Re: 2.2.0 Bug summary

All the mail mirrored from lore.kernel.org
 help / color / mirror / Atom feed

* Re: 2.2.0 Bug summary
       [not found] <199812290146.BAA12687@terrorserver.swansea.linux.org.uk>
@ 1998-12-31 18:00 ` Andrea Arcangeli
  1998-12-31 18:34   ` [patch] new-vm improvement [Re: 2.2.0 Bug summary] Andrea Arcangeli
  0 siblings, 1 reply; 243+ messages in thread
From: Andrea Arcangeli @ 1998-12-31 18:00 UTC (permalink / raw
  To: Alan Cox
  Cc: linux-kernel, Linus Torvalds, Stephen C. Tweedie,
	Benjamin Redelings I, Rik van Riel, linux-mm

On Tue, 29 Dec 1998, Alan Cox wrote:

> o	Linus VM is still 20% slower than sct vm on an 8Mb machine
> 	[benchmarks kernel build and netscape]

Today I start playing with Linus's vm in 2.2.0-pre1 and I changed the
semantics of many things and I added heuristic to avoid that one process
trashing memory will hang other "normal" processes. This my new VM I
developed today is _far_ better than sct's ac11 vm and anything I tried
before. I would like if somebody could try it also on low memory machines
and feedback what happens there.  I don't have enough spare time to test
it on many kind of hardware too. 

The same benchmark that was taking 106 sec on clean 2.2.0-pre1 to
dirtifying 160Mbyte of virtual memory (run with 128RAM and 72swap of phis
mem), now runs in 90 sec but this is not the most important thing, the
good point is that the cache/buffer/swap levels now are perfectly stable
and all other processes runs fine and get not out of cache even if there's
a memory trahser running at the same time.

Comments?

Ah, the shrink_mmap limit was wrong since we account only not referenced
pages.

Patch against 2.2.0-pre1:

Index: linux/mm/filemap.c
diff -u linux/mm/filemap.c:1.1.1.7 linux/mm/filemap.c:1.1.1.1.2.29
--- linux/mm/filemap.c:1.1.1.7	Wed Dec 23 15:25:21 1998
+++ linux/mm/filemap.c	Thu Dec 31 17:56:27 1998
@@ -125,7 +129,7 @@
 	struct page * page;
 	int count;
 
-	count = (limit<<1) >> (priority);
+	count = limit >> priority;
 
 	page = mem_map + clock;
 	do {
@@ -182,6 +186,7 @@
 	return 0;
 }
 
+#if 0
 /*
  * This is called from try_to_swap_out() when we try to get rid of some
  * pages..  If we're unmapping the last occurrence of this page, we also
@@ -201,6 +206,7 @@
 	remove_inode_page(page);
 	return 1;
 }
+#endif
 
 /*
  * Update a page cache copy, when we're doing a "write()" system call
Index: linux/mm/page_alloc.c
diff -u linux/mm/page_alloc.c:1.1.1.3 linux/mm/page_alloc.c:1.1.1.1.2.11
--- linux/mm/page_alloc.c:1.1.1.3	Sun Dec 20 16:31:11 1998
+++ linux/mm/page_alloc.c	Thu Dec 31 17:56:27 1998
@@ -241,7 +241,29 @@
 			goto nopage;
 		}
 
-		if (freepages.min > nr_free_pages) {
+		if (freepages.high < nr_free_pages)
+		{
+			if (current->trashing_memory)
+			{
+				current->trashing_memory = 0;
+#if 0
+				printk("trashing end for %s\n", current->comm);
+#endif
+			}
+		} else if (freepages.min > nr_free_pages) {
+			if (!current->trashing_memory)
+			{
+				current->trashing_memory = 1;
+#if 0
+				printk("trashing start for %s\n", current->comm);
+#endif
+			}
+		}
+
+		/*
+		 * Block the process that is trashing memory. -arca
+		 */
+		if (current->trashing_memory) {
 			int freed;
 			freed = try_to_free_pages(gfp_mask, SWAP_CLUSTER_MAX);
 			/*
Index: linux/mm/swap_state.c
diff -u linux/mm/swap_state.c:1.1.1.3 linux/mm/swap_state.c:1.1.1.1.2.8
--- linux/mm/swap_state.c:1.1.1.3	Sun Dec 20 16:31:12 1998
+++ linux/mm/swap_state.c	Tue Dec 22 18:42:03 1998
@@ -248,7 +248,7 @@
 		delete_from_swap_cache(page);
 	}
 	
-	free_page(addr);
+	__free_page(page);
 }
 
 
@@ -261,6 +261,9 @@
 struct page * lookup_swap_cache(unsigned long entry)
 {
 	struct page *found;
+#ifdef	SWAP_CACHE_INFO
+	swap_cache_find_total++;
+#endif
 	
 	while (1) {
 		found = find_page(&swapper_inode, entry);
@@ -268,8 +271,12 @@
 			return 0;
 		if (found->inode != &swapper_inode || !PageSwapCache(found))
 			goto out_bad;
-		if (!PageLocked(found))
+		if (!PageLocked(found)) {
+#ifdef	SWAP_CACHE_INFO
+			swap_cache_find_success++;
+#endif
 			return found;
+		}
 		__free_page(found);
 		__wait_on_page(found);
 	}
Index: linux/mm/vmalloc.c
diff -u linux/mm/vmalloc.c:1.1.1.2 linux/mm/vmalloc.c:1.1.1.1.2.2
--- linux/mm/vmalloc.c:1.1.1.2	Fri Nov 27 11:19:11 1998
+++ linux/mm/vmalloc.c	Fri Nov 27 11:41:42 1998
@@ -185,7 +185,8 @@
 	for (p = &vmlist ; (tmp = *p) ; p = &tmp->next) {
 		if (tmp->addr == addr) {
 			*p = tmp->next;
-			vmfree_area_pages(VMALLOC_VMADDR(tmp->addr), tmp->size);
+			vmfree_area_pages(VMALLOC_VMADDR(tmp->addr),
+					  tmp->size - PAGE_SIZE);
 			kfree(tmp);
 			return;
 		}
Index: linux/mm/vmscan.c
diff -u linux/mm/vmscan.c:1.1.1.6 linux/mm/vmscan.c:1.1.1.1.2.43
--- linux/mm/vmscan.c:1.1.1.6	Tue Dec 22 11:56:28 1998
+++ linux/mm/vmscan.c	Thu Dec 31 17:56:27 1998
@@ -162,8 +162,8 @@
 			 * copy in memory, so we add it to the swap
 			 * cache. */
 			if (PageSwapCache(page_map)) {
-				free_page(page);
-				return (atomic_read(&page_map->count) == 0);
+				__free_page(page_map);
+				return atomic_read(&page_map->count) + 1;
 			}
 			add_to_swap_cache(page_map, entry);
 			/* We checked we were unlocked way up above, and we
@@ -180,8 +180,8 @@
 		 * asynchronously.  That's no problem, shrink_mmap() can
 		 * correctly clean up the occassional unshared page
 		 * which gets left behind in the swap cache. */
-		free_page(page);
-		return 1;	/* we slept: the process may not exist any more */
+		__free_page(page_map);
+		return atomic_read(&page_map->count) + 1;	/* we slept: the process may not exist any more */
 	}
 
 	/* The page was _not_ dirty, but still has a zero age.  It must
@@ -194,8 +194,8 @@
 		set_pte(page_table, __pte(entry));
 		flush_tlb_page(vma, address);
 		swap_duplicate(entry);
-		free_page(page);
-		return (atomic_read(&page_map->count) == 0);
+		__free_page(page_map);
+		return atomic_read(&page_map->count) + 1;
 	} 
 	/* 
 	 * A clean page to be discarded?  Must be mmap()ed from
@@ -210,9 +210,8 @@
 	flush_cache_page(vma, address);
 	pte_clear(page_table);
 	flush_tlb_page(vma, address);
-	entry = (atomic_read(&page_map->count) == 1);
 	__free_page(page_map);
-	return entry;
+	return atomic_read(&page_map->count) + 1;
 }
 
 /*
@@ -369,8 +368,14 @@
 	 * swapped out.  If the swap-out fails, we clear swap_cnt so the 
 	 * task won't be selected again until all others have been tried.
 	 */
-	counter = ((PAGEOUT_WEIGHT * nr_tasks) >> 10) >> priority;
+	counter = nr_tasks / (priority+1);
+	if (counter < 1)
+		counter = 1;
+	if (counter > nr_tasks)
+		counter = nr_tasks;
+
 	for (; counter >= 0; counter--) {
+		int retval;
 		assign = 0;
 		max_cnt = 0;
 		pbest = NULL;
@@ -382,15 +387,8 @@
 				continue;
 	 		if (p->mm->rss <= 0)
 				continue;
-			if (assign) {
-				/* 
-				 * If we didn't select a task on pass 1, 
-				 * assign each task a new swap_cnt.
-				 * Normalise the number of pages swapped
-				 * by multiplying by (RSS / 1MB)
-				 */
-				p->swap_cnt = AGE_CLUSTER_SIZE(p->mm->rss);
-			}
+			if (assign)
+				p->swap_cnt = p->mm->rss;
 			if (p->swap_cnt > max_cnt) {
 				max_cnt = p->swap_cnt;
 				pbest = p;
@@ -404,14 +402,13 @@
 			}
 			goto out;
 		}
-		pbest->swap_cnt--;
-
 		/*
 		 * Nonzero means we cleared out something, but only "1" means
 		 * that we actually free'd up a page as a result.
 		 */
-		if (swap_out_process(pbest, gfp_mask) == 1)
-				return 1;
+		retval = swap_out_process(pbest, gfp_mask);
+		if (retval)
+			return retval;
 	}
 out:
 	return 0;
@@ -438,44 +435,78 @@
        printk ("Starting kswapd v%.*s\n", i, s);
 }
 
-#define free_memory(fn) \
-	count++; do { if (!--count) goto done; } while (fn)
+static int do_free_user_and_cache(int priority, int gfp_mask)
+{
+	switch (swap_out(priority, gfp_mask))
+	{
+	default:
+		shrink_mmap(0, gfp_mask);
+		/*
+		 * We done at least some swapping progress so return 1 in
+		 * this case. -arca
+		 */
+		return 1;
+	case 0:
+		/* swap_out() failed to swapout */
+		if (shrink_mmap(priority, gfp_mask))
+		{
+			printk("swapout 0 shrink 1\n");
+			return 1;
+		}
+		printk("swapout 0 shrink 0\n");
+		return 0;
+	case 1:
+		/* this would be the best but should not happen right now */
+		printk(KERN_DEBUG
+		       "do_free_user_and_cache: swapout returned 1\n");
+		return 1;
+	}
+}
 
-static int kswapd_free_pages(int kswapd_state)
+static int do_free_page(int * state, int gfp_mask)
 {
-	unsigned long end_time;
+	int priority = 6;
+
+	kmem_cache_reap(gfp_mask);
 
-	/* Always trim SLAB caches when memory gets low. */
-	kmem_cache_reap(0);
+	switch (*state) {
+		do {
+		default:
+			if (do_free_user_and_cache(priority, gfp_mask))
+				return 1;
+			*state = 1;
+		case 1:
+			if (shm_swap(priority, gfp_mask))
+				return 1;
+			*state = 2;
+		case 2:
+			shrink_dcache_memory(priority, gfp_mask);
+			*state = 0;
+		} while (--priority >= 0);
+	}
+	return 0;
+}
 
+static int kswapd_free_pages(int kswapd_state)
+{
 	/* max one hundreth of a second */
-	end_time = jiffies + (HZ-1)/100;
-	do {
-		int priority = 5;
-		int count = pager_daemon.swap_cluster;
+	unsigned long end_time = jiffies + (HZ-1)/100;
 
-		switch (kswapd_state) {
-			do {
-			default:
-				free_memory(shrink_mmap(priority, 0));
-				kswapd_state++;
-			case 1:
-				free_memory(shm_swap(priority, 0));
-				kswapd_state++;
-			case 2:
-				free_memory(swap_out(priority, 0));
-				shrink_dcache_memory(priority, 0);
-				kswapd_state = 0;
-			} while (--priority >= 0);
-			return kswapd_state;
-		}
-done:
-		if (nr_free_pages > freepages.high + pager_daemon.swap_cluster)
+	do {
+		do_free_page(&kswapd_state, 0);
+		if (nr_free_pages > freepages.high)
 			break;
 	} while (time_before_eq(jiffies,end_time));
+	/* take kswapd_state on the stack to save some byte of memory */
 	return kswapd_state;
 }
 
+static inline void enable_swap_tick(void)
+{
+	timer_table[SWAP_TIMER].expires = jiffies+(HZ+99)/100;
+	timer_active |= 1<<SWAP_TIMER;
+}
+
 /*
  * The background pageout daemon.
  * Started as a kernel thread from the init process.
@@ -523,6 +554,7 @@
 		current->state = TASK_INTERRUPTIBLE;
 		flush_signals(current);
 		run_task_queue(&tq_disk);
+		enable_swap_tick();
 		schedule();
 		swapstats.wakeups++;
 		state = kswapd_free_pages(state);
@@ -542,35 +574,24 @@
  * if we need more memory as part of a swap-out effort we
  * will just silently return "success" to tell the page
  * allocator to accept the allocation.
- *
- * We want to try to free "count" pages, and we need to 
- * cluster them so that we get good swap-out behaviour. See
- * the "free_memory()" macro for details.
  */
 int try_to_free_pages(unsigned int gfp_mask, int count)
 {
-	int retval;
-
+	int retval = 1;
 	lock_kernel();
 
-	/* Always trim SLAB caches when memory gets low. */
-	kmem_cache_reap(gfp_mask);
-
-	retval = 1;
 	if (!(current->flags & PF_MEMALLOC)) {
-		int priority;
+		static int state = 0;
 
 		current->flags |= PF_MEMALLOC;
 	
-		priority = 5;
-		do {
-			free_memory(shrink_mmap(priority, gfp_mask));
-			free_memory(shm_swap(priority, gfp_mask));
-			free_memory(swap_out(priority, gfp_mask));
-			shrink_dcache_memory(priority, gfp_mask);
-		} while (--priority >= 0);
-		retval = 0;
-done:
+		while (count--)
+			if (!do_free_page(&state, gfp_mask))
+			{
+				retval = 0;
+				break;
+			}
+
 		current->flags &= ~PF_MEMALLOC;
 	}
 	unlock_kernel();
@@ -593,7 +614,8 @@
 	if (priority) {
 		p->counter = p->priority << priority;
 		wake_up_process(p);
-	}
+	} else
+		enable_swap_tick();
 }
 
 /* 
@@ -631,9 +653,8 @@
 			want_wakeup = 3;
 	
 		kswapd_wakeup(p,want_wakeup);
-	}
-
-	timer_active |= (1<<SWAP_TIMER);
+	} else
+		enable_swap_tick();
 }
 
 /* 
Index: linux/kernel/fork.c
diff -u linux/kernel/fork.c:1.1.1.3 linux/kernel/fork.c:1.1.1.1.2.6
--- linux/kernel/fork.c:1.1.1.3	Thu Dec  3 12:55:12 1998
+++ linux/kernel/fork.c	Thu Dec 31 17:56:28 1998
@@ -567,6 +570,7 @@
 
 	/* ok, now we should be set up.. */
 	p->swappable = 1;
+	p->trashing_memory = 0;
 	p->exit_signal = clone_flags & CSIGNAL;
 	p->pdeath_signal = 0;
 
Index: linux/include/linux/sched.h
diff -u linux/include/linux/sched.h:1.1.1.2 linux/include/linux/sched.h:1.1.1.1.2.7
--- linux/include/linux/sched.h:1.1.1.2	Tue Dec 29 01:39:00 1998
+++ linux/include/linux/sched.h	Thu Dec 31 17:56:29 1998
@@ -268,6 +273,7 @@
 /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
 	unsigned long min_flt, maj_flt, nswap, cmin_flt, cmaj_flt, cnswap;
 	int swappable:1;
+	int trashing_memory:1;
 	unsigned long swap_address;
 	unsigned long old_maj_flt;	/* old value of maj_flt */
 	unsigned long dec_flt;		/* page fault count of the last time */
@@ -353,7 +359,7 @@
 /* utime */	{0,0,0,0},0, \
 /* per CPU times */ {0, }, {0, }, \
 /* flt */	0,0,0,0,0,0, \
-/* swp */	0,0,0,0,0, \
+/* swp */	0,0,0,0,0,0, \
 /* process credentials */					\
 /* uid etc */	0,0,0,0,0,0,0,0,				\
 /* suppl grps*/ 0, {0,},					\





--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* [patch] new-vm improvement [Re: 2.2.0 Bug summary]
  1998-12-31 18:00 ` 2.2.0 Bug summary Andrea Arcangeli
@ 1998-12-31 18:34   ` Andrea Arcangeli
  1999-01-01  0:16     ` Steve Bergman
  1999-01-01 16:44     ` Andrea Arcangeli
  0 siblings, 2 replies; 243+ messages in thread
From: Andrea Arcangeli @ 1998-12-31 18:34 UTC (permalink / raw
  To: Alan Cox
  Cc: linux-kernel, Linus Torvalds, Stephen C. Tweedie,
	Benjamin Redelings I, Rik van Riel, linux-mm

On Thu, 31 Dec 1998, Andrea Arcangeli wrote:

> Comments?
> 
> Ah, the shrink_mmap limit was wrong since we account only not referenced
> pages.
> 
> Patch against 2.2.0-pre1:

whoops in the last email I forget to change a bit the subject (adding
[patch]) and this printk: 

Index: linux/mm/vmscan.c
diff -u linux/mm/vmscan.c:1.1.1.1.2.43 linux/mm/vmscan.c:1.1.1.1.2.45
--- linux/mm/vmscan.c:1.1.1.1.2.43	Thu Dec 31 17:56:27 1998
+++ linux/mm/vmscan.c	Thu Dec 31 19:41:06 1998
@@ -449,11 +449,7 @@
 	case 0:
 		/* swap_out() failed to swapout */
 		if (shrink_mmap(priority, gfp_mask))
-		{
-			printk("swapout 0 shrink 1\n");
 			return 1;
-		}
-		printk("swapout 0 shrink 0\n");
 		return 0;
 	case 1:
 		/* this would be the best but should not happen right now */



Andrea Arcangeli

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: [patch] new-vm improvement [Re: 2.2.0 Bug summary]
  1998-12-31 18:34   ` [patch] new-vm improvement [Re: 2.2.0 Bug summary] Andrea Arcangeli
@ 1999-01-01  0:16     ` Steve Bergman
  1999-01-01 17:16       ` Andrea Arcangeli
  1999-01-01 16:44     ` Andrea Arcangeli
  1 sibling, 1 reply; 243+ messages in thread
From: Steve Bergman @ 1999-01-01  0:16 UTC (permalink / raw
  To: Andrea Arcangeli; +Cc: linux-mm

Andrea Arcangeli wrote:
> 
> On Thu, 31 Dec 1998, Andrea Arcangeli wrote:
> 
> > Comments?
> >
> > Ah, the shrink_mmap limit was wrong since we account only not referenced
> > pages.
> >
> > Patch against 2.2.0-pre1:
> 
> whoops in the last email I forget to change a bit the subject (adding
> [patch]) and this printk:

Hi,

I just tried out the patch and got very disappointing results on my
128MB AMD K6-3.  I tested by loading 117 good sized images all at once. 
This kicks it ~ 165MB into the swap (~ 293 MB mem total).  The standard
2.2.0-pre1 kernel streamed out to swap at an average of >1MB/sec and
finished in 184 seconds.  WIth the patched kernel I stopped at 280 sec. 
At that time it had about 65 mb swapped out or < 250K/sec.  I then
rebooted, brought up X and an xterm and went to compile the 2.1.131-ac11
patch (still running under the patched 2.2.0-pre1) and noted that during
the compile I had 17MB in the swap with nothing else going on.  Bringing
up netscape put it up to 25MB.   Suggestions? Requests?  Let me know if
you want me to try anything else.

Thanks,
Steve
--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: [patch] new-vm improvement [Re: 2.2.0 Bug summary]
  1998-12-31 18:34   ` [patch] new-vm improvement [Re: 2.2.0 Bug summary] Andrea Arcangeli
  1999-01-01  0:16     ` Steve Bergman
@ 1999-01-01 16:44     ` Andrea Arcangeli
  1999-01-01 20:02       ` Andrea Arcangeli
  1 sibling, 1 reply; 243+ messages in thread
From: Andrea Arcangeli @ 1999-01-01 16:44 UTC (permalink / raw
  To: Benjamin Redelings I, Stephen C. Tweedie, Linus Torvalds
  Cc: linux-kernel, Alan Cox, Rik van Riel, linux-mm

I' ll try to comment my latest VM patch.

The patch basically do two things.

It add an heuristic to block trashing tasks in try_to_free_pages() and
allow normal tasks to run fine in the meantime.

It returns to the old do_try_to_free_pages() way to do things. I think the
reason the old way was no longer working well is that we are using
swap_out()  as other freeing-methods while swapout has really nothing to
do with them. 

To get VM stability under low memory we must use both swap_out() (that put
pages from the user process Vmemory to the swap cache) and shrink_mmap() 
in a new method. My new method put user pages in the swap cache because
there we can handle aging very well. Then shrink_mmap() can free a not
refernced page to really do some progress in the memory freeing (and not
only in the swapout).

So basically my patch cause sure the system to swapout more than we was
used to do, but most of the time we will not need a swapin to reput the
pages in the process Vmemory.

Somebody reported a big slowdown of the trashing application. Right now I
don't know which bit of the patch caused this slowdown (yesterday my
benchmark here didn't showed this slowdown). My new trashing_memory
heuristic will probably decrease performance for the trashing application
(but hey you know that if you need performance you can alwaws buy more RAM
;), but it will improve a lot performance for normal not-trashing tasks. 

I' ll try to change do_free_user_and_cache() to see if I can achieve
something better.

I changed also the swap_out() since the best way to choose a process it to
compare the raw RSS I think. And I don' t want that swap_cnt is decreased
of something every time something is swapped out. I want that the kernel
will continue passing throught all the pages of one process once it
started playing with it (if it will still exists of course ;). I changed
also the pressure of swap_out() since it make no sense to me to pass more
than one time over the VM of all tasks in the system. Now at priority 6
swap_out()  is trying to swapout something at max from nr_tasks/7 (low
bound to 1 task). I changed also the pressure of shrink_mmap() because it
was making no sense to me to do two passes on just not referenced pages.

I also changed swapout() allowing it to return 0 1 or more.

0 means that swap_out() is been not able to put in the swap cache
something.

1 means that swap_out() is been able to swapout something and has also
freed up one page (how??? it can't right now because the page should
always be still at least present in the swap cache)

2 means that swap_out() has swapped out 1 page and that the page is still
referenced somewhere (probably by the swap cache)

So in case 2 and case 0 we must use shrink_mmap() to really do some
progress in the page freeing.  This the idea that my new
do_free_user_and_cache() follows.

Comments?

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: [patch] new-vm improvement [Re: 2.2.0 Bug summary]
  1999-01-01  0:16     ` Steve Bergman
@ 1999-01-01 17:16       ` Andrea Arcangeli
  0 siblings, 0 replies; 243+ messages in thread
From: Andrea Arcangeli @ 1999-01-01 17:16 UTC (permalink / raw
  To: Steve Bergman; +Cc: linux-mm

On Thu, 31 Dec 1998, Steve Bergman wrote:

> I just tried out the patch and got very disappointing results on my
> 128MB AMD K6-3.  I tested by loading 117 good sized images all at once. 

The point of my patch is to balance the VM and improve performance for not
memory trashing proggy. It make sense that the trashing program is been
slowed down... Once the proggy will stop allocating RAM but it will
continue to use only pages just allocated (eventually in swap) performance
should return normal.

> patch (still running under the patched 2.2.0-pre1) and noted that during
> the compile I had 17MB in the swap with nothing else going on.  Bringing
> up netscape put it up to 25MB.   Suggestions? Requests?  Let me know if

I am going to still change something for sure. But please don't care the
size of the SWAP, care only performances. The pages in the swap right now
are likely to be present also in the swap cache so you' ll handle both
aging and a little cost in a swapin using more the swap cache. Really
there's also the cost of an async swapout to disk but it seems to not harm
here.

> you want me to try anything else.

Yes you should tell me if the performances decreased with normal usage
(like netscape + kernel compile). 

Andrea Arcangeli

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: [patch] new-vm improvement [Re: 2.2.0 Bug summary]
  1999-01-01 16:44     ` Andrea Arcangeli
@ 1999-01-01 20:02       ` Andrea Arcangeli
  1999-01-01 23:46         ` Steve Bergman
  1999-01-02  3:03         ` Andrea Arcangeli
  0 siblings, 2 replies; 243+ messages in thread
From: Andrea Arcangeli @ 1999-01-01 20:02 UTC (permalink / raw
  To: Benjamin Redelings I, Stephen C. Tweedie, Linus Torvalds
  Cc: linux-kernel, Alan Cox, Rik van Riel, linux-mm

I rediffed my VM patch against test1-patch-2.2.0-pre3.gz. I also fixed
some bug (not totally critical but..) pointed out by Linus in my last
code. I also changed the shrink_mmap(0) to shrink_mmap(priority) because
it was completly sucking a lot performance. There is no need to do a
shrink_mmap(0) for example if the cache/buffer are under min. In such case
we must allow the swap_out() to grow the cache before start shrinking it.

So basically this new patch is _far_ more efficient than the last
one (I never seen so good/stable/fast behavior before!).

This my new patch is against testing/test1-patch-2.2.0-pre3.gz that is
against v2.1/2.2.0-pre2 that is against patch-2.2.0-pre1-vs-2.1.132.gz
(where is this last one now?).

Ah, from testing/test1-patch-2.2.0-pre3.gz was missing the trashing memory
initialization that will allow every process to do a fast start.

Index: linux/kernel/fork.c
diff -u linux/kernel/fork.c:1.1.1.3 linux/kernel/fork.c:1.1.1.1.2.6
--- linux/kernel/fork.c:1.1.1.3	Thu Dec  3 12:55:12 1998
+++ linux/kernel/fork.c	Thu Dec 31 17:56:28 1998
@@ -567,6 +570,7 @@
 
 	/* ok, now we should be set up.. */
 	p->swappable = 1;
+	p->trashing_memory = 0;
 	p->exit_signal = clone_flags & CSIGNAL;
 	p->pdeath_signal = 0;
 
Index: linux/mm/vmscan.c
diff -u linux/mm/vmscan.c:1.1.1.8 linux/mm/vmscan.c:1.1.1.1.2.49
--- linux/mm/vmscan.c:1.1.1.8	Fri Jan  1 19:12:54 1999
+++ linux/mm/vmscan.c	Fri Jan  1 20:29:19 1999
@@ -162,8 +162,9 @@
 			 * copy in memory, so we add it to the swap
 			 * cache. */
 			if (PageSwapCache(page_map)) {
+				entry = atomic_read(&page_map->count);
 				__free_page(page_map);
-				return (atomic_read(&page_map->count) == 0);
+				return entry;
 			}
 			add_to_swap_cache(page_map, entry);
 			/* We checked we were unlocked way up above, and we
@@ -180,8 +181,9 @@
 		 * asynchronously.  That's no problem, shrink_mmap() can
 		 * correctly clean up the occassional unshared page
 		 * which gets left behind in the swap cache. */
+		entry = atomic_read(&page_map->count);
 		__free_page(page_map);
-		return 1;	/* we slept: the process may not exist any more */
+		return entry;	/* we slept: the process may not exist any more */
 	}
 
 	/* The page was _not_ dirty, but still has a zero age.  It must
@@ -194,8 +196,9 @@
 		set_pte(page_table, __pte(entry));
 		flush_tlb_page(vma, address);
 		swap_duplicate(entry);
+		entry = atomic_read(&page_map->count);
 		__free_page(page_map);
-		return (atomic_read(&page_map->count) == 0);
+		return entry;
 	} 
 	/* 
 	 * A clean page to be discarded?  Must be mmap()ed from
@@ -210,7 +213,7 @@
 	flush_cache_page(vma, address);
 	pte_clear(page_table);
 	flush_tlb_page(vma, address);
-	entry = (atomic_read(&page_map->count) == 1);
+	entry = atomic_read(&page_map->count);
 	__free_page(page_map);
 	return entry;
 }
@@ -369,8 +372,14 @@
 	 * swapped out.  If the swap-out fails, we clear swap_cnt so the 
 	 * task won't be selected again until all others have been tried.
 	 */
-	counter = ((PAGEOUT_WEIGHT * nr_tasks) >> 10) >> priority;
+	counter = nr_tasks / (priority+1);
+	if (counter < 1)
+		counter = 1;
+	if (counter > nr_tasks)
+		counter = nr_tasks;
+
 	for (; counter >= 0; counter--) {
+		int retval;
 		assign = 0;
 		max_cnt = 0;
 		pbest = NULL;
@@ -382,15 +391,8 @@
 				continue;
 	 		if (p->mm->rss <= 0)
 				continue;
-			if (assign) {
-				/* 
-				 * If we didn't select a task on pass 1, 
-				 * assign each task a new swap_cnt.
-				 * Normalise the number of pages swapped
-				 * by multiplying by (RSS / 1MB)
-				 */
-				p->swap_cnt = AGE_CLUSTER_SIZE(p->mm->rss);
-			}
+			if (assign)
+				p->swap_cnt = p->mm->rss;
 			if (p->swap_cnt > max_cnt) {
 				max_cnt = p->swap_cnt;
 				pbest = p;
@@ -404,14 +406,13 @@
 			}
 			goto out;
 		}
-		pbest->swap_cnt--;
-
 		/*
 		 * Nonzero means we cleared out something, but only "1" means
 		 * that we actually free'd up a page as a result.
 		 */
-		if (swap_out_process(pbest, gfp_mask) == 1)
-				return 1;
+		retval = swap_out_process(pbest, gfp_mask);
+		if (retval)
+			return retval;
 	}
 out:
 	return 0;
@@ -438,44 +439,74 @@
        printk ("Starting kswapd v%.*s\n", i, s);
 }
 
-#define free_memory(fn) \
-	count++; do { if (!--count) goto done; } while (fn)
+static int do_free_user_and_cache(int priority, int gfp_mask)
+{
+	switch (swap_out(priority, gfp_mask))
+	{
+	default:
+		shrink_mmap(priority, gfp_mask);
+		/*
+		 * We done at least some swapping progress so return 1 in
+		 * this case. -arca
+		 */
+		return 1;
+	case 0:
+		/* swap_out() failed to swapout */
+		if (shrink_mmap(priority, gfp_mask))
+			return 1;
+		return 0;
+	case 1:
+		/* this would be the best but should not happen right now */
+		printk(KERN_DEBUG
+		       "do_free_user_and_cache: swapout returned 1\n");
+		return 1;
+	}
+}
 
-static int kswapd_free_pages(int kswapd_state)
+static int do_free_page(int * state, int gfp_mask)
 {
-	unsigned long end_time;
+	int priority = 6;
 
-	/* Always trim SLAB caches when memory gets low. */
-	kmem_cache_reap(0);
+	kmem_cache_reap(gfp_mask);
 
+	switch (*state) {
+		do {
+		default:
+			if (do_free_user_and_cache(priority, gfp_mask))
+				return 1;
+			*state = 1;
+		case 1:
+			if (shm_swap(priority, gfp_mask))
+				return 1;
+			*state = 2;
+		case 2:
+			shrink_dcache_memory(priority, gfp_mask);
+			*state = 0;
+		} while (--priority >= 0);
+	}
+	return 0;
+}
+
+static int kswapd_free_pages(int kswapd_state)
+{
 	/* max one hundreth of a second */
-	end_time = jiffies + (HZ-1)/100;
-	do {
-		int priority = 5;
-		int count = pager_daemon.swap_cluster;
+	unsigned long end_time = jiffies + (HZ-1)/100;
 
-		switch (kswapd_state) {
-			do {
-			default:
-				free_memory(shrink_mmap(priority, 0));
-				kswapd_state++;
-			case 1:
-				free_memory(shm_swap(priority, 0));
-				kswapd_state++;
-			case 2:
-				free_memory(swap_out(priority, 0));
-				shrink_dcache_memory(priority, 0);
-				kswapd_state = 0;
-			} while (--priority >= 0);
-			return kswapd_state;
-		}
-done:
-		if (nr_free_pages > freepages.high + pager_daemon.swap_cluster)
+	do {
+		do_free_page(&kswapd_state, 0);
+		if (nr_free_pages > freepages.high)
 			break;
 	} while (time_before_eq(jiffies,end_time));
+	/* take kswapd_state on the stack to save some byte of memory */
 	return kswapd_state;
 }
 
+static inline void enable_swap_tick(void)
+{
+	timer_table[SWAP_TIMER].expires = jiffies+(HZ+99)/100;
+	timer_active |= 1<<SWAP_TIMER;
+}
+
 /*
  * The background pageout daemon.
  * Started as a kernel thread from the init process.
@@ -523,6 +554,7 @@
 		current->state = TASK_INTERRUPTIBLE;
 		flush_signals(current);
 		run_task_queue(&tq_disk);
+		enable_swap_tick();
 		schedule();
 		swapstats.wakeups++;
 		state = kswapd_free_pages(state);
@@ -542,35 +574,23 @@
  * if we need more memory as part of a swap-out effort we
  * will just silently return "success" to tell the page
  * allocator to accept the allocation.
- *
- * We want to try to free "count" pages, and we need to 
- * cluster them so that we get good swap-out behaviour. See
- * the "free_memory()" macro for details.
  */
 int try_to_free_pages(unsigned int gfp_mask, int count)
 {
-	int retval;
-
+	int retval = 1;
 	lock_kernel();
 
-	/* Always trim SLAB caches when memory gets low. */
-	kmem_cache_reap(gfp_mask);
-
-	retval = 1;
 	if (!(current->flags & PF_MEMALLOC)) {
-		int priority;
-
 		current->flags |= PF_MEMALLOC;
-	
-		priority = 5;
-		do {
-			free_memory(shrink_mmap(priority, gfp_mask));
-			free_memory(shm_swap(priority, gfp_mask));
-			free_memory(swap_out(priority, gfp_mask));
-			shrink_dcache_memory(priority, gfp_mask);
-		} while (--priority >= 0);
-		retval = 0;
-done:
+		while (count--)
+		{
+			static int state = 0;
+			if (!do_free_page(&state, gfp_mask))
+			{
+				retval = 0;
+				break;
+			}
+		}
 		current->flags &= ~PF_MEMALLOC;
 	}
 	unlock_kernel();
@@ -593,7 +613,8 @@
 	if (priority) {
 		p->counter = p->priority << priority;
 		wake_up_process(p);
-	}
+	} else
+		enable_swap_tick();
 }
 
 /* 
@@ -631,9 +652,8 @@
 			want_wakeup = 3;
 	
 		kswapd_wakeup(p,want_wakeup);
-	}
-
-	timer_active |= (1<<SWAP_TIMER);
+	} else
+		enable_swap_tick();
 }
 
 /* 
@@ -642,7 +662,6 @@
 
 void init_swap_timer(void)
 {
-	timer_table[SWAP_TIMER].expires = jiffies;
 	timer_table[SWAP_TIMER].fn = swap_tick;
-	timer_active |= (1<<SWAP_TIMER);
+	enable_swap_tick();
 }
Index: linux/mm/swap_state.c
diff -u linux/mm/swap_state.c:1.1.1.4 linux/mm/swap_state.c:1.1.1.1.2.9
--- linux/mm/swap_state.c:1.1.1.4	Fri Jan  1 19:12:54 1999
+++ linux/mm/swap_state.c	Fri Jan  1 19:25:33 1999
@@ -262,6 +262,9 @@
 struct page * lookup_swap_cache(unsigned long entry)
 {
 	struct page *found;
+#ifdef	SWAP_CACHE_INFO
+	swap_cache_find_total++;
+#endif
 	
 	while (1) {
 		found = find_page(&swapper_inode, entry);
@@ -269,8 +272,12 @@
 			return 0;
 		if (found->inode != &swapper_inode || !PageSwapCache(found))
 			goto out_bad;
-		if (!PageLocked(found))
+		if (!PageLocked(found)) {
+#ifdef	SWAP_CACHE_INFO
+			swap_cache_find_success++;
+#endif
 			return found;
+		}
 		__free_page(found);
 		__wait_on_page(found);
 	}




If this patch is decreasing performance for you (eventually due too much
memory swapped out) you can try this incremental patch (I never tried here
btw):

Index: mm//vmscan.c
===================================================================
RCS file: /var/cvs/linux/mm/vmscan.c,v
retrieving revision 1.1.1.1.2.49
diff -u -r1.1.1.1.2.49 vmscan.c
--- vmscan.c	1999/01/01 19:29:19	1.1.1.1.2.49
+++ linux/mm/vmscan.c	1999/01/01 19:51:22
@@ -441,6 +441,9 @@
 
 static int do_free_user_and_cache(int priority, int gfp_mask)
 {
+	if (shrink_mmap(priority, gfp_mask))
+		return 1;
+
 	switch (swap_out(priority, gfp_mask))
 	{
 	default:



I written a swap benchmark that is dirtifying 160Mbyte of VM. For the
first loop 2.2-pre1 was taking 106 sec, for the second loop 120 and
then worse.

test1-pre3 + my new patch in this email, instead takes 120 sec in the
first loop (since it's allocating it's probably slowed down a bit by the
trashing_memory heuristic, and that's right), then it takes 90 sec in the
second loop and 77 sec in the third loop!! and the system was far to be
idle (as when I measured 2.2-pre1), but I was using it without special
regards and was perfectly usable (2.2-pre1 was unusable instead).

Comments?

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: [patch] new-vm improvement [Re: 2.2.0 Bug summary]
  1999-01-01 20:02       ` Andrea Arcangeli
@ 1999-01-01 23:46         ` Steve Bergman
  1999-01-02  6:55           ` Linus Torvalds
  1999-01-02  3:03         ` Andrea Arcangeli
  1 sibling, 1 reply; 243+ messages in thread
From: Steve Bergman @ 1999-01-01 23:46 UTC (permalink / raw
  To: Andrea Arcangeli
  Cc: Benjamin Redelings I, Stephen C. Tweedie, Linus Torvalds,
	linux-kernel, Alan Cox, Rik van Riel, linux-mm

Andrea Arcangeli wrote:

> 
> Please stop and try my new patch against Linus's test1-pre3 (that just
> merge some of my new stuff).

I got the patch and I must say I'm impressed.  I ran my "117 image" test
and got these results:

[Note: This loads 117 different images at the same time using 117
separate instances of 'xv' started in the background and results in ~
165 MB of swap area usage.  The machine is an AMD K6-2 300 with 128MB]

2.1.131-ac11                         172 sec  (This was previously the
best)
2.2.0-pre1 + Arcangeli's 1st patch   400 sec
test1-pre  + Arcangeli's 2nd patch   119 sec (!)

Processor utilization was substantially greater with the new patch
compared to either of the others.  Before it starts using swap, memory
is being consumed at ~ 4MB/sec.  After it starts to swap out, it streams
out at ~ 2MB/sec.

The performance is ~ 45% better than ac11 and ~ 70% better than
2.2.0-pre1 in this test.  

I was going to test the low memory case but got side tracked.

Thanks,
Steve
--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: [patch] new-vm improvement [Re: 2.2.0 Bug summary]
  1999-01-01 20:02       ` Andrea Arcangeli
  1999-01-01 23:46         ` Steve Bergman
@ 1999-01-02  3:03         ` Andrea Arcangeli
  1 sibling, 0 replies; 243+ messages in thread
From: Andrea Arcangeli @ 1999-01-02  3:03 UTC (permalink / raw
  To: Benjamin Redelings I, Stephen C. Tweedie, Linus Torvalds,
	Steve Bergman
  Cc: linux-kernel, Alan Cox, Rik van Riel, linux-mm

On Fri, 1 Jan 1999, Andrea Arcangeli wrote:

> I rediffed my VM patch against test1-patch-2.2.0-pre3.gz. I also fixed
> some bug (not totally critical but..) pointed out by Linus in my last
> code. I also changed the shrink_mmap(0) to shrink_mmap(priority) because
> it was completly sucking a lot performance. There is no need to do a
> shrink_mmap(0) for example if the cache/buffer are under min. In such case
> we must allow the swap_out() to grow the cache before start shrinking it.
> 
> So basically this new patch is _far_ more efficient than the last
> one (I never seen so good/stable/fast behavior before!).

Hmm, I just found a big problem, the patch was perfect as far as there was
no I/O bound application running.

When a I/O bound application start to read/write throught the fs, the
buffer and the cache grows, so kswapd has to use do_free_user_and_cache()
to make space for the new data in the cache.

The problem with my last approch is that do_free_user_and_cache() was
always generating I/O to async put some part of user memory to the swap.
This had a _bad_ impact in I/O performance of the I/O bound process :(.

I am the first guy that I hate to see some swapin/swapout while there are
tons of free memory used in cache/buffers.

So I obviously changed something. This new patch fix the problem
fine, even if it doesn't achieve the same iteractive performance as before
under heavily swapping (but it's near), it's a bit more sane ;).
The system is still perfectly balanced thought and now there aren't not
unnecessary swapin/swapout under heavy fs operation while there is a lot of
memory freeable.

Since to be happy I always need to change something more than what needed,
I also moved kmemcachereap with shrink_dcache().

Here is a new patch against test1-pre3. Steve if you are going
to make comparison let me know the results of course! Thanks.

You can also try to increase the priority = 8 in vmscan.c to 9 and see if the
benchmark is improved that way...

Index: linux/kernel/fork.c
diff -u linux/kernel/fork.c:1.1.1.3 linux/kernel/fork.c:1.1.1.1.2.6
--- linux/kernel/fork.c:1.1.1.3	Thu Dec  3 12:55:12 1998
+++ linux/kernel/fork.c	Thu Dec 31 17:56:28 1998
@@ -567,6 +570,7 @@
 
 	/* ok, now we should be set up.. */
 	p->swappable = 1;
+	p->trashing_memory = 0;
 	p->exit_signal = clone_flags & CSIGNAL;
 	p->pdeath_signal = 0;
 
Index: linux/mm/swap_state.c
diff -u linux/mm/swap_state.c:1.1.1.4 linux/mm/swap_state.c:1.1.1.1.2.9
--- linux/mm/swap_state.c:1.1.1.4	Fri Jan  1 19:12:54 1999
+++ linux/mm/swap_state.c	Fri Jan  1 19:25:33 1999
@@ -262,6 +262,9 @@
 struct page * lookup_swap_cache(unsigned long entry)
 {
 	struct page *found;
+#ifdef	SWAP_CACHE_INFO
+	swap_cache_find_total++;
+#endif
 	
 	while (1) {
 		found = find_page(&swapper_inode, entry);
@@ -269,8 +272,12 @@
 			return 0;
 		if (found->inode != &swapper_inode || !PageSwapCache(found))
 			goto out_bad;
-		if (!PageLocked(found))
+		if (!PageLocked(found)) {
+#ifdef	SWAP_CACHE_INFO
+			swap_cache_find_success++;
+#endif
 			return found;
+		}
 		__free_page(found);
 		__wait_on_page(found);
 	}
Index: linux/mm/vmscan.c
diff -u linux/mm/vmscan.c:1.1.1.8 linux/mm/vmscan.c:1.1.1.1.2.51
--- linux/mm/vmscan.c:1.1.1.8	Fri Jan  1 19:12:54 1999
+++ linux/mm/vmscan.c	Sat Jan  2 04:18:31 1999
@@ -10,6 +10,11 @@
  *  Version: $Id: vmscan.c,v 1.5 1998/02/23 22:14:28 sct Exp $
  */
 
+/*
+ * Revisioned the page freeing algorithm: do_free_user_and_cache().
+ * Copyright (C) 1998  Andrea Arcangeli
+ */
+
 #include <linux/slab.h>
 #include <linux/kernel_stat.h>
 #include <linux/swap.h>
@@ -162,8 +167,9 @@
 			 * copy in memory, so we add it to the swap
 			 * cache. */
 			if (PageSwapCache(page_map)) {
+				entry = atomic_read(&page_map->count);
 				__free_page(page_map);
-				return (atomic_read(&page_map->count) == 0);
+				return entry;
 			}
 			add_to_swap_cache(page_map, entry);
 			/* We checked we were unlocked way up above, and we
@@ -180,8 +186,9 @@
 		 * asynchronously.  That's no problem, shrink_mmap() can
 		 * correctly clean up the occassional unshared page
 		 * which gets left behind in the swap cache. */
+		entry = atomic_read(&page_map->count);
 		__free_page(page_map);
-		return 1;	/* we slept: the process may not exist any more */
+		return entry;	/* we slept: the process may not exist any more */
 	}
 
 	/* The page was _not_ dirty, but still has a zero age.  It must
@@ -194,8 +201,9 @@
 		set_pte(page_table, __pte(entry));
 		flush_tlb_page(vma, address);
 		swap_duplicate(entry);
+		entry = atomic_read(&page_map->count);
 		__free_page(page_map);
-		return (atomic_read(&page_map->count) == 0);
+		return entry;
 	} 
 	/* 
 	 * A clean page to be discarded?  Must be mmap()ed from
@@ -210,7 +218,7 @@
 	flush_cache_page(vma, address);
 	pte_clear(page_table);
 	flush_tlb_page(vma, address);
-	entry = (atomic_read(&page_map->count) == 1);
+	entry = atomic_read(&page_map->count);
 	__free_page(page_map);
 	return entry;
 }
@@ -369,8 +377,14 @@
 	 * swapped out.  If the swap-out fails, we clear swap_cnt so the 
 	 * task won't be selected again until all others have been tried.
 	 */
-	counter = ((PAGEOUT_WEIGHT * nr_tasks) >> 10) >> priority;
+	counter = nr_tasks / (priority+1);
+	if (counter < 1)
+		counter = 1;
+	if (counter > nr_tasks)
+		counter = nr_tasks;
+
 	for (; counter >= 0; counter--) {
+		int retval;
 		assign = 0;
 		max_cnt = 0;
 		pbest = NULL;
@@ -382,15 +396,8 @@
 				continue;
 	 		if (p->mm->rss <= 0)
 				continue;
-			if (assign) {
-				/* 
-				 * If we didn't select a task on pass 1, 
-				 * assign each task a new swap_cnt.
-				 * Normalise the number of pages swapped
-				 * by multiplying by (RSS / 1MB)
-				 */
-				p->swap_cnt = AGE_CLUSTER_SIZE(p->mm->rss);
-			}
+			if (assign)
+				p->swap_cnt = p->mm->rss;
 			if (p->swap_cnt > max_cnt) {
 				max_cnt = p->swap_cnt;
 				pbest = p;
@@ -404,14 +411,13 @@
 			}
 			goto out;
 		}
-		pbest->swap_cnt--;
-
 		/*
 		 * Nonzero means we cleared out something, but only "1" means
 		 * that we actually free'd up a page as a result.
 		 */
-		if (swap_out_process(pbest, gfp_mask) == 1)
-				return 1;
+		retval = swap_out_process(pbest, gfp_mask);
+		if (retval)
+			return retval;
 	}
 out:
 	return 0;
@@ -438,44 +444,64 @@
        printk ("Starting kswapd v%.*s\n", i, s);
 }
 
-#define free_memory(fn) \
-	count++; do { if (!--count) goto done; } while (fn)
+static int do_free_user_and_cache(int priority, int gfp_mask)
+{
+	if (shrink_mmap(priority, gfp_mask))
+		return 1;
 
-static int kswapd_free_pages(int kswapd_state)
+	if (swap_out(priority, gfp_mask))
+		/*
+		 * We done at least some swapping progress so return 1 in
+		 * this case. -arca
+		 */
+		return 1;
+
+	return 0;
+}
+
+static int do_free_page(int * state, int gfp_mask)
 {
-	unsigned long end_time;
+	int priority = 8;
 
-	/* Always trim SLAB caches when memory gets low. */
-	kmem_cache_reap(0);
+	switch (*state) {
+		do {
+		default:
+			if (do_free_user_and_cache(priority, gfp_mask))
+				return 1;
+			*state = 1;
+		case 1:
+			if (shm_swap(priority, gfp_mask))
+				return 1;
+			*state = 2;
+		case 2:
+			shrink_dcache_memory(priority, gfp_mask);
+			kmem_cache_reap(gfp_mask);
+			*state = 0;
+		} while (--priority >= 0);
+	}
+	return 0;
+}
 
+static int kswapd_free_pages(int kswapd_state)
+{
 	/* max one hundreth of a second */
-	end_time = jiffies + (HZ-1)/100;
-	do {
-		int priority = 5;
-		int count = pager_daemon.swap_cluster;
+	unsigned long end_time = jiffies + (HZ-1)/100;
 
-		switch (kswapd_state) {
-			do {
-			default:
-				free_memory(shrink_mmap(priority, 0));
-				kswapd_state++;
-			case 1:
-				free_memory(shm_swap(priority, 0));
-				kswapd_state++;
-			case 2:
-				free_memory(swap_out(priority, 0));
-				shrink_dcache_memory(priority, 0);
-				kswapd_state = 0;
-			} while (--priority >= 0);
-			return kswapd_state;
-		}
-done:
-		if (nr_free_pages > freepages.high + pager_daemon.swap_cluster)
+	do {
+		do_free_page(&kswapd_state, 0);
+		if (nr_free_pages > freepages.high)
 			break;
 	} while (time_before_eq(jiffies,end_time));
+	/* take kswapd_state on the stack to save some byte of memory */
 	return kswapd_state;
 }
 
+static inline void enable_swap_tick(void)
+{
+	timer_table[SWAP_TIMER].expires = jiffies+(HZ+99)/100;
+	timer_active |= 1<<SWAP_TIMER;
+}
+
 /*
  * The background pageout daemon.
  * Started as a kernel thread from the init process.
@@ -523,6 +549,7 @@
 		current->state = TASK_INTERRUPTIBLE;
 		flush_signals(current);
 		run_task_queue(&tq_disk);
+		enable_swap_tick();
 		schedule();
 		swapstats.wakeups++;
 		state = kswapd_free_pages(state);
@@ -542,35 +569,23 @@
  * if we need more memory as part of a swap-out effort we
  * will just silently return "success" to tell the page
  * allocator to accept the allocation.
- *
- * We want to try to free "count" pages, and we need to 
- * cluster them so that we get good swap-out behaviour. See
- * the "free_memory()" macro for details.
  */
 int try_to_free_pages(unsigned int gfp_mask, int count)
 {
-	int retval;
-
+	int retval = 1;
 	lock_kernel();
 
-	/* Always trim SLAB caches when memory gets low. */
-	kmem_cache_reap(gfp_mask);
-
-	retval = 1;
 	if (!(current->flags & PF_MEMALLOC)) {
-		int priority;
-
 		current->flags |= PF_MEMALLOC;
-	
-		priority = 5;
-		do {
-			free_memory(shrink_mmap(priority, gfp_mask));
-			free_memory(shm_swap(priority, gfp_mask));
-			free_memory(swap_out(priority, gfp_mask));
-			shrink_dcache_memory(priority, gfp_mask);
-		} while (--priority >= 0);
-		retval = 0;
-done:
+		while (count--)
+		{
+			static int state = 0;
+			if (!do_free_page(&state, gfp_mask))
+			{
+				retval = 0;
+				break;
+			}
+		}
 		current->flags &= ~PF_MEMALLOC;
 	}
 	unlock_kernel();
@@ -593,7 +608,8 @@
 	if (priority) {
 		p->counter = p->priority << priority;
 		wake_up_process(p);
-	}
+	} else
+		enable_swap_tick();
 }
 
 /* 
@@ -631,9 +647,8 @@
 			want_wakeup = 3;
 	
 		kswapd_wakeup(p,want_wakeup);
-	}
-
-	timer_active |= (1<<SWAP_TIMER);
+	} else
+		enable_swap_tick();
 }
 
 /* 
@@ -642,7 +657,6 @@
 
 void init_swap_timer(void)
 {
-	timer_table[SWAP_TIMER].expires = jiffies;
 	timer_table[SWAP_TIMER].fn = swap_tick;
-	timer_active |= (1<<SWAP_TIMER);
+	enable_swap_tick();
 }



Andrea Arcangeli

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: [patch] new-vm improvement [Re: 2.2.0 Bug summary]
  1999-01-01 23:46         ` Steve Bergman
@ 1999-01-02  6:55           ` Linus Torvalds
  1999-01-02  8:33             ` Steve Bergman
                               ` (3 more replies)
  0 siblings, 4 replies; 243+ messages in thread
From: Linus Torvalds @ 1999-01-02  6:55 UTC (permalink / raw
  To: Steve Bergman
  Cc: Andrea Arcangeli, Benjamin Redelings I, Stephen C. Tweedie,
	linux-kernel, Alan Cox, Rik van Riel, linux-mm



On Fri, 1 Jan 1999, Steve Bergman wrote:
>
> I got the patch and I must say I'm impressed.  I ran my "117 image" test
> and got these results:
> 
> 2.1.131-ac11                         172 sec  (This was previously the best)
> 2.2.0-pre1 + Arcangeli's 1st patch   400 sec
> test1-pre  + Arcangeli's 2nd patch   119 sec (!)

Would you care to do some more testing? In particular, I'd like to hear
how basic 2.2.0pre3 works (that's essentially the same as test1-pre, with
only minor updates)? I'd like to calibrate the numbers against that,
rather than against kernels that I haven't actually ever run myself. 

The other thing I'd like to hear is how pre3 looks with this patch, which
should behave basically like Andrea's latest patch but without the
obfuscation he put into his patch..

		Linus

-----
diff -u --recursive --new-file v2.2.0-pre3/linux/Makefile linux/Makefile
--- v2.2.0-pre3/linux/Makefile	Fri Jan  1 12:58:14 1999
+++ linux/Makefile	Fri Jan  1 12:58:29 1999
@@ -1,7 +1,7 @@
 VERSION = 2
 PATCHLEVEL = 2
 SUBLEVEL = 0
-EXTRAVERSION =-pre3
+EXTRAVERSION =-pre4
 
 ARCH := $(shell uname -m | sed -e s/i.86/i386/ -e s/sun4u/sparc64/ -e s/arm.*/arm/ -e s/sa110/arm/)
 
diff -u --recursive --new-file v2.2.0-pre3/linux/drivers/misc/parport_procfs.c linux/drivers/misc/parport_procfs.c
--- v2.2.0-pre3/linux/drivers/misc/parport_procfs.c	Sun Nov  8 14:02:59 1998
+++ linux/drivers/misc/parport_procfs.c	Fri Jan  1 21:27:12 1999
@@ -305,12 +305,11 @@
 {
 	base = new_proc_entry("parport", S_IFDIR, &proc_root,PROC_PARPORT,
 			      NULL);
-	base->fill_inode = &parport_modcount;
-
 	if (base == NULL) {
 		printk(KERN_ERR "Unable to initialise /proc/parport.\n");
 		return 0;
 	}
+	base->fill_inode = &parport_modcount;
 
 	return 1;
 }
diff -u --recursive --new-file v2.2.0-pre3/linux/fs/binfmt_misc.c linux/fs/binfmt_misc.c
--- v2.2.0-pre3/linux/fs/binfmt_misc.c	Fri Jan  1 12:58:20 1999
+++ linux/fs/binfmt_misc.c	Fri Jan  1 13:00:10 1999
@@ -30,6 +30,16 @@
 #include <asm/uaccess.h>
 #include <asm/spinlock.h>
 
+/*
+ * We should make this work with a "stub-only" /proc,
+ * which would just not be able to be configured.
+ * Right now the /proc-fs support is too black and white,
+ * though, so just remind people that this should be
+ * fixed..
+ */
+#ifndef CONFIG_PROC_FS
+#error You really need /proc support for binfmt_misc. Please reconfigure!
+#endif
 
 #define VERBOSE_STATUS /* undef this to save 400 bytes kernel memory */
 
diff -u --recursive --new-file v2.2.0-pre3/linux/include/linux/swapctl.h linux/include/linux/swapctl.h
--- v2.2.0-pre3/linux/include/linux/swapctl.h	Tue Dec 22 14:16:58 1998
+++ linux/include/linux/swapctl.h	Fri Jan  1 22:31:21 1999
@@ -90,18 +90,6 @@
 #define PAGE_DECLINE		(swap_control.sc_page_decline)
 #define PAGE_INITIAL_AGE	(swap_control.sc_page_initial_age)
 
-/* Given a resource of N units (pages or buffers etc), we only try to
- * age and reclaim AGE_CLUSTER_FRACT per 1024 resources each time we
- * scan the resource list. */
-static inline int AGE_CLUSTER_SIZE(int resources)
-{
-	unsigned int n = (resources * AGE_CLUSTER_FRACT) >> 10;
-	if (n < AGE_CLUSTER_MIN)
-		return AGE_CLUSTER_MIN;
-	else
-		return n;
-}
-
 #endif /* __KERNEL */
 
 #endif /* _LINUX_SWAPCTL_H */
diff -u --recursive --new-file v2.2.0-pre3/linux/mm/vmscan.c linux/mm/vmscan.c
--- v2.2.0-pre3/linux/mm/vmscan.c	Fri Jan  1 12:58:21 1999
+++ linux/mm/vmscan.c	Fri Jan  1 22:41:58 1999
@@ -363,13 +363,23 @@
 	/* 
 	 * We make one or two passes through the task list, indexed by 
 	 * assign = {0, 1}:
-	 *   Pass 1: select the swappable task with maximal swap_cnt.
-	 *   Pass 2: assign new swap_cnt values, then select as above.
+	 *   Pass 1: select the swappable task with maximal RSS that has
+	 *         not yet been swapped out. 
+	 *   Pass 2: re-assign rss swap_cnt values, then select as above.
+	 *
 	 * With this approach, there's no need to remember the last task
 	 * swapped out.  If the swap-out fails, we clear swap_cnt so the 
 	 * task won't be selected again until all others have been tried.
+	 *
+	 * Think of swap_cnt as a "shadow rss" - it tells us which process
+	 * we want to page out (always try largest first).
 	 */
-	counter = ((PAGEOUT_WEIGHT * nr_tasks) >> 10) >> priority;
+	counter = nr_tasks / (priority+1);
+	if (counter < 1)
+		counter = 1;
+	if (counter > nr_tasks)
+		counter = nr_tasks;
+
 	for (; counter >= 0; counter--) {
 		assign = 0;
 		max_cnt = 0;
@@ -382,15 +392,9 @@
 				continue;
 	 		if (p->mm->rss <= 0)
 				continue;
-			if (assign) {
-				/* 
-				 * If we didn't select a task on pass 1, 
-				 * assign each task a new swap_cnt.
-				 * Normalise the number of pages swapped
-				 * by multiplying by (RSS / 1MB)
-				 */
-				p->swap_cnt = AGE_CLUSTER_SIZE(p->mm->rss);
-			}
+			/* Refresh swap_cnt? */
+			if (assign)
+				p->swap_cnt = p->mm->rss;
 			if (p->swap_cnt > max_cnt) {
 				max_cnt = p->swap_cnt;
 				pbest = p;
@@ -404,14 +408,13 @@
 			}
 			goto out;
 		}
-		pbest->swap_cnt--;
 
 		/*
 		 * Nonzero means we cleared out something, but only "1" means
 		 * that we actually free'd up a page as a result.
 		 */
 		if (swap_out_process(pbest, gfp_mask) == 1)
-				return 1;
+			return 1;
 	}
 out:
 	return 0;
@@ -451,19 +454,17 @@
 	/* max one hundreth of a second */
 	end_time = jiffies + (HZ-1)/100;
 	do {
-		int priority = 5;
+		int priority = 8;
 		int count = pager_daemon.swap_cluster;
 
 		switch (kswapd_state) {
 			do {
 			default:
 				free_memory(shrink_mmap(priority, 0));
+				free_memory(swap_out(priority, 0));
 				kswapd_state++;
 			case 1:
 				free_memory(shm_swap(priority, 0));
-				kswapd_state++;
-			case 2:
-				free_memory(swap_out(priority, 0));
 				shrink_dcache_memory(priority, 0);
 				kswapd_state = 0;
 			} while (--priority >= 0);
@@ -562,7 +563,7 @@
 
 		current->flags |= PF_MEMALLOC;
 	
-		priority = 5;
+		priority = 8;
 		do {
 			free_memory(shrink_mmap(priority, gfp_mask));
 			free_memory(shm_swap(priority, gfp_mask));


--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: [patch] new-vm improvement [Re: 2.2.0 Bug summary]
  1999-01-02  6:55           ` Linus Torvalds
@ 1999-01-02  8:33             ` Steve Bergman
  1999-01-02 14:48             ` Andrea Arcangeli
                               ` (2 subsequent siblings)
  3 siblings, 0 replies; 243+ messages in thread
From: Steve Bergman @ 1999-01-02  8:33 UTC (permalink / raw
  Cc: Benjamin Redelings I, Stephen C. Tweedie, linux-kernel, Alan Cox,
	Rik van Riel, linux-mm

Linus Torvalds wrote:
> 
> On Fri, 1 Jan 1999, Steve Bergman wrote:
> >
> > I got the patch and I must say I'm impressed.  I ran my "117 image" test
> > and got these results:
> >
> > 2.1.131-ac11                         172 sec  (This was previously the best)
> > 2.2.0-pre1 + Arcangeli's 1st patch   400 sec
> > test1-pre  + Arcangeli's 2nd patch   119 sec (!)
> 
> Would you care to do some more testing? In particular, I'd like to hear
> how basic 2.2.0pre3 works (that's essentially the same as test1-pre, with
> only minor updates)? I'd like to calibrate the numbers against that,
> rather than against kernels that I haven't actually ever run myself.
> 
> The other thing I'd like to hear is how pre3 looks with this patch, which
> should behave basically like Andrea's latest patch 

Hi Linus,

Andrea sent another patch to correct a problem with i/o bound processes,
which he also posted to linux-kernel.  The performance in this test is
unchanged.

Here are the results:


2.1.131-ac11                                    172 sec  

2.2.0-pre1 + Arcangeli's 1st patch              400 sec
test1-pre  + Arcangeli's 2nd patch              119 sec 
test1-pre  + Arcangeli's 3rd patch              119 sec
test1-pre  + Arcangeli's 3rd patch              117 sec 
(changed to priority = 9 in mm/vmscan.c)

2.2.0-pre3                                      175 sec
2.2.0-pre3 + Linus's patch                      129 sec

RH5.2 Stock (2.0.36-0.7)                        280 sec



I noticed that in watching the 'vmstat 1' during the test that
'2.2.0+Linus patch' was not *quite* as smooth as the Archangeli patches,
in that there were periods of 2 or 3 seconds in which the swap out rate
would fall to ~800k/sec and then jump back up to 1.8-2.5MB/sec.  I have
only run your patch once though.  I'll check it further tomorrow to
confirm that that is really the case.  Note how much better 2.2 is doing
compared to 2.0.36-0.7 in this situation.

I should be available for a good part of this weekend for further
testing; Just let me know.

As a reference:

AMD K6-2 300
128MB ram
2GB seagate scsi2 dedicated to swap
Data drive is 6.5GB UDMA


Steve Bergman
steve@netplus.net
--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: [patch] new-vm improvement [Re: 2.2.0 Bug summary]
  1999-01-02  6:55           ` Linus Torvalds
  1999-01-02  8:33             ` Steve Bergman
@ 1999-01-02 14:48             ` Andrea Arcangeli
  1999-01-02 15:38             ` Andrea Arcangeli
  1999-01-02 20:04             ` Steve Bergman
  3 siblings, 0 replies; 243+ messages in thread
From: Andrea Arcangeli @ 1999-01-02 14:48 UTC (permalink / raw
  To: Linus Torvalds
  Cc: Steve Bergman, Benjamin Redelings I, Stephen C. Tweedie,
	linux-kernel, Alan Cox, Rik van Riel, linux-mm

On Fri, 1 Jan 1999, Linus Torvalds wrote:

> The other thing I'd like to hear is how pre3 looks with this patch, which
> should behave basically like Andrea's latest patch but without the
> obfuscation he put into his patch..

I still think the most important part of all my latest VM patches is my
new do_free_user_and_cache(). It allow the VM to scale very better and be
perfectly balanced. 

Why to run `count' times swap_out() without take a look if the cache grows
too much?

Andrea Arcangeli

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: [patch] new-vm improvement [Re: 2.2.0 Bug summary]
  1999-01-02  6:55           ` Linus Torvalds
  1999-01-02  8:33             ` Steve Bergman
  1999-01-02 14:48             ` Andrea Arcangeli
@ 1999-01-02 15:38             ` Andrea Arcangeli
  1999-01-02 18:10               ` Linus Torvalds
  1999-01-02 20:52               ` Andrea Arcangeli
  1999-01-02 20:04             ` Steve Bergman
  3 siblings, 2 replies; 243+ messages in thread
From: Andrea Arcangeli @ 1999-01-02 15:38 UTC (permalink / raw
  To: Linus Torvalds
  Cc: Steve Bergman, Benjamin Redelings I, Stephen C. Tweedie,
	linux-kernel, Alan Cox, Rik van Riel, linux-mm

On Fri, 1 Jan 1999, Linus Torvalds wrote:

> The other thing I'd like to hear is how pre3 looks with this patch, which
> should behave basically like Andrea's latest patch but without the
> obfuscation he put into his patch..

I rediffed my latest swapout stuff against your latest tree (I consider
your latest patch as test1-pre4, right?).

Index: linux/mm/vmscan.c
diff -u linux/mm/vmscan.c:1.1.1.9 linux/mm/vmscan.c:1.1.1.1.2.52
--- linux/mm/vmscan.c:1.1.1.9	Sat Jan  2 15:46:20 1999
+++ linux/mm/vmscan.c	Sat Jan  2 15:53:33 1999
@@ -10,6 +10,11 @@
  *  Version: $Id: vmscan.c,v 1.5 1998/02/23 22:14:28 sct Exp $
  */
 
+/*
+ * Revisioned the page freeing algorithm: do_free_user_and_cache().
+ * Copyright (C) 1998  Andrea Arcangeli
+ */
+
 #include <linux/slab.h>
 #include <linux/kernel_stat.h>
 #include <linux/swap.h>
@@ -162,8 +167,9 @@
 			 * copy in memory, so we add it to the swap
 			 * cache. */
 			if (PageSwapCache(page_map)) {
+				entry = atomic_read(&page_map->count);
 				__free_page(page_map);
-				return (atomic_read(&page_map->count) == 0);
+				return entry;
 			}
 			add_to_swap_cache(page_map, entry);
 			/* We checked we were unlocked way up above, and we
@@ -180,8 +186,9 @@
 		 * asynchronously.  That's no problem, shrink_mmap() can
 		 * correctly clean up the occassional unshared page
 		 * which gets left behind in the swap cache. */
+		entry = atomic_read(&page_map->count);
 		__free_page(page_map);
-		return 1;	/* we slept: the process may not exist any more */
+		return entry;	/* we slept: the process may not exist any more */
 	}
 
 	/* The page was _not_ dirty, but still has a zero age.  It must
@@ -194,8 +201,9 @@
 		set_pte(page_table, __pte(entry));
 		flush_tlb_page(vma, address);
 		swap_duplicate(entry);
+		entry = atomic_read(&page_map->count);
 		__free_page(page_map);
-		return (atomic_read(&page_map->count) == 0);
+		return entry;
 	} 
 	/* 
 	 * A clean page to be discarded?  Must be mmap()ed from
@@ -210,7 +218,7 @@
 	flush_cache_page(vma, address);
 	pte_clear(page_table);
 	flush_tlb_page(vma, address);
-	entry = (atomic_read(&page_map->count) == 1);
+	entry = atomic_read(&page_map->count);
 	__free_page(page_map);
 	return entry;
 }
@@ -381,6 +389,7 @@
 		counter = nr_tasks;
 
 	for (; counter >= 0; counter--) {
+		int retval;
 		assign = 0;
 		max_cnt = 0;
 		pbest = NULL;
@@ -413,8 +422,9 @@
 		 * Nonzero means we cleared out something, but only "1" means
 		 * that we actually free'd up a page as a result.
 		 */
-		if (swap_out_process(pbest, gfp_mask) == 1)
-			return 1;
+		retval = swap_out_process(pbest, gfp_mask);
+		if (retval)
+			return retval;
 	}
 out:
 	return 0;
@@ -441,42 +451,64 @@
        printk ("Starting kswapd v%.*s\n", i, s);
 }
 
-#define free_memory(fn) \
-	count++; do { if (!--count) goto done; } while (fn)
+static int do_free_user_and_cache(int priority, int gfp_mask)
+{
+	if (shrink_mmap(priority, gfp_mask))
+		return 1;
 
-static int kswapd_free_pages(int kswapd_state)
+	if (swap_out(priority, gfp_mask))
+		/*
+		 * We done at least some swapping progress so return 1 in
+		 * this case. -arca
+		 */
+		return 1;
+
+	return 0;
+}
+
+static int do_free_page(int * state, int gfp_mask)
 {
-	unsigned long end_time;
+	int priority = 8;
 
-	/* Always trim SLAB caches when memory gets low. */
-	kmem_cache_reap(0);
+	switch (*state) {
+		do {
+		default:
+			if (do_free_user_and_cache(priority, gfp_mask))
+				return 1;
+			*state = 1;
+		case 1:
+			if (shm_swap(priority, gfp_mask))
+				return 1;
+			*state = 2;
+		case 2:
+			shrink_dcache_memory(priority, gfp_mask);
+			kmem_cache_reap(gfp_mask);
+			*state = 0;
+		} while (--priority >= 0);
+	}
+	return 0;
+}
 
+static int kswapd_free_pages(int kswapd_state)
+{
 	/* max one hundreth of a second */
-	end_time = jiffies + (HZ-1)/100;
-	do {
-		int priority = 8;
-		int count = pager_daemon.swap_cluster;
+	unsigned long end_time = jiffies + (HZ-1)/100;
 
-		switch (kswapd_state) {
-			do {
-			default:
-				free_memory(shrink_mmap(priority, 0));
-				free_memory(swap_out(priority, 0));
-				kswapd_state++;
-			case 1:
-				free_memory(shm_swap(priority, 0));
-				shrink_dcache_memory(priority, 0);
-				kswapd_state = 0;
-			} while (--priority >= 0);
-			return kswapd_state;
-		}
-done:
-		if (nr_free_pages > freepages.high + pager_daemon.swap_cluster)
+	do {
+		do_free_page(&kswapd_state, 0);
+		if (nr_free_pages > freepages.high)
 			break;
 	} while (time_before_eq(jiffies,end_time));
+	/* take kswapd_state on the stack to save some byte of memory */
 	return kswapd_state;
 }
 
+static inline void enable_swap_tick(void)
+{
+	timer_table[SWAP_TIMER].expires = jiffies+(HZ+99)/100;
+	timer_active |= 1<<SWAP_TIMER;
+}
+
 /*
  * The background pageout daemon.
  * Started as a kernel thread from the init process.
@@ -524,6 +556,7 @@
 		current->state = TASK_INTERRUPTIBLE;
 		flush_signals(current);
 		run_task_queue(&tq_disk);
+		enable_swap_tick();
 		schedule();
 		swapstats.wakeups++;
 		state = kswapd_free_pages(state);
@@ -543,35 +576,23 @@
  * if we need more memory as part of a swap-out effort we
  * will just silently return "success" to tell the page
  * allocator to accept the allocation.
- *
- * We want to try to free "count" pages, and we need to 
- * cluster them so that we get good swap-out behaviour. See
- * the "free_memory()" macro for details.
  */
 int try_to_free_pages(unsigned int gfp_mask, int count)
 {
-	int retval;
-
+	int retval = 1;
 	lock_kernel();
 
-	/* Always trim SLAB caches when memory gets low. */
-	kmem_cache_reap(gfp_mask);
-
-	retval = 1;
 	if (!(current->flags & PF_MEMALLOC)) {
-		int priority;
-
 		current->flags |= PF_MEMALLOC;
-	
-		priority = 8;
-		do {
-			free_memory(shrink_mmap(priority, gfp_mask));
-			free_memory(shm_swap(priority, gfp_mask));
-			free_memory(swap_out(priority, gfp_mask));
-			shrink_dcache_memory(priority, gfp_mask);
-		} while (--priority >= 0);
-		retval = 0;
-done:
+		while (count--)
+		{
+			static int state = 0;
+			if (!do_free_page(&state, gfp_mask))
+			{
+				retval = 0;
+				break;
+			}
+		}
 		current->flags &= ~PF_MEMALLOC;
 	}
 	unlock_kernel();
@@ -594,7 +615,8 @@
 	if (priority) {
 		p->counter = p->priority << priority;
 		wake_up_process(p);
-	}
+	} else
+		enable_swap_tick();
 }
 
 /* 
@@ -632,9 +654,8 @@
 			want_wakeup = 3;
 	
 		kswapd_wakeup(p,want_wakeup);
-	}
-
-	timer_active |= (1<<SWAP_TIMER);
+	} else
+		enable_swap_tick();
 }
 
 /* 
@@ -643,7 +664,6 @@
 
 void init_swap_timer(void)
 {
-	timer_table[SWAP_TIMER].expires = jiffies;
 	timer_table[SWAP_TIMER].fn = swap_tick;
-	timer_active |= (1<<SWAP_TIMER);
+	enable_swap_tick();
 }



The try_to_swap_out() changes (entry = atomic_read()) are really not
important for the performance. We could always return 1 instead of
atomic_read() and consider the retval 1 from swap_out() as every current
retval >1. Since I can't see a big performance impact by atomic_read() I
left it here since it will give us more info than returning a plain 1 and
so knowing only that we have succesfully unliked a page from the user
process memory. 

I have also a new experimental patch against the one above, that here
improve a _lot_ the swapout performance. The benchmark that dirtify 160
Mbyte in loop was used to take near 106 sec and now takes 89sec. It will
also avoid all not trashing process to be swapped out.

I don't consider this production code though but I am interested if
somebody will try it ;):

Index: mm//vmscan.c
===================================================================
RCS file: /var/cvs/linux/mm/vmscan.c,v
retrieving revision 1.1.1.1.2.52
diff -u -r1.1.1.1.2.52 vmscan.c
--- vmscan.c	1999/01/02 14:53:33	1.1.1.1.2.52
+++ linux/mm/vmscan.c	1999/01/02 15:19:21
@@ -353,7 +353,6 @@
 	}
 
 	/* We didn't find anything for the process */
-	p->swap_cnt = 0;
 	p->swap_address = 0;
 	return 0;
 }
@@ -423,6 +422,14 @@
 		 * that we actually free'd up a page as a result.
 		 */
 		retval = swap_out_process(pbest, gfp_mask);
+		/*
+		 * Don't play with other tasks next time if the huge one
+		 * is been swapedin in the meantime. This can be considered
+		 * a bit experimental, but it seems to improve a lot the
+		 * swapout performances here. -arca
+		 */
+		p->swap_cnt = p->mm->rss;
+
 		if (retval)
 			return retval;
 	}
 

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: [patch] new-vm improvement [Re: 2.2.0 Bug summary]
  1999-01-02 15:38             ` Andrea Arcangeli
@ 1999-01-02 18:10               ` Linus Torvalds
  1999-01-02 20:52               ` Andrea Arcangeli
  1 sibling, 0 replies; 243+ messages in thread
From: Linus Torvalds @ 1999-01-02 18:10 UTC (permalink / raw
  To: Andrea Arcangeli
  Cc: Steve Bergman, Benjamin Redelings I, Stephen C. Tweedie, Alan Cox,
	Rik van Riel, linux-mm



On Sat, 2 Jan 1999, Andrea Arcangeli wrote:
> 
> > The other thing I'd like to hear is how pre3 looks with this patch, which
> > should behave basically like Andrea's latest patch but without the
> > obfuscation he put into his patch..
> 
> I rediffed my latest swapout stuff against your latest tree (I consider
> your latest patch as test1-pre4, right?).

Andrea, I already told you that I refuse to apply patches that include
this many obvious cases of pure obfuscation.

As I already told you in an earlier mail, your state machine only has two
states, not three like the code makes you believe. Gratuitous changes like
that that only show that the writer didn't actually _think_ about the code
is not something I want at any stage, much less now.

		Linus

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: [patch] new-vm improvement [Re: 2.2.0 Bug summary]
  1999-01-02  6:55           ` Linus Torvalds
                               ` (2 preceding siblings ...)
  1999-01-02 15:38             ` Andrea Arcangeli
@ 1999-01-02 20:04             ` Steve Bergman
  3 siblings, 0 replies; 243+ messages in thread
From: Steve Bergman @ 1999-01-02 20:04 UTC (permalink / raw
  To: Linus Torvalds
  Cc: Andrea Arcangeli, Benjamin Redelings I, Stephen C. Tweedie,
	linux-kernel, Alan Cox, Rik van Riel, linux-mm

Linus Torvalds wrote:
> 
> Would you care to do some more testing? In particular, I'd like to hear
> how basic 2.2.0pre3 works (that's essentially the same as test1-pre, with
> only minor updates)? I'd like to calibrate the numbers against that,
> rather than against kernels that I haven't actually ever run myself.
> 

I've done some more testing, this time including the low memory case. 
For low memory testing I built the dhcp server from SRPM in 8MB with X,
xdm, various daemons (sendmail, named, inetd, etc.), and vmstat 1
running.  Swap area stayed at about 8MB usage.  I have also run the
128MB tests some more and have slightly more accurate results.  Here is
the summary:



Kernel                                          128MB              8MB
------------                                    -------           
------
2.1.131-ac11                                    172 sec            260
sec
test1-pre  + Arcangeli's patch                  119 sec            226
sec
2.2.0-pre3                                      175 sec            334
sec
2.2.0-pre3 + Linus's patch                      129 sec            312
sec
RH5.2 Stock (2.0.36-0.7)                        280 sec            N/A



-Steve
--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: [patch] new-vm improvement [Re: 2.2.0 Bug summary]
  1999-01-02 15:38             ` Andrea Arcangeli
  1999-01-02 18:10               ` Linus Torvalds
@ 1999-01-02 20:52               ` Andrea Arcangeli
  1999-01-03  2:59                 ` Andrea Arcangeli
  1 sibling, 1 reply; 243+ messages in thread
From: Andrea Arcangeli @ 1999-01-02 20:52 UTC (permalink / raw
  To: Steve Bergman
  Cc: Linus Torvalds, Benjamin Redelings I, Stephen C. Tweedie,
	linux-kernel, Alan Cox, Rik van Riel, linux-mm

On Sat, 2 Jan 1999, Andrea Arcangeli wrote:

> I rediffed my latest swapout stuff against your latest tree (I consider
> your latest patch as test1-pre4, right?).

I developed new exiting stuff this afternoon! The most important thing is
the swapout smart weight code. Basing the priority on the number of
process to try to swapout was really ugly and not smart.

The second change is done over shrink_mmap(), this will cause
shrink_mmap() to care very more about aging. We have only one bit and we
must use it carefully to get not out of cache ;) 

I also added/removed some PG_referenced. But please, don't trust too much
the pg_refernced changes since I have not thought about it too much (maybe
they are not needed?). 

I returned to put the minimum of cache and buffer to 5%. This allow me to
run every trashing memory proggy I can for every time but I still have all
my last command run (free) and filesystem (ls -l) in cache (because the
trashing memory _only_ play with its VM and asks nothing to the kernel of
course). 

Ah and woops, in the last patch I do a mistake and I forget to change
max_cnt to unsigned long. This should be changed also in your tree, Linus. 

This new patch seems to really rocks here and seems _far_ better than
anything I tried before! Steve, could try it and feedback? Thanks ;) 

Please excuse me Linus if I have not yet cleanedup things, but my spare
time is very small and I would _try_ to improve things a bit more
before...

This patch is against 2.2.0-pre4 (the lateest patch posted by Linus here).

Index: linux/include/linux/mm.h
diff -u linux/include/linux/mm.h:1.1.1.3 linux/include/linux/mm.h:1.1.1.1.2.11
--- linux/include/linux/mm.h:1.1.1.3	Sat Jan  2 15:24:18 1999
+++ linux/include/linux/mm.h	Sat Jan  2 21:40:13 1999
@@ -118,7 +118,6 @@
 	unsigned long offset;
 	struct page *next_hash;
 	atomic_t count;
-	unsigned int unused;
 	unsigned long flags;	/* atomic flags, some possibly updated asynchronously */
 	struct wait_queue *wait;
 	struct page **pprev_hash;
@@ -295,8 +294,7 @@
 
 /* filemap.c */
 extern void remove_inode_page(struct page *);
-extern unsigned long page_unuse(struct page *);
-extern int shrink_mmap(int, int);
+extern int FASTCALL(shrink_mmap(int, int));
 extern void truncate_inode_pages(struct inode *, unsigned long);
 extern unsigned long get_cached_page(struct inode *, unsigned long, int);
 extern void put_cached_page(unsigned long);
Index: linux/include/linux/pagemap.h
diff -u linux/include/linux/pagemap.h:1.1.1.1 linux/include/linux/pagemap.h:1.1.1.1.2.1
--- linux/include/linux/pagemap.h:1.1.1.1	Fri Nov 20 00:01:16 1998
+++ linux/include/linux/pagemap.h	Sat Jan  2 21:40:13 1999
@@ -77,6 +77,7 @@
 		*page->pprev_hash = page->next_hash;
 		page->pprev_hash = NULL;
 	}
+	clear_bit(PG_referenced, &page->flags);
 	page_cache_size--;
 }
 
Index: linux/mm/filemap.c
diff -u linux/mm/filemap.c:1.1.1.8 linux/mm/filemap.c:1.1.1.1.2.35
--- linux/mm/filemap.c:1.1.1.8	Fri Jan  1 19:12:53 1999
+++ linux/mm/filemap.c	Sat Jan  2 21:40:13 1999
@@ -118,6 +122,10 @@
 	__free_page(page);
 }
 
+#define HANDLE_AGING(page)					\
+	if (test_and_clear_bit(PG_referenced, &(page)->flags))	\
+		continue;
+
 int shrink_mmap(int priority, int gfp_mask)
 {
 	static unsigned long clock = 0;
@@ -140,12 +148,11 @@
 			page = page->next_hash;
 			clock = page->map_nr;
 		}
-		
-		if (test_and_clear_bit(PG_referenced, &page->flags))
-			continue;
 
 		/* Decrement count only for non-referenced pages */
-		count--;
+		if (!test_bit(PG_referenced, &page->flags))
+			count--;
+
 		if (PageLocked(page))
 			continue;
 
@@ -160,6 +167,7 @@
 		if (page->buffers) {
 			if (buffer_under_min())
 				continue;
+			HANDLE_AGING(page);
 			if (!try_to_free_buffers(page))
 				continue;
 			return 1;
@@ -167,12 +175,14 @@
 
 		/* is it a swap-cache or page-cache page? */
 		if (page->inode) {
-			if (pgcache_under_min())
-				continue;
 			if (PageSwapCache(page)) {
+				HANDLE_AGING(page);
 				delete_from_swap_cache(page);
 				return 1;
 			}
+			if (pgcache_under_min())
+				continue;
+			HANDLE_AGING(page);
 			remove_inode_page(page);
 			return 1;
 		}
@@ -181,6 +191,8 @@
 	return 0;
 }
 
+#undef HANDLE_AGING
+
 /*
  * Update a page cache copy, when we're doing a "write()" system call
  * See also "update_vm_cache()".
Index: linux/mm/swap.c
diff -u linux/mm/swap.c:1.1.1.5 linux/mm/swap.c:1.1.1.1.2.8
--- linux/mm/swap.c:1.1.1.5	Sat Jan  2 15:24:40 1999
+++ linux/mm/swap.c	Sat Jan  2 21:40:13 1999
@@ -64,13 +64,13 @@
 swapstat_t swapstats = {0};
 
 buffer_mem_t buffer_mem = {
-	2,	/* minimum percent buffer */
+	5,	/* minimum percent buffer */
 	10,	/* borrow percent buffer */
 	60	/* maximum percent buffer */
 };
 
 buffer_mem_t page_cache = {
-	2,	/* minimum percent page cache */
+	5,	/* minimum percent page cache */
 	15,	/* borrow percent page cache */
 	75	/* maximum */
 };
Index: linux/mm/vmscan.c
diff -u linux/mm/vmscan.c:1.1.1.9 linux/mm/vmscan.c:1.1.1.1.2.57
--- linux/mm/vmscan.c:1.1.1.9	Sat Jan  2 15:46:20 1999
+++ linux/mm/vmscan.c	Sat Jan  2 21:45:22 1999
@@ -10,6 +10,12 @@
  *  Version: $Id: vmscan.c,v 1.5 1998/02/23 22:14:28 sct Exp $
  */
 
+/*
+ * Revisioned the page freeing algorithm (do_free_user_and_cache), and
+ * developed a smart mechanism to handle the swapout weight.
+ * Copyright (C) 1998  Andrea Arcangeli
+ */
+
 #include <linux/slab.h>
 #include <linux/kernel_stat.h>
 #include <linux/swap.h>
@@ -162,8 +168,9 @@
 			 * copy in memory, so we add it to the swap
 			 * cache. */
 			if (PageSwapCache(page_map)) {
+				entry = atomic_read(&page_map->count);
 				__free_page(page_map);
-				return (atomic_read(&page_map->count) == 0);
+				return entry;
 			}
 			add_to_swap_cache(page_map, entry);
 			/* We checked we were unlocked way up above, and we
@@ -180,8 +187,9 @@
 		 * asynchronously.  That's no problem, shrink_mmap() can
 		 * correctly clean up the occassional unshared page
 		 * which gets left behind in the swap cache. */
+		entry = atomic_read(&page_map->count);
 		__free_page(page_map);
-		return 1;	/* we slept: the process may not exist any more */
+		return entry;	/* we slept: the process may not exist any more */
 	}
 
 	/* The page was _not_ dirty, but still has a zero age.  It must
@@ -194,8 +202,9 @@
 		set_pte(page_table, __pte(entry));
 		flush_tlb_page(vma, address);
 		swap_duplicate(entry);
+		entry = atomic_read(&page_map->count);
 		__free_page(page_map);
-		return (atomic_read(&page_map->count) == 0);
+		return entry;
 	} 
 	/* 
 	 * A clean page to be discarded?  Must be mmap()ed from
@@ -210,7 +219,7 @@
 	flush_cache_page(vma, address);
 	pte_clear(page_table);
 	flush_tlb_page(vma, address);
-	entry = (atomic_read(&page_map->count) == 1);
+	entry = atomic_read(&page_map->count);
 	__free_page(page_map);
 	return entry;
 }
@@ -230,7 +239,7 @@
  */
 
 static inline int swap_out_pmd(struct task_struct * tsk, struct vm_area_struct * vma,
-	pmd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
+	pmd_t *dir, unsigned long address, unsigned long end, int gfp_mask, unsigned long * counter, unsigned long * next_addr)
 {
 	pte_t * pte;
 	unsigned long pmd_end;
@@ -256,13 +265,19 @@
 		if (result)
 			return result;
 		address += PAGE_SIZE;
+		if (!*counter)
+		{
+			*next_addr = address;
+			return 0;
+		} else
+			(*counter)--;
 		pte++;
 	} while (address < end);
 	return 0;
 }
 
 static inline int swap_out_pgd(struct task_struct * tsk, struct vm_area_struct * vma,
-	pgd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
+	pgd_t *dir, unsigned long address, unsigned long end, int gfp_mask, unsigned long * counter, unsigned long * next_addr)
 {
 	pmd_t * pmd;
 	unsigned long pgd_end;
@@ -282,9 +297,11 @@
 		end = pgd_end;
 	
 	do {
-		int result = swap_out_pmd(tsk, vma, pmd, address, end, gfp_mask);
+		int result = swap_out_pmd(tsk, vma, pmd, address, end, gfp_mask, counter, next_addr);
 		if (result)
 			return result;
+		if (!*counter)
+			return 0;
 		address = (address + PMD_SIZE) & PMD_MASK;
 		pmd++;
 	} while (address < end);
@@ -292,7 +309,7 @@
 }
 
 static int swap_out_vma(struct task_struct * tsk, struct vm_area_struct * vma,
-	unsigned long address, int gfp_mask)
+	unsigned long address, int gfp_mask, unsigned long * counter, unsigned long * next_addr)
 {
 	pgd_t *pgdir;
 	unsigned long end;
@@ -306,16 +323,19 @@
 
 	end = vma->vm_end;
 	while (address < end) {
-		int result = swap_out_pgd(tsk, vma, pgdir, address, end, gfp_mask);
+		int result = swap_out_pgd(tsk, vma, pgdir, address, end, gfp_mask, counter, next_addr);
 		if (result)
 			return result;
+		if (!*counter)
+			return 0;
 		address = (address + PGDIR_SIZE) & PGDIR_MASK;
 		pgdir++;
 	}
 	return 0;
 }
 
-static int swap_out_process(struct task_struct * p, int gfp_mask)
+static int swap_out_process(struct task_struct * p, int gfp_mask,
+			    unsigned long * counter)
 {
 	unsigned long address;
 	struct vm_area_struct* vma;
@@ -334,9 +354,16 @@
 			address = vma->vm_start;
 
 		for (;;) {
-			int result = swap_out_vma(p, vma, address, gfp_mask);
+			unsigned long next_addr;
+			int result = swap_out_vma(p, vma, address, gfp_mask,
+						  counter, &next_addr);
 			if (result)
 				return result;
+			if (!*counter)
+			{
+				p->swap_address = next_addr;
+				return 0;
+			}
 			vma = vma->vm_next;
 			if (!vma)
 				break;
@@ -350,6 +377,19 @@
 	return 0;
 }
 
+static unsigned long total_rss(void)
+{
+	unsigned long total_rss = 0;
+	struct task_struct * p;
+
+	read_lock(&tasklist_lock);
+	for (p = init_task.next_task; p != &init_task; p = p->next_task)
+		total_rss += p->mm->rss;
+	read_unlock(&tasklist_lock);
+
+	return total_rss;
+}
+
 /*
  * Select the task with maximal swap_cnt and try to swap out a page.
  * N.B. This function returns only 0 or 1.  Return values != 1 from
@@ -358,7 +398,10 @@
 static int swap_out(unsigned int priority, int gfp_mask)
 {
 	struct task_struct * p, * pbest;
-	int counter, assign, max_cnt;
+	int assign;
+	unsigned long max_cnt, counter;
+
+	counter = total_rss() >> priority;
 
 	/* 
 	 * We make one or two passes through the task list, indexed by 
@@ -374,13 +417,8 @@
 	 * Think of swap_cnt as a "shadow rss" - it tells us which process
 	 * we want to page out (always try largest first).
 	 */
-	counter = nr_tasks / (priority+1);
-	if (counter < 1)
-		counter = 1;
-	if (counter > nr_tasks)
-		counter = nr_tasks;
-
-	for (; counter >= 0; counter--) {
+	while (counter > 0) {
+		int retval;
 		assign = 0;
 		max_cnt = 0;
 		pbest = NULL;
@@ -413,8 +451,9 @@
 		 * Nonzero means we cleared out something, but only "1" means
 		 * that we actually free'd up a page as a result.
 		 */
-		if (swap_out_process(pbest, gfp_mask) == 1)
-			return 1;
+		retval = swap_out_process(pbest, gfp_mask, &counter);
+		if (retval)
+			return retval;
 	}
 out:
 	return 0;
@@ -441,42 +480,63 @@
        printk ("Starting kswapd v%.*s\n", i, s);
 }
 
-#define free_memory(fn) \
-	count++; do { if (!--count) goto done; } while (fn)
+static int do_free_user_and_cache(int priority, int gfp_mask)
+{
+	if (shrink_mmap(priority, gfp_mask))
+		return 1;
 
-static int kswapd_free_pages(int kswapd_state)
+	if (swap_out(priority, gfp_mask))
+		/*
+		 * We done at least some swapping progress so return 1 in
+		 * this case. -arca
+		 */
+		return 1;
+
+	return 0;
+}
+
+static int do_free_page(int * state, int gfp_mask)
 {
-	unsigned long end_time;
+	int priority = 8;
 
-	/* Always trim SLAB caches when memory gets low. */
-	kmem_cache_reap(0);
+	switch (*state) {
+		do {
+		default:
+			if (do_free_user_and_cache(priority, gfp_mask))
+				return 1;
+			*state = 1;
+		case 1:
+			if (shm_swap(priority, gfp_mask))
+				return 1;
+			*state = 0;
 
+			shrink_dcache_memory(priority, gfp_mask);
+			kmem_cache_reap(gfp_mask);
+		} while (--priority >= 0);
+	}
+	return 0;
+}
+
+static int kswapd_free_pages(int kswapd_state)
+{
 	/* max one hundreth of a second */
-	end_time = jiffies + (HZ-1)/100;
-	do {
-		int priority = 8;
-		int count = pager_daemon.swap_cluster;
+	unsigned long end_time = jiffies + (HZ-1)/100;
 
-		switch (kswapd_state) {
-			do {
-			default:
-				free_memory(shrink_mmap(priority, 0));
-				free_memory(swap_out(priority, 0));
-				kswapd_state++;
-			case 1:
-				free_memory(shm_swap(priority, 0));
-				shrink_dcache_memory(priority, 0);
-				kswapd_state = 0;
-			} while (--priority >= 0);
-			return kswapd_state;
-		}
-done:
-		if (nr_free_pages > freepages.high + pager_daemon.swap_cluster)
+	do {
+		do_free_page(&kswapd_state, 0);
+		if (nr_free_pages > freepages.high)
 			break;
 	} while (time_before_eq(jiffies,end_time));
+	/* take kswapd_state on the stack to save some byte of memory */
 	return kswapd_state;
 }
 
+static inline void enable_swap_tick(void)
+{
+	timer_table[SWAP_TIMER].expires = jiffies+(HZ+99)/100;
+	timer_active |= 1<<SWAP_TIMER;
+}
+
 /*
  * The background pageout daemon.
  * Started as a kernel thread from the init process.
@@ -524,6 +584,7 @@
 		current->state = TASK_INTERRUPTIBLE;
 		flush_signals(current);
 		run_task_queue(&tq_disk);
+		enable_swap_tick();
 		schedule();
 		swapstats.wakeups++;
 		state = kswapd_free_pages(state);
@@ -543,35 +604,23 @@
  * if we need more memory as part of a swap-out effort we
  * will just silently return "success" to tell the page
  * allocator to accept the allocation.
- *
- * We want to try to free "count" pages, and we need to 
- * cluster them so that we get good swap-out behaviour. See
- * the "free_memory()" macro for details.
  */
 int try_to_free_pages(unsigned int gfp_mask, int count)
 {
-	int retval;
-
+	int retval = 1;
 	lock_kernel();
 
-	/* Always trim SLAB caches when memory gets low. */
-	kmem_cache_reap(gfp_mask);
-
-	retval = 1;
 	if (!(current->flags & PF_MEMALLOC)) {
-		int priority;
-
 		current->flags |= PF_MEMALLOC;
-	
-		priority = 8;
-		do {
-			free_memory(shrink_mmap(priority, gfp_mask));
-			free_memory(shm_swap(priority, gfp_mask));
-			free_memory(swap_out(priority, gfp_mask));
-			shrink_dcache_memory(priority, gfp_mask);
-		} while (--priority >= 0);
-		retval = 0;
-done:
+		while (count--)
+		{
+			static int state = 0;
+			if (!do_free_page(&state, gfp_mask))
+			{
+				retval = 0;
+				break;
+			}
+		}
 		current->flags &= ~PF_MEMALLOC;
 	}
 	unlock_kernel();
@@ -594,7 +643,8 @@
 	if (priority) {
 		p->counter = p->priority << priority;
 		wake_up_process(p);
-	}
+	} else
+		enable_swap_tick();
 }
 
 /* 
@@ -632,9 +682,8 @@
 			want_wakeup = 3;
 	
 		kswapd_wakeup(p,want_wakeup);
-	}
-
-	timer_active |= (1<<SWAP_TIMER);
+	} else
+		enable_swap_tick();
 }
 
 /* 
@@ -643,7 +692,6 @@
 
 void init_swap_timer(void)
 {
-	timer_table[SWAP_TIMER].expires = jiffies;
 	timer_table[SWAP_TIMER].fn = swap_tick;
-	timer_active |= (1<<SWAP_TIMER);
+	enable_swap_tick();
 }
Index: linux/fs/buffer.c
diff -u linux/fs/buffer.c:1.1.1.5 linux/fs/buffer.c:1.1.1.1.2.6
--- linux/fs/buffer.c:1.1.1.5	Fri Jan  1 19:10:20 1999
+++ linux/fs/buffer.c	Sat Jan  2 21:40:07 1999
@@ -1263,6 +1263,7 @@
 		panic("brw_page: page not locked for I/O");
 	clear_bit(PG_uptodate, &page->flags);
 	clear_bit(PG_error, &page->flags);
+	set_bit(PG_referenced, &page->flags);
 	/*
 	 * Allocate async buffer heads pointing to this page, just for I/O.
 	 * They do _not_ show up in the buffer hash table!


Andrea Arcangeli

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: [patch] new-vm improvement [Re: 2.2.0 Bug summary]
  1999-01-02 20:52               ` Andrea Arcangeli
@ 1999-01-03  2:59                 ` Andrea Arcangeli
  1999-01-04 18:08                   ` [patch] arca-vm-6, killed kswapd [Re: [patch] new-vm improvement , [Re: 2.2.0 Bug summary]] Andrea Arcangeli
  1999-01-05 13:33                   ` [patch] new-vm improvement [Re: 2.2.0 Bug summary] Ben McCann
  0 siblings, 2 replies; 243+ messages in thread
From: Andrea Arcangeli @ 1999-01-03  2:59 UTC (permalink / raw
  To: Steve Bergman
  Cc: Linus Torvalds, Benjamin Redelings I, Stephen C. Tweedie,
	linux-kernel, Alan Cox, Rik van Riel, linux-mm

On Sat, 2 Jan 1999, Andrea Arcangeli wrote:

> is the swapout smart weight code. Basing the priority on the number of
> process to try to swapout was really ugly and not smart. 

But I done two mistakes in it. Benjamin pointed out after one msec that
there was no need for putting the address on the stack, and looking a
_bit_ more at swap_out_pmd() I noticed that the old code was just updating
swap_address, woops ;).

I noticed the second very more important mistakes running at 8Mbyte
because the trashing memory proggy was segfaulting. The bug was to base
the maximal weight of swap_out() on the total_rss and not on the sum of
the total_vm of all processes. With 8Mbyte all my processes got swapped
out and so swap_out stopped working ;). It's fixed now...

> The second change is done over shrink_mmap(), this will cause
> shrink_mmap() to care very more about aging. We have only one bit and we
> must use it carefully to get not out of cache ;) 

This change is pretty buggy too. The only good thing was to not care
about the pgcache min limits before to shrink the _swap_cache_. Now I also
changed pgcache_under_min to don't care about the swapcache size (now the
swap cache is a bit more fast-variable/crazy).

> I also added/removed some PG_referenced. But please, don't trust too much
> the pg_refernced changes since I have not thought about it too much (maybe
> they are not needed?). 

Hmm I guess at least the brw_page set_bit was not needed because before to
run such function is been run or a __find_page() or an add_to_...cache().

> Ah and woops, in the last patch I do a mistake and I forget to change
> max_cnt to unsigned long. This should be changed also in your tree, Linus. 

Also some count should be moved from int to unsigned long to handle huge
RAM sizes.

> This new patch seems to really rocks here and seems _far_ better than
> anything I tried before! Steve, could try it and feedback? Thanks ;) 

Here Steve's feedback:

                      128MB       8MB
                      -------     -------
Your previous patch:  132 sec     218 sec
This patch         :  118 sec     226 sec       

Even if `This patch' was pretty buggy (as pointed out above) it was going
sligtly _faster_. I guess the reason for the 8Mbyte slowdown was the
s/rss/total_vm/ thing (but I am not 100% sure). 

I fixed the bugs and so I repost the fixed diff against pre4. I also
cleaned up a bit some thing...

Index: linux/include/linux/mm.h
diff -u linux/include/linux/mm.h:1.1.1.3 linux/include/linux/mm.h:1.1.1.1.2.12
--- linux/include/linux/mm.h:1.1.1.3	Sat Jan  2 15:24:18 1999
+++ linux/include/linux/mm.h	Sun Jan  3 03:43:52 1999
@@ -118,7 +118,6 @@
 	unsigned long offset;
 	struct page *next_hash;
 	atomic_t count;
-	unsigned int unused;
 	unsigned long flags;	/* atomic flags, some possibly updated asynchronously */
 	struct wait_queue *wait;
 	struct page **pprev_hash;
@@ -295,8 +294,7 @@
 
 /* filemap.c */
 extern void remove_inode_page(struct page *);
-extern unsigned long page_unuse(struct page *);
-extern int shrink_mmap(int, int);
+extern int FASTCALL(shrink_mmap(int, int));
 extern void truncate_inode_pages(struct inode *, unsigned long);
 extern unsigned long get_cached_page(struct inode *, unsigned long, int);
 extern void put_cached_page(unsigned long);
@@ -379,8 +377,8 @@
 
 #define buffer_under_min()	((buffermem >> PAGE_SHIFT) * 100 < \
 				buffer_mem.min_percent * num_physpages)
-#define pgcache_under_min()	(page_cache_size * 100 < \
-				page_cache.min_percent * num_physpages)
+#define pgcache_under_min()	((page_cache_size-swapper_inode.i_nrpages)*100\
+				< page_cache.min_percent * num_physpages)
 
 #endif /* __KERNEL__ */
 
Index: linux/include/linux/pagemap.h
diff -u linux/include/linux/pagemap.h:1.1.1.1 linux/include/linux/pagemap.h:1.1.1.1.2.1
--- linux/include/linux/pagemap.h:1.1.1.1	Fri Nov 20 00:01:16 1998
+++ linux/include/linux/pagemap.h	Sat Jan  2 21:40:13 1999
@@ -77,6 +77,7 @@
 		*page->pprev_hash = page->next_hash;
 		page->pprev_hash = NULL;
 	}
+	clear_bit(PG_referenced, &page->flags);
 	page_cache_size--;
 }
 
Index: linux/mm/filemap.c
diff -u linux/mm/filemap.c:1.1.1.8 linux/mm/filemap.c:1.1.1.1.2.36
--- linux/mm/filemap.c:1.1.1.8	Fri Jan  1 19:12:53 1999
+++ linux/mm/filemap.c	Sun Jan  3 03:13:09 1999
@@ -122,13 +126,14 @@
 {
 	static unsigned long clock = 0;
 	unsigned long limit = num_physpages;
+	unsigned long count;
 	struct page * page;
-	int count;
 
 	count = limit >> priority;
 
 	page = mem_map + clock;
-	do {
+	while (count != 0)
+	{
 		page++;
 		clock++;
 		if (clock >= max_mapnr) {
@@ -167,17 +172,17 @@
 
 		/* is it a swap-cache or page-cache page? */
 		if (page->inode) {
-			if (pgcache_under_min())
-				continue;
 			if (PageSwapCache(page)) {
 				delete_from_swap_cache(page);
 				return 1;
 			}
+			if (pgcache_under_min())
+				continue;
 			remove_inode_page(page);
 			return 1;
 		}
 
-	} while (count > 0);
+	}
 	return 0;
 }
 
Index: linux/mm/swap.c
diff -u linux/mm/swap.c:1.1.1.5 linux/mm/swap.c:1.1.1.1.2.8
--- linux/mm/swap.c:1.1.1.5	Sat Jan  2 15:24:40 1999
+++ linux/mm/swap.c	Sat Jan  2 21:40:13 1999
@@ -64,13 +64,13 @@
 swapstat_t swapstats = {0};
 
 buffer_mem_t buffer_mem = {
-	2,	/* minimum percent buffer */
+	5,	/* minimum percent buffer */
 	10,	/* borrow percent buffer */
 	60	/* maximum percent buffer */
 };
 
 buffer_mem_t page_cache = {
-	2,	/* minimum percent page cache */
+	5,	/* minimum percent page cache */
 	15,	/* borrow percent page cache */
 	75	/* maximum */
 };
Index: linux/mm/vmscan.c
diff -u linux/mm/vmscan.c:1.1.1.9 linux/mm/vmscan.c:1.1.1.1.2.59
--- linux/mm/vmscan.c:1.1.1.9	Sat Jan  2 15:46:20 1999
+++ linux/mm/vmscan.c	Sun Jan  3 03:43:54 1999
@@ -10,6 +10,12 @@
  *  Version: $Id: vmscan.c,v 1.5 1998/02/23 22:14:28 sct Exp $
  */
 
+/*
+ * Revisioned the page freeing algorithm (do_free_user_and_cache), and
+ * developed a smart mechanism to handle the swapout weight.
+ * Copyright (C) 1998  Andrea Arcangeli
+ */
+
 #include <linux/slab.h>
 #include <linux/kernel_stat.h>
 #include <linux/swap.h>
@@ -163,7 +169,7 @@
 			 * cache. */
 			if (PageSwapCache(page_map)) {
 				__free_page(page_map);
-				return (atomic_read(&page_map->count) == 0);
+				return 1;
 			}
 			add_to_swap_cache(page_map, entry);
 			/* We checked we were unlocked way up above, and we
@@ -195,7 +201,7 @@
 		flush_tlb_page(vma, address);
 		swap_duplicate(entry);
 		__free_page(page_map);
-		return (atomic_read(&page_map->count) == 0);
+		return 1;
 	} 
 	/* 
 	 * A clean page to be discarded?  Must be mmap()ed from
@@ -210,9 +216,8 @@
 	flush_cache_page(vma, address);
 	pte_clear(page_table);
 	flush_tlb_page(vma, address);
-	entry = (atomic_read(&page_map->count) == 1);
 	__free_page(page_map);
-	return entry;
+	return 1;
 }
 
 /*
@@ -230,7 +235,7 @@
  */
 
 static inline int swap_out_pmd(struct task_struct * tsk, struct vm_area_struct * vma,
-	pmd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
+	pmd_t *dir, unsigned long address, unsigned long end, int gfp_mask, unsigned long * counter)
 {
 	pte_t * pte;
 	unsigned long pmd_end;
@@ -251,18 +256,20 @@
 
 	do {
 		int result;
-		tsk->swap_address = address + PAGE_SIZE;
 		result = try_to_swap_out(tsk, vma, address, pte, gfp_mask);
+		address += PAGE_SIZE;
+		tsk->swap_address = address;
 		if (result)
 			return result;
-		address += PAGE_SIZE;
+		if (!--*counter)
+			return 0;
 		pte++;
 	} while (address < end);
 	return 0;
 }
 
 static inline int swap_out_pgd(struct task_struct * tsk, struct vm_area_struct * vma,
-	pgd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
+	pgd_t *dir, unsigned long address, unsigned long end, int gfp_mask, unsigned long * counter)
 {
 	pmd_t * pmd;
 	unsigned long pgd_end;
@@ -282,9 +289,11 @@
 		end = pgd_end;
 	
 	do {
-		int result = swap_out_pmd(tsk, vma, pmd, address, end, gfp_mask);
+		int result = swap_out_pmd(tsk, vma, pmd, address, end, gfp_mask, counter);
 		if (result)
 			return result;
+		if (!*counter)
+			return 0;
 		address = (address + PMD_SIZE) & PMD_MASK;
 		pmd++;
 	} while (address < end);
@@ -292,7 +301,7 @@
 }
 
 static int swap_out_vma(struct task_struct * tsk, struct vm_area_struct * vma,
-	unsigned long address, int gfp_mask)
+	unsigned long address, int gfp_mask, unsigned long * counter)
 {
 	pgd_t *pgdir;
 	unsigned long end;
@@ -306,16 +315,19 @@
 
 	end = vma->vm_end;
 	while (address < end) {
-		int result = swap_out_pgd(tsk, vma, pgdir, address, end, gfp_mask);
+		int result = swap_out_pgd(tsk, vma, pgdir, address, end, gfp_mask, counter);
 		if (result)
 			return result;
+		if (!*counter)
+			return 0;
 		address = (address + PGDIR_SIZE) & PGDIR_MASK;
 		pgdir++;
 	}
 	return 0;
 }
 
-static int swap_out_process(struct task_struct * p, int gfp_mask)
+static int swap_out_process(struct task_struct * p, int gfp_mask,
+			    unsigned long * counter)
 {
 	unsigned long address;
 	struct vm_area_struct* vma;
@@ -334,9 +346,12 @@
 			address = vma->vm_start;
 
 		for (;;) {
-			int result = swap_out_vma(p, vma, address, gfp_mask);
+			int result = swap_out_vma(p, vma, address, gfp_mask,
+						  counter);
 			if (result)
 				return result;
+			if (!*counter)
+				return 0;
 			vma = vma->vm_next;
 			if (!vma)
 				break;
@@ -350,6 +365,19 @@
 	return 0;
 }
 
+static unsigned long get_total_vm(void)
+{
+	unsigned long total_vm = 0;
+	struct task_struct * p;
+
+	read_lock(&tasklist_lock);
+	for_each_task(p)
+		total_vm += p->mm->total_vm;
+	read_unlock(&tasklist_lock);
+
+	return total_vm;
+}
+
 /*
  * Select the task with maximal swap_cnt and try to swap out a page.
  * N.B. This function returns only 0 or 1.  Return values != 1 from
@@ -358,8 +386,11 @@
 static int swap_out(unsigned int priority, int gfp_mask)
 {
 	struct task_struct * p, * pbest;
-	int counter, assign, max_cnt;
+	int assign;
+	unsigned long counter, max_cnt;
 
+	counter = get_total_vm() >> priority;
+
 	/* 
 	 * We make one or two passes through the task list, indexed by 
 	 * assign = {0, 1}:
@@ -374,20 +405,14 @@
 	 * Think of swap_cnt as a "shadow rss" - it tells us which process
 	 * we want to page out (always try largest first).
 	 */
-	counter = nr_tasks / (priority+1);
-	if (counter < 1)
-		counter = 1;
-	if (counter > nr_tasks)
-		counter = nr_tasks;
-
-	for (; counter >= 0; counter--) {
+	while (counter > 0) {
 		assign = 0;
 		max_cnt = 0;
 		pbest = NULL;
 	select:
 		read_lock(&tasklist_lock);
-		p = init_task.next_task;
-		for (; p != &init_task; p = p->next_task) {
+		for_each_task(p)
+		{
 			if (!p->swappable)
 				continue;
 	 		if (p->mm->rss <= 0)
@@ -410,10 +435,11 @@
 		}
 
 		/*
-		 * Nonzero means we cleared out something, but only "1" means
-		 * that we actually free'd up a page as a result.
+		 * Nonzero means we cleared out something, and "1" means
+		 * that we actually moved a page from the process memory
+		 * to the swap cache (it's not been freed yet).
 		 */
-		if (swap_out_process(pbest, gfp_mask) == 1)
+		if (swap_out_process(pbest, gfp_mask, &counter))
 			return 1;
 	}
 out:
@@ -441,42 +467,63 @@
        printk ("Starting kswapd v%.*s\n", i, s);
 }
 
-#define free_memory(fn) \
-	count++; do { if (!--count) goto done; } while (fn)
+static int do_free_user_and_cache(int priority, int gfp_mask)
+{
+	if (shrink_mmap(priority, gfp_mask))
+		return 1;
 
-static int kswapd_free_pages(int kswapd_state)
+	if (swap_out(priority, gfp_mask))
+		/*
+		 * We done at least some swapping progress so return 1 in
+		 * this case. -arca
+		 */
+		return 1;
+
+	return 0;
+}
+
+static int do_free_page(int * state, int gfp_mask)
 {
-	unsigned long end_time;
+	int priority = 8;
 
-	/* Always trim SLAB caches when memory gets low. */
-	kmem_cache_reap(0);
+	switch (*state) {
+		do {
+		default:
+			if (do_free_user_and_cache(priority, gfp_mask))
+				return 1;
+			*state = 1;
+		case 1:
+			if (shm_swap(priority, gfp_mask))
+				return 1;
+			*state = 0;
+
+			shrink_dcache_memory(priority, gfp_mask);
+			kmem_cache_reap(gfp_mask);
+		} while (--priority >= 0);
+	}
+	return 0;
+}
 
+static int kswapd_free_pages(int kswapd_state)
+{
 	/* max one hundreth of a second */
-	end_time = jiffies + (HZ-1)/100;
-	do {
-		int priority = 8;
-		int count = pager_daemon.swap_cluster;
+	unsigned long end_time = jiffies + (HZ-1)/100;
 
-		switch (kswapd_state) {
-			do {
-			default:
-				free_memory(shrink_mmap(priority, 0));
-				free_memory(swap_out(priority, 0));
-				kswapd_state++;
-			case 1:
-				free_memory(shm_swap(priority, 0));
-				shrink_dcache_memory(priority, 0);
-				kswapd_state = 0;
-			} while (--priority >= 0);
-			return kswapd_state;
-		}
-done:
-		if (nr_free_pages > freepages.high + pager_daemon.swap_cluster)
+	do {
+		do_free_page(&kswapd_state, 0);
+		if (nr_free_pages > freepages.high)
 			break;
 	} while (time_before_eq(jiffies,end_time));
+	/* take kswapd_state on the stack to save some byte of memory */
 	return kswapd_state;
 }
 
+static inline void enable_swap_tick(void)
+{
+	timer_table[SWAP_TIMER].expires = jiffies+(HZ+99)/100;
+	timer_active |= 1<<SWAP_TIMER;
+}
+
 /*
  * The background pageout daemon.
  * Started as a kernel thread from the init process.
@@ -524,6 +571,7 @@
 		current->state = TASK_INTERRUPTIBLE;
 		flush_signals(current);
 		run_task_queue(&tq_disk);
+		enable_swap_tick();
 		schedule();
 		swapstats.wakeups++;
 		state = kswapd_free_pages(state);
@@ -543,35 +591,23 @@
  * if we need more memory as part of a swap-out effort we
  * will just silently return "success" to tell the page
  * allocator to accept the allocation.
- *
- * We want to try to free "count" pages, and we need to 
- * cluster them so that we get good swap-out behaviour. See
- * the "free_memory()" macro for details.
  */
 int try_to_free_pages(unsigned int gfp_mask, int count)
 {
-	int retval;
-
+	int retval = 1;
 	lock_kernel();
 
-	/* Always trim SLAB caches when memory gets low. */
-	kmem_cache_reap(gfp_mask);
-
-	retval = 1;
 	if (!(current->flags & PF_MEMALLOC)) {
-		int priority;
-
 		current->flags |= PF_MEMALLOC;
-	
-		priority = 8;
-		do {
-			free_memory(shrink_mmap(priority, gfp_mask));
-			free_memory(shm_swap(priority, gfp_mask));
-			free_memory(swap_out(priority, gfp_mask));
-			shrink_dcache_memory(priority, gfp_mask);
-		} while (--priority >= 0);
-		retval = 0;
-done:
+		while (count--)
+		{
+			static int state = 0;
+			if (!do_free_page(&state, gfp_mask))
+			{
+				retval = 0;
+				break;
+			}
+		}
 		current->flags &= ~PF_MEMALLOC;
 	}
 	unlock_kernel();
@@ -594,7 +630,8 @@
 	if (priority) {
 		p->counter = p->priority << priority;
 		wake_up_process(p);
-	}
+	} else
+		enable_swap_tick();
 }
 
 /* 
@@ -632,9 +669,8 @@
 			want_wakeup = 3;
 	
 		kswapd_wakeup(p,want_wakeup);
-	}
-
-	timer_active |= (1<<SWAP_TIMER);
+	} else
+		enable_swap_tick();
 }
 
 /* 
@@ -643,7 +679,6 @@
 
 void init_swap_timer(void)
 {
-	timer_table[SWAP_TIMER].expires = jiffies;
 	timer_table[SWAP_TIMER].fn = swap_tick;
-	timer_active |= (1<<SWAP_TIMER);
+	enable_swap_tick();
 }



As usual if you Steve or other will try this I am interested about numbers
;). Thanks.

Andrea Arcangeli

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* [patch] arca-vm-6, killed kswapd [Re: [patch] new-vm improvement , [Re: 2.2.0 Bug summary]]
  1999-01-03  2:59                 ` Andrea Arcangeli
@ 1999-01-04 18:08                   ` Andrea Arcangeli
  1999-01-04 20:56                     ` Linus Torvalds
  1999-01-05 13:33                   ` [patch] new-vm improvement [Re: 2.2.0 Bug summary] Ben McCann
  1 sibling, 1 reply; 243+ messages in thread
From: Andrea Arcangeli @ 1999-01-04 18:08 UTC (permalink / raw
  To: Steve Bergman
  Cc: Linus Torvalds, Benjamin Redelings I, Stephen C. Tweedie,
	linux-kernel, Alan Cox, Rik van Riel, linux-mm

I have a new revolutionary patch. The main thing is that I killed kswapd
just to make Rik happy ;).

Ah and my last patches had a little bug that was sure hurting performances
against Linus's VM since I was stopping kswapd when nr_free_pages >
freepages.high was true and not as right Linus was doing when
nr_free_pages > freepages.high + swap_cluster. So I was causing a lot of
kswapd wakeup.

There was also a not improved thing in the trashing_memory heuristic, that
is to remove the trashing bit only if PF_MEMALLOC is not set.

Ah and the swapout code seems to like a linear and not exponential
priority handling. Probably it likes more to succeed than shrink_mmap().

If you'll try it let me know. I am interested about the image load test
(that should be the most near to the real world). 

With this patch the swapout performances are doubled. The swapout
benchmark that was used to take 100 sec with my old code and with Linus's
VM, now run in 50sec! Now I go to 6Mbyte at sec (3so and 3si) instead of
3Mbyte sec (1.5so, 1.5si). 6mbyte/sec is the performance reported by
hdparm -t btw ;). And all the system is perfectly fluid (far more fuild
than with the old code). I open an xterm without wait seconds. The cache
get not kiked out. It seems really great here. When the system goes OOM it
seems to recover fine.

Here arca-vm-6 against 2.2.0-pre4:

Index: linux/mm/vmscan.c
diff -u linux/mm/vmscan.c:1.1.1.9 linux/mm/vmscan.c:1.1.1.1.2.62
--- linux/mm/vmscan.c:1.1.1.9	Sat Jan  2 15:46:20 1999
+++ linux/mm/vmscan.c	Mon Jan  4 18:42:54 1999
@@ -10,6 +10,12 @@
  *  Version: $Id: vmscan.c,v 1.5 1998/02/23 22:14:28 sct Exp $
  */
 
+/*
+ * Revisioned the page freeing algorithm (do_free_user_and_cache), and
+ * developed a smart mechanism to handle the swapout weight. Removed kswapd.
+ * Copyright (C) 1998  Andrea Arcangeli
+ */
+
 #include <linux/slab.h>
 #include <linux/kernel_stat.h>
 #include <linux/swap.h>
@@ -20,13 +26,6 @@
 
 #include <asm/pgtable.h>
 
-/* 
- * The wait queue for waking up the pageout daemon:
- */
-static struct task_struct * kswapd_task = NULL;
-
-static void init_swap_timer(void);
-
 /*
  * The swap-out functions return 1 if they successfully
  * threw something out, and we got a free page. It returns
@@ -163,7 +162,7 @@
 			 * cache. */
 			if (PageSwapCache(page_map)) {
 				__free_page(page_map);
-				return (atomic_read(&page_map->count) == 0);
+				return 1;
 			}
 			add_to_swap_cache(page_map, entry);
 			/* We checked we were unlocked way up above, and we
@@ -195,7 +194,7 @@
 		flush_tlb_page(vma, address);
 		swap_duplicate(entry);
 		__free_page(page_map);
-		return (atomic_read(&page_map->count) == 0);
+		return 1;
 	} 
 	/* 
 	 * A clean page to be discarded?  Must be mmap()ed from
@@ -210,9 +209,8 @@
 	flush_cache_page(vma, address);
 	pte_clear(page_table);
 	flush_tlb_page(vma, address);
-	entry = (atomic_read(&page_map->count) == 1);
 	__free_page(page_map);
-	return entry;
+	return 1;
 }
 
 /*
@@ -230,7 +228,7 @@
  */
 
 static inline int swap_out_pmd(struct task_struct * tsk, struct vm_area_struct * vma,
-	pmd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
+	pmd_t *dir, unsigned long address, unsigned long end, int gfp_mask, unsigned long * counter)
 {
 	pte_t * pte;
 	unsigned long pmd_end;
@@ -251,18 +249,20 @@
 
 	do {
 		int result;
-		tsk->swap_address = address + PAGE_SIZE;
 		result = try_to_swap_out(tsk, vma, address, pte, gfp_mask);
+		address += PAGE_SIZE;
+		tsk->swap_address = address;
 		if (result)
 			return result;
-		address += PAGE_SIZE;
+		if (!--*counter)
+			return 0;
 		pte++;
 	} while (address < end);
 	return 0;
 }
 
 static inline int swap_out_pgd(struct task_struct * tsk, struct vm_area_struct * vma,
-	pgd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
+	pgd_t *dir, unsigned long address, unsigned long end, int gfp_mask, unsigned long * counter)
 {
 	pmd_t * pmd;
 	unsigned long pgd_end;
@@ -282,9 +282,11 @@
 		end = pgd_end;
 	
 	do {
-		int result = swap_out_pmd(tsk, vma, pmd, address, end, gfp_mask);
+		int result = swap_out_pmd(tsk, vma, pmd, address, end, gfp_mask, counter);
 		if (result)
 			return result;
+		if (!*counter)
+			return 0;
 		address = (address + PMD_SIZE) & PMD_MASK;
 		pmd++;
 	} while (address < end);
@@ -292,7 +294,7 @@
 }
 
 static int swap_out_vma(struct task_struct * tsk, struct vm_area_struct * vma,
-	unsigned long address, int gfp_mask)
+	unsigned long address, int gfp_mask, unsigned long * counter)
 {
 	pgd_t *pgdir;
 	unsigned long end;
@@ -306,16 +308,19 @@
 
 	end = vma->vm_end;
 	while (address < end) {
-		int result = swap_out_pgd(tsk, vma, pgdir, address, end, gfp_mask);
+		int result = swap_out_pgd(tsk, vma, pgdir, address, end, gfp_mask, counter);
 		if (result)
 			return result;
+		if (!*counter)
+			return 0;
 		address = (address + PGDIR_SIZE) & PGDIR_MASK;
 		pgdir++;
 	}
 	return 0;
 }
 
-static int swap_out_process(struct task_struct * p, int gfp_mask)
+static int swap_out_process(struct task_struct * p, int gfp_mask,
+			    unsigned long * counter)
 {
 	unsigned long address;
 	struct vm_area_struct* vma;
@@ -334,9 +339,12 @@
 			address = vma->vm_start;
 
 		for (;;) {
-			int result = swap_out_vma(p, vma, address, gfp_mask);
+			int result = swap_out_vma(p, vma, address, gfp_mask,
+						  counter);
 			if (result)
 				return result;
+			if (!*counter)
+				return 0;
 			vma = vma->vm_next;
 			if (!vma)
 				break;
@@ -350,6 +358,25 @@
 	return 0;
 }
 
+static inline unsigned long calc_swapout_weight(int priority)
+{
+	struct task_struct * p;
+	unsigned long total_vm = 0;
+
+	read_lock(&tasklist_lock);
+	for_each_task(p)
+	{
+		if (!p->swappable)
+			continue;
+		if (p->mm->rss == 0)
+			continue;
+		total_vm += p->mm->total_vm;
+	}
+	read_unlock(&tasklist_lock);
+
+	return total_vm / (priority+1);
+}
+
 /*
  * Select the task with maximal swap_cnt and try to swap out a page.
  * N.B. This function returns only 0 or 1.  Return values != 1 from
@@ -358,8 +385,11 @@
 static int swap_out(unsigned int priority, int gfp_mask)
 {
 	struct task_struct * p, * pbest;
-	int counter, assign, max_cnt;
+	int assign;
+	unsigned long counter, max_cnt;
 
+	counter = calc_swapout_weight(priority);
+
 	/* 
 	 * We make one or two passes through the task list, indexed by 
 	 * assign = {0, 1}:
@@ -374,23 +404,17 @@
 	 * Think of swap_cnt as a "shadow rss" - it tells us which process
 	 * we want to page out (always try largest first).
 	 */
-	counter = nr_tasks / (priority+1);
-	if (counter < 1)
-		counter = 1;
-	if (counter > nr_tasks)
-		counter = nr_tasks;
-
-	for (; counter >= 0; counter--) {
+	while (counter != 0) {
 		assign = 0;
 		max_cnt = 0;
 		pbest = NULL;
 	select:
 		read_lock(&tasklist_lock);
-		p = init_task.next_task;
-		for (; p != &init_task; p = p->next_task) {
+		for_each_task(p)
+		{
 			if (!p->swappable)
 				continue;
-	 		if (p->mm->rss <= 0)
+	 		if (p->mm->rss == 0)
 				continue;
 			/* Refresh swap_cnt? */
 			if (assign)
@@ -410,127 +434,51 @@
 		}
 
 		/*
-		 * Nonzero means we cleared out something, but only "1" means
-		 * that we actually free'd up a page as a result.
+		 * Nonzero means we cleared out something, and "1" means
+		 * that we actually moved a page from the process memory
+		 * to the swap cache (it's not been freed yet).
 		 */
-		if (swap_out_process(pbest, gfp_mask) == 1)
+		if (swap_out_process(pbest, gfp_mask, &counter))
 			return 1;
 	}
 out:
 	return 0;
 }
 
-/*
- * Before we start the kernel thread, print out the 
- * kswapd initialization message (otherwise the init message 
- * may be printed in the middle of another driver's init 
- * message).  It looks very bad when that happens.
- */
-void __init kswapd_setup(void)
+static int do_free_user_and_cache(int priority, int gfp_mask)
 {
-       int i;
-       char *revision="$Revision: 1.5 $", *s, *e;
+	if (shrink_mmap(priority, gfp_mask))
+		return 1;
 
-       swap_setup();
-       
-       if ((s = strchr(revision, ':')) &&
-           (e = strchr(s, '$')))
-               s++, i = e - s;
-       else
-               s = revision, i = -1;
-       printk ("Starting kswapd v%.*s\n", i, s);
-}
-
-#define free_memory(fn) \
-	count++; do { if (!--count) goto done; } while (fn)
-
-static int kswapd_free_pages(int kswapd_state)
-{
-	unsigned long end_time;
-
-	/* Always trim SLAB caches when memory gets low. */
-	kmem_cache_reap(0);
-
-	/* max one hundreth of a second */
-	end_time = jiffies + (HZ-1)/100;
-	do {
-		int priority = 8;
-		int count = pager_daemon.swap_cluster;
+	if (swap_out(priority, gfp_mask & ~__GFP_WAIT))
+		/*
+		 * We done at least some swapping progress so return 1 in
+		 * this case. -arca
+		 */
+		return 1;
 
-		switch (kswapd_state) {
-			do {
-			default:
-				free_memory(shrink_mmap(priority, 0));
-				free_memory(swap_out(priority, 0));
-				kswapd_state++;
-			case 1:
-				free_memory(shm_swap(priority, 0));
-				shrink_dcache_memory(priority, 0);
-				kswapd_state = 0;
-			} while (--priority >= 0);
-			return kswapd_state;
-		}
-done:
-		if (nr_free_pages > freepages.high + pager_daemon.swap_cluster)
-			break;
-	} while (time_before_eq(jiffies,end_time));
-	return kswapd_state;
+	return 0;
 }
 
-/*
- * The background pageout daemon.
- * Started as a kernel thread from the init process.
- */
-int kswapd(void *unused)
+static int do_free_page(int * state, int gfp_mask)
 {
-	current->session = 1;
-	current->pgrp = 1;
-	strcpy(current->comm, "kswapd");
-	sigfillset(&current->blocked);
-	
-	/*
-	 *	As a kernel thread we want to tamper with system buffers
-	 *	and other internals and thus be subject to the SMP locking
-	 *	rules. (On a uniprocessor box this does nothing).
-	 */
-	lock_kernel();
+	int priority = 6;
 
-	/*
-	 * Set the base priority to something smaller than a
-	 * regular process. We will scale up the priority
-	 * dynamically depending on how much memory we need.
-	 */
-	current->priority = (DEF_PRIORITY * 2) / 3;
-
-	/*
-	 * Tell the memory management that we're a "memory allocator",
-	 * and that if we need more memory we should get access to it
-	 * regardless (see "try_to_free_pages()"). "kswapd" should
-	 * never get caught in the normal page freeing logic.
-	 *
-	 * (Kswapd normally doesn't need memory anyway, but sometimes
-	 * you need a small amount of memory in order to be able to
-	 * page out something else, and this flag essentially protects
-	 * us from recursively trying to free more memory as we're
-	 * trying to free the first piece of memory in the first place).
-	 */
-	current->flags |= PF_MEMALLOC;
+	switch (*state) {
+		do {
+		case 0:
+			if (do_free_user_and_cache(priority, gfp_mask))
+				return 1;
+			*state = 1;
+		case 1:
+			if (shm_swap(priority, gfp_mask))
+				return 1;
+			*state = 0;
 
-	init_swap_timer();
-	kswapd_task = current;
-	while (1) {
-		int state = 0;
-
-		current->state = TASK_INTERRUPTIBLE;
-		flush_signals(current);
-		run_task_queue(&tq_disk);
-		schedule();
-		swapstats.wakeups++;
-		state = kswapd_free_pages(state);
+			shrink_dcache_memory(priority, gfp_mask);
+			kmem_cache_reap(gfp_mask);
+		} while (--priority >= 0);
 	}
-	/* As if we could ever get here - maybe we want to make this killable */
-	kswapd_task = NULL;
-	unlock_kernel();
 	return 0;
 }
 
@@ -543,107 +491,26 @@
  * if we need more memory as part of a swap-out effort we
  * will just silently return "success" to tell the page
  * allocator to accept the allocation.
- *
- * We want to try to free "count" pages, and we need to 
- * cluster them so that we get good swap-out behaviour. See
- * the "free_memory()" macro for details.
  */
 int try_to_free_pages(unsigned int gfp_mask, int count)
 {
-	int retval;
-
+	int retval = 1;
 	lock_kernel();
 
-	/* Always trim SLAB caches when memory gets low. */
-	kmem_cache_reap(gfp_mask);
-
-	retval = 1;
 	if (!(current->flags & PF_MEMALLOC)) {
-		int priority;
-
 		current->flags |= PF_MEMALLOC;
-	
-		priority = 8;
-		do {
-			free_memory(shrink_mmap(priority, gfp_mask));
-			free_memory(shm_swap(priority, gfp_mask));
-			free_memory(swap_out(priority, gfp_mask));
-			shrink_dcache_memory(priority, gfp_mask);
-		} while (--priority >= 0);
-		retval = 0;
-done:
+		while (count--)
+		{
+			static int state = 0;
+			if (!do_free_page(&state, gfp_mask))
+			{
+				retval = 0;
+				break;
+			}
+		}
 		current->flags &= ~PF_MEMALLOC;
 	}
 	unlock_kernel();
 
 	return retval;
-}
-
-/*
- * Wake up kswapd according to the priority
- *	0 - no wakeup
- *	1 - wake up as a low-priority process
- *	2 - wake up as a normal process
- *	3 - wake up as an almost real-time process
- *
- * This plays mind-games with the "goodness()"
- * function in kernel/sched.c.
- */
-static inline void kswapd_wakeup(struct task_struct *p, int priority)
-{
-	if (priority) {
-		p->counter = p->priority << priority;
-		wake_up_process(p);
-	}
-}
-
-/* 
- * The swap_tick function gets called on every clock tick.
- */
-void swap_tick(void)
-{
-	struct task_struct *p = kswapd_task;
-
-	/*
-	 * Only bother to try to wake kswapd up
-	 * if the task exists and can be woken.
-	 */
-	if (p && (p->state & TASK_INTERRUPTIBLE)) {
-		unsigned int pages;
-		int want_wakeup;
-
-		/*
-		 * Schedule for wakeup if there isn't lots
-		 * of free memory or if there is too much
-		 * of it used for buffers or pgcache.
-		 *
-		 * "want_wakeup" is our priority: 0 means
-		 * not to wake anything up, while 3 means
-		 * that we'd better give kswapd a realtime
-		 * priority.
-		 */
-		want_wakeup = 0;
-		pages = nr_free_pages;
-		if (pages < freepages.high)
-			want_wakeup = 1;
-		if (pages < freepages.low)
-			want_wakeup = 2;
-		if (pages < freepages.min)
-			want_wakeup = 3;
-	
-		kswapd_wakeup(p,want_wakeup);
-	}
-
-	timer_active |= (1<<SWAP_TIMER);
-}
-
-/* 
- * Initialise the swap timer
- */
-
-void init_swap_timer(void)
-{
-	timer_table[SWAP_TIMER].expires = jiffies;
-	timer_table[SWAP_TIMER].fn = swap_tick;
-	timer_active |= (1<<SWAP_TIMER);
 }
Index: linux/mm/page_alloc.c
diff -u linux/mm/page_alloc.c:1.1.1.5 linux/mm/page_alloc.c:1.1.1.1.2.14
--- linux/mm/page_alloc.c:1.1.1.5	Sun Jan  3 20:42:44 1999
+++ linux/mm/page_alloc.c	Mon Jan  4 18:42:54 1999
@@ -260,7 +260,8 @@
 		if (nr_free_pages > freepages.min) {
 			if (!current->trashing_memory)
 				goto ok_to_allocate;
-			if (nr_free_pages > freepages.low) {
+			if (!(current->flags & PF_MEMALLOC) &&
+			    nr_free_pages > freepages.low) {
 				current->trashing_memory = 0;
 				goto ok_to_allocate;
 			}
@@ -271,7 +272,7 @@
 		 * memory.
 		 */
 		current->trashing_memory = 1;
-		if (!try_to_free_pages(gfp_mask, SWAP_CLUSTER_MAX) && !(gfp_mask & (__GFP_MED | __GFP_HIGH)))
+		if (!try_to_free_pages(gfp_mask, freepages.high - nr_free_pages + 1<<order) && !(gfp_mask & (__GFP_MED | __GFP_HIGH)))
 			goto nopage;
 	}
 ok_to_allocate:
Index: linux/init/main.c
diff -u linux/init/main.c:1.1.1.5 linux/init/main.c:1.1.1.1.2.9
--- linux/init/main.c:1.1.1.5	Tue Dec 29 01:39:16 1998
+++ linux/init/main.c	Mon Jan  4 18:42:54 1999
@@ -63,8 +63,6 @@
 
 static int init(void *);
 extern int bdflush(void *);
-extern int kswapd(void *);
-extern void kswapd_setup(void);
 
 extern void init_IRQ(void);
 extern void init_modules(void);
@@ -1269,9 +1267,6 @@
 
 	/* Launch bdflush from here, instead of the old syscall way. */
 	kernel_thread(bdflush, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
-	/* Start the background pageout daemon. */
-	kswapd_setup();
-	kernel_thread(kswapd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
 
 #if CONFIG_AP1000
 	/* Start the async paging daemon. */
Index: linux/include/linux/mm.h
diff -u linux/include/linux/mm.h:1.1.1.3 linux/include/linux/mm.h:1.1.1.1.2.13
--- linux/include/linux/mm.h:1.1.1.3	Sat Jan  2 15:24:18 1999
+++ linux/include/linux/mm.h	Mon Jan  4 18:42:52 1999
@@ -118,7 +118,6 @@
 	unsigned long offset;
 	struct page *next_hash;
 	atomic_t count;
-	unsigned int unused;
 	unsigned long flags;	/* atomic flags, some possibly updated asynchronously */
 	struct wait_queue *wait;
 	struct page **pprev_hash;
@@ -295,8 +294,7 @@
 
 /* filemap.c */
 extern void remove_inode_page(struct page *);
-extern unsigned long page_unuse(struct page *);
-extern int shrink_mmap(int, int);
+extern int FASTCALL(shrink_mmap(int, int));
 extern void truncate_inode_pages(struct inode *, unsigned long);
 extern unsigned long get_cached_page(struct inode *, unsigned long, int);
 extern void put_cached_page(unsigned long);
Index: linux/mm/swap.c
diff -u linux/mm/swap.c:1.1.1.5 linux/mm/swap.c:1.1.1.1.2.8
--- linux/mm/swap.c:1.1.1.5	Sat Jan  2 15:24:40 1999
+++ linux/mm/swap.c	Sat Jan  2 21:40:13 1999
@@ -64,13 +64,13 @@
 swapstat_t swapstats = {0};
 
 buffer_mem_t buffer_mem = {
-	2,	/* minimum percent buffer */
+	5,	/* minimum percent buffer */
 	10,	/* borrow percent buffer */
 	60	/* maximum percent buffer */
 };
 
 buffer_mem_t page_cache = {
-	2,	/* minimum percent page cache */
+	5,	/* minimum percent page cache */
 	15,	/* borrow percent page cache */
 	75	/* maximum */
 };
Index: linux/include/linux/swap.h
diff -u linux/include/linux/swap.h:1.1.1.4 linux/include/linux/swap.h:1.1.1.1.2.9
--- linux/include/linux/swap.h:1.1.1.4	Tue Dec 29 01:39:03 1998
+++ linux/include/linux/swap.h	Tue Dec 29 02:19:08 1998
@@ -167,9 +167,11 @@
 	count = atomic_read(&page->count);
 	if (PageSwapCache(page))
 	{
+#if 0
 		/* PARANOID */
 		if (page->inode != &swapper_inode)
 			panic("swap cache page has wrong inode\n");
+#endif
 		count += swap_count(page->offset) - 2;
 	}
 	if (PageFreeAfter(page))


--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: [patch] arca-vm-6, killed kswapd [Re: [patch] new-vm improvement , [Re: 2.2.0 Bug summary]]
  1999-01-04 18:08                   ` [patch] arca-vm-6, killed kswapd [Re: [patch] new-vm improvement , [Re: 2.2.0 Bug summary]] Andrea Arcangeli
@ 1999-01-04 20:56                     ` Linus Torvalds
  1999-01-04 21:10                       ` Rik van Riel
                                         ` (2 more replies)
  0 siblings, 3 replies; 243+ messages in thread
From: Linus Torvalds @ 1999-01-04 20:56 UTC (permalink / raw
  To: Andrea Arcangeli
  Cc: Steve Bergman, Benjamin Redelings I, Stephen C. Tweedie,
	linux-kernel, Alan Cox, Rik van Riel, linux-mm

On Mon, 4 Jan 1999, Andrea Arcangeli wrote:
>
> I have a new revolutionary patch. The main thing is that I killed kswapd
> just to make Rik happy ;).

Ehh..

You may have made Rik happy, but you totally missed the reason for kswapd. 
And while your patch looked interesting (a lot cleaner than the previous
ones, and I _like_ patches that remove code), the fact that you killed
kswapd means that it is essentially useless. 

Basically, we _have_ to have kswapd, and I'll tell you why:
 - imagine running low on memory due to GFP_ATOMIC
 - imagine not having any normal processes that do memory alloction.

Boom. You just killed the machine with your patch, because maybe the
GPF_ATOMIC things are what the machine is doing. Imagine a machine that
acts as a router - it might not even be running any normal user processes
at _all_, but it had damn well better make sure that memory is always
available some way. "kswapd" did that for us, and Rik's happiness counts
as nothing in face of basic facts of life like that. Sorry.

		Linus

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: [patch] arca-vm-6, killed kswapd [Re: [patch] new-vm improvement , [Re: 2.2.0 Bug summary]]
  1999-01-04 20:56                     ` Linus Torvalds
@ 1999-01-04 21:10                       ` Rik van Riel
  1999-01-04 22:04                       ` Alan Cox
  1999-01-04 22:29                       ` Andrea Arcangeli
  2 siblings, 0 replies; 243+ messages in thread
From: Rik van Riel @ 1999-01-04 21:10 UTC (permalink / raw
  To: Linus Torvalds
  Cc: Andrea Arcangeli, Steve Bergman, Benjamin Redelings I,
	Stephen C. Tweedie, linux-kernel, Alan Cox, linux-mm

On Mon, 4 Jan 1999, Linus Torvalds wrote:
> On Mon, 4 Jan 1999, Andrea Arcangeli wrote:
> >
> > I have a new revolutionary patch. The main thing is that I killed kswapd
> > just to make Rik happy ;).
> 
> You may have made Rik happy,

Not even that -- I really like the concept of a separate
thread doing the much needed page freeing...

> but you totally missed the reason for kswapd.  And while your
> patch looked interesting (a lot cleaner than the previous ones,
> and I _like_ patches that remove code), the fact that you killed
> kswapd means that it is essentially useless.

Yup -- a definite No-No.
(just to make sure that nobody would have really gotten
the impression that I would be happy with the removal
of kswapd)

cheers,

Rik -- If a Microsoft product fails, who do you sue?
+-------------------------------------------------------------------+
| Linux memory management tour guide.        riel@humbolt.geo.uu.nl |
| Scouting Vries cubscout leader.    http://humbolt.geo.uu.nl/~riel |
+-------------------------------------------------------------------+

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: [patch] arca-vm-6, killed kswapd [Re: [patch] new-vm improvement , [Re: 2.2.0 Bug summary]]
  1999-01-04 22:04                       ` Alan Cox
@ 1999-01-04 21:55                         ` Linus Torvalds
  1999-01-04 22:51                           ` Andrea Arcangeli
  1999-01-04 22:43                         ` [patch] arca-vm-6, killed kswapd [Re: [patch] new-vm improvement , [Re: 2.2.0 Bug summary]] Andrea Arcangeli
  1 sibling, 1 reply; 243+ messages in thread
From: Linus Torvalds @ 1999-01-04 21:55 UTC (permalink / raw
  To: Alan Cox
  Cc: andrea, steve, bredelin, sct, linux-kernel, H.H.vanRiel, linux-mm



On Mon, 4 Jan 1999, Alan Cox wrote:
> > Boom. You just killed the machine with your patch, because maybe the
> > GPF_ATOMIC things are what the machine is doing. Imagine a machine that
> > acts as a router - it might not even be running any normal user processes
> > at _all_, but it had damn well better make sure that memory is always
> > available some way. "kswapd" did that for us, and Rik's happiness counts
> > as nothing in face of basic facts of life like that. Sorry.
> 
> Its performance properties are very interesting however. They do seem to suggest
> kswapd should be more of a last resort. 

Agreed, I found that interesting too. The solution may just be to make
kswapd run a lot less often rather than removing it - for the
machine-killing out-of-memory situation it doesn't matter if kswapd runs
just a few times a second or something like that. 

However, one of the things I found so appealing with the patch was the
fact that it removed a lot of code, and that wouldn't be true for
something that just changed kswapd to run less often. Oh, well. 

		Linus

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: [patch] arca-vm-6, killed kswapd [Re: [patch] new-vm improvement , [Re: 2.2.0 Bug summary]]
  1999-01-04 20:56                     ` Linus Torvalds
  1999-01-04 21:10                       ` Rik van Riel
@ 1999-01-04 22:04                       ` Alan Cox
  1999-01-04 21:55                         ` Linus Torvalds
  1999-01-04 22:43                         ` [patch] arca-vm-6, killed kswapd [Re: [patch] new-vm improvement , [Re: 2.2.0 Bug summary]] Andrea Arcangeli
  1999-01-04 22:29                       ` Andrea Arcangeli
  2 siblings, 2 replies; 243+ messages in thread
From: Alan Cox @ 1999-01-04 22:04 UTC (permalink / raw
  To: Linus Torvalds
  Cc: andrea, steve, bredelin, sct, linux-kernel, alan, H.H.vanRiel,
	linux-mm

> Boom. You just killed the machine with your patch, because maybe the
> GPF_ATOMIC things are what the machine is doing. Imagine a machine that
> acts as a router - it might not even be running any normal user processes
> at _all_, but it had damn well better make sure that memory is always
> available some way. "kswapd" did that for us, and Rik's happiness counts
> as nothing in face of basic facts of life like that. Sorry.

Its performance properties are very interesting however. They do seem to suggest
kswapd should be more of a last resort. 

Alan


--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: [patch] arca-vm-6, killed kswapd [Re: [patch] new-vm improvement , [Re: 2.2.0 Bug summary]]
  1999-01-04 20:56                     ` Linus Torvalds
  1999-01-04 21:10                       ` Rik van Riel
  1999-01-04 22:04                       ` Alan Cox
@ 1999-01-04 22:29                       ` Andrea Arcangeli
  2 siblings, 0 replies; 243+ messages in thread
From: Andrea Arcangeli @ 1999-01-04 22:29 UTC (permalink / raw
  To: Linus Torvalds
  Cc: Steve Bergman, Benjamin Redelings I, Stephen C. Tweedie,
	linux-kernel, Alan Cox, Rik van Riel, linux-mm

On Mon, 4 Jan 1999, Linus Torvalds wrote:

> GPF_ATOMIC things are what the machine is doing. Imagine a machine that
> acts as a router - it might not even be running any normal user processes

Argg, I didn't thought at that, now I understood the point... But I am
pretty sure we can continue to do async swapout also from the process
path. I think it works fine because now swapout is only a bank credit. It
works faster obviously because the process doesn't need to block and so
requesting many swapout at one time will drammatically improve swapout
I/O performances... 

I am going to re-insert the poor kswapd now ;)

Thanks.

Andrea Arcangeli

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: [patch] arca-vm-6, killed kswapd [Re: [patch] new-vm improvement , [Re: 2.2.0 Bug summary]]
  1999-01-04 22:04                       ` Alan Cox
  1999-01-04 21:55                         ` Linus Torvalds
@ 1999-01-04 22:43                         ` Andrea Arcangeli
  1 sibling, 0 replies; 243+ messages in thread
From: Andrea Arcangeli @ 1999-01-04 22:43 UTC (permalink / raw
  To: Alan Cox
  Cc: Linus Torvalds, steve, bredelin, sct, linux-kernel, H.H.vanRiel,
	linux-mm

On Mon, 4 Jan 1999, Alan Cox wrote:

> Its performance properties are very interesting however. They do seem to suggest
> kswapd should be more of a last resort. 

Steve said me now that the image test runs not fast as in arca-3 (the one
before inserting my new swap_out() smart weight code), but here there are
no dubits. My latest patch double performances under swap here and every
thing is _far_ more fluid (I tried only on 128Mbyte of RAM though). I go
to the cinema in the menatime and I tried again now with the same
results... 

Just to allow everyone to see the difference (and to tell me if eventually
I am missing something of magic ;) here is the bench I am using:

#include <stdio.h>
#include <time.h>

main()
{
	char *p[160];
	int i, j;
	int count;
	time_t start,stop;
	for (j=0; j<160; j++)
	{
		p[j] = (char *) malloc(1000000);
	}
	for (count=0;count<2000;count++)
	{
		start = time(NULL);
		for (j=0; j<160; j++)
		{
			for (i=0; i<1000000; i++)
				p[j][i] = 0;
		}
		stop = time(NULL);
		if (count)
			printf("elapsed %u\n", stop-start);
		fflush(stdout);
	}
}

The number 160 menas that the benchmark will tell you the time in sec it
takes to dirtify 160 mbyte of virtual memory in loop. It now runs in 54
sec (against 100 before) and I am writing this in the meantime without see
differences with an idle system (I couldn't open pine and sort some huge
folder without any kind of slowdown under the same conditions before). My
I/O is _slowww__ I have _everything_ in a IDE 6mbyte/sec disk and the seek
time is really a pain (note it's the HD that is slowww, I like IDE ;). 

I am going to revert everything except the new things that caused the
benchmark to double performances and the system to go far more fluid, to
arca-vm-3 (that it's reported to be the fastest vm out there by steve
under misc swapping usage (the image test)). I probably leave the
swap_out() smart weight code since it's really needed on low memory even
if it seems that the swapout weight is causing a bit of slowdown probably
because it's not tuned right now.

Andrea Arcangeli

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: [patch] arca-vm-6, killed kswapd [Re: [patch] new-vm improvement , [Re: 2.2.0 Bug summary]]
  1999-01-04 21:55                         ` Linus Torvalds
@ 1999-01-04 22:51                           ` Andrea Arcangeli
  1999-01-05  0:32                             ` Andrea Arcangeli
  0 siblings, 1 reply; 243+ messages in thread
From: Andrea Arcangeli @ 1999-01-04 22:51 UTC (permalink / raw
  To: Linus Torvalds
  Cc: Alan Cox, steve, bredelin, sct, linux-kernel, H.H.vanRiel,
	linux-mm

On Mon, 4 Jan 1999, Linus Torvalds wrote:

> However, one of the things I found so appealing with the patch was the
> fact that it removed a lot of code, and that wouldn't be true for
> something that just changed kswapd to run less often. Oh, well. 

We can still remove the dynamic prio thing and the
run-one-jiffy-and-schedule thing since we don't need to give
swapout performances via kswapd anymore allowing the process to swapout
async and take credits from the bank some time after...

We can more simply schedule() if need_resched is set inside the kswapd
engine.

I am going to do something like that right now...

Andrea Arcangeli

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: [patch] arca-vm-6, killed kswapd [Re: [patch] new-vm improvement , [Re: 2.2.0 Bug summary]]
  1999-01-04 22:51                           ` Andrea Arcangeli
@ 1999-01-05  0:32                             ` Andrea Arcangeli
  1999-01-05  0:52                               ` Zlatko Calusic
                                                 ` (2 more replies)
  0 siblings, 3 replies; 243+ messages in thread
From: Andrea Arcangeli @ 1999-01-05  0:32 UTC (permalink / raw
  To: Linus Torvalds
  Cc: Alan Cox, steve, bredelin, sct, linux-kernel, H.H.vanRiel,
	linux-mm

On Mon, 4 Jan 1999, Andrea Arcangeli wrote:

> I am going to do something like that right now...

Here a new patch (arca-vm-7). It pratically removes kswapd for all places
except the ATOMIC memory allocation if there aren't process that are just
freeing memory. 

It returns also to the stock/arca-vm-3 shrink_mmap() (even if it seems
slower).

Index: linux/mm/vmscan.c
diff -u linux/mm/vmscan.c:1.1.1.9 linux/mm/vmscan.c:1.1.1.1.2.64
--- linux/mm/vmscan.c:1.1.1.9	Sat Jan  2 15:46:20 1999
+++ linux/mm/vmscan.c	Tue Jan  5 01:02:43 1999
@@ -10,6 +10,14 @@
  *  Version: $Id: vmscan.c,v 1.5 1998/02/23 22:14:28 sct Exp $
  */
 
+/*
+ * Developed the balanced page freeing algorithm (do_free_user_and_cache).
+ * Developed a smart mechanism to handle the swapout weight.
+ * Allowed the process to swapout async and only then get the credit from
+ * the bank. This has doubled swapout performances and fluidness.
+ * Copyright (C) 1998  Andrea Arcangeli
+ */
+
 #include <linux/slab.h>
 #include <linux/kernel_stat.h>
 #include <linux/swap.h>
@@ -21,12 +29,15 @@
 #include <asm/pgtable.h>
 
 /* 
+ * When are we next due for a page scan? 
+ */
+static atomic_t nr_tasks_freeing_memory = ATOMIC_INIT(0);
+
+/* 
  * The wait queue for waking up the pageout daemon:
  */
 static struct task_struct * kswapd_task = NULL;
 
-static void init_swap_timer(void);
-
 /*
  * The swap-out functions return 1 if they successfully
  * threw something out, and we got a free page. It returns
@@ -163,7 +174,7 @@
 			 * cache. */
 			if (PageSwapCache(page_map)) {
 				__free_page(page_map);
-				return (atomic_read(&page_map->count) == 0);
+				return 1;
 			}
 			add_to_swap_cache(page_map, entry);
 			/* We checked we were unlocked way up above, and we
@@ -195,7 +206,7 @@
 		flush_tlb_page(vma, address);
 		swap_duplicate(entry);
 		__free_page(page_map);
-		return (atomic_read(&page_map->count) == 0);
+		return 1;
 	} 
 	/* 
 	 * A clean page to be discarded?  Must be mmap()ed from
@@ -210,9 +221,8 @@
 	flush_cache_page(vma, address);
 	pte_clear(page_table);
 	flush_tlb_page(vma, address);
-	entry = (atomic_read(&page_map->count) == 1);
 	__free_page(page_map);
-	return entry;
+	return 1;
 }
 
 /*
@@ -230,7 +240,7 @@
  */
 
 static inline int swap_out_pmd(struct task_struct * tsk, struct vm_area_struct * vma,
-	pmd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
+	pmd_t *dir, unsigned long address, unsigned long end, int gfp_mask, unsigned long * counter)
 {
 	pte_t * pte;
 	unsigned long pmd_end;
@@ -251,18 +261,20 @@
 
 	do {
 		int result;
-		tsk->swap_address = address + PAGE_SIZE;
 		result = try_to_swap_out(tsk, vma, address, pte, gfp_mask);
+		address += PAGE_SIZE;
+		tsk->swap_address = address;
 		if (result)
 			return result;
-		address += PAGE_SIZE;
+		if (!--*counter)
+			return 0;
 		pte++;
 	} while (address < end);
 	return 0;
 }
 
 static inline int swap_out_pgd(struct task_struct * tsk, struct vm_area_struct * vma,
-	pgd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
+	pgd_t *dir, unsigned long address, unsigned long end, int gfp_mask, unsigned long * counter)
 {
 	pmd_t * pmd;
 	unsigned long pgd_end;
@@ -282,9 +294,11 @@
 		end = pgd_end;
 	
 	do {
-		int result = swap_out_pmd(tsk, vma, pmd, address, end, gfp_mask);
+		int result = swap_out_pmd(tsk, vma, pmd, address, end, gfp_mask, counter);
 		if (result)
 			return result;
+		if (!*counter)
+			return 0;
 		address = (address + PMD_SIZE) & PMD_MASK;
 		pmd++;
 	} while (address < end);
@@ -292,7 +306,7 @@
 }
 
 static int swap_out_vma(struct task_struct * tsk, struct vm_area_struct * vma,
-	unsigned long address, int gfp_mask)
+	unsigned long address, int gfp_mask, unsigned long * counter)
 {
 	pgd_t *pgdir;
 	unsigned long end;
@@ -306,16 +320,19 @@
 
 	end = vma->vm_end;
 	while (address < end) {
-		int result = swap_out_pgd(tsk, vma, pgdir, address, end, gfp_mask);
+		int result = swap_out_pgd(tsk, vma, pgdir, address, end, gfp_mask, counter);
 		if (result)
 			return result;
+		if (!*counter)
+			return 0;
 		address = (address + PGDIR_SIZE) & PGDIR_MASK;
 		pgdir++;
 	}
 	return 0;
 }
 
-static int swap_out_process(struct task_struct * p, int gfp_mask)
+static int swap_out_process(struct task_struct * p, int gfp_mask,
+			    unsigned long * counter)
 {
 	unsigned long address;
 	struct vm_area_struct* vma;
@@ -334,9 +351,12 @@
 			address = vma->vm_start;
 
 		for (;;) {
-			int result = swap_out_vma(p, vma, address, gfp_mask);
+			int result = swap_out_vma(p, vma, address, gfp_mask,
+						  counter);
 			if (result)
 				return result;
+			if (!*counter)
+				return 0;
 			vma = vma->vm_next;
 			if (!vma)
 				break;
@@ -350,6 +370,25 @@
 	return 0;
 }
 
+static inline unsigned long calc_swapout_weight(int priority)
+{
+	struct task_struct * p;
+	unsigned long total_vm = 0;
+
+	read_lock(&tasklist_lock);
+	for_each_task(p)
+	{
+		if (!p->swappable)
+			continue;
+		if (p->mm->rss == 0)
+			continue;
+		total_vm += p->mm->total_vm;
+	}
+	read_unlock(&tasklist_lock);
+
+	return total_vm / (priority+1);
+}
+
 /*
  * Select the task with maximal swap_cnt and try to swap out a page.
  * N.B. This function returns only 0 or 1.  Return values != 1 from
@@ -358,7 +397,10 @@
 static int swap_out(unsigned int priority, int gfp_mask)
 {
 	struct task_struct * p, * pbest;
-	int counter, assign, max_cnt;
+	int assign;
+	unsigned long counter, max_cnt;
+
+	counter = calc_swapout_weight(priority);
 
 	/* 
 	 * We make one or two passes through the task list, indexed by 
@@ -374,23 +416,17 @@
 	 * Think of swap_cnt as a "shadow rss" - it tells us which process
 	 * we want to page out (always try largest first).
 	 */
-	counter = nr_tasks / (priority+1);
-	if (counter < 1)
-		counter = 1;
-	if (counter > nr_tasks)
-		counter = nr_tasks;
-
-	for (; counter >= 0; counter--) {
+	while (counter != 0) {
 		assign = 0;
 		max_cnt = 0;
 		pbest = NULL;
 	select:
 		read_lock(&tasklist_lock);
-		p = init_task.next_task;
-		for (; p != &init_task; p = p->next_task) {
+		for_each_task(p)
+		{
 			if (!p->swappable)
 				continue;
-	 		if (p->mm->rss <= 0)
+	 		if (p->mm->rss == 0)
 				continue;
 			/* Refresh swap_cnt? */
 			if (assign)
@@ -410,10 +446,11 @@
 		}
 
 		/*
-		 * Nonzero means we cleared out something, but only "1" means
-		 * that we actually free'd up a page as a result.
+		 * Nonzero means we cleared out something, and "1" means
+		 * that we actually moved a page from the process memory
+		 * to the swap cache (it's not been freed yet).
 		 */
-		if (swap_out_process(pbest, gfp_mask) == 1)
+		if (swap_out_process(pbest, gfp_mask, &counter))
 			return 1;
 	}
 out:
@@ -441,39 +478,62 @@
        printk ("Starting kswapd v%.*s\n", i, s);
 }
 
-#define free_memory(fn) \
-	count++; do { if (!--count) goto done; } while (fn)
+static int do_free_user_and_cache(int priority, int gfp_mask)
+{
+	if (shrink_mmap(priority, gfp_mask))
+		return 1;
 
-static int kswapd_free_pages(int kswapd_state)
+	/*
+	 * NOTE: Here we allow also the process to do async swapout
+	 * because the swapout is really only a credit at the bank of
+	 * free memory right now. So we don't care to have it _now_.
+	 * Allowing async I/O we are going to improve drammatically
+	 * swapout performance -arca (discovered this afternoon ;) 980105
+	 */
+	if (swap_out(priority, gfp_mask & ~__GFP_WAIT))
+		/*
+		 * We done at least some swapping progress so return 1 in
+		 * this case. -arca
+		 */
+		return 1;
+
+	return 0;
+}
+
+static int do_free_page(int * state, int gfp_mask)
 {
-	unsigned long end_time;
+	int priority = 8;
 
-	/* Always trim SLAB caches when memory gets low. */
-	kmem_cache_reap(0);
+	switch (*state) {
+		do {
+		case 0:
+			if (do_free_user_and_cache(priority, gfp_mask))
+				return 1;
+			*state = 1;
+		case 1:
+			if (shm_swap(priority, gfp_mask))
+				return 1;
+			*state = 0;
 
-	/* max one hundreth of a second */
-	end_time = jiffies + (HZ-1)/100;
-	do {
-		int priority = 8;
-		int count = pager_daemon.swap_cluster;
+			shrink_dcache_memory(priority, gfp_mask);
+			kmem_cache_reap(gfp_mask);
+		} while (--priority >= 0);
+	}
+	return 0;
+}
 
-		switch (kswapd_state) {
-			do {
-			default:
-				free_memory(shrink_mmap(priority, 0));
-				free_memory(swap_out(priority, 0));
-				kswapd_state++;
-			case 1:
-				free_memory(shm_swap(priority, 0));
-				shrink_dcache_memory(priority, 0);
-				kswapd_state = 0;
-			} while (--priority >= 0);
-			return kswapd_state;
-		}
-done:
-		if (nr_free_pages > freepages.high + pager_daemon.swap_cluster)
+static int kswapd_free_pages(int kswapd_state)
+{
+	for(;;)
+	{
+		do_free_page(&kswapd_state, 0);
+		if (nr_free_pages > freepages.high)
+			break;
+		if (atomic_read(&nr_tasks_freeing_memory))
 			break;
-	} while (time_before_eq(jiffies,end_time));
+		if (kswapd_task->need_resched)
+			schedule();
+	};
 	return kswapd_state;
 }
 
@@ -496,13 +556,6 @@
 	lock_kernel();
 
 	/*
-	 * Set the base priority to something smaller than a
-	 * regular process. We will scale up the priority
-	 * dynamically depending on how much memory we need.
-	 */
-	current->priority = (DEF_PRIORITY * 2) / 3;
-
-	/*
 	 * Tell the memory management that we're a "memory allocator",
 	 * and that if we need more memory we should get access to it
 	 * regardless (see "try_to_free_pages()"). "kswapd" should
@@ -516,7 +569,6 @@
 	 */
 	current->flags |= PF_MEMALLOC;
 
-	init_swap_timer();
 	kswapd_task = current;
 	while (1) {
 		int state = 0;
@@ -543,107 +595,37 @@
  * if we need more memory as part of a swap-out effort we
  * will just silently return "success" to tell the page
  * allocator to accept the allocation.
- *
- * We want to try to free "count" pages, and we need to 
- * cluster them so that we get good swap-out behaviour. See
- * the "free_memory()" macro for details.
  */
 int try_to_free_pages(unsigned int gfp_mask, int count)
 {
-	int retval;
-
+	int retval = 1;
 	lock_kernel();
 
-	/* Always trim SLAB caches when memory gets low. */
-	kmem_cache_reap(gfp_mask);
-
-	retval = 1;
 	if (!(current->flags & PF_MEMALLOC)) {
-		int priority;
-
 		current->flags |= PF_MEMALLOC;
-	
-		priority = 8;
-		do {
-			free_memory(shrink_mmap(priority, gfp_mask));
-			free_memory(shm_swap(priority, gfp_mask));
-			free_memory(swap_out(priority, gfp_mask));
-			shrink_dcache_memory(priority, gfp_mask);
-		} while (--priority >= 0);
-		retval = 0;
-done:
+		atomic_inc(&nr_tasks_freeing_memory);
+		while (count--)
+		{
+			static int state = 0;
+			if (!do_free_page(&state, gfp_mask))
+			{
+				retval = 0;
+				break;
+			}
+		}
+		atomic_dec(&nr_tasks_freeing_memory);
 		current->flags &= ~PF_MEMALLOC;
 	}
 	unlock_kernel();
 
 	return retval;
 }
-
-/*
- * Wake up kswapd according to the priority
- *	0 - no wakeup
- *	1 - wake up as a low-priority process
- *	2 - wake up as a normal process
- *	3 - wake up as an almost real-time process
- *
- * This plays mind-games with the "goodness()"
- * function in kernel/sched.c.
- */
-static inline void kswapd_wakeup(struct task_struct *p, int priority)
-{
-	if (priority) {
-		p->counter = p->priority << priority;
-		wake_up_process(p);
-	}
-}
 
-/* 
- * The swap_tick function gets called on every clock tick.
- */
-void swap_tick(void)
+void kswapd_wakeup(void)
 {
-	struct task_struct *p = kswapd_task;
-
-	/*
-	 * Only bother to try to wake kswapd up
-	 * if the task exists and can be woken.
-	 */
-	if (p && (p->state & TASK_INTERRUPTIBLE)) {
-		unsigned int pages;
-		int want_wakeup;
-
-		/*
-		 * Schedule for wakeup if there isn't lots
-		 * of free memory or if there is too much
-		 * of it used for buffers or pgcache.
-		 *
-		 * "want_wakeup" is our priority: 0 means
-		 * not to wake anything up, while 3 means
-		 * that we'd better give kswapd a realtime
-		 * priority.
-		 */
-		want_wakeup = 0;
-		pages = nr_free_pages;
-		if (pages < freepages.high)
-			want_wakeup = 1;
-		if (pages < freepages.low)
-			want_wakeup = 2;
-		if (pages < freepages.min)
-			want_wakeup = 3;
-	
-		kswapd_wakeup(p,want_wakeup);
-	}
-
-	timer_active |= (1<<SWAP_TIMER);
-}
+	struct task_struct * p = kswapd_task;
 
-/* 
- * Initialise the swap timer
- */
-
-void init_swap_timer(void)
-{
-	timer_table[SWAP_TIMER].expires = jiffies;
-	timer_table[SWAP_TIMER].fn = swap_tick;
-	timer_active |= (1<<SWAP_TIMER);
+	if (p && (p->state & TASK_INTERRUPTIBLE) &&
+	    !atomic_read(&nr_tasks_freeing_memory))
+		wake_up_process(p);
 }
Index: linux/mm/page_alloc.c
diff -u linux/mm/page_alloc.c:1.1.1.5 linux/mm/page_alloc.c:1.1.1.1.2.15
--- linux/mm/page_alloc.c:1.1.1.5	Sun Jan  3 20:42:44 1999
+++ linux/mm/page_alloc.c	Tue Jan  5 01:13:00 1999
@@ -151,7 +151,6 @@
 	if (!PageReserved(page) && atomic_dec_and_test(&page->count)) {
 		if (PageSwapCache(page))
 			panic ("Freeing swap cache page");
-		page->flags &= ~(1 << PG_referenced);
 		free_pages_ok(page->map_nr, 0);
 		return;
 	}
@@ -173,7 +172,6 @@
 		if (atomic_dec_and_test(&map->count)) {
 			if (PageSwapCache(map))
 				panic ("Freeing swap cache pages");
-			map->flags &= ~(1 << PG_referenced);
 			free_pages_ok(map_nr, order);
 			return;
 		}
@@ -260,7 +258,8 @@
 		if (nr_free_pages > freepages.min) {
 			if (!current->trashing_memory)
 				goto ok_to_allocate;
-			if (nr_free_pages > freepages.low) {
+			if (!(current->flags & PF_MEMALLOC) &&
+			    nr_free_pages > freepages.low) {
 				current->trashing_memory = 0;
 				goto ok_to_allocate;
 			}
@@ -271,8 +270,11 @@
 		 * memory.
 		 */
 		current->trashing_memory = 1;
-		if (!try_to_free_pages(gfp_mask, SWAP_CLUSTER_MAX) && !(gfp_mask & (__GFP_MED | __GFP_HIGH)))
+		if (!try_to_free_pages(gfp_mask, freepages.high - nr_free_pages + 1<<order) && !(gfp_mask & (__GFP_MED | __GFP_HIGH)))
 			goto nopage;
+	} else {
+		if (nr_free_pages < freepages.min)
+			kswapd_wakeup();
 	}
 ok_to_allocate:
 	spin_lock_irqsave(&page_alloc_lock, flags);
Index: linux/include/linux/mm.h
diff -u linux/include/linux/mm.h:1.1.1.3 linux/include/linux/mm.h:1.1.1.1.2.13
--- linux/include/linux/mm.h:1.1.1.3	Sat Jan  2 15:24:18 1999
+++ linux/include/linux/mm.h	Mon Jan  4 18:42:52 1999
@@ -118,7 +118,6 @@
 	unsigned long offset;
 	struct page *next_hash;
 	atomic_t count;
-	unsigned int unused;
 	unsigned long flags;	/* atomic flags, some possibly updated asynchronously */
 	struct wait_queue *wait;
 	struct page **pprev_hash;
@@ -295,8 +294,7 @@
 
 /* filemap.c */
 extern void remove_inode_page(struct page *);
-extern unsigned long page_unuse(struct page *);
-extern int shrink_mmap(int, int);
+extern int FASTCALL(shrink_mmap(int, int));
 extern void truncate_inode_pages(struct inode *, unsigned long);
 extern unsigned long get_cached_page(struct inode *, unsigned long, int);
 extern void put_cached_page(unsigned long);
Index: linux/mm/swap.c
diff -u linux/mm/swap.c:1.1.1.5 linux/mm/swap.c:1.1.1.1.2.8
--- linux/mm/swap.c:1.1.1.5	Sat Jan  2 15:24:40 1999
+++ linux/mm/swap.c	Sat Jan  2 21:40:13 1999
@@ -64,13 +64,13 @@
 swapstat_t swapstats = {0};
 
 buffer_mem_t buffer_mem = {
-	2,	/* minimum percent buffer */
+	5,	/* minimum percent buffer */
 	10,	/* borrow percent buffer */
 	60	/* maximum percent buffer */
 };
 
 buffer_mem_t page_cache = {
-	2,	/* minimum percent page cache */
+	5,	/* minimum percent page cache */
 	15,	/* borrow percent page cache */
 	75	/* maximum */
 };
Index: linux/include/linux/swap.h
diff -u linux/include/linux/swap.h:1.1.1.4 linux/include/linux/swap.h:1.1.1.1.2.10
--- linux/include/linux/swap.h:1.1.1.4	Tue Dec 29 01:39:03 1998
+++ linux/include/linux/swap.h	Tue Jan  5 01:12:59 1999
@@ -83,6 +83,7 @@
 
 /* linux/mm/vmscan.c */
 extern int try_to_free_pages(unsigned int gfp_mask, int count);
+extern void kswapd_wakeup(void);
 
 /* linux/mm/page_io.c */
 extern void rw_swap_page(int, unsigned long, char *, int);
@@ -167,9 +168,11 @@
 	count = atomic_read(&page->count);
 	if (PageSwapCache(page))
 	{
+#if 0
 		/* PARANOID */
 		if (page->inode != &swapper_inode)
 			panic("swap cache page has wrong inode\n");
+#endif
 		count += swap_count(page->offset) - 2;
 	}
 	if (PageFreeAfter(page))


Andrea Arcangeli

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: [patch] arca-vm-6, killed kswapd [Re: [patch] new-vm improvement , [Re: 2.2.0 Bug summary]]
  1999-01-05  0:32                             ` Andrea Arcangeli
@ 1999-01-05  0:52                               ` Zlatko Calusic
  1999-01-05  3:02                               ` Zlatko Calusic
  1999-01-05 15:35                               ` arca-vm-8 [Re: [patch] arca-vm-6, killed kswapd [Re: [patch] new-vm , improvement , [Re: 2.2.0 Bug summary]]] Andrea Arcangeli
  2 siblings, 0 replies; 243+ messages in thread
From: Zlatko Calusic @ 1999-01-05  0:52 UTC (permalink / raw
  To: Andrea Arcangeli
  Cc: Linus Torvalds, Alan Cox, steve, bredelin, sct, linux-kernel,
	H.H.vanRiel, linux-mm

Andrea Arcangeli <andrea@e-mind.com> writes:

> -		if (!try_to_free_pages(gfp_mask, SWAP_CLUSTER_MAX) && !(gfp_mask & (__GFP_MED | __GFP_HIGH)))
> +		if (!try_to_free_pages(gfp_mask, freepages.high - nr_free_pages + 1<<order) && !(gfp_mask & (__GFP_MED | __GFP_HIGH)))              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

How about a pair of parentheses at a strategic place? :)

Other than that, your previous (-6?) patch really works good here.

It was once that I wanted to get rid of kswapd, too, but I thought it would
surely harm performance, so I dumped the idea. Now, I'm not at all sure. :)

Keep trying!
-- 
Zlatko
--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: [patch] arca-vm-6, killed kswapd [Re: [patch] new-vm improvement , [Re: 2.2.0 Bug summary]]
  1999-01-05  0:32                             ` Andrea Arcangeli
  1999-01-05  0:52                               ` Zlatko Calusic
@ 1999-01-05  3:02                               ` Zlatko Calusic
  1999-01-05 11:49                                 ` Andrea Arcangeli
  1999-01-05 15:35                               ` arca-vm-8 [Re: [patch] arca-vm-6, killed kswapd [Re: [patch] new-vm , improvement , [Re: 2.2.0 Bug summary]]] Andrea Arcangeli
  2 siblings, 1 reply; 243+ messages in thread
From: Zlatko Calusic @ 1999-01-05  3:02 UTC (permalink / raw
  To: Andrea Arcangeli
  Cc: Linus Torvalds, Alan Cox, steve, bredelin, sct, linux-kernel,
	H.H.vanRiel, linux-mm

Andrea Arcangeli <andrea@e-mind.com> writes:

> On Mon, 4 Jan 1999, Andrea Arcangeli wrote:
> 
> > I am going to do something like that right now...
> 
> Here a new patch (arca-vm-7). It pratically removes kswapd for all places
> except the ATOMIC memory allocation if there aren't process that are just
> freeing memory. 
> 

You have a bug somewhere!

At this point (output of Alt-SysRq-M), machine locked:

Jan  5 03:49:14 atlas kernel: Free pages:         512kB 
Jan  5 03:49:14 atlas kernel:  ( Free: 128 (128 256 384) 
Jan  5 03:49:14 atlas kernel: 0*4kB 0*8kB 0*16kB 0*32kB 0*64kB 4*128kB = 512kB) 

Probably you have "< instead of <=", or similar logic problem
somewhere.

Bug revealed itself during "mmap-sync" run. It's a program that
utilises bug with shared mappings (you used to send patches for that
one, I don't know if they made it to the tree, so I check
occasionally).

Other than that, VM is really fast, in fact unbelievably fast. Kswapd
is very light on the CPU and interactive feel is great.
-- 
Zlatko
--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: [patch] arca-vm-6, killed kswapd [Re: [patch] new-vm improvement , [Re: 2.2.0 Bug summary]]
  1999-01-05  3:02                               ` Zlatko Calusic
@ 1999-01-05 11:49                                 ` Andrea Arcangeli
  1999-01-05 13:23                                   ` Zlatko Calusic
  0 siblings, 1 reply; 243+ messages in thread
From: Andrea Arcangeli @ 1999-01-05 11:49 UTC (permalink / raw
  To: Zlatko Calusic; +Cc: linux-kernel, linux-mm

On 5 Jan 1999, Zlatko Calusic wrote:

> At this point (output of Alt-SysRq-M), machine locked:

Are you been able to continue using SysRq-K?

Could you reproduce and press ALT-right+Scroll-Lock and tell me what the
kernel was executing at that time...

Could you send me also the proggy for the shared-mmaps to allow me to
reproduce?

Andrea Arcangeli

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: [patch] arca-vm-6, killed kswapd [Re: [patch] new-vm improvement , [Re: 2.2.0 Bug summary]]
  1999-01-05 11:49                                 ` Andrea Arcangeli
@ 1999-01-05 13:23                                   ` Zlatko Calusic
  1999-01-05 15:42                                     ` Andrea Arcangeli
  0 siblings, 1 reply; 243+ messages in thread
From: Zlatko Calusic @ 1999-01-05 13:23 UTC (permalink / raw
  To: Andrea Arcangeli; +Cc: linux-kernel, linux-mm

[-- Attachment #1: Type: text/plain, Size: 811 bytes --]

Andrea Arcangeli <andrea@e-mind.com> writes:

> On 5 Jan 1999, Zlatko Calusic wrote:
> 
> > At this point (output of Alt-SysRq-M), machine locked:
> 
> Are you been able to continue using SysRq-K?

Erm... I continued with *&#&%$ Alt-SysRq-{S,U,B}.
That worked for me. :)

> 
> Could you reproduce and press ALT-right+Scroll-Lock and tell me what the
> kernel was executing at that time...
>

I tried few times, but to no avail. Looks like subtle race, bad news
for you, unfortunately.

*BUT*, after I pressed ctrl-c against mmap-sync in one of the torture
tests, the program stuck in down_failed (loadav += 2). Few minutes
later machine got very unstable and I decided to reboot it. Go figure.

> Could you send me also the proggy for the shared-mmaps to allow me to
> reproduce?
> 

Sure, just be careful. :)


[-- Attachment #2: Exercise shared mappings --]
[-- Type: application/octet-stream, Size: 1009 bytes --]

#include <unistd.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/stat.h>

/* 
 * file size, should be half of the size of the physical memory
 */
#define FILESIZE (32 * 1024 * 1024)

int main(void)
{
  char *ptr;
  int fd, i;
  char c = 'A';
  pid_t pid;

  if ((fd = open("foo", O_RDWR | O_CREAT | O_TRUNC)) == -1) {
    perror("open");
    exit(1);
  }
  lseek(fd, FILESIZE - 1, SEEK_SET);
  /* write one byte to extend the file */
  write(fd, &fd, 1);

  /* get a shared mapping */
  ptr = mmap(0, FILESIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
  if (ptr == NULL) {
    perror("mmap");
    exit(1);
  }

  /* touch all pages in the mapping */
  for (i = 0; i < FILESIZE; i += 4096)
    ptr[i] = c;

  while (1) {
    if ((pid = fork())) { /* parent, wait */
      waitpid(pid, NULL, 0);
    } else { /* child, exec away */
#if 0
      execl("/bin/echo", "echo", "blah");
#else
      fsync(fd);
      printf("blah\n");
      exit(0);
#endif
    }
    sleep(5);
  }
}

[-- Attachment #3: Type: text/plain, Size: 84 bytes --]


P.S. Apologies for too many jokes, I didn't sleep at all last night. ;)
-- 
Zlatko

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: [patch] new-vm improvement [Re: 2.2.0 Bug summary]
  1999-01-03  2:59                 ` Andrea Arcangeli
  1999-01-04 18:08                   ` [patch] arca-vm-6, killed kswapd [Re: [patch] new-vm improvement , [Re: 2.2.0 Bug summary]] Andrea Arcangeli
@ 1999-01-05 13:33                   ` Ben McCann
  1 sibling, 0 replies; 243+ messages in thread
From: Ben McCann @ 1999-01-05 13:33 UTC (permalink / raw
  To: Andrea Arcangeli
  Cc: Steve Bergman, Linus Torvalds, Benjamin Redelings I,
	Stephen C. Tweedie, linux-kernel, Alan Cox, Rik van Riel,
	linux-mm

Hi Andrea,

My pet VM benchmark is the compilation of a set of about 50 C++
files which regularly grow the EGCS compiler VM size (as shown
by 'top') to 75 to 90 MB. I only have 64MB of RAM so it swaps a lot.

Here are the times (as measured by the 'time' command) for the
compilation of this suite of files (using 'make' and EGCS 1.0.1)
with 2.2.0pre4 and 2.2.0pre4 with your latest VM patch:

 TMS Compile with 2.2.0pre4
 589.830u 68.830s 18:09.88 60.4% 0+0k 0+0io 188062pf+260255w

 TMS Compile with 2.2.0pre4 and Andreas latest patch
 597.840u 71.030s 21:59.36 50.6% 0+0k 0+0io 298514pf+237324w
                  ^^^^^^^^                  ^^^^^^

Note the wall-clock time increases from 18 minutes to almost
22 minutes and the number of page faults increases from 188,000
to 298,500. It seems something is invalidating pages too aggressively
in your patch.

Is there something I can tune to improve this? Is there an experiment
I can run to help fine-tune your VM changes?

-Ben McCann

-- 
Ben McCann                              Indus River Networks
                                        31 Nagog Park
                                        Acton, MA, 01720
email: bmccann@indusriver.com           web: www.indusriver.com 
phone: (978) 266-8140                   fax: (978) 266-8111
--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* arca-vm-8 [Re: [patch] arca-vm-6, killed kswapd [Re: [patch] new-vm , improvement , [Re: 2.2.0 Bug summary]]]
  1999-01-05  0:32                             ` Andrea Arcangeli
  1999-01-05  0:52                               ` Zlatko Calusic
  1999-01-05  3:02                               ` Zlatko Calusic
@ 1999-01-05 15:35                               ` Andrea Arcangeli
  1999-01-06 14:48                                 ` Andrea Arcangeli
  2 siblings, 1 reply; 243+ messages in thread
From: Andrea Arcangeli @ 1999-01-05 15:35 UTC (permalink / raw
  To: Linus Torvalds
  Cc: Alan Cox, steve, bredelin, sct, linux-kernel, H.H.vanRiel,
	linux-mm

On Tue, 5 Jan 1999, Andrea Arcangeli wrote:

> Here a new patch (arca-vm-7). It pratically removes kswapd for all places

I fixed some thing in arca-vm-7. This new is arca-vm-8.

The main change is the fix of the trashing_memory heuristic. Now the the
free memory is always between low and high and it's left to the trashing
task to take the limits uptodate. This way I can run the swapout bench and
while :; do free; done, and the shell script _never_ gets blocked (as
opposed to arca-vm-7 and previous).

I return to right removing the referenced flag from the freed pages since
it seems to make no performance differences and it looks cleaner to me (I 
removed it in the last patch because I didn't benchmarked it and I
worried that it was the bit that made the difference between arca-vm-3).

The new patch returns to allow the pgcache to be shrunk even if pgcache
is under min. This make sense since this way shrink_mmap() is able to
really_swapout more pages even if we are really low on memory.

This new patches is very more efficient than the last one. I still don't
need kswapd...

Forget to tell, I moved the swapout weight to an exponential behavior... 
(since the new global patch it's working very better I have not compared
with the linear /(priority+1) thing).

I guess the lockup that Zlatko reported is due the bug he discovered (some
missing `()' ;). Thanks Zlatko. I tried a proggy that sync some shared
mmap and everything is fine here... 

I guess that this new code will be very better also in low memory machines
than the last one...

Index: linux/mm/vmscan.c
diff -u linux/mm/vmscan.c:1.1.1.9 linux/mm/vmscan.c:1.1.1.1.2.67
--- linux/mm/vmscan.c:1.1.1.9	Sat Jan  2 15:46:20 1999
+++ linux/mm/vmscan.c	Tue Jan  5 16:17:00 1999
@@ -10,6 +10,14 @@
  *  Version: $Id: vmscan.c,v 1.5 1998/02/23 22:14:28 sct Exp $
  */
 
+/*
+ * Developed the balanced page freeing algorithm (do_free_user_and_cache).
+ * Developed a smart mechanism to handle the swapout weight.
+ * Allowed the process to swapout async and only then get the credit from
+ * the bank. This has doubled swapout performances and fluidness.
+ * Copyright (C) 1998  Andrea Arcangeli
+ */
+
 #include <linux/slab.h>
 #include <linux/kernel_stat.h>
 #include <linux/swap.h>
@@ -21,12 +29,15 @@
 #include <asm/pgtable.h>
 
 /* 
+ * When are we next due for a page scan? 
+ */
+static atomic_t nr_tasks_freeing_memory = ATOMIC_INIT(0);
+
+/* 
  * The wait queue for waking up the pageout daemon:
  */
 static struct task_struct * kswapd_task = NULL;
 
-static void init_swap_timer(void);
-
 /*
  * The swap-out functions return 1 if they successfully
  * threw something out, and we got a free page. It returns
@@ -163,7 +174,7 @@
 			 * cache. */
 			if (PageSwapCache(page_map)) {
 				__free_page(page_map);
-				return (atomic_read(&page_map->count) == 0);
+				return 1;
 			}
 			add_to_swap_cache(page_map, entry);
 			/* We checked we were unlocked way up above, and we
@@ -195,7 +206,7 @@
 		flush_tlb_page(vma, address);
 		swap_duplicate(entry);
 		__free_page(page_map);
-		return (atomic_read(&page_map->count) == 0);
+		return 1;
 	} 
 	/* 
 	 * A clean page to be discarded?  Must be mmap()ed from
@@ -210,9 +221,8 @@
 	flush_cache_page(vma, address);
 	pte_clear(page_table);
 	flush_tlb_page(vma, address);
-	entry = (atomic_read(&page_map->count) == 1);
 	__free_page(page_map);
-	return entry;
+	return 1;
 }
 
 /*
@@ -230,7 +240,7 @@
  */
 
 static inline int swap_out_pmd(struct task_struct * tsk, struct vm_area_struct * vma,
-	pmd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
+	pmd_t *dir, unsigned long address, unsigned long end, int gfp_mask, unsigned long * counter)
 {
 	pte_t * pte;
 	unsigned long pmd_end;
@@ -251,18 +261,20 @@
 
 	do {
 		int result;
-		tsk->swap_address = address + PAGE_SIZE;
 		result = try_to_swap_out(tsk, vma, address, pte, gfp_mask);
+		address += PAGE_SIZE;
+		tsk->swap_address = address;
 		if (result)
 			return result;
-		address += PAGE_SIZE;
+		if (!--*counter)
+			return 0;
 		pte++;
 	} while (address < end);
 	return 0;
 }
 
 static inline int swap_out_pgd(struct task_struct * tsk, struct vm_area_struct * vma,
-	pgd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
+	pgd_t *dir, unsigned long address, unsigned long end, int gfp_mask, unsigned long * counter)
 {
 	pmd_t * pmd;
 	unsigned long pgd_end;
@@ -282,9 +294,11 @@
 		end = pgd_end;
 	
 	do {
-		int result = swap_out_pmd(tsk, vma, pmd, address, end, gfp_mask);
+		int result = swap_out_pmd(tsk, vma, pmd, address, end, gfp_mask, counter);
 		if (result)
 			return result;
+		if (!*counter)
+			return 0;
 		address = (address + PMD_SIZE) & PMD_MASK;
 		pmd++;
 	} while (address < end);
@@ -292,7 +306,7 @@
 }
 
 static int swap_out_vma(struct task_struct * tsk, struct vm_area_struct * vma,
-	unsigned long address, int gfp_mask)
+	unsigned long address, int gfp_mask, unsigned long * counter)
 {
 	pgd_t *pgdir;
 	unsigned long end;
@@ -306,16 +320,19 @@
 
 	end = vma->vm_end;
 	while (address < end) {
-		int result = swap_out_pgd(tsk, vma, pgdir, address, end, gfp_mask);
+		int result = swap_out_pgd(tsk, vma, pgdir, address, end, gfp_mask, counter);
 		if (result)
 			return result;
+		if (!*counter)
+			return 0;
 		address = (address + PGDIR_SIZE) & PGDIR_MASK;
 		pgdir++;
 	}
 	return 0;
 }
 
-static int swap_out_process(struct task_struct * p, int gfp_mask)
+static int swap_out_process(struct task_struct * p, int gfp_mask,
+			    unsigned long * counter)
 {
 	unsigned long address;
 	struct vm_area_struct* vma;
@@ -334,9 +351,12 @@
 			address = vma->vm_start;
 
 		for (;;) {
-			int result = swap_out_vma(p, vma, address, gfp_mask);
+			int result = swap_out_vma(p, vma, address, gfp_mask,
+						  counter);
 			if (result)
 				return result;
+			if (!*counter)
+				return 0;
 			vma = vma->vm_next;
 			if (!vma)
 				break;
@@ -350,6 +370,25 @@
 	return 0;
 }
 
+static inline unsigned long calc_swapout_weight(int priority)
+{
+	struct task_struct * p;
+	unsigned long total_vm = 0;
+
+	read_lock(&tasklist_lock);
+	for_each_task(p)
+	{
+		if (!p->swappable)
+			continue;
+		if (p->mm->rss == 0)
+			continue;
+		total_vm += p->mm->total_vm;
+	}
+	read_unlock(&tasklist_lock);
+
+	return total_vm >> (priority>>1);
+}
+
 /*
  * Select the task with maximal swap_cnt and try to swap out a page.
  * N.B. This function returns only 0 or 1.  Return values != 1 from
@@ -358,7 +397,10 @@
 static int swap_out(unsigned int priority, int gfp_mask)
 {
 	struct task_struct * p, * pbest;
-	int counter, assign, max_cnt;
+	int assign;
+	unsigned long counter, max_cnt;
+
+	counter = calc_swapout_weight(priority);
 
 	/* 
 	 * We make one or two passes through the task list, indexed by 
@@ -374,23 +416,17 @@
 	 * Think of swap_cnt as a "shadow rss" - it tells us which process
 	 * we want to page out (always try largest first).
 	 */
-	counter = nr_tasks / (priority+1);
-	if (counter < 1)
-		counter = 1;
-	if (counter > nr_tasks)
-		counter = nr_tasks;
-
-	for (; counter >= 0; counter--) {
+	while (counter != 0) {
 		assign = 0;
 		max_cnt = 0;
 		pbest = NULL;
 	select:
 		read_lock(&tasklist_lock);
-		p = init_task.next_task;
-		for (; p != &init_task; p = p->next_task) {
+		for_each_task(p)
+		{
 			if (!p->swappable)
 				continue;
-	 		if (p->mm->rss <= 0)
+	 		if (p->mm->rss == 0)
 				continue;
 			/* Refresh swap_cnt? */
 			if (assign)
@@ -410,10 +446,11 @@
 		}
 
 		/*
-		 * Nonzero means we cleared out something, but only "1" means
-		 * that we actually free'd up a page as a result.
+		 * Nonzero means we cleared out something, and "1" means
+		 * that we actually moved a page from the process memory
+		 * to the swap cache (it's not been freed yet).
 		 */
-		if (swap_out_process(pbest, gfp_mask) == 1)
+		if (swap_out_process(pbest, gfp_mask, &counter))
 			return 1;
 	}
 out:
@@ -440,40 +477,63 @@
                s = revision, i = -1;
        printk ("Starting kswapd v%.*s\n", i, s);
 }
+
+static int do_free_user_and_cache(int priority, int gfp_mask)
+{
+	if (shrink_mmap(priority, gfp_mask))
+		return 1;
 
-#define free_memory(fn) \
-	count++; do { if (!--count) goto done; } while (fn)
+	/*
+	 * NOTE: Here we allow also the process to do async swapout
+	 * because the swapout is really only a credit at the bank of
+	 * free memory right now. So we don't care to have it _now_.
+	 * Allowing async I/O we are going to improve drammatically
+	 * swapout performance -arca (discovered this afternoon ;) 980105
+	 */
+	if (swap_out(priority, gfp_mask & ~__GFP_WAIT))
+		/*
+		 * We done at least some swapping progress so return 1 in
+		 * this case. -arca
+		 */
+		return 1;
 
-static int kswapd_free_pages(int kswapd_state)
+	return 0;
+}
+
+static int do_free_page(int * state, int gfp_mask)
 {
-	unsigned long end_time;
+	int priority = 8;
 
-	/* Always trim SLAB caches when memory gets low. */
-	kmem_cache_reap(0);
+	switch (*state) {
+		do {
+		case 0:
+			if (do_free_user_and_cache(priority, gfp_mask))
+				return 1;
+			*state = 1;
+		case 1:
+			if (shm_swap(priority, gfp_mask))
+				return 1;
+			*state = 0;
 
-	/* max one hundreth of a second */
-	end_time = jiffies + (HZ-1)/100;
-	do {
-		int priority = 8;
-		int count = pager_daemon.swap_cluster;
+			shrink_dcache_memory(priority, gfp_mask);
+			kmem_cache_reap(gfp_mask);
+		} while (--priority >= 0);
+	}
+	return 0;
+}
 
-		switch (kswapd_state) {
-			do {
-			default:
-				free_memory(shrink_mmap(priority, 0));
-				free_memory(swap_out(priority, 0));
-				kswapd_state++;
-			case 1:
-				free_memory(shm_swap(priority, 0));
-				shrink_dcache_memory(priority, 0);
-				kswapd_state = 0;
-			} while (--priority >= 0);
-			return kswapd_state;
-		}
-done:
-		if (nr_free_pages > freepages.high + pager_daemon.swap_cluster)
+static int kswapd_free_pages(int kswapd_state)
+{
+	for(;;)
+	{
+		do_free_page(&kswapd_state, 0);
+		if (nr_free_pages > freepages.high)
 			break;
-	} while (time_before_eq(jiffies,end_time));
+		if (atomic_read(&nr_tasks_freeing_memory))
+			break;
+		if (kswapd_task->need_resched)
+			schedule();
+	};
 	return kswapd_state;
 }
 
@@ -496,13 +556,6 @@
 	lock_kernel();
 
 	/*
-	 * Set the base priority to something smaller than a
-	 * regular process. We will scale up the priority
-	 * dynamically depending on how much memory we need.
-	 */
-	current->priority = (DEF_PRIORITY * 2) / 3;
-
-	/*
 	 * Tell the memory management that we're a "memory allocator",
 	 * and that if we need more memory we should get access to it
 	 * regardless (see "try_to_free_pages()"). "kswapd" should
@@ -516,7 +569,6 @@
 	 */
 	current->flags |= PF_MEMALLOC;
 
-	init_swap_timer();
 	kswapd_task = current;
 	while (1) {
 		int state = 0;
@@ -543,107 +595,35 @@
  * if we need more memory as part of a swap-out effort we
  * will just silently return "success" to tell the page
  * allocator to accept the allocation.
- *
- * We want to try to free "count" pages, and we need to 
- * cluster them so that we get good swap-out behaviour. See
- * the "free_memory()" macro for details.
  */
 int try_to_free_pages(unsigned int gfp_mask, int count)
 {
-	int retval;
-
+	int retval = 1;
 	lock_kernel();
-
-	/* Always trim SLAB caches when memory gets low. */
-	kmem_cache_reap(gfp_mask);
-
-	retval = 1;
-	if (!(current->flags & PF_MEMALLOC)) {
-		int priority;
 
-		current->flags |= PF_MEMALLOC;
-	
-		priority = 8;
-		do {
-			free_memory(shrink_mmap(priority, gfp_mask));
-			free_memory(shm_swap(priority, gfp_mask));
-			free_memory(swap_out(priority, gfp_mask));
-			shrink_dcache_memory(priority, gfp_mask);
-		} while (--priority >= 0);
-		retval = 0;
-done:
-		current->flags &= ~PF_MEMALLOC;
+	current->flags |= PF_MEMALLOC;
+	atomic_inc(&nr_tasks_freeing_memory);
+	while (count--)
+	{
+		static int state = 0;
+		if (!do_free_page(&state, gfp_mask))
+		{
+			retval = 0;
+			break;
+		}
 	}
-	unlock_kernel();
+	atomic_dec(&nr_tasks_freeing_memory);
+	current->flags &= ~PF_MEMALLOC;
 
+	unlock_kernel();
 	return retval;
 }
 
-/*
- * Wake up kswapd according to the priority
- *	0 - no wakeup
- *	1 - wake up as a low-priority process
- *	2 - wake up as a normal process
- *	3 - wake up as an almost real-time process
- *
- * This plays mind-games with the "goodness()"
- * function in kernel/sched.c.
- */
-static inline void kswapd_wakeup(struct task_struct *p, int priority)
+void kswapd_wakeup(void)
 {
-	if (priority) {
-		p->counter = p->priority << priority;
-		wake_up_process(p);
-	}
-}
+	struct task_struct * p = kswapd_task;
 
-/* 
- * The swap_tick function gets called on every clock tick.
- */
-void swap_tick(void)
-{
-	struct task_struct *p = kswapd_task;
-
-	/*
-	 * Only bother to try to wake kswapd up
-	 * if the task exists and can be woken.
-	 */
-	if (p && (p->state & TASK_INTERRUPTIBLE)) {
-		unsigned int pages;
-		int want_wakeup;
-
-		/*
-		 * Schedule for wakeup if there isn't lots
-		 * of free memory or if there is too much
-		 * of it used for buffers or pgcache.
-		 *
-		 * "want_wakeup" is our priority: 0 means
-		 * not to wake anything up, while 3 means
-		 * that we'd better give kswapd a realtime
-		 * priority.
-		 */
-		want_wakeup = 0;
-		pages = nr_free_pages;
-		if (pages < freepages.high)
-			want_wakeup = 1;
-		if (pages < freepages.low)
-			want_wakeup = 2;
-		if (pages < freepages.min)
-			want_wakeup = 3;
-	
-		kswapd_wakeup(p,want_wakeup);
-	}
-
-	timer_active |= (1<<SWAP_TIMER);
-}
-
-/* 
- * Initialise the swap timer
- */
-
-void init_swap_timer(void)
-{
-	timer_table[SWAP_TIMER].expires = jiffies;
-	timer_table[SWAP_TIMER].fn = swap_tick;
-	timer_active |= (1<<SWAP_TIMER);
+	if (p && (p->state & TASK_INTERRUPTIBLE) &&
+	    !atomic_read(&nr_tasks_freeing_memory))
+		wake_up_process(p);
 }
Index: linux/mm/page_alloc.c
diff -u linux/mm/page_alloc.c:1.1.1.5 linux/mm/page_alloc.c:1.1.1.1.2.18
--- linux/mm/page_alloc.c:1.1.1.5	Sun Jan  3 20:42:44 1999
+++ linux/mm/page_alloc.c	Tue Jan  5 16:17:00 1999
@@ -3,6 +3,7 @@
  *
  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
  *  Swap reorganised 29.12.95, Stephen Tweedie
+ *  memory_trashing heuristic. Copyright (C) 1998  Andrea Arcangeli
  */
 
 #include <linux/config.h>
@@ -250,17 +251,18 @@
 		 * a bad memory situation, we're better off trying
 		 * to free things up until things are better.
 		 *
-		 * Normally we shouldn't ever have to do this, with
-		 * kswapd doing this in the background.
-		 *
 		 * Most notably, this puts most of the onus of
 		 * freeing up memory on the processes that _use_
 		 * the most memory, rather than on everybody.
 		 */
-		if (nr_free_pages > freepages.min) {
+		if (nr_free_pages > freepages.min+(1<<order)) {
 			if (!current->trashing_memory)
+				goto ok_to_allocate;
+			if (current->flags & PF_MEMALLOC)
+				goto ok_to_allocate;
+			if (nr_free_pages > freepages.low+(1<<order))
 				goto ok_to_allocate;
-			if (nr_free_pages > freepages.low) {
+			if (nr_free_pages > freepages.high+(1<<order)) {
 				current->trashing_memory = 0;
 				goto ok_to_allocate;
 			}
@@ -271,8 +273,11 @@
 		 * memory.
 		 */
 		current->trashing_memory = 1;
-		if (!try_to_free_pages(gfp_mask, SWAP_CLUSTER_MAX) && !(gfp_mask & (__GFP_MED | __GFP_HIGH)))
+		if (!try_to_free_pages(gfp_mask, freepages.high - nr_free_pages + (1<<order)) && !(gfp_mask & (__GFP_MED | __GFP_HIGH)))
 			goto nopage;
+	} else {
+		if (nr_free_pages < freepages.min)
+			kswapd_wakeup();
 	}
 ok_to_allocate:
 	spin_lock_irqsave(&page_alloc_lock, flags);
Index: linux/include/linux/mm.h
diff -u linux/include/linux/mm.h:1.1.1.3 linux/include/linux/mm.h:1.1.1.1.2.13
--- linux/include/linux/mm.h:1.1.1.3	Sat Jan  2 15:24:18 1999
+++ linux/include/linux/mm.h	Mon Jan  4 18:42:52 1999
@@ -118,7 +118,6 @@
 	unsigned long offset;
 	struct page *next_hash;
 	atomic_t count;
-	unsigned int unused;
 	unsigned long flags;	/* atomic flags, some possibly updated asynchronously */
 	struct wait_queue *wait;
 	struct page **pprev_hash;
@@ -295,8 +294,7 @@
 
 /* filemap.c */
 extern void remove_inode_page(struct page *);
-extern unsigned long page_unuse(struct page *);
-extern int shrink_mmap(int, int);
+extern int FASTCALL(shrink_mmap(int, int));
 extern void truncate_inode_pages(struct inode *, unsigned long);
 extern unsigned long get_cached_page(struct inode *, unsigned long, int);
 extern void put_cached_page(unsigned long);
Index: linux/mm/swap.c
diff -u linux/mm/swap.c:1.1.1.5 linux/mm/swap.c:1.1.1.1.2.8
--- linux/mm/swap.c:1.1.1.5	Sat Jan  2 15:24:40 1999
+++ linux/mm/swap.c	Sat Jan  2 21:40:13 1999
@@ -64,13 +64,13 @@
 swapstat_t swapstats = {0};
 
 buffer_mem_t buffer_mem = {
-	2,	/* minimum percent buffer */
+	5,	/* minimum percent buffer */
 	10,	/* borrow percent buffer */
 	60	/* maximum percent buffer */
 };
 
 buffer_mem_t page_cache = {
-	2,	/* minimum percent page cache */
+	5,	/* minimum percent page cache */
 	15,	/* borrow percent page cache */
 	75	/* maximum */
 };
Index: linux/include/linux/swap.h
diff -u linux/include/linux/swap.h:1.1.1.4 linux/include/linux/swap.h:1.1.1.1.2.10
--- linux/include/linux/swap.h:1.1.1.4	Tue Dec 29 01:39:03 1998
+++ linux/include/linux/swap.h	Tue Jan  5 01:12:59 1999
@@ -83,6 +83,7 @@
 
 /* linux/mm/vmscan.c */
 extern int try_to_free_pages(unsigned int gfp_mask, int count);
+extern void kswapd_wakeup(void);
 
 /* linux/mm/page_io.c */
 extern void rw_swap_page(int, unsigned long, char *, int);
@@ -167,9 +168,11 @@
 	count = atomic_read(&page->count);
 	if (PageSwapCache(page))
 	{
+#if 0
 		/* PARANOID */
 		if (page->inode != &swapper_inode)
 			panic("swap cache page has wrong inode\n");
+#endif
 		count += swap_count(page->offset) - 2;
 	}
 	if (PageFreeAfter(page))


--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: [patch] arca-vm-6, killed kswapd [Re: [patch] new-vm improvement , [Re: 2.2.0 Bug summary]]
  1999-01-05 13:23                                   ` Zlatko Calusic
@ 1999-01-05 15:42                                     ` Andrea Arcangeli
  1999-01-05 16:16                                       ` Zlatko Calusic
  0 siblings, 1 reply; 243+ messages in thread
From: Andrea Arcangeli @ 1999-01-05 15:42 UTC (permalink / raw
  To: Zlatko Calusic; +Cc: linux-kernel, linux-mm

On 5 Jan 1999, Zlatko Calusic wrote:

> I tried few times, but to no avail. Looks like subtle race, bad news
> for you, unfortunately.

Hmm, I gues it's been due the wrong order shifiting you pointed out a bit
before...

The lockup could be due to one oom loop. Ingo pointed out at once that
raid1 (if I remeber well) has one of them. Do you use raidx?

> Sure, just be careful. :)

Don't worry ;). Could you try if you can reproduce problems with
arca-vm-8? 

Andrea Arcangeli

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: [patch] arca-vm-6, killed kswapd [Re: [patch] new-vm improvement , [Re: 2.2.0 Bug summary]]
  1999-01-05 15:42                                     ` Andrea Arcangeli
@ 1999-01-05 16:16                                       ` Zlatko Calusic
  0 siblings, 0 replies; 243+ messages in thread
From: Zlatko Calusic @ 1999-01-05 16:16 UTC (permalink / raw
  To: Andrea Arcangeli

Andrea Arcangeli <andrea@e-mind.com> writes:

> On 5 Jan 1999, Zlatko Calusic wrote:
> 
> > I tried few times, but to no avail. Looks like subtle race, bad news
> > for you, unfortunately.
> 
> Hmm, I gues it's been due the wrong order shifiting you pointed out a bit
> before...

Nope. I fixed that before compiling. :)
It's even in my PRCS tree, your patches and my parentheses. :)

linux-2.1 2204.3 Tue, 05 Jan 1999 00:15:24 +0100 by zcalusic
Parent-Version:      2204.2
Version-Log:         MM & no kswapd (andrea)

linux-2.1 2204.4 Tue, 05 Jan 1999 03:34:57 +0100 by zcalusic
Parent-Version:      2204.2
Version-Log:         arca-vm-7

> 
> The lockup could be due to one oom loop. Ingo pointed out at once that
> raid1 (if I remeber well) has one of them. Do you use raidx?
> 

Wow, that's a new variable in a story, I'm indeed using raid0 (IDE +
SCSI). That is it, then. Should I contact Ingo about that? I'm not on
linux-raid, so I never heard of a problem like that, in fact it
happened only yesterday I lost control of machine in such a strange
way.

> > Sure, just be careful. :)
> 
> Don't worry ;). Could you try if you can reproduce problems with
> arca-vm-8? 
> 

Huh, I must refuse your proposal, at least til' I get some
sleep. :(

Tomorrow is non-working day, so I'll spend some time reading stuff
(recently I bought Rubini's Device Drivers), and on the Thursday I'm
back to regular schedule, sleepless nights and arca-vm-10, at that
time, probably. :)

While at VM changes, I have one (reborn) objection. It looks like
recent kernels are once again very aggressive when it comes to copying
lots of data. That is, if you cp few hundred of MB's, you effectively
finish with cleansed memory (populated with page cache pages) and
programs are on swap. Behaviour is practicaly identical in vanilla
Linus' tree and with your changes applied. Maybe you could, when
you're at it, see if that problem can be solved. With such a
behaviour, Linux feels very slugish, feels like a NT crap.

I know it's tough job, because I spent lots of time trying, but my
conclusion is that whenever you have good swapping speed, kernel will
outswap too much. On the other side if you fix that, swapping speed
drops. Tough luck. :(

I wish you good luck with your work, anyway.
-- 
Zlatko
--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: arca-vm-8 [Re: [patch] arca-vm-6, killed kswapd [Re: [patch] new-vm , improvement , [Re: 2.2.0 Bug summary]]]
  1999-01-05 15:35                               ` arca-vm-8 [Re: [patch] arca-vm-6, killed kswapd [Re: [patch] new-vm , improvement , [Re: 2.2.0 Bug summary]]] Andrea Arcangeli
@ 1999-01-06 14:48                                 ` Andrea Arcangeli
  1999-01-06 23:31                                   ` Andrea Arcangeli
  1999-01-06 23:35                                   ` arca-vm-8 [Re: [patch] arca-vm-6, killed kswapd [Re: [patch] new-vm , improvement , [Re: 2.2.0 Bug summary]]] Linus Torvalds
  0 siblings, 2 replies; 243+ messages in thread
From: Andrea Arcangeli @ 1999-01-06 14:48 UTC (permalink / raw
  To: steve, brent verner, Garst R. Reese, Kalle Andersson,
	Zlatko Calusic, Ben McCann
  Cc: Linus Torvalds, Alan Cox, bredelin, Stephen C. Tweedie,
	linux-kernel, Rik van Riel, linux-mm

On Tue, 5 Jan 1999, Andrea Arcangeli wrote:

> I fixed some thing in arca-vm-7. This new is arca-vm-8.

I've put out arca-vm-9.

It seems that it's a lose marking as not referenced all freed pages in
__free_pages(). Probably because shrink_mmap() doesn't like to decrease
the `count' on just freed pages. So now I mark all freed pages as
referenced.

In the last patches (arca-vm[78] I forgot to include the filemap.c diff)
that seems to improve performances here (allowing the swap cache to be
shrunk without care about pgcache_under_min()).

arca-vm-9 return to a linear behavior in cacluating the swapout weight.

You can donwload arca-vm-9 from here:

ftp://e-mind.com/pub/linux/kernel-patches/2.2.0-pre4-arca-VM-9

Let me know if you'll try it. Thanks!

Andrea Arcangeli

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: arca-vm-8 [Re: [patch] arca-vm-6, killed kswapd [Re: [patch] new-vm , improvement , [Re: 2.2.0 Bug summary]]]
  1999-01-06 14:48                                 ` Andrea Arcangeli
@ 1999-01-06 23:31                                   ` Andrea Arcangeli
  1999-01-07  3:32                                     ` Results: 2.2.0-pre5 vs arcavm10 vs arcavm9 vs arcavm7 Steve Bergman
  1999-01-06 23:35                                   ` arca-vm-8 [Re: [patch] arca-vm-6, killed kswapd [Re: [patch] new-vm , improvement , [Re: 2.2.0 Bug summary]]] Linus Torvalds
  1 sibling, 1 reply; 243+ messages in thread
From: Andrea Arcangeli @ 1999-01-06 23:31 UTC (permalink / raw
  To: steve, brent verner, Garst R. Reese, Kalle Andersson,
	Zlatko Calusic, Ben McCann
  Cc: bredelin, linux-kernel, linux-mm, Linus Torvalds, Alan Cox,
	Stephen C. Tweedie

On Wed, 6 Jan 1999, Andrea Arcangeli wrote:

> I've put out arca-vm-9.

Woops in both arca-vm-8 and arca-vm-9 there was a very stupid bug in my
changes of the memory_trashing heuristic (done at too late time..). 
Basically when a process was masked to be a memory_trasher, it had no ways
to return a not marked process................ 

I didn' t noticed the bug because I use swap only when I do my benchmarks
(when there is not swapping in progress the slowdown due some
shrink_mmap() can't be seen with eyes...) and when I run my benchmarks I
always start before the memory trasher proggy...

Thanks to Benjamin who showed me the bugs some seconds ago ;)

I've put out a new arca-vm-10 with at least this bug fixed.

ftp://e-mind.com/pub/linux/kernel-patches/2.2.0-pre4-arca-VM-10

Excuse me...

BTW, I have reports that arca-vm-6/7 are faster than arca-vm-8/9
(arca-vm-7 is reported the fastest even more than arca-vm-3). Maybe it's
been due this bug that the latest are slower, or maybe the whole new
changes at the first memory_trashing code (the one in 2.2.0-pre4) are
hurting... (even if here them seems to helps, with them the trashing
process remains marked all the time and not only during low mem peak; and
not trashing process never get marked as trashing) 

And again thanks to Steve, Zlatko, Benjamin, Ben, Garst, MikeG, Kalle,
Brent and all other testers for their good reports! 

Andrea Arcangeli

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: arca-vm-8 [Re: [patch] arca-vm-6, killed kswapd [Re: [patch] new-vm , improvement , [Re: 2.2.0 Bug summary]]]
  1999-01-06 14:48                                 ` Andrea Arcangeli
  1999-01-06 23:31                                   ` Andrea Arcangeli
@ 1999-01-06 23:35                                   ` Linus Torvalds
  1999-01-07  4:30                                     ` Eric W. Biederman
  1999-01-07 14:11                                     ` Andrea Arcangeli
  1 sibling, 2 replies; 243+ messages in thread
From: Linus Torvalds @ 1999-01-06 23:35 UTC (permalink / raw
  To: Andrea Arcangeli
  Cc: steve, brent verner, Garst R. Reese, Kalle Andersson,
	Zlatko Calusic, Ben McCann, Alan Cox, bredelin,
	Stephen C. Tweedie, linux-kernel, Rik van Riel, linux-mm


Oh, well.. Based on what the arca-[678] patches did, there's now a pre-5
out there. Not very similar, but it should incorporate the basic idea: 
namely much more aggressively asynchronous swap-outs from a process
context. 

Comment away,

		Linus

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Results: 2.2.0-pre5 vs arcavm10 vs arcavm9 vs arcavm7
  1999-01-06 23:31                                   ` Andrea Arcangeli
@ 1999-01-07  3:32                                     ` Steve Bergman
  1999-01-07 12:02                                       ` Andrea Arcangeli
                                                         ` (2 more replies)
  0 siblings, 3 replies; 243+ messages in thread
From: Steve Bergman @ 1999-01-07  3:32 UTC (permalink / raw
  To: Andrea Arcangeli
  Cc: brent verner, Garst R. Reese, Kalle Andersson, Zlatko Calusic,
	Ben McCann, bredelin, linux-kernel, linux-mm, Linus Torvalds,
	Alan Cox, Stephen C. Tweedie

Andrea Arcangeli wrote:

> I've put out a new arca-vm-10 with at least this bug fixed.
> 
> ftp://e-mind.com/pub/linux/kernel-patches/2.2.0-pre4-arca-VM-10
> 

Here are my latest numbers.  This is timing a complete kernel compile  (make
clean;make depend;make;make modules;make modules_install)  in 16MB memory with
netscape, kde, and various daemons running.  I unknowningly had two more daemons
running in the background this time than last so the numbers can't be compared
directly with my last test (Which I think I only sent to Andrea).  But all of
these numbers are consistent with *each other*.

kernel		Time	Maj pf	Min pf  Swaps
----------	-----	------	------	-----
2.2.0-pre5	18:19	522333	493803	27984
arcavm10	19:57	556299	494163	12035
arcavm9		19:55	553783	494444	12077
arcavm7		18:39	538520	493287	11526

Pre5 looks good.
Arcavm7 still looks better than arcavm10.

-Steve
--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: arca-vm-8 [Re: [patch] arca-vm-6, killed kswapd [Re: [patch] new-vm , improvement , [Re: 2.2.0 Bug summary]]]
  1999-01-06 23:35                                   ` arca-vm-8 [Re: [patch] arca-vm-6, killed kswapd [Re: [patch] new-vm , improvement , [Re: 2.2.0 Bug summary]]] Linus Torvalds
@ 1999-01-07  4:30                                     ` Eric W. Biederman
  1999-01-07 17:56                                       ` Linus Torvalds
  1999-01-07 14:11                                     ` Andrea Arcangeli
  1 sibling, 1 reply; 243+ messages in thread
From: Eric W. Biederman @ 1999-01-07  4:30 UTC (permalink / raw
  To: Linus Torvalds
  Cc: Andrea Arcangeli, steve, brent verner, Garst R. Reese,
	Kalle Andersson, Zlatko Calusic, Ben McCann, Alan Cox, bredelin,
	Stephen C. Tweedie, linux-kernel, Rik van Riel, linux-mm

>>>>> "LT" == Linus Torvalds <torvalds@transmeta.com> writes:

LT> Oh, well.. Based on what the arca-[678] patches did, there's now a pre-5
LT> out there. Not very similar, but it should incorporate the basic idea: 
LT> namely much more aggressively asynchronous swap-outs from a process
LT> context. 

LT> Comment away,

1) With your comments on PG_dirty/(what shrink_mmap should do) you
   have worked out what needs to happen for the mapped in memory case,
   and I haven't quite gotten there.  Thank You.

2) I have tested using PG_dirty from shrink_mmap and it is a
   performance problem because it loses all locality of reference,
   and because it forces shrink_mmap into a dual role, of freeing and
   writing pages, which need seperate tuning.

Linus is this a case you feel is important to tune for 2.2?
If so I would be happy to play with it.

Eric
--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: Results: 2.2.0-pre5 vs arcavm10 vs arcavm9 vs arcavm7
  1999-01-07  3:32                                     ` Results: 2.2.0-pre5 vs arcavm10 vs arcavm9 vs arcavm7 Steve Bergman
@ 1999-01-07 12:02                                       ` Andrea Arcangeli
  1999-01-07 20:27                                         ` Linus Torvalds
  1999-01-07 17:35                                       ` Linus Torvalds
  1999-01-09 22:39                                       ` Results: pre6 vs pre6+zlatko's_patch vs pre5 vs arcavm13 Steve Bergman
  2 siblings, 1 reply; 243+ messages in thread
From: Andrea Arcangeli @ 1999-01-07 12:02 UTC (permalink / raw
  To: Steve Bergman; +Cc: linux-kernel, linux-mm, Linus Torvalds

On Wed, 6 Jan 1999, Steve Bergman wrote:

> kernel	Time	Maj pf	Min pf  Swaps
> ----------	-----	------	------	-----
> 2.2.0-pre5	18:19	522333	493803	27984
> arcavm10	19:57	556299	494163	12035
> arcavm9	19:55	553783	494444	12077
> arcavm7	18:39	538520	493287	11526

Happy to hear that ! ;)

The changes in 2.2.0-pre5 looks really cool! I think the only missing
thing that I would like to see in is my calc_swapout_weight() thing. This
my change would avoid swap_out() to stall too much the system in presence
of huge tasks and so it would allow the VM to scale better... I'll do some
test starting from pre5 now...

Andrea Arcangeli

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: arca-vm-8 [Re: [patch] arca-vm-6, killed kswapd [Re: [patch] new-vm , improvement , [Re: 2.2.0 Bug summary]]]
  1999-01-06 23:35                                   ` arca-vm-8 [Re: [patch] arca-vm-6, killed kswapd [Re: [patch] new-vm , improvement , [Re: 2.2.0 Bug summary]]] Linus Torvalds
  1999-01-07  4:30                                     ` Eric W. Biederman
@ 1999-01-07 14:11                                     ` Andrea Arcangeli
  1999-01-07 18:19                                       ` Linus Torvalds
  1 sibling, 1 reply; 243+ messages in thread
From: Andrea Arcangeli @ 1999-01-07 14:11 UTC (permalink / raw
  To: Linus Torvalds
  Cc: steve, brent verner, Garst R. Reese, Kalle Andersson,
	Zlatko Calusic, Ben McCann, bredelin, linux-kernel, linux-mm

On Wed, 6 Jan 1999, Linus Torvalds wrote:

> Oh, well.. Based on what the arca-[678] patches did, there's now a pre-5
> out there. Not very similar, but it should incorporate the basic idea: 
> namely much more aggressively asynchronous swap-outs from a process
> context. 

I like it infact ;). I just have some diff that I would like to put under
testing. The patches are against 2.2.0-pre5.

This first patch allow swap_out to have a more fine grined weight. Should
help at least in low memory envinronments.

diff -u linux/mm/vmscan.c:1.1.1.10 linux/mm/vmscan.c:1.1.1.1.2.72
--- linux/mm/vmscan.c:1.1.1.10	Thu Jan  7 12:21:36 1999
+++ linux/mm/vmscan.c	Thu Jan  7 14:46:17 1999
@@ -171,7 +179,7 @@
  */
 
 static inline int swap_out_pmd(struct task_struct * tsk, struct vm_area_struct * vma,
-	pmd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
+	pmd_t *dir, unsigned long address, unsigned long end, int gfp_mask, unsigned long * counter)
 {
 	pte_t * pte;
 	unsigned long pmd_end;
@@ -192,18 +200,20 @@
 
 	do {
 		int result;
-		tsk->swap_address = address + PAGE_SIZE;
 		result = try_to_swap_out(tsk, vma, address, pte, gfp_mask);
+		address += PAGE_SIZE;
+		tsk->swap_address = address;
 		if (result)
 			return result;
-		address += PAGE_SIZE;
+		if (!--*counter)
+			return 0;
 		pte++;
 	} while (address < end);
 	return 0;
 }
 
 static inline int swap_out_pgd(struct task_struct * tsk, struct vm_area_struct * vma,
-	pgd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
+	pgd_t *dir, unsigned long address, unsigned long end, int gfp_mask, unsigned long * counter)
 {
 	pmd_t * pmd;
 	unsigned long pgd_end;
@@ -223,9 +233,11 @@
 		end = pgd_end;
 	
 	do {
-		int result = swap_out_pmd(tsk, vma, pmd, address, end, gfp_mask);
+		int result = swap_out_pmd(tsk, vma, pmd, address, end, gfp_mask, counter);
 		if (result)
 			return result;
+		if (!*counter)
+			return 0;
 		address = (address + PMD_SIZE) & PMD_MASK;
 		pmd++;
 	} while (address < end);
@@ -233,7 +245,7 @@
 }
 
 static int swap_out_vma(struct task_struct * tsk, struct vm_area_struct * vma,
-	unsigned long address, int gfp_mask)
+	unsigned long address, int gfp_mask, unsigned long * counter)
 {
 	pgd_t *pgdir;
 	unsigned long end;
@@ -247,16 +259,19 @@
 
 	end = vma->vm_end;
 	while (address < end) {
-		int result = swap_out_pgd(tsk, vma, pgdir, address, end, gfp_mask);
+		int result = swap_out_pgd(tsk, vma, pgdir, address, end, gfp_mask, counter);
 		if (result)
 			return result;
+		if (!*counter)
+			return 0;
 		address = (address + PGDIR_SIZE) & PGDIR_MASK;
 		pgdir++;
 	}
 	return 0;
 }
 
-static int swap_out_process(struct task_struct * p, int gfp_mask)
+static int swap_out_process(struct task_struct * p, int gfp_mask,
+			    unsigned long * counter)
 {
 	unsigned long address;
 	struct vm_area_struct* vma;
@@ -275,9 +290,12 @@
 			address = vma->vm_start;
 
 		for (;;) {
-			int result = swap_out_vma(p, vma, address, gfp_mask);
+			int result = swap_out_vma(p, vma, address, gfp_mask,
+						  counter);
 			if (result)
 				return result;
+			if (!*counter)
+				return 0;
 			vma = vma->vm_next;
 			if (!vma)
 				break;
@@ -291,6 +309,25 @@
 	return 0;
 }
 
+static inline unsigned long calc_swapout_weight(int priority)
+{
+	struct task_struct * p;
+	unsigned long total_vm = 0;
+
+	read_lock(&tasklist_lock);
+	for_each_task(p)
+	{
+		if (!p->swappable)
+			continue;
+		if (p->mm->rss == 0)
+			continue;
+		total_vm += p->mm->total_vm;
+	}
+	read_unlock(&tasklist_lock);
+
+	return total_vm / (1+priority);
+}
+
 /*
  * Select the task with maximal swap_cnt and try to swap out a page.
  * N.B. This function returns only 0 or 1.  Return values != 1 from
@@ -299,7 +336,10 @@
 static int swap_out(unsigned int priority, int gfp_mask)
 {
 	struct task_struct * p, * pbest;
-	int counter, assign, max_cnt;
+	int assign;
+	unsigned long counter, max_cnt;
+
+	counter = calc_swapout_weight(priority);
 
 	/* 
 	 * We make one or two passes through the task list, indexed by 
@@ -315,23 +355,17 @@
 	 * Think of swap_cnt as a "shadow rss" - it tells us which process
 	 * we want to page out (always try largest first).
 	 */
-	counter = nr_tasks / (priority+1);
-	if (counter < 1)
-		counter = 1;
-	if (counter > nr_tasks)
-		counter = nr_tasks;
-
-	for (; counter >= 0; counter--) {
+	while (counter != 0) {
 		assign = 0;
 		max_cnt = 0;
 		pbest = NULL;
 	select:
 		read_lock(&tasklist_lock);
-		p = init_task.next_task;
-		for (; p != &init_task; p = p->next_task) {
+		for_each_task(p)
+		{
 			if (!p->swappable)
 				continue;
-	 		if (p->mm->rss <= 0)
+	 		if (p->mm->rss == 0)
 				continue;
 			/* Refresh swap_cnt? */
 			if (assign)
@@ -350,7 +384,7 @@
 			goto out;
 		}
 
-		if (swap_out_process(pbest, gfp_mask))
+		if (swap_out_process(pbest, gfp_mask, &counter))
 			return 1;
 	}
 out:







This other patch instead change a bit the trashing memory heuristic and
how many pages are freed every time. I am not sure it's the best thing to
do. So if you'll try it let me know the results... 

Index: linux/mm/page_alloc.c
diff -u linux/mm/page_alloc.c:1.1.1.6 linux/mm/page_alloc.c:1.1.1.1.2.22
--- linux/mm/page_alloc.c:1.1.1.6	Thu Jan  7 12:21:35 1999
+++ linux/mm/page_alloc.c	Thu Jan  7 12:57:23 1999
@@ -3,6 +3,7 @@
  *
  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
  *  Swap reorganised 29.12.95, Stephen Tweedie
+ *  memory_trashing heuristic. Copyright (C) 1998  Andrea Arcangeli
  */
 
 #include <linux/config.h>
@@ -258,20 +259,18 @@
 		 * a bad memory situation, we're better off trying
 		 * to free things up until things are better.
 		 *
-		 * Normally we shouldn't ever have to do this, with
-		 * kswapd doing this in the background.
-		 *
 		 * Most notably, this puts most of the onus of
 		 * freeing up memory on the processes that _use_
 		 * the most memory, rather than on everybody.
 		 */
-		if (nr_free_pages > freepages.min) {
+		if (nr_free_pages > freepages.min+(1<<order)) {
 			if (!current->trashing_memory)
 				goto ok_to_allocate;
-			if (nr_free_pages > freepages.low) {
+			if (nr_free_pages > freepages.high+(1<<order)) {
 				current->trashing_memory = 0;
 				goto ok_to_allocate;
-			}
+			} else if (nr_free_pages > freepages.low+(1<<order))
+				goto ok_to_allocate;
 		}
 		/*
 		 * Low priority (user) allocations must not
@@ -282,7 +281,7 @@
 		{
 			int freed;
 			current->flags |= PF_MEMALLOC;
-			freed = try_to_free_pages(gfp_mask, SWAP_CLUSTER_MAX);
+			freed = try_to_free_pages(gfp_mask, freepages.high - nr_free_pages + (1<<order));
 			current->flags &= ~PF_MEMALLOC;
 			if (!freed && !(gfp_mask & (__GFP_MED | __GFP_HIGH)))
 				goto nopage;



Thanks.

Andrea Arcangeli

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: Results: 2.2.0-pre5 vs arcavm10 vs arcavm9 vs arcavm7
  1999-01-07  3:32                                     ` Results: 2.2.0-pre5 vs arcavm10 vs arcavm9 vs arcavm7 Steve Bergman
  1999-01-07 12:02                                       ` Andrea Arcangeli
@ 1999-01-07 17:35                                       ` Linus Torvalds
  1999-01-07 18:44                                         ` Zlatko Calusic
  1999-01-09 22:39                                       ` Results: pre6 vs pre6+zlatko's_patch vs pre5 vs arcavm13 Steve Bergman
  2 siblings, 1 reply; 243+ messages in thread
From: Linus Torvalds @ 1999-01-07 17:35 UTC (permalink / raw
  To: Steve Bergman
  Cc: Andrea Arcangeli, brent verner, Garst R. Reese, Kalle Andersson,
	Zlatko Calusic, Ben McCann, bredelin, linux-kernel, linux-mm,
	Alan Cox, Stephen C. Tweedie



On Wed, 6 Jan 1999, Steve Bergman wrote:
> 
> Here are my latest numbers.  This is timing a complete kernel compile  (make
> clean;make depend;make;make modules;make modules_install)  in 16MB memory with
> netscape, kde, and various daemons running.  I unknowningly had two more daemons
> running in the background this time than last so the numbers can't be compared
> directly with my last test (Which I think I only sent to Andrea).  But all of
> these numbers are consistent with *each other*.
> 
> 
> kernel		Time	Maj pf	Min pf  Swaps
> ----------	-----	------	------	-----
> 2.2.0-pre5		18:19	522333	493803	27984
> arcavm10		19:57	556299	494163	12035
> arcavm9		19:55	553783	494444	12077
> arcavm7		18:39	538520	493287	11526

Don't look too closely at the "swaps" number - I think pre-5 just changed
accounting a bit. A lot of the "swaps" are really just dropping a virtual
mapping (that is later picked up again from the page cache or the swap
cache). 

Basically, pre-5 uses the page cache and the swap cache more actively as a
"victim cache", and that inflates the "swaps" number simply due to the
accounting issues. 

I guess I shouldn't count the simple "drop_pte" operation as a swap at
all, because it doesn't involve any IO.

		Linus

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: arca-vm-8 [Re: [patch] arca-vm-6, killed kswapd [Re: [patch] new-vm , improvement , [Re: 2.2.0 Bug summary]]]
  1999-01-07  4:30                                     ` Eric W. Biederman
@ 1999-01-07 17:56                                       ` Linus Torvalds
  1999-01-07 18:18                                         ` Rik van Riel
                                                           ` (5 more replies)
  0 siblings, 6 replies; 243+ messages in thread
From: Linus Torvalds @ 1999-01-07 17:56 UTC (permalink / raw
  To: Eric W. Biederman
  Cc: Andrea Arcangeli, steve, brent verner, Garst R. Reese,
	Kalle Andersson, Zlatko Calusic, Ben McCann, Alan Cox, bredelin,
	Stephen C. Tweedie, linux-kernel, Rik van Riel, linux-mm

On 6 Jan 1999, Eric W. Biederman wrote:
> 
> 1) With your comments on PG_dirty/(what shrink_mmap should do) you
>    have worked out what needs to happen for the mapped in memory case,
>    and I haven't quite gotten there.  Thank You.

Note that it is not finalized. That's why I didn't write the code (which
should be fairly simple), because it has some fairly subtle issues and
thus becomes a 2.3.x thing, I very much suspect.

Basically, my rule of thumb for the changes I did was: "it should have the
same code paths as the old code". What that means is that I didn't
actually do any changes that changed real code: I did only changes that
changed _behaviour_.

That way I can be reasonably hopeful that there are no new bugs introduced
even though performance is very different. I _do_ have some early data
that seems to say that this _has_ uncovered a very old deadlock condition: 
something that could happen before but was almost impossible to trigger. 

The deadlock I suspect is:
 - we're low on memory
 - we allocate or look up a new block on the filesystem. This involves
   getting the ext2 superblock lock, and doing a "bread()" of the free
   block bitmap block.
 - this causes us to try to allocate a new buffer, and we are so low on
   memory that we go into try_to_free_pages() to find some more memory.
 - try_to_free_pages() finds a shared memory file to page out.
 - trying to page that out, it looks up the buffers on the filesystem it
   needs, but deadlocks on the superblock lock.

Note that this could happen before too (I've not removed any of the
codepaths that could lead to it), but it was dynamically _much_ less
likely to happen.

I'm not even sure it really exists, but I have some really old reports
that _could_ be due to this, and a few more recent ones (that I never
could explain). And I have a few _really_ recent ones from here internally
at transmeta that looks like it's triggering more easily these days.

(Note that this is not actually pre5-related: I've been chasing this on
and off for some time, and it seems to have just gotten easier to trigger,
which is why I finally have a theory on what is going on - just a theory
though, and I may be completely off the mark). 

The positive news is that if I'm right in my suspicions it can only happen
with shared writable mappings or shared memory segments. The bad news is
that the bug appears rather old, and no immediate solution presents
itself. 

> 2) I have tested using PG_dirty from shrink_mmap and it is a
>    performance problem because it loses all locality of reference,
>    and because it forces shrink_mmap into a dual role, of freeing and
>    writing pages, which need seperate tuning.

Exactly. This is part of the complexity.

The right solution (I _think_) is to conceptually always mark it PG_dirty
in vmscan, and basically leave all the nasty cases to the filemap physical
page scan. But in the simple cases (ie a swap-cached page that is only
mapped by one process and doesn't have any other users), you'd start the
IO "early".

That would essentially mean that normal single mappings get the good
locality, while the case we really suck at right now (multiple mappings
which can all dirty the page) would not cause excessive page-outs. 

Basically, I think that the stuff we handle now with the swap-cache we do
well on already, and we'd only really want to handle the shared memory
case with PG_dirty. But I think this is a 2.3 issue, and I only added the
comment (and the PG_dirty define) for now. 

> Linus is this a case you feel is important to tune for 2.2?
> If so I would be happy to play with it.

It might be something good to test out, but I really don't want patches at
this date (unless your patches also fix the above deadlock problem, which
I can't see them doing ;)

		Linus

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: arca-vm-8 [Re: [patch] arca-vm-6, killed kswapd [Re: [patch] new-vm , improvement , [Re: 2.2.0 Bug summary]]]
  1999-01-07 17:56                                       ` Linus Torvalds
@ 1999-01-07 18:18                                         ` Rik van Riel
  1999-01-07 19:19                                           ` arca-vm-8 [Re: [patch] arca-vm-6, killed kswapd [Re: [patch] Alan Cox
  1999-01-07 18:55                                         ` arca-vm-8 [Re: [patch] arca-vm-6, killed kswapd [Re: [patch] new-vm , improvement , [Re: 2.2.0 Bug summary]]] Zlatko Calusic
                                                           ` (4 subsequent siblings)
  5 siblings, 1 reply; 243+ messages in thread
From: Rik van Riel @ 1999-01-07 18:18 UTC (permalink / raw
  To: Linus Torvalds
  Cc: Eric W. Biederman, Andrea Arcangeli, steve, brent verner,
	Garst R. Reese, Kalle Andersson, Zlatko Calusic, Ben McCann,
	Alan Cox, bredelin, Stephen C. Tweedie, linux-kernel, linux-mm

On Thu, 7 Jan 1999, Linus Torvalds wrote:
> On 6 Jan 1999, Eric W. Biederman wrote:


> > 2) I have tested using PG_dirty from shrink_mmap and it is a
> >    performance problem because it loses all locality of reference,
> >    and because it forces shrink_mmap into a dual role, of freeing and
> >    writing pages, which need seperate tuning.
> 
> Exactly. This is part of the complexity.

It can be solved by having a 'laundry' list like the *BSD
folks have and maybe a special worker thread to take care
of the laundry (optimizing placement on disk, etc).

> The right solution (I _think_) is to conceptually always mark it
> PG_dirty in vmscan, and basically leave all the nasty cases to the
> filemap physical page scan. But in the simple cases (ie a
> swap-cached page that is only mapped by one process and doesn't
> have any other users), you'd start the IO "early".
>
> That would essentially mean that normal single mappings get the good
> locality, while the case we really suck at right now (multiple mappings
> which can all dirty the page) would not cause excessive page-outs. 

We can already do that by simply not writing the page to
disk if there are other users besides us (keeping in mind
the swap cache and other system things).

One problem might be that we could end up with more on-disk
fragmentation that way (and maybe less clusterable I/O).

> Basically, I think that the stuff we handle now with the
> swap-cache we do well on already, and we'd only really want to
> handle the shared memory case with PG_dirty. But I think this is a
> 2.3 issue, and I only added the comment (and the PG_dirty define)
> for now.

It's quite definately 2.3. It's just a minor performance
issue for most systems (an extra write is an order of
magnitude cheaper than an extra read where a process is
actually waiting).


Rik -- If a Microsoft product fails, who do you sue?
+-------------------------------------------------------------------+
| Linux memory management tour guide.        riel@humbolt.geo.uu.nl |
| Scouting Vries cubscout leader.    http://humbolt.geo.uu.nl/~riel |
+-------------------------------------------------------------------+

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: arca-vm-8 [Re: [patch] arca-vm-6, killed kswapd [Re: [patch] new-vm , improvement , [Re: 2.2.0 Bug summary]]]
  1999-01-07 14:11                                     ` Andrea Arcangeli
@ 1999-01-07 18:19                                       ` Linus Torvalds
  1999-01-07 20:35                                         ` Andrea Arcangeli
  0 siblings, 1 reply; 243+ messages in thread
From: Linus Torvalds @ 1999-01-07 18:19 UTC (permalink / raw
  To: Andrea Arcangeli
  Cc: steve, brent verner, Garst R. Reese, Kalle Andersson,
	Zlatko Calusic, Ben McCann, bredelin, linux-kernel, linux-mm

On Thu, 7 Jan 1999, Andrea Arcangeli wrote:
> 
> This first patch allow swap_out to have a more fine grined weight. Should
> help at least in low memory envinronments.

The basic reason I didn't want to do this was that I thought it was wrong
to try to base _any_ decision on any virtual memory sizes. The reason is
simply that I think RSS isn't a very interesting thing to look at.

Yes, the current version also looks at RSS, but if you actually read the
code and think about what it does, it really only uses RSS as an
"ordering"  issue, and it doesn't actually matter for anything else -
we'll walk through all processes until they are all exhausted, and the
only thing that RSS does for us is to start off with the larger one.

Basically, it doesn't matter for anything but startup, because the steady
state will essentially just be a "go through each process in the list over
and over again", and the fact that the list has some ordering is pretty
much inconsequential. 

The real decision on what to throw out is done by the physical page scan,
that takes the PG_referenced bit into account.

So essentially, if we get anything wrong when we do the virtual page table
walk, the only thing that results in is that we might handle a few extra
page faults (not no extra IO, because the page faults will be satisfied
from the victim caches - the page cache and the swap cache). 

The only case this isn't true is the case where we have a shared file
mapping. That's where the PG_dirty issues come in - we've never done that
well from a performance standpoint, and pre-5 does not change that fact,
it just lays some foundations for doing it right in the future. 

So that's why I'd prefer to not complicate the VM counting any more. I
don't think it should make any fundamental difference (it might make a
difference in various extreme cases, but not, I think, under any kind of
realistic load).

But who knows, I've been wrong before. But now at least you know why I
didn't want it in the default kernel. 

> This other patch instead change a bit the trashing memory heuristic and
> how many pages are freed every time. I am not sure it's the best thing to
> do. So if you'll try it let me know the results... 

I think this might well be tuned some, although I think your patch is
extreme. I'd love to hear comments from people who test it under different
loads and different memory sizes.

		Linus

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: Results: 2.2.0-pre5 vs arcavm10 vs arcavm9 vs arcavm7
  1999-01-07 17:35                                       ` Linus Torvalds
@ 1999-01-07 18:44                                         ` Zlatko Calusic
  1999-01-07 19:33                                           ` Linus Torvalds
                                                             ` (3 more replies)
  0 siblings, 4 replies; 243+ messages in thread
From: Zlatko Calusic @ 1999-01-07 18:44 UTC (permalink / raw
  To: Linus Torvalds
  Cc: Steve Bergman, Andrea Arcangeli, brent verner, Garst R. Reese,
	Kalle Andersson, Ben McCann, bredelin, linux-kernel, linux-mm,
	Alan Cox, Stephen C. Tweedie

[-- Attachment #1: Type: text/plain, Size: 4091 bytes --]

Linus Torvalds <torvalds@transmeta.com> writes:

> On Wed, 6 Jan 1999, Steve Bergman wrote:
> > 
> > Here are my latest numbers.  This is timing a complete kernel compile  (make
> > clean;make depend;make;make modules;make modules_install)  in 16MB memory with
> > netscape, kde, and various daemons running.  I unknowningly had two more daemons
> > running in the background this time than last so the numbers can't be compared
> > directly with my last test (Which I think I only sent to Andrea).  But all of
> > these numbers are consistent with *each other*.
> > 
> > 
> > kernel		Time	Maj pf	Min pf  Swaps
> > ----------	-----	------	------	-----
> > 2.2.0-pre5		18:19	522333	493803	27984
> > arcavm10		19:57	556299	494163	12035
> > arcavm9		19:55	553783	494444	12077
> > arcavm7		18:39	538520	493287	11526
> 
> Don't look too closely at the "swaps" number - I think pre-5 just changed
> accounting a bit. A lot of the "swaps" are really just dropping a virtual
> mapping (that is later picked up again from the page cache or the swap
> cache). 
> 
> Basically, pre-5 uses the page cache and the swap cache more actively as a
> "victim cache", and that inflates the "swaps" number simply due to the
> accounting issues. 
> 
> I guess I shouldn't count the simple "drop_pte" operation as a swap at
> all, because it doesn't involve any IO.
> 

2.2.0-pre5 works very good, indeed, but it still has some not
sufficiently explored nuisances:

1) Swap performance in pre-5 is much worse compared to pre-4 in
*certain* circumstances. I'm using quite stupid and unintelligent
program to check for raw swap speed (attached below). With 64 MB of
RAM I usually run it as 'hogmem 100 3' and watch for result which is
recently around 6 MB/sec. But when I lately decided to start two
instances of it like "hogmem 50 3 & hogmem 50 3 &" in pre-4 I got 2 x
2.5 MB/sec and in pre-5 it is only 2 x 1 MB/sec and disk is making
very weird and frightening sounds. My conclusion is that now (pre-5)
system behaves much poorer when we have more than one thrashing
task. *Please*, check this, it is a quite serious problem.

2) In pre-5, under heavy load, free memory is hovering around
freepages.min instead of being somewhere between freepages.low &
freepages.max. This could make trouble for bursts of atomic
allocations (networking!).

3) Nitpick #1: /proc/swapstats exist but is only filled with
zeros. Probably it should go away. I believe Stephen added it
recently, but only part of his patch got actually applied.

4) Nitpick #2": "Swap cache:" line in report of Alt-SysRq-M is not
useful as it is laid now. People have repeatedly sent patches (Rik,
Andrea...) to fix this but it is still not fixed, as of pre-5.

5) There is lots of #if 0 constructs in MM code, and also lots of
structures are not anymore used but still take precious memory in
compiled kernel and uncover itself under /proc (/proc/sys/vm/swapctl
for instance). Do you want a patch to remove this cruft?

6) Finally one suggestion of mine. In swapfile.c there is comment:

         * We try to cluster swap pages by allocating them
         * sequentially in swap.  Once we've allocated
         * SWAP_CLUSTER_MAX pages this way, however, we resort to
         * first-free allocation, starting a new cluster.  This
         * prevents us from scattering swap pages all over the entire
         * swap partition, so that we reduce overall disk seek times

This is good, but clustering of only 32 (SWAP_CLUSTER_MAX) * 4KB =
128KB is too small for today's disk and swap sizes. I tried to enlarge
this value to something like 2 MB and got much much better results.
This is very important now that we have swapin readahead to keep pages
as adjacent as possible to each other so hit rate is big. It is
trivial (one liner) and completely safe to make this constant much
bigger, so I'm not even attaching a patch. 512 works very well and
swapping is much faster than with default valuein place. Maybe this
should even be sysctl controllable. If you agree with the last idea,
I'll send you a patch, just confirm.

I promised memory hogger:

[-- Attachment #2: Hogmem.c --]
[-- Type: application/octet-stream, Size: 1067 bytes --]

#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <limits.h>
#include <signal.h>
#include <time.h>
#include <sys/times.h>

#define MB (1024 * 1024)

int nr, intsize, i, t;
clock_t st;
struct tms dummy;

void intr(int intnum)
{
    clock_t et = times(&dummy);

    printf("\nMemory speed: %.2f MB/sec\n", (2 * t * CLK_TCK * nr + (double) i * CLK_TCK * intsize / MB) / (et - st));
    exit(EXIT_SUCCESS);
}

int main(int argc, char **argv)
{
    int max, nr_times, *area, c;

    setbuf(stdout, 0);
    signal(SIGINT, intr);
    signal(SIGTERM, intr);
    intsize = sizeof(int);
    if (argc < 2 || argc > 3) {
	fprintf(stderr, "Usage: hogmem <MB> [times]\n");
	exit(EXIT_FAILURE);
    }
    nr = atoi(argv[1]);
    if (argc == 3)
	nr_times = atoi(argv[2]);
    else
	nr_times = INT_MAX;
    area = malloc(nr * MB);
    max = nr * MB / intsize;
    st = times(&dummy);
    for (c = 0; c < nr_times; c++)
    {
	for (i = 0; i < max; i++)
	    area[i]++;
	t++;
	putchar('.');
    }
    i = 0;
    intr(0);
    /* notreached */
    exit(EXIT_SUCCESS);
}

[-- Attachment #3: Type: text/plain, Size: 74 bytes --]

OK, that's it for today. Don't bang heads too hard and enjoy!
-- 
Zlatko

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: arca-vm-8 [Re: [patch] arca-vm-6, killed kswapd [Re: [patch] new-vm , improvement , [Re: 2.2.0 Bug summary]]]
  1999-01-07 17:56                                       ` Linus Torvalds
  1999-01-07 18:18                                         ` Rik van Riel
@ 1999-01-07 18:55                                         ` Zlatko Calusic
  1999-01-07 22:57                                         ` Linus Torvalds
                                                           ` (3 subsequent siblings)
  5 siblings, 0 replies; 243+ messages in thread
From: Zlatko Calusic @ 1999-01-07 18:55 UTC (permalink / raw
  To: Linus Torvalds
  Cc: Eric W. Biederman, Andrea Arcangeli, steve, brent verner,
	Garst R. Reese, Kalle Andersson, Ben McCann, Alan Cox, bredelin,
	Stephen C. Tweedie, linux-kernel, Rik van Riel, linux-mm

Linus Torvalds <torvalds@transmeta.com> writes:

[snip]
> 
> That way I can be reasonably hopeful that there are no new bugs introduced
> even though performance is very different. I _do_ have some early data
> that seems to say that this _has_ uncovered a very old deadlock condition: 
> something that could happen before but was almost impossible to trigger. 
> 
> The deadlock I suspect is:
>  - we're low on memory
>  - we allocate or look up a new block on the filesystem. This involves
>    getting the ext2 superblock lock, and doing a "bread()" of the free
>    block bitmap block.
>  - this causes us to try to allocate a new buffer, and we are so low on
>    memory that we go into try_to_free_pages() to find some more memory.
>  - try_to_free_pages() finds a shared memory file to page out.
>  - trying to page that out, it looks up the buffers on the filesystem it
>    needs, but deadlocks on the superblock lock.
> 
> Note that this could happen before too (I've not removed any of the
> codepaths that could lead to it), but it was dynamically _much_ less
> likely to happen.

You could be very easily right. Look below.

> 
> I'm not even sure it really exists, but I have some really old reports
> that _could_ be due to this, and a few more recent ones (that I never
> could explain). And I have a few _really_ recent ones from here internally
> at transmeta that looks like it's triggering more easily these days.
> 
> (Note that this is not actually pre5-related: I've been chasing this on
> and off for some time, and it seems to have just gotten easier to trigger,
> which is why I finally have a theory on what is going on - just a theory
> though, and I may be completely off the mark). 
> 
> The positive news is that if I'm right in my suspicions it can only happen
> with shared writable mappings or shared memory segments. The bad news is
> that the bug appears rather old, and no immediate solution presents
> itself. 

Exactly. I was torture testing shared mapping when I got very weird
deadlock. It happened only once, few days ago. Look at report and
enjoy:

Jan  5 03:49:14 atlas kernel: SysRq: Show Memory 
Jan  5 03:49:14 atlas kernel: Mem-info: 
Jan  5 03:49:14 atlas kernel: Free pages:         512kB 
Jan  5 03:49:14 atlas kernel:  ( Free: 128 (128 256 384) 
Jan  5 03:49:14 atlas kernel: 0*4kB 0*8kB 0*16kB 0*32kB 0*64kB 4*128kB = 512kB) 
Jan  5 03:49:14 atlas kernel: Swap cache: add 131125/131125, delete 130652/13065
2, find 0/0 
Jan  5 03:49:14 atlas kernel: Free swap:       231632kB 
Jan  5 03:49:14 atlas kernel: 16384 pages of RAM 
Jan  5 03:49:14 atlas kernel: 956 reserved pages 
Jan  5 03:49:14 atlas kernel: 17996 pages shared 
Jan  5 03:49:14 atlas kernel: 473 pages swap cached 
Jan  5 03:49:14 atlas kernel: 13 pages in page table cache 
Jan  5 03:49:14 atlas kernel: Buffer memory:    14696kB 
Jan  5 03:49:14 atlas kernel: Buffer heads:     14732 
Jan  5 03:49:14 atlas kernel: Buffer blocks:    14696 
Jan  5 03:49:14 atlas kernel:    CLEAN: 144 buffers, 18 used (last=122), 0 locke
d, 0 protected, 0 dirty 

This looks exactly like the problem you were describing, isn't it?

[snip]
> Basically, I think that the stuff we handle now with the swap-cache we do
> well on already, and we'd only really want to handle the shared memory
> case with PG_dirty. But I think this is a 2.3 issue, and I only added the
> comment (and the PG_dirty define) for now. 

Nice, thanks. That will make experimenting slightly easier and will
give courage to people to actually experiment with PG_Dirty
implementation. So far, only Eric did some work in this area.

Of course, this is all 2.3 work.

> 
> > Linus is this a case you feel is important to tune for 2.2?
> > If so I would be happy to play with it.
> 
> It might be something good to test out, but I really don't want patches at
> this date (unless your patches also fix the above deadlock problem, which
> I can't see them doing ;)
> 

Sure!
-- 
Zlatko
--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: arca-vm-8 [Re: [patch] arca-vm-6, killed kswapd [Re: [patch]
  1999-01-07 18:18                                         ` Rik van Riel
@ 1999-01-07 19:19                                           ` Alan Cox
  0 siblings, 0 replies; 243+ messages in thread
From: Alan Cox @ 1999-01-07 19:19 UTC (permalink / raw
  To: Rik van Riel
  Cc: torvalds, ebiederm+eric, andrea, steve, damonbrent, reese,
	kalle.andersson, Zlatko.Calusic, bmccann, alan, bredelin, sct,
	linux-kernel, linux-mm

> It can be solved by having a 'laundry' list like the *BSD
> folks have and maybe a special worker thread to take care
> of the laundry (optimizing placement on disk, etc).

We actually have one - on the sparc anyway there is asyncd

Alan

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: Results: 2.2.0-pre5 vs arcavm10 vs arcavm9 vs arcavm7
  1999-01-07 18:44                                         ` Zlatko Calusic
@ 1999-01-07 19:33                                           ` Linus Torvalds
  1999-01-07 21:10                                             ` Zlatko Calusic
  1999-01-07 19:38                                           ` Zlatko Calusic
                                                             ` (2 subsequent siblings)
  3 siblings, 1 reply; 243+ messages in thread
From: Linus Torvalds @ 1999-01-07 19:33 UTC (permalink / raw
  To: Zlatko Calusic
  Cc: Steve Bergman, Andrea Arcangeli, brent verner, Garst R. Reese,
	Kalle Andersson, Ben McCann, bredelin, linux-kernel, linux-mm,
	Alan Cox, Stephen C. Tweedie



On 7 Jan 1999, Zlatko Calusic wrote:
> 
> 1) Swap performance in pre-5 is much worse compared to pre-4 in
> *certain* circumstances. I'm using quite stupid and unintelligent
> program to check for raw swap speed (attached below). With 64 MB of
> RAM I usually run it as 'hogmem 100 3' and watch for result which is
> recently around 6 MB/sec. But when I lately decided to start two
> instances of it like "hogmem 50 3 & hogmem 50 3 &" in pre-4 I got 2 x
> 2.5 MB/sec and in pre-5 it is only 2 x 1 MB/sec and disk is making
> very weird and frightening sounds. My conclusion is that now (pre-5)
> system behaves much poorer when we have more than one thrashing
> task. *Please*, check this, it is a quite serious problem.

Ok, will investigate. One thing you can test is to try out different
"count" arguments to try_to_free_pages() (this was part of what Andrea
did, btw). So instead of (page_alloc.c, line 285):

	freed = try_to_free_pages(gfp_mask, SWAP_CLUSTER_MAX);

you can try different things for the second argument: the thing Andrea did
was something like

	freed = try_to_free_pages(gfp_mask, freepages.high - nr_free_pages);

which could work well (one thing I'm nervous about is that this probably
needs to be limited some way - it can be quite a large number on large
machines, and that's why I'd like to hear comments from people).

> 2) In pre-5, under heavy load, free memory is hovering around
> freepages.min instead of being somewhere between freepages.low &
> freepages.max. This could make trouble for bursts of atomic
> allocations (networking!).

The change above would change this too.

> 3) Nitpick #1: /proc/swapstats exist but is only filled with
> zeros. Probably it should go away. I believe Stephen added it
> recently, but only part of his patch got actually applied.

Maybe somebody can find a use for it.

> 4) Nitpick #2": "Swap cache:" line in report of Alt-SysRq-M is not
> useful as it is laid now. People have repeatedly sent patches (Rik,
> Andrea...) to fix this but it is still not fixed, as of pre-5.

I never use it, so it hasn't been a big issue. 

> 5) There is lots of #if 0 constructs in MM code, and also lots of
> structures are not anymore used but still take precious memory in
> compiled kernel and uncover itself under /proc (/proc/sys/vm/swapctl
> for instance). Do you want a patch to remove this cruft?

Some of the #if 0 code should certainly be removed. Some of it is useful
as a kind of commentary - sometimes code is removed not because it doesn't
make sense, but because the implementation wasn't quite good enough.

		Linus

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: Results: 2.2.0-pre5 vs arcavm10 vs arcavm9 vs arcavm7
  1999-01-07 18:44                                         ` Zlatko Calusic
  1999-01-07 19:33                                           ` Linus Torvalds
@ 1999-01-07 19:38                                           ` Zlatko Calusic
  1999-01-07 19:40                                           ` Andrea Arcangeli
  1999-01-09  6:28                                           ` 2.2.0-pre[56] swap performance poor with > 1 thrashing task Dax Kelson
  3 siblings, 0 replies; 243+ messages in thread
From: Zlatko Calusic @ 1999-01-07 19:38 UTC (permalink / raw
  To: Linus Torvalds
  Cc: Steve Bergman, Andrea Arcangeli, brent verner, Garst R. Reese,
	Kalle Andersson, Ben McCann, bredelin, linux-kernel, linux-mm,
	Alan Cox, Stephen C. Tweedie

Zlatko Calusic <Zlatko.Calusic@CARNet.hr> writes:

> 2) In pre-5, under heavy load, free memory is hovering around
> freepages.min instead of being somewhere between freepages.low &
> freepages.max. This could make trouble for bursts of atomic
> allocations (networking!).
> 

To followup myself, don't trust me, check your logfiles:

Jan  7 20:12:03 atlas kernel: eth0: Insufficient memory; nuking packet. 
Jan  7 20:12:05 atlas last message repeated 64 times

Uaaa, it's baaack... :)
-- 
Zlatko
--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: Results: 2.2.0-pre5 vs arcavm10 vs arcavm9 vs arcavm7
  1999-01-07 18:44                                         ` Zlatko Calusic
  1999-01-07 19:33                                           ` Linus Torvalds
  1999-01-07 19:38                                           ` Zlatko Calusic
@ 1999-01-07 19:40                                           ` Andrea Arcangeli
  1999-01-09  6:28                                           ` 2.2.0-pre[56] swap performance poor with > 1 thrashing task Dax Kelson
  3 siblings, 0 replies; 243+ messages in thread
From: Andrea Arcangeli @ 1999-01-07 19:40 UTC (permalink / raw
  To: Zlatko Calusic
  Cc: Linus Torvalds, Steve Bergman, brent verner, Garst R. Reese,
	Kalle Andersson, Ben McCann, bredelin, linux-kernel, linux-mm,
	Alan Cox, Stephen C. Tweedie

On 7 Jan 1999, Zlatko Calusic wrote:

> 2) In pre-5, under heavy load, free memory is hovering around
> freepages.min instead of being somewhere between freepages.low &
> freepages.max. This could make trouble for bursts of atomic
> allocations (networking!).

Agreed and I just fixed that with my updates to the memory trashing
heuristic (see also the second patch in one of my last emails). 

A new minimal patch against 2.2.0-pre5 is this:

Index: page_alloc.c
===================================================================
RCS file: /var/cvs/linux/mm/page_alloc.c,v
retrieving revision 1.1.1.6
diff -u -2 -r1.1.1.6 page_alloc.c
--- page_alloc.c	1999/01/07 11:21:35	1.1.1.6
+++ linux/mm/page_alloc.c	1999/01/07 19:34:58
@@ -4,4 +4,5 @@
  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
  *  Swap reorganised 29.12.95, Stephen Tweedie
+ *  trashing_memory heuristic. Copyright (C) 1999  Andrea Arcangeli
  */
 
@@ -259,7 +260,4 @@
 		 * to free things up until things are better.
 		 *
-		 * Normally we shouldn't ever have to do this, with
-		 * kswapd doing this in the background.
-		 *
 		 * Most notably, this puts most of the onus of
 		 * freeing up memory on the processes that _use_
@@ -269,8 +267,9 @@
 			if (!current->trashing_memory)
 				goto ok_to_allocate;
-			if (nr_free_pages > freepages.low) {
+			if (nr_free_pages > freepages.high) {
 				current->trashing_memory = 0;
 				goto ok_to_allocate;
-			}
+			} else if (nr_free_pages > freepages.low)
+				goto ok_to_allocate;
 		}
 		/*



The problem is that I don't know if it's going to hurt performances... If
somebody would try it out would be helpful... I don't think it can 
hurt but...

Andrea Arcangeli

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: Results: 2.2.0-pre5 vs arcavm10 vs arcavm9 vs arcavm7
  1999-01-07 12:02                                       ` Andrea Arcangeli
@ 1999-01-07 20:27                                         ` Linus Torvalds
  1999-01-07 23:56                                           ` Andrea Arcangeli
  0 siblings, 1 reply; 243+ messages in thread
From: Linus Torvalds @ 1999-01-07 20:27 UTC (permalink / raw
  To: Andrea Arcangeli; +Cc: Steve Bergman, linux-kernel, linux-mm



On Thu, 7 Jan 1999, Andrea Arcangeli wrote:
> 
> The changes in 2.2.0-pre5 looks really cool! I think the only missing
> thing that I would like to see in is my calc_swapout_weight() thing. This
> my change would avoid swap_out() to stall too much the system in presence
> of huge tasks and so it would allow the VM to scale better...

Note that if swap_out swaps something out, it will always return 1 (it has
to, as it sleeps), and that in turn will make us decrement our counter,
which will make us stop paging things out soon enough.. 

So I really don't think it's a scaling issue either.

		Linus

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: arca-vm-8 [Re: [patch] arca-vm-6, killed kswapd [Re: [patch] new-vm , improvement , [Re: 2.2.0 Bug summary]]]
  1999-01-07 18:19                                       ` Linus Torvalds
@ 1999-01-07 20:35                                         ` Andrea Arcangeli
  1999-01-07 23:51                                           ` Linus Torvalds
  0 siblings, 1 reply; 243+ messages in thread
From: Andrea Arcangeli @ 1999-01-07 20:35 UTC (permalink / raw
  To: Linus Torvalds
  Cc: steve, brent verner, Garst R. Reese, Kalle Andersson,
	Zlatko Calusic, Ben McCann, bredelin, linux-kernel, linux-mm

On Thu, 7 Jan 1999, Linus Torvalds wrote:

> 
> 
> On Thu, 7 Jan 1999, Andrea Arcangeli wrote:
> > 
> > This first patch allow swap_out to have a more fine grined weight. Should
> > help at least in low memory envinronments.
> 
> The basic reason I didn't want to do this was that I thought it was wrong
> to try to base _any_ decision on any virtual memory sizes. The reason is
> simply that I think RSS isn't a very interesting thing to look at.

But now I am not looking at RSS, I am looking only at total_vm. The point
of the patch is only to be _balanced_ between passes even if in the system
there are some processes with a total_vm of 1Giga and some processes that
has a total_vm of 1kbyte. In normal conditions the patch _should_ make no
differences... This in my theory at least ;)

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: Results: 2.2.0-pre5 vs arcavm10 vs arcavm9 vs arcavm7
  1999-01-07 19:33                                           ` Linus Torvalds
@ 1999-01-07 21:10                                             ` Zlatko Calusic
  0 siblings, 0 replies; 243+ messages in thread
From: Zlatko Calusic @ 1999-01-07 21:10 UTC (permalink / raw
  To: Linus Torvalds
  Cc: Steve Bergman, Andrea Arcangeli, brent verner, Garst R. Reese,
	Kalle Andersson, Ben McCann, bredelin, linux-kernel, linux-mm,
	Alan Cox, Stephen C. Tweedie

Linus Torvalds <torvalds@transmeta.com> writes:

> On 7 Jan 1999, Zlatko Calusic wrote:
> > 
> > 1) Swap performance in pre-5 is much worse compared to pre-4 in
> > *certain* circumstances. I'm using quite stupid and unintelligent
> > program to check for raw swap speed (attached below). With 64 MB of
> > RAM I usually run it as 'hogmem 100 3' and watch for result which is
> > recently around 6 MB/sec. But when I lately decided to start two
> > instances of it like "hogmem 50 3 & hogmem 50 3 &" in pre-4 I got 2 x
> > 2.5 MB/sec and in pre-5 it is only 2 x 1 MB/sec and disk is making
> > very weird and frightening sounds. My conclusion is that now (pre-5)
> > system behaves much poorer when we have more than one thrashing
> > task. *Please*, check this, it is a quite serious problem.
> 
> Ok, will investigate. One thing you can test is to try out different
> "count" arguments to try_to_free_pages() (this was part of what Andrea
> did, btw). So instead of (page_alloc.c, line 285):
> 
> 	freed = try_to_free_pages(gfp_mask, SWAP_CLUSTER_MAX);
> 
> you can try different things for the second argument: the thing Andrea did
> was something like
> 
> 	freed = try_to_free_pages(gfp_mask, freepages.high - nr_free_pages);
> 
> which could work well (one thing I'm nervous about is that this probably
> needs to be limited some way - it can be quite a large number on large
> machines, and that's why I'd like to hear comments from people).

OK, I'll check what can be done with that, as time permits.

> 
> > 2) In pre-5, under heavy load, free memory is hovering around
> > freepages.min instead of being somewhere between freepages.low &
> > freepages.max. This could make trouble for bursts of atomic
> > allocations (networking!).
> 
> The change above would change this too.

Yes, probably.

> 
> > 3) Nitpick #1: /proc/swapstats exist but is only filled with
> > zeros. Probably it should go away. I believe Stephen added it
> > recently, but only part of his patch got actually applied.
> 
> Maybe somebody can find a use for it.

Of course it can, but *right now* it's useless. So if nobody gets in
(Stephen?), I'll take it and fix it. I have a similar, even more
verbose patch for MM statistics with ~15 columns of information, but
that one does not apply cleanly on the latest kernels (obvious
reasons).

> 
> > 4) Nitpick #2": "Swap cache:" line in report of Alt-SysRq-M is not
> > useful as it is laid now. People have repeatedly sent patches (Rik,
> > Andrea...) to fix this but it is still not fixed, as of pre-5.
> 
> I never use it, so it hasn't been a big issue.

But it is, cause if properly done, it could give valuable information
about swap cache hit, and thus differentiate among various algorithms
we use for memory management. If I understand correctly, lots of
changes happened in MM in last few kernel revisions, but nobody knows
exactly what we're doing (there were few macro benchmarks done, which
are by definition not very useful for selecting among algorithms).

> 
> > 5) There is lots of #if 0 constructs in MM code, and also lots of
> > structures are not anymore used but still take precious memory in
> > compiled kernel and uncover itself under /proc (/proc/sys/vm/swapctl
> > for instance). Do you want a patch to remove this cruft?
> 
> Some of the #if 0 code should certainly be removed. Some of it is useful
> as a kind of commentary - sometimes code is removed not because it doesn't
> make sense, but because the implementation wasn't quite good enough.
> 

I'll see tomorrow if I can prepare one patch to fix few annoyances I
reported.

Good night!
-- 
Zlatko
--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: arca-vm-8 [Re: [patch] arca-vm-6, killed kswapd [Re: [patch] new-vm , improvement , [Re: 2.2.0 Bug summary]]]
  1999-01-07 17:56                                       ` Linus Torvalds
  1999-01-07 18:18                                         ` Rik van Riel
  1999-01-07 18:55                                         ` arca-vm-8 [Re: [patch] arca-vm-6, killed kswapd [Re: [patch] new-vm , improvement , [Re: 2.2.0 Bug summary]]] Zlatko Calusic
@ 1999-01-07 22:57                                         ` Linus Torvalds
  1999-01-08  1:16                                           ` Linus Torvalds
  1999-01-09  9:43                                           ` MM deadlock [was: Re: arca-vm-8...] Savochkin Andrey Vladimirovich
  1999-01-08  2:56                                         ` arca-vm-8 [Re: [patch] arca-vm-6, killed kswapd [Re: [patch] new-vm , improvement , [Re: 2.2.0 Bug summary]]] Eric W. Biederman
                                                           ` (2 subsequent siblings)
  5 siblings, 2 replies; 243+ messages in thread
From: Linus Torvalds @ 1999-01-07 22:57 UTC (permalink / raw
  To: Eric W. Biederman
  Cc: Andrea Arcangeli, steve, brent verner, Garst R. Reese,
	Kalle Andersson, Zlatko Calusic, Ben McCann, Alan Cox, bredelin,
	Stephen C. Tweedie, linux-kernel, Rik van Riel, linux-mm



On Thu, 7 Jan 1999, Linus Torvalds wrote:
> 
> The deadlock I suspect is:
>  - we're low on memory
>  - we allocate or look up a new block on the filesystem. This involves
>    getting the ext2 superblock lock, and doing a "bread()" of the free
>    block bitmap block.
>  - this causes us to try to allocate a new buffer, and we are so low on
>    memory that we go into try_to_free_pages() to find some more memory.
>  - try_to_free_pages() finds a shared memory file to page out.
>  - trying to page that out, it looks up the buffers on the filesystem it
>    needs, but deadlocks on the superblock lock.

Confirmed. Hpa was good enough to reproduce this, and my debugging code
caught the (fairly deep) deadlock: 

	system_call ->
	sys_write ->
	ext2_file_write ->
	ext2_getblk ->
	ext2_alloc_block ->	** gets superblock lock **
	ext2_new_block ->
	getblk ->
	refill_freelist ->
	grow_buffers ->
	__get_free_pages ->
	try_to_free_pages ->
	swap_out ->
	swap_out_process ->
	swap_out_vma ->
	try_to_swap_out ->
	filemap_swapout ->
	filemap_write_page ->
	ext2_file_write ->
	ext2_getblk ->
	ext2_alloc_block ->
	__wait_on_super		** BOOM - we want the superblock lock again **

and I suspect the fix is fairly simple: I'll just add back the __GFP_IO
bit (we kind of used to have one that did something similar) which will
make the swap-out code not write out shared pages when it allocates
buffers. 

The better fix would actually be to make sure that filesystems do not hold
locks around these kinds of blocking operations, but that is harder to do
at this late stage.

		Linus

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: arca-vm-8 [Re: [patch] arca-vm-6, killed kswapd [Re: [patch] new-vm , improvement , [Re: 2.2.0 Bug summary]]]
  1999-01-07 20:35                                         ` Andrea Arcangeli
@ 1999-01-07 23:51                                           ` Linus Torvalds
  1999-01-08  0:04                                             ` Andrea Arcangeli
  0 siblings, 1 reply; 243+ messages in thread
From: Linus Torvalds @ 1999-01-07 23:51 UTC (permalink / raw
  To: Andrea Arcangeli
  Cc: steve, brent verner, Garst R. Reese, Kalle Andersson,
	Zlatko Calusic, Ben McCann, bredelin, linux-kernel, linux-mm



On Thu, 7 Jan 1999, Andrea Arcangeli wrote:
> > The basic reason I didn't want to do this was that I thought it was wrong
> > to try to base _any_ decision on any virtual memory sizes. The reason is
> > simply that I think RSS isn't a very interesting thing to look at.
> 
> But now I am not looking at RSS, I am looking only at total_vm. The point
> of the patch is only to be _balanced_ between passes even if in the system
> there are some processes with a total_vm of 1Giga and some processes that
> has a total_vm of 1kbyte. In normal conditions the patch _should_ make no
> differences... This in my theory at least ;)

Ehh, and how do you protect against somebody playing games with your mind
by doing _huge_ mappings of something that takes no real memory? The VM
footprint of a process is not necessarily related to how much physical
memory you use. 

Basically, I think the thing should either be simple or right, and yours
is somewhere in between - neither simple nor strictly correct.

Also, I've been happily deleting code, and it has worked wonderfully. This
patch adds logic and code back.

		Linus

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: Results: 2.2.0-pre5 vs arcavm10 vs arcavm9 vs arcavm7
  1999-01-07 20:27                                         ` Linus Torvalds
@ 1999-01-07 23:56                                           ` Andrea Arcangeli
  0 siblings, 0 replies; 243+ messages in thread
From: Andrea Arcangeli @ 1999-01-07 23:56 UTC (permalink / raw
  To: Linus Torvalds; +Cc: Steve Bergman, linux-kernel, linux-mm

On Thu, 7 Jan 1999, Linus Torvalds wrote:

> Note that if swap_out swaps something out, it will always return 1 (it has
> to, as it sleeps), and that in turn will make us decrement our counter,

Side note, when we swapout something we stop completly swapping out and we
return to try_to_free_pages() (still better for the issue we was talking
about ;). 

> So I really don't think it's a scaling issue either.

Yes, I think you are right. I am rejecting the calc_swapout_weight code.

Andrea Arcangeli

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: arca-vm-8 [Re: [patch] arca-vm-6, killed kswapd [Re: [patch] new-vm , improvement , [Re: 2.2.0 Bug summary]]]
  1999-01-07 23:51                                           ` Linus Torvalds
@ 1999-01-08  0:04                                             ` Andrea Arcangeli
  0 siblings, 0 replies; 243+ messages in thread
From: Andrea Arcangeli @ 1999-01-08  0:04 UTC (permalink / raw
  To: Linus Torvalds; +Cc: linux-kernel, linux-mm

On Thu, 7 Jan 1999, Linus Torvalds wrote:

> Ehh, and how do you protect against somebody playing games with your mind
> by doing _huge_ mappings of something that takes no real memory? The VM
> footprint of a process is not necessarily related to how much physical
> memory you use. 

I was infact rejecting from the total_vm calc all tasks with a rss == 0,
but yes, I am convinced that my more fine grined counter is not needed.

Andrea Arcangeli

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: arca-vm-8 [Re: [patch] arca-vm-6, killed kswapd [Re: [patch] new-vm , improvement , [Re: 2.2.0 Bug summary]]]
  1999-01-07 22:57                                         ` Linus Torvalds
@ 1999-01-08  1:16                                           ` Linus Torvalds
  1999-01-08 10:45                                             ` Andrea Arcangeli
  1999-01-09  9:43                                           ` MM deadlock [was: Re: arca-vm-8...] Savochkin Andrey Vladimirovich
  1 sibling, 1 reply; 243+ messages in thread
From: Linus Torvalds @ 1999-01-08  1:16 UTC (permalink / raw
  To: Eric W. Biederman
  Cc: Andrea Arcangeli, steve, brent verner, Garst R. Reese,
	Kalle Andersson, Zlatko Calusic, Ben McCann, Alan Cox, bredelin,
	Stephen C. Tweedie, linux-kernel, Rik van Riel, linux-mm



On Thu, 7 Jan 1999, Linus Torvalds wrote:
>
> and I suspect the fix is fairly simple: I'll just add back the __GFP_IO
> bit (we kind of used to have one that did something similar) which will
> make the swap-out code not write out shared pages when it allocates
> buffers. 

Ok, here it is.. Stable.

		Linus

-----
diff -u --recursive --new-file v2.2.0-pre5/linux/include/linux/mm.h linux/include/linux/mm.h
--- v2.2.0-pre5/linux/include/linux/mm.h	Thu Jan  7 15:11:40 1999
+++ linux/include/linux/mm.h	Thu Jan  7 15:04:54 1999
@@ -315,14 +323,15 @@
 #define __GFP_LOW	0x02
 #define __GFP_MED	0x04
 #define __GFP_HIGH	0x08
+#define __GFP_IO	0x10
 
 #define __GFP_DMA	0x80
 
 #define GFP_BUFFER	(__GFP_LOW | __GFP_WAIT)
 #define GFP_ATOMIC	(__GFP_HIGH)
-#define GFP_USER	(__GFP_LOW | __GFP_WAIT)
-#define GFP_KERNEL	(__GFP_MED | __GFP_WAIT)
-#define GFP_NFS		(__GFP_HIGH | __GFP_WAIT)
+#define GFP_USER	(__GFP_LOW | __GFP_WAIT | __GFP_IO)
+#define GFP_KERNEL	(__GFP_MED | __GFP_WAIT | __GFP_IO)
+#define GFP_NFS		(__GFP_HIGH | __GFP_WAIT | __GFP_IO)
 
 /* Flag - indicates that the buffer will be suitable for DMA.  Ignored on some
    platforms, used as appropriate on others */
diff -u --recursive --new-file v2.2.0-pre5/linux/mm/vmscan.c linux/mm/vmscan.c
--- v2.2.0-pre5/linux/mm/vmscan.c	Thu Jan  7 15:11:41 1999
+++ linux/mm/vmscan.c	Thu Jan  7 15:09:46 1999
@@ -76,7 +76,6 @@
 		set_pte(page_table, __pte(entry));
 drop_pte:
 		vma->vm_mm->rss--;
-		tsk->nswap++;
 		flush_tlb_page(vma, address);
 		__free_page(page_map);
 		return 0;
@@ -99,6 +98,14 @@
 		pte_clear(page_table);
 		goto drop_pte;
 	}
+
+	/*
+	 * Don't go down into the swap-out stuff if
+	 * we cannot do I/O! Avoid recursing on FS
+	 * locks etc.
+	 */
+	if (!(gfp_mask & __GFP_IO))
+		return 0;
 
 	/*
 	 * Ok, it's really dirty. That means that


--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: arca-vm-8 [Re: [patch] arca-vm-6, killed kswapd [Re: [patch] new-vm , improvement , [Re: 2.2.0 Bug summary]]]
  1999-01-07 17:56                                       ` Linus Torvalds
                                                           ` (2 preceding siblings ...)
  1999-01-07 22:57                                         ` Linus Torvalds
@ 1999-01-08  2:56                                         ` Eric W. Biederman
  1999-01-09  0:50                                         ` David S. Miller
  1999-01-09  2:13                                         ` Stephen C. Tweedie
  5 siblings, 0 replies; 243+ messages in thread
From: Eric W. Biederman @ 1999-01-08  2:56 UTC (permalink / raw
  To: Linus Torvalds
  Cc: Andrea Arcangeli, steve, brent verner, Garst R. Reese,
	Kalle Andersson, Zlatko Calusic, Ben McCann, Alan Cox, bredelin,
	Stephen C. Tweedie, linux-kernel, Rik van Riel, linux-mm

>>>>> "LT" == Linus Torvalds <torvalds@transmeta.com> writes:

LT> On 6 Jan 1999, Eric W. Biederman wrote:
>> 
>> 1) With your comments on PG_dirty/(what shrink_mmap should do) you
>> have worked out what needs to happen for the mapped in memory case,
>> and I haven't quite gotten there.  Thank You.

LT> Note that it is not finalized. That's why I didn't write the code (which
LT> should be fairly simple), because it has some fairly subtle issues and
LT> thus becomes a 2.3.x thing, I very much suspect.

The code probably will be simple enough, but there are issues.
The complete issue for 2.3.x is dirty data in the page cache,
mapped shared pages are just a small subset.

This will be much more important for NFS, e2compr, and not
double buffering between the page cache and the buffer cache,
than for this case.

>> 2) I have tested using PG_dirty from shrink_mmap and it is a
>> performance problem because it loses all locality of reference,
>> and because it forces shrink_mmap into a dual role, of freeing and
>> writing pages, which need seperate tuning.

LT> Exactly. This is part of the complexity.

LT> The right solution (I _think_) is to conceptually always mark it PG_dirty
LT> in vmscan, and basically leave all the nasty cases to the filemap physical
LT> page scan. But in the simple cases (ie a swap-cached page that is only
LT> mapped by one process and doesn't have any other users), you'd start the
LT> IO "early".

This sounds good for the subset of the problem you are considering.

>From where I'm at something that allocates a streamlined buffer_head
to the diry pages, sounds even better.  That and having a peridic
scan of the page tables that removes the dirty bit and marks the 
pages dirty, before we need the pages to be clean.

LT> Basically, I think that the stuff we handle now with the swap-cache we do
LT> well on already, and we'd only really want to handle the shared memory
LT> case with PG_dirty. But I think this is a 2.3 issue, and I only added the
LT> comment (and the PG_dirty define) for now. 

Thanks it does give some encouragement and some relief.  There are enough
things to get shaken out,  I am much more comfortable with early 2.3,
where we have time to convert things to a new way of doing things.

LT> It might be something good to test out, but I really don't want patches at
LT> this date (unless your patches also fix the above deadlock problem, which
LT> I can't see them doing ;)

Then I will proceed with my previous plan and see if I can get a fairly
complete set of patches ready for 2.3.early

Eric

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: arca-vm-8 [Re: [patch] arca-vm-6, killed kswapd [Re: [patch] new-vm , improvement , [Re: 2.2.0 Bug summary]]]
  1999-01-08  1:16                                           ` Linus Torvalds
@ 1999-01-08 10:45                                             ` Andrea Arcangeli
  1999-01-08 19:06                                               ` Linus Torvalds
  0 siblings, 1 reply; 243+ messages in thread
From: Andrea Arcangeli @ 1999-01-08 10:45 UTC (permalink / raw
  To: Linus Torvalds
  Cc: Eric W. Biederman, steve, brent verner, Garst R. Reese,
	Kalle Andersson, Zlatko Calusic, Ben McCann, Alan Cox, bredelin,
	Stephen C. Tweedie, linux-kernel, Rik van Riel, linux-mm

On Thu, 7 Jan 1999, Linus Torvalds wrote:

> Ok, here it is.. Stable.

Yesterday after your email I tried and I been able to reproduce the
deadlock here too. It's trivial, simply alloc a shared mapping of 160Mbyte
and start dirtifying it and msync it in loop. So I applyed your patch and
the machine still deadlocked after some second. I thought "argg
update_shared_mappings is faulting noooooo"!! So I removed
updated_shared_mappings() and I tried again and it still deadlocked... I
thought "oh, cool, I still have something to fix ;)". 

So I developed this debugging code (that I post here because I guess it
could be useful also to many others) to know which was the still pending
bug:

Index: sched.c
===================================================================
RCS file: /var/cvs/linux/kernel/sched.c,v
retrieving revision 1.1.1.1.2.37
diff -u -r1.1.1.1.2.37 sched.c
--- sched.c	1999/01/07 11:57:23	1.1.1.1.2.37
+++ sched.c	1999/01/08 10:41:53
@@ -22,6 +22,10 @@
  * current-task
  */
 
+/*
+ * Debug down() code. Copyright (C) 1999  Andrea Arcangeli
+ */
+
 #include <linux/mm.h>
 #include <linux/kernel_stat.h>
 #include <linux/fdreg.h>
@@ -893,12 +897,27 @@
 	tsk->state = TASK_RUNNING;		\
 	remove_wait_queue(&sem->wait, &wait);
 
+void generate_oops (struct semaphore *sem)
+{
+	sema_init(sem, 9876);
+	wake_up(&sem->wait);
+}
+
 void __down(struct semaphore * sem)
 {
 	DOWN_VAR
+	struct timer_list timer;
+	init_timer (&timer);
+	timer.expires = jiffies + HZ*20;
+	timer.data = (unsigned long) sem;
+	timer.function = (void (*)(unsigned long)) generate_oops;
+	add_timer(&timer);
 	DOWN_HEAD(TASK_UNINTERRUPTIBLE)
 	schedule();
+	if (atomic_read(&sem->count) == 9876)
+		*(int *) 0 = 0;
 	DOWN_TAIL(TASK_UNINTERRUPTIBLE)
+	del_timer(&timer);
 }
 
 int __down_interruptible(struct semaphore * sem)


Then recompiled, rebooted, return to run the deadlocking proggy, deadlocked
again after some seconds and after 20 second I had a
nice Oops on the screen. SysRQ-K helped me to restore some functionality
in another console. Then I run dmesg | ksymoops.... and I had this:

Using `/usr/src/linux/System.map' to map addresses to symbols.

>>EIP: c0111646 <__down+b2/160>
Trace: c0111574 <generate_oops>
Trace: c0189f58 <__down_failed+8/10>
Trace: c010ef1a <do_page_fault+56/340>
Trace: c0108c0d <error_code+2d/40>
Trace: c0111646 <__down+b2/160>
Trace: c0111574 <generate_oops>
Trace: c0189f58 <__down_failed+8/10>
Trace: c011dc59 <filemap_write_page+9d/138>
Trace: c011dd59 <filemap_swapout+65/7c>
Trace: c0121864 <try_to_swap_out+118/1c4>
Trace: c0121a18 <swap_out_vma+108/164>
Trace: c0121ad4 <swap_out_process+60/88>
Trace: c0121bdb <swap_out+df/fc>
Trace: c011cbb7 <shrink_mmap+11b/138>
Trace: c0121d1a <free_user_and_cache+1e/34>
Trace: c0121d76 <try_to_free_pages+46/a4>
Trace: c0122615 <__get_free_pages+d5/220>
Trace: c0126af2 <get_hash_table+52/64>
Trace: c0127bcf <grow_buffers+3b/ec>
Trace: c0126ca8 <refill_freelist+c/34>
Trace: c0126f3a <getblk+202/228>
Trace: c013af6c <ext2_alloc_block+68/13c>
Trace: c013b5c4 <block_getblk+15c/2b0>
Trace: c013b887 <ext2_getblk+16f/20c>
Trace: c0139d2b <ext2_file_write+40b/554>
Trace: c011dcc0 <filemap_write_page+104/138>
Trace: c011e0fe <filemap_sync+256/30c>
Trace: c011e297 <msync_interval+2f/7c>
Trace: c011e3d2 <sys_msync+ee/14c>
Trace: c0108ad4 <system_call+34/40>
Code: c0111646 <__down+b2/160> 
Code: c0111646 <__down+b2/160>  c7 05 00 00 00 	movl   $0x0,0x0
Code: c011164b <__down+b7/160>  00 00 00 00 00 
Code: c0111656 <__down+c2/160>  8b 75 d8       	movl   0xffffffd8(%ebp),%esi
Code: c0111659 <__down+c5/160>  c7 06 02 00 00 	movl   $0x2,(%esi)
Code: c011165f <__down+cb/160>  31 00          	xorl   %eax,(%eax)
Code: c0111667 <__down+d3/160>  90             	nop    
Code: c0111668 <__down+d4/160>  90             	nop    
Code: c0111669 <__down+d5/160>  90             	nop    

So I looked at buffer.c ;)

Index: buffer.c
===================================================================
RCS file: /var/cvs/linux/fs/buffer.c,v
retrieving revision 1.1.1.1.2.8
diff -u -r1.1.1.1.2.8 buffer.c
--- buffer.c	1999/01/07 11:57:21	1.1.1.1.2.8
+++ linux/fs/buffer.c	1999/01/08 10:27:09
@@ -689,7 +689,7 @@
  */
 static void refill_freelist(int size)
 {
-	if (!grow_buffers(GFP_KERNEL, size)) {
+	if (!grow_buffers(GFP_BUFFER, size)) {
 		wakeup_bdflush(1);
 		current->policy |= SCHED_YIELD;
 		schedule();


and now is really stable ;))

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: arca-vm-8 [Re: [patch] arca-vm-6, killed kswapd [Re: [patch] new-vm , improvement , [Re: 2.2.0 Bug summary]]]
  1999-01-08 10:45                                             ` Andrea Arcangeli
@ 1999-01-08 19:06                                               ` Linus Torvalds
  0 siblings, 0 replies; 243+ messages in thread
From: Linus Torvalds @ 1999-01-08 19:06 UTC (permalink / raw
  To: Andrea Arcangeli
  Cc: Eric W. Biederman, steve, brent verner, Garst R. Reese,
	Kalle Andersson, Zlatko Calusic, Ben McCann, Alan Cox, bredelin,
	Stephen C. Tweedie, linux-kernel, Rik van Riel, linux-mm



On Fri, 8 Jan 1999, Andrea Arcangeli wrote:
> 
> So I looked at buffer.c ;)

Ehh, duh. I had it right in my tree, but the _patch_ I sent out only had
my mm changes, but not my fs changes. 

Embarrassing ;)

		Linus

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: arca-vm-8 [Re: [patch] arca-vm-6, killed kswapd [Re: [patch] new-vm , improvement , [Re: 2.2.0 Bug summary]]]
  1999-01-07 17:56                                       ` Linus Torvalds
                                                           ` (3 preceding siblings ...)
  1999-01-08  2:56                                         ` arca-vm-8 [Re: [patch] arca-vm-6, killed kswapd [Re: [patch] new-vm , improvement , [Re: 2.2.0 Bug summary]]] Eric W. Biederman
@ 1999-01-09  0:50                                         ` David S. Miller
  1999-01-09  2:13                                         ` Stephen C. Tweedie
  5 siblings, 0 replies; 243+ messages in thread
From: David S. Miller @ 1999-01-09  0:50 UTC (permalink / raw
  To: torvalds
  Cc: ebiederm+eric, andrea, steve, damonbrent, reese, kalle.andersson,
	Zlatko.Calusic, bmccann, alan, bredelin, sct, linux-kernel,
	H.H.vanRiel, linux-mm

   Date: 	Thu, 7 Jan 1999 09:56:03 -0800 (PST)
   From: Linus Torvalds <torvalds@transmeta.com>

   The positive news is that if I'm right in my suspicions it can only
   happen with shared writable mappings or shared memory segments. The
   bad news is that the bug appears rather old, and no immediate
   solution presents itself.

We could drop the superblock lock right before the actual bread()
call, grab it again right afterwards, then idicate back down to the
original caller that he should restart his search from the beginning
of the toplevel logic in ext2_free_blocks/ext2_new_block.

The second time around a bread() won't happen.

>From a performance standpoint, since we are doing a disk I/O anyways,
the extra software overhead here will be mute.

However, I am concerned about deadlocks in this scheme where the
bread() kicks some other bitmap block back out to disk, and we loop
forever pingponging block bitmap blocks back and forth with no forward
progress being made.  Also the logic in these functions is non-trivial
and making an "obviously correct" patch, ignoring the possible
deadlock mentioned here, might not be easy.

We've had a couple strange issues like this, with recursive superblock
lock problems, recall the quota writeback deadlock Bill Hawes fixed a
few months ago, very similar.

Later,
David S. Miller
davem@dm.cobaltmicro.com
--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: arca-vm-8 [Re: [patch] arca-vm-6, killed kswapd [Re: [patch] new-vm , improvement , [Re: 2.2.0 Bug summary]]]
  1999-01-07 17:56                                       ` Linus Torvalds
                                                           ` (4 preceding siblings ...)
  1999-01-09  0:50                                         ` David S. Miller
@ 1999-01-09  2:13                                         ` Stephen C. Tweedie
  1999-01-09  2:34                                           ` Andrea Arcangeli
  1999-01-09 12:11                                           ` Andrea Arcangeli
  5 siblings, 2 replies; 243+ messages in thread
From: Stephen C. Tweedie @ 1999-01-09  2:13 UTC (permalink / raw
  To: Linus Torvalds
  Cc: Eric W. Biederman, Andrea Arcangeli, steve, brent verner,
	Garst R. Reese, Kalle Andersson, Zlatko Calusic, Ben McCann,
	Alan Cox, bredelin, Stephen C. Tweedie, linux-kernel,
	Rik van Riel, linux-mm

Hi,

On Thu, 7 Jan 1999 09:56:03 -0800 (PST), Linus Torvalds
<torvalds@transmeta.com> said:

> That way I can be reasonably hopeful that there are no new bugs introduced
> even though performance is very different. I _do_ have some early data
> that seems to say that this _has_ uncovered a very old deadlock condition: 
> something that could happen before but was almost impossible to trigger. 

> The deadlock I suspect is:
>  - we're low on memory
>  - we allocate or look up a new block on the filesystem. This involves
>    getting the ext2 superblock lock, and doing a "bread()" of the free
>    block bitmap block.
>  - this causes us to try to allocate a new buffer, and we are so low on
>    memory that we go into try_to_free_pages() to find some more memory.
>  - try_to_free_pages() finds a shared memory file to page out.
>  - trying to page that out, it looks up the buffers on the filesystem it
>    needs, but deadlocks on the superblock lock.

Hmm, I think that's a new one to me, but add to that one which I think
we've come across before and which I have not even thought about for a
couple of years at least: a large write() to a mmap()ed file can
deadlock for a similar reason, but on the inode write lock instead of
the superblock lock.

> The positive news is that if I'm right in my suspicions it can only happen
> with shared writable mappings or shared memory segments. The bad news is
> that the bug appears rather old, and no immediate solution presents
> itself. 

A couple solutions which come to mind: (1) make the superblock lock
recursive (ugh, horrible and it only works if we have an additional
mechanism to pin down bitmap buffers in the bitmap cache), or (2) allow
load_block_bitmap and friends to drop the superblock if it finds that it
needs to do an IO, and repeat if it happened.  However, what we're
basically saying here is that all operations on the superblock_lock have
to drop the lock if they want to allocate memory, and that's not a great
deal of fun: we might as well use the kernel spinlock.

It gets worse, because of course we cannot even rely on kswapd to
function correctly in this situation --- it will block on the superblock
lock just as happily as the current process's try_to_free_pages call
will.

I think the cleanest solution may be to reimplement some form of the old
GFP_IO flag, to prevent us from trying to use IO inside
try_to_free_pages() if we know we already have a lock which could
deadlock.  The easiest way I can see of achieving something like this is
to set current->flags |= PF_MEMALLOC while we hold the superblock lock,
or create another PF_NOIO flag which prevents try_to_free_pages from
doing anything with dirty pages.  I suspect that the PF_MEMALLOC option
might be good enough for starters; it will only do the wrong thing if we
have entirely exhausted the free page list.

The inode deadlock at least is relatively easy to fix, either by making
the inodelock recursive, or by having a separate sharable truncate lock
to prevent pages from being invalidated in the middle of the pageout
(which was the reason for the down() in the filemap write-page code in
the first place).  The truncate lock (or allocation/deallocation lock,
if you want to do it that way) makes a ton of sense; it avoids
serialising all writes while still making sure that truncates themselves
are exclusively locked.

>> 2) I have tested using PG_dirty from shrink_mmap and it is a
>> performance problem because it loses all locality of reference,
>> and because it forces shrink_mmap into a dual role, of freeing and
>> writing pages, which need seperate tuning.

> Exactly. This is part of the complexity.

> The right solution (I _think_) is to conceptually always mark it PG_dirty
> in vmscan, and basically leave all the nasty cases to the filemap physical
> page scan. But in the simple cases (ie a swap-cached page that is only
> mapped by one process and doesn't have any other users), you'd start the
> IO "early".

The trouble is that when we come to do the physical IO, we really want
to cluster the IOs.  Doing the swap cache allocation from vmscan means
that we'll still be allocating virtually adjacent memory pages to
adjacent swap pages, but if we don't do the IO itself until
shrink_mmap(), we'll lose the IO clustering which we need for good
swapout performance.

--Stephen
--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: arca-vm-8 [Re: [patch] arca-vm-6, killed kswapd [Re: [patch] new-vm , improvement , [Re: 2.2.0 Bug summary]]]
  1999-01-09  2:13                                         ` Stephen C. Tweedie
@ 1999-01-09  2:34                                           ` Andrea Arcangeli
  1999-01-09  9:30                                             ` Stephen C. Tweedie
  1999-01-09 12:11                                           ` Andrea Arcangeli
  1 sibling, 1 reply; 243+ messages in thread
From: Andrea Arcangeli @ 1999-01-09  2:34 UTC (permalink / raw
  To: Stephen C. Tweedie
  Cc: Linus Torvalds, Eric W. Biederman, Zlatko Calusic, Alan Cox,
	bredelin, linux-kernel, Rik van Riel, linux-mm

Hi Stephen!

On Sat, 9 Jan 1999, Stephen C. Tweedie wrote:

> deadlock.  The easiest way I can see of achieving something like this is
> to set current->flags |= PF_MEMALLOC while we hold the superblock lock,

Hmm, we must not avoid shrink_mmap() to run. So I see plain wrong to set
the PF_MEMALLOC before call __get_free_pages(). Very cleaner to use
GFP_ATOMIC to achieve the same effect btw ;).

Now I am too tired to follow the other part of your email (I'll read
tomorrow, now it's time to sleep for me... ;).

Forget to tell, did you have comments about the FreeAfter() stuff? It made
sense to me (looking at page_io if I remeber well) but I have not
carefully reread it yet after Linus's comments on it. 

Andrea Arcangeli

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* 2.2.0-pre[56] swap performance poor with > 1 thrashing task
  1999-01-07 18:44                                         ` Zlatko Calusic
                                                             ` (2 preceding siblings ...)
  1999-01-07 19:40                                           ` Andrea Arcangeli
@ 1999-01-09  6:28                                           ` Dax Kelson
  1999-01-09  6:32                                             ` Zlatko Calusic
  1999-01-09  7:48                                             ` Benjamin Redelings I
  3 siblings, 2 replies; 243+ messages in thread
From: Dax Kelson @ 1999-01-09  6:28 UTC (permalink / raw
  To: Zlatko Calusic
  Cc: Linus Torvalds, Steve Bergman, Andrea Arcangeli, brent verner,
	Garst R. Reese, Kalle Andersson, Ben McCann, bredelin,
	linux-kernel, linux-mm, Alan Cox, Stephen C. Tweedie


On 7 Jan 1999, Zlatko Calusic wrote:

> 2.2.0-pre5 works very good, indeed, but it still has some not
> sufficiently explored nuisances:
> 
> 1) Swap performance in pre-5 is much worse compared to pre-4 in
> *certain* circumstances. I'm using quite stupid and unintelligent
> program to check for raw swap speed (attached below). With 64 MB of
> RAM I usually run it as 'hogmem 100 3' and watch for result which is
> recently around 6 MB/sec. But when I lately decided to start two
> instances of it like "hogmem 50 3 & hogmem 50 3 &" in pre-4 I got 2 x
> 2.5 MB/sec and in pre-5 it is only 2 x 1 MB/sec and disk is making
> very weird and frightening sounds. My conclusion is that now (pre-5)
> system behaves much poorer when we have more than one thrashing
> task. *Please*, check this, it is a quite serious problem.

I just tried this on 2.2.0-pre6 PentiumII 412Mhz, 128MB SDRAM, one IDE
disk (/ & swap).

./hogmem 100 3  (no swapping)
Memory speed: 167.60 MB/sec

./hogmem 200 3
Memory speed: 9.01 MB/sec

./hogmem 100 3 & ./hogmem 100 3
Memory speed: 0.96 MB/sec
Memory speed: 0.96 MB/sec

./hogmem 100 3 (no swap)
Memory speed: 180.18 MB/sec

./hogmem 200 3
Memory speed: 8.68 MB/sec

I then tried 

./hogmem 200 3 &

find / (on about 1.5GB of data on ext2 and vfat and nfs repeatedly) 

And launched netscape.  After 45 mins, I didn't restart the find, and
about 3 mins later the hogmem completed at 0.75MB/sec.  Netscape was
surprisingly responsive however.

Dax Kelson



--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: 2.2.0-pre[56] swap performance poor with > 1 thrashing task
  1999-01-09  6:28                                           ` 2.2.0-pre[56] swap performance poor with > 1 thrashing task Dax Kelson
@ 1999-01-09  6:32                                             ` Zlatko Calusic
  1999-01-09  6:44                                               ` Linus Torvalds
  1999-01-09  7:48                                             ` Benjamin Redelings I
  1 sibling, 1 reply; 243+ messages in thread
From: Zlatko Calusic @ 1999-01-09  6:32 UTC (permalink / raw
  To: Dax Kelson
  Cc: Linus Torvalds, Steve Bergman, Andrea Arcangeli, brent verner,
	Garst R. Reese, Kalle Andersson, Ben McCann, bredelin,
	linux-kernel, linux-mm, Alan Cox, Stephen C. Tweedie

Dax Kelson <dkelson@inconnect.com> writes:

> On 7 Jan 1999, Zlatko Calusic wrote:
> > 
> > 1) Swap performance in pre-5 is much worse compared to pre-4 in
> > *certain* circumstances. I'm using quite stupid and unintelligent
> > program to check for raw swap speed (attached below). With 64 MB of
> > RAM I usually run it as 'hogmem 100 3' and watch for result which is
> > recently around 6 MB/sec. But when I lately decided to start two
> > instances of it like "hogmem 50 3 & hogmem 50 3 &" in pre-4 I got 2 x
> > 2.5 MB/sec and in pre-5 it is only 2 x 1 MB/sec and disk is making
> > very weird and frightening sounds. My conclusion is that now (pre-5)
> > system behaves much poorer when we have more than one thrashing
> > task. *Please*, check this, it is a quite serious problem.
> 
> I just tried this on 2.2.0-pre6 PentiumII 412Mhz, 128MB SDRAM, one IDE
> disk (/ & swap).
> 
> ./hogmem 200 3
> Memory speed: 9.01 MB/sec
> 
> ./hogmem 100 3 & ./hogmem 100 3
> Memory speed: 0.96 MB/sec
> Memory speed: 0.96 MB/sec
> 

I have a fix for this, together with a great improvement in swapping
speed that I'll be sending in few moments, after some final testing.

pre6 is VERY good, and with my changes, we will have fastest MM ever!
-- 
Zlatko
--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: 2.2.0-pre[56] swap performance poor with > 1 thrashing task
  1999-01-09  6:32                                             ` Zlatko Calusic
@ 1999-01-09  6:44                                               ` Linus Torvalds
  1999-01-09 18:58                                                 ` Andrea Arcangeli
                                                                   ` (2 more replies)
  0 siblings, 3 replies; 243+ messages in thread
From: Linus Torvalds @ 1999-01-09  6:44 UTC (permalink / raw
  To: Zlatko Calusic
  Cc: Dax Kelson, Steve Bergman, Andrea Arcangeli, brent verner,
	Garst R. Reese, Kalle Andersson, Ben McCann, bredelin,
	linux-kernel, linux-mm, Alan Cox, Stephen C. Tweedie

Btw, if there are people there who actually like timing different things
(something I _hate_ doing - I lose interest if things become just a matter
of numbers rather than trying to get some algorithm right), then there's
one thing I'd love to hear about: the effect of trying to do some
access bit setting on buffer cache pages.

See my comments in linux/include/linux/fs.h, at around line 260 or so. 
It's the "touch_buffer()" macro which is currently a no-op, and it is
entirely possible that it really should set the PG_referenced bit. 

As a no-op, it can now randomly and unprectably result in even worthwhile
buffers just being thrown out - possibly quite soon after they've been
loaded in. I happen to believe that it doesn't actually matter (and I'm
not convinced that marking the pages referenced has no downsides), but I'm
too lazy to bother to test it. 

		Linus

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: 2.2.0-pre[56] swap performance poor with > 1 thrashing task
  1999-01-09  7:48                                             ` Benjamin Redelings I
@ 1999-01-09  6:53                                               ` Linus Torvalds
  0 siblings, 0 replies; 243+ messages in thread
From: Linus Torvalds @ 1999-01-09  6:53 UTC (permalink / raw
  To: Benjamin Redelings I
  Cc: Dax Kelson, Zlatko Calusic, Steve Bergman, Andrea Arcangeli,
	brent verner, Garst R. Reese, Kalle Andersson, Ben McCann,
	linux-kernel, linux-mm, Alan Cox, Stephen C. Tweedie

On Fri, 8 Jan 1999, Benjamin Redelings I wrote:
>
> 	Maybe this is not really a problem with swapping, but more with
> concurrent I/O in general,

It's really easy to get really crappy performance with concurrent IO, if
you end up just seeking back and forth on the disk - which is why we
should be trying to cluster our IO. Sounds like we end up with silly
behaviour where one process is paging in from one area of the disk while
the other is paging out to another area, resulting in all the time spent
in just moving the disk head rather than moving any actual data.

Or something silly like that. The fix is probably not all that complex: 
the code is supposed to try to avoid it, but I bet I just had some idiotic
braino that just completely defeated the whole idea. It sounds like Zlatko
found my braino already.

		Linus

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: 2.2.0-pre[56] swap performance poor with > 1 thrashing task
  1999-01-09  6:28                                           ` 2.2.0-pre[56] swap performance poor with > 1 thrashing task Dax Kelson
  1999-01-09  6:32                                             ` Zlatko Calusic
@ 1999-01-09  7:48                                             ` Benjamin Redelings I
  1999-01-09  6:53                                               ` Linus Torvalds
  1 sibling, 1 reply; 243+ messages in thread
From: Benjamin Redelings I @ 1999-01-09  7:48 UTC (permalink / raw
  To: Dax Kelson
  Cc: Zlatko Calusic, Linus Torvalds, Steve Bergman, Andrea Arcangeli,
	brent verner, Garst R. Reese, Kalle Andersson, Ben McCann,
	linux-kernel, linux-mm, Alan Cox, Stephen C. Tweedie

	Maybe this is not really a problem with swapping, but more with
concurrent I/O in general, because I KNOW that running low-priority
niced jobs in the background (e.g. updatedb) can seriously degrade
performance of tasks in the foreground (e.g. netscape) that are doing a
minimal amount of I/O.  I think I've seen a few people mention this in
the past also.
	In any case, I've kind of assumed that that was the way it is supposed
to be.  Perhaps it is just that IDE drives really don't like writing 2
files at once.  Or that the background task does a lot of I/O, and the
clustering algorithm makes sure it all gets written before anything else
happens.  Anyway, I bet those explanations are wrong, but maybe there is
another explanation.... I don't know.
	Ah.  So Zlatko has a patch.  I look forward to it, and hope it improves
performance of non-swapping applications also.

-benRI
-- 
I don't need     education.
I don't need ANY education.
I don't need NO  education.

Benjamin Redelings I       <><      http://sdcc13.ucsd.edu/~bredelin
--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: arca-vm-8 [Re: [patch] arca-vm-6, killed kswapd [Re: [patch] new-vm , improvement , [Re: 2.2.0 Bug summary]]]
  1999-01-09  2:34                                           ` Andrea Arcangeli
@ 1999-01-09  9:30                                             ` Stephen C. Tweedie
  0 siblings, 0 replies; 243+ messages in thread
From: Stephen C. Tweedie @ 1999-01-09  9:30 UTC (permalink / raw
  To: Andrea Arcangeli
  Cc: Stephen C. Tweedie, Linus Torvalds, Eric W. Biederman,
	Zlatko Calusic, Alan Cox, bredelin, linux-kernel, Rik van Riel,
	linux-mm

Hi,

On Sat, 9 Jan 1999 03:34:56 +0100 (CET), Andrea Arcangeli
<andrea@e-mind.com> said:

> Hi Stephen!
> On Sat, 9 Jan 1999, Stephen C. Tweedie wrote:

>> deadlock.  The easiest way I can see of achieving something like this is
>> to set current->flags |= PF_MEMALLOC while we hold the superblock lock,

> Hmm, we must not avoid shrink_mmap() to run. So I see plain wrong to set
> the PF_MEMALLOC before call __get_free_pages(). Very cleaner to use
> GFP_ATOMIC to achieve the same effect btw ;).

No, there are about a squillion possible places where we might try to
allocate memory with the superblock lock; updating them all to make
the gfp parameter conditional is gross!

Anyway, the whole point of PF_MEMALLOC is that it says we are
currently in the middle of an operation which has subtle deadlock or
stack overflow semantics wrt allocations, so always try to make
allocations from the free list.  In this case, the number of such
allocations we expect is small, so this is reasonable.  And yes, using
a new flag as opposed to PF_MEMALLOC would allow us to continue to
shrink_mmap (and in fact also to unmap clean pages) while preventing
recursive IO.

--Stephen
--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* MM deadlock [was: Re: arca-vm-8...]
  1999-01-07 22:57                                         ` Linus Torvalds
  1999-01-08  1:16                                           ` Linus Torvalds
@ 1999-01-09  9:43                                           ` Savochkin Andrey Vladimirovich
  1999-01-09 18:00                                             ` Linus Torvalds
  1 sibling, 1 reply; 243+ messages in thread
From: Savochkin Andrey Vladimirovich @ 1999-01-09  9:43 UTC (permalink / raw
  To: Linus Torvalds
  Cc: Andrea Arcangeli, steve, Eric W. Biederman, brent verner,
	Garst R. Reese, Kalle Andersson, Zlatko Calusic, Ben McCann,
	Alan Cox, bredelin, Stephen C. Tweedie, linux-kernel,
	Rik van Riel, linux-mm

Hi,

I've found an another deadlock.
Two processes were locked trying to grab an inode write semaphore.
Their call traces are (in diff format):

 Using `map-2.2.0pre5-1' to map addresses to symbols.
 
 Trace: c010f038 <__down+58/90>
 Trace: c018d080 <__down_failed+8/c>
 Trace: c011abaa <filemap_write_page+a6/15c>
 Trace: c011acad <filemap_swapout+4d/60>
 Trace: c011e2ae <try_to_swap_out+10a/1ac>
 Trace: c011e45a <swap_out_vma+10a/174>
 Trace: c011e521 <swap_out_process+5d/8c>
 Trace: c011e60b <swap_out+bb/e4>
 Trace: c011e75b <try_to_free_pages+4b/70>
 Trace: c011ef61 <__get_free_pages+b5/1dc>
-Trace: c0119cd7 <try_to_read_ahead+2f/124>
-Trace: c011a970 <filemap_nopage+170/304>
-Trace: c0118888 <do_no_page+54/e4>
-Trace: c01189e4 <handle_mm_fault+cc/168>
+Trace: c0118375 <do_wp_page+19/210>
+Trace: c0118a3a <handle_mm_fault+122/168>
 Trace: c010ce9f <do_page_fault+143/364>

I suspect that one of the processes grabbed the semaphore and then deadlocked
trying to do it again.  Probably the process invoked write()
with the data having been swapped out.  The page fault handler
tried to free some memory and try_to_free_pages decided to write
out dirty pages of a shared mapping.  By accident the dirty pages
happened to belong to the file the process had started to write to.

A simple solution will be to check if the inode semaphore is held
before trying to write pages out and skip the mapping if it is.
However it doesn't seem to be a very good solution because if the most
memory is occupied by dirty pages of a shared mapping then
writing the pages out is the most right thing to do.

Best wishes
					Andrey V.
					Savochkin

On Thu, Jan 07, 1999 at 02:57:34PM -0800, Linus Torvalds wrote:
[snip]
> Confirmed. Hpa was good enough to reproduce this, and my debugging code
> caught the (fairly deep) deadlock: 
> 
> 	system_call ->
> 	sys_write ->
> 	ext2_file_write ->
> 	ext2_getblk ->
> 	ext2_alloc_block ->	** gets superblock lock **
> 	ext2_new_block ->
> 	getblk ->
> 	refill_freelist ->
> 	grow_buffers ->
> 	__get_free_pages ->
> 	try_to_free_pages ->
> 	swap_out ->
> 	swap_out_process ->
> 	swap_out_vma ->
> 	try_to_swap_out ->
> 	filemap_swapout ->
> 	filemap_write_page ->
> 	ext2_file_write ->
> 	ext2_getblk ->
> 	ext2_alloc_block ->
> 	__wait_on_super		** BOOM - we want the superblock lock again **
--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: arca-vm-8 [Re: [patch] arca-vm-6, killed kswapd [Re: [patch] new-vm , improvement , [Re: 2.2.0 Bug summary]]]
  1999-01-09  2:13                                         ` Stephen C. Tweedie
  1999-01-09  2:34                                           ` Andrea Arcangeli
@ 1999-01-09 12:11                                           ` Andrea Arcangeli
  1 sibling, 0 replies; 243+ messages in thread
From: Andrea Arcangeli @ 1999-01-09 12:11 UTC (permalink / raw
  To: Stephen C. Tweedie
  Cc: Linus Torvalds, Eric W. Biederman, steve, brent verner,
	Garst R. Reese, Kalle Andersson, Zlatko Calusic, Ben McCann,
	Alan Cox, bredelin, linux-kernel, Rik van Riel, linux-mm

On Sat, 9 Jan 1999, Stephen C. Tweedie wrote:

> couple of years at least: a large write() to a mmap()ed file can
> deadlock for a similar reason, but on the inode write lock instead of
> the superblock lock.

Right. Look at the Oops report I generated at deadlock time and you'll see
that my kernel deadlocked in filemap_write_page() on the inode semaphore. 

Andrea Arcangeli

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-09  9:43                                           ` MM deadlock [was: Re: arca-vm-8...] Savochkin Andrey Vladimirovich
@ 1999-01-09 18:00                                             ` Linus Torvalds
  1999-01-09 18:41                                               ` Andrea Arcangeli
  1999-01-09 21:50                                               ` Linus Torvalds
  0 siblings, 2 replies; 243+ messages in thread
From: Linus Torvalds @ 1999-01-09 18:00 UTC (permalink / raw
  To: Savochkin Andrey Vladimirovich
  Cc: Andrea Arcangeli, steve, Eric W. Biederman, brent verner,
	Garst R. Reese, Kalle Andersson, Zlatko Calusic, Ben McCann,
	Alan Cox, bredelin, Stephen C. Tweedie, linux-kernel,
	Rik van Riel, linux-mm

On Sat, 9 Jan 1999, Savochkin Andrey Vladimirovich wrote:
> 
> I've found an another deadlock.

Yes. This is a case I knew about, and that Alan already mentioned. Trying
to write from a shared mapping has a path that can take the write
semaphore twice.

This one is a whole lot harder to fix - the previous one needed only a
simple extra flag, this one is truly nasty.

The cleanest solution I can think of is actually to allow semaphores to be
recursive. I can do that with minimal overhead (just one extra instruction
in the non-contention case), so it's not too bad, and I've wanted to do it
for certain other things, but it's still a nasty piece of code to mess
around with. 

Oh, well. I don't think I have much choice. Making the swap-out routines
refuse to touch an inode that is busy is a sure way to allow people to
let bad users lock down infinite amounts of memory.

		Linus

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-09 18:00                                             ` Linus Torvalds
@ 1999-01-09 18:41                                               ` Andrea Arcangeli
  1999-01-10 21:41                                                 ` Stephen C. Tweedie
  1999-01-09 21:50                                               ` Linus Torvalds
  1 sibling, 1 reply; 243+ messages in thread
From: Andrea Arcangeli @ 1999-01-09 18:41 UTC (permalink / raw
  To: Linus Torvalds
  Cc: Savochkin Andrey Vladimirovich, steve, Eric W. Biederman,
	brent verner, Garst R. Reese, Kalle Andersson, Zlatko Calusic,
	Ben McCann, Alan Cox, bredelin, Stephen C. Tweedie, linux-kernel,
	Rik van Riel, linux-mm

On Sat, 9 Jan 1999, Linus Torvalds wrote:

> refuse to touch an inode that is busy is a sure way to allow people to

What do you mean for busy? What about refusing filemap_write_page() in
filemap_swapout() only if
!atomic_count(&vma->vm_file->d_entry->d_inode->i_sem.count)?

That way other no-fs path could still put the dirty pages of the shared
mapping on disk. Today I had a really little time to play with Linux due
OFFTOPIC University studies (I should never play with Linux :() so I had
not time to try out this my new idea, so maybe I am missing something... 

Other my thoughts about the topic are: maybe do the inode sempahore
recursive could be better anyway so better to do that now? I don't know
what does it mean recursive ;), I guess like lock_kernel(). But that way
we would be not sure to preserve data integrity if the same process would
do crazy things, right now we would "only" deadlock in such case. 

Andrea Arcangeli

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: 2.2.0-pre[56] swap performance poor with > 1 thrashing task
  1999-01-09  6:44                                               ` Linus Torvalds
@ 1999-01-09 18:58                                                 ` Andrea Arcangeli
  1999-01-11  9:21                                                 ` Buffer handling (setting PG_referenced on access) Zlatko Calusic
  1999-01-16 17:35                                                 ` 2.2.0-pre[56] swap performance poor with > 1 thrashing task Andrea Arcangeli
  2 siblings, 0 replies; 243+ messages in thread
From: Andrea Arcangeli @ 1999-01-09 18:58 UTC (permalink / raw
  To: Linus Torvalds
  Cc: Zlatko Calusic, Dax Kelson, Steve Bergman, brent verner,
	Garst R. Reese, Kalle Andersson, Ben McCann, bredelin,
	linux-kernel, linux-mm, Alan Cox, Stephen C. Tweedie

I think there are problems with 2.2.0-pre6 VM (even if I have not tried it
yet really). Latest time I tried on previous kernel to use in
__get_free_pages() a try_to_free_pages weight > than MAX_SWAP_CLUSTER (aka
freepages.high - nr_free_pages) I had bad impact of VM balance under
swapping. 

The problem is try_to_free_pages() implementation. Using a lower weight as
in pre5 we was sure to return to shrink_mmap with more frequency and so
getting more balance. Instead now we return to risk to only swapout
without make real free memory space.

In the patch there's also some cosmetic change (like s/if/else if/). The
priority = 8 is to go in the swap path more easily.

Ah and probably we could reinsert the swapout_interval sysctl with default
value of HZ (not done yet due lack of time).

Index: linux/mm/vmscan.c
diff -u linux/mm/vmscan.c:1.1.1.11 linux/mm/vmscan.c:1.1.1.1.2.81
--- linux/mm/vmscan.c:1.1.1.11	Sat Jan  9 12:58:26 1999
+++ linux/mm/vmscan.c	Sat Jan  9 19:30:01 1999
@@ -10,6 +10,11 @@
  *  Version: $Id: vmscan.c,v 1.5 1998/02/23 22:14:28 sct Exp $
  */
 
+/*
+ * free_user_and_cache(), always async swapout.
+ * Copyright (C) 1999  Andrea Arcangeli
+ */
+
 #include <linux/slab.h>
 #include <linux/kernel_stat.h>
 #include <linux/swap.h>
@@ -199,11 +204,11 @@
 
 	do {
 		int result;
-		tsk->swap_address = address + PAGE_SIZE;
 		result = try_to_swap_out(tsk, vma, address, pte, gfp_mask);
+		address += PAGE_SIZE;
+		tsk->swap_address = address;
 		if (result)
 			return result;
-		address += PAGE_SIZE;
 		pte++;
 	} while (address < end);
 	return 0;
@@ -325,7 +330,7 @@
 	counter = nr_tasks / (priority+1);
 	if (counter < 1)
 		counter = 1;
-	if (counter > nr_tasks)
+	else if (counter > nr_tasks)
 		counter = nr_tasks;
 
 	for (; counter >= 0; counter--) {
@@ -438,13 +443,22 @@
 		 * point is to make sure that the system doesn't stay
 		 * forever in a really bad memory squeeze.
 		 */
-		if (nr_free_pages < freepages.high)
-			try_to_free_pages(0, 16);
+		if (nr_free_pages < freepages.low)
+			try_to_free_pages(0, freepages.high - nr_free_pages);
 	}
 
 	return 0;
 }
 
+static int free_user_and_cache(int priority, int gfp_mask)
+{
+	if (shrink_mmap(priority, gfp_mask))
+		return 1;
+	if (swap_out(priority, gfp_mask))
+		return 1;
+	return 0;
+}
+
 /*
  * We need to make the locks finer granularity, but right
  * now we need this so that we can do page allocations
@@ -457,33 +471,32 @@
 int try_to_free_pages(unsigned int gfp_mask, int count)
 {
 	int priority;
+	static int state = 0;
 
 	lock_kernel();
 
-	/* Always trim SLAB caches when memory gets low. */
-	kmem_cache_reap(gfp_mask);
-
-	priority = 6;
-	do {
-		while (shrink_mmap(priority, gfp_mask)) {
-			if (!--count)
-				goto done;
-		}
-
-		/* Try to get rid of some shared memory pages.. */
-		while (shm_swap(priority, gfp_mask)) {
-			if (!--count)
-				goto done;
-		}
-	
-		/* Then, try to page stuff out.. */
-		while (swap_out(priority, gfp_mask)) {
-			if (!--count)
-				goto done;
-		}
+	priority = 8;
+	switch (state)
+	{
+		do {
+		case 0:
+			while (free_user_and_cache(priority, gfp_mask)) {
+				if (!--count)
+					goto done;
+			}
+			state = 1;
+		case 1:
+			/* Try to get rid of some shared memory pages.. */
+			while (shm_swap(priority, gfp_mask)) {
+				if (!--count)
+					goto done;
+			}
+			state = 0;
 
-		shrink_dcache_memory(priority, gfp_mask);
-	} while (--priority >= 0);
+			kmem_cache_reap(gfp_mask);
+			shrink_dcache_memory(priority, gfp_mask);
+		} while (--priority >= 0);
+	}
 done:
 	unlock_kernel();
 


Another patch I consider right is this:

Index: linux/mm/page_alloc.c
diff -u linux/mm/page_alloc.c:1.1.1.7 linux/mm/page_alloc.c:1.1.1.1.2.25
--- linux/mm/page_alloc.c:1.1.1.7	Sat Jan  9 12:58:25 1999
+++ linux/mm/page_alloc.c	Fri Jan  8 00:57:18 1999
@@ -3,6 +3,7 @@
  *
  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
  *  Swap reorganised 29.12.95, Stephen Tweedie
+ *  trashing_memory heuristic. Copyright (C) 1999  Andrea Arcangeli
  */
 
 #include <linux/config.h>
@@ -265,10 +266,11 @@
 		if (nr_free_pages > freepages.min) {
 			if (!current->trashing_memory)
 				goto ok_to_allocate;
-			if (nr_free_pages > freepages.low) {
+			if (nr_free_pages > freepages.high) {
 				current->trashing_memory = 0;
 				goto ok_to_allocate;
-			}
+			} else if (nr_free_pages > freepages.low)
+				goto ok_to_allocate;
 		}
 		/*
 		 * Low priority (user) allocations must not


This will allow the system to be less close to freepages.min. Both the two
patches applyed to pre6 make arca-vm-13-against-pre5 and arca-vm-13 is
been reported by Steve to give the _same_ timing numbers as pre5 with its
latest bench (with the difference that arca-vm-13 was generating 1/2 of
swap hit than pre5). I guess he will try to do some other bench (as the
image test soon) next days though.

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-09 18:00                                             ` Linus Torvalds
  1999-01-09 18:41                                               ` Andrea Arcangeli
@ 1999-01-09 21:50                                               ` Linus Torvalds
  1999-01-10 11:56                                                 ` Savochkin Andrey Vladimirovich
  1999-01-10 16:59                                                 ` Stephen C. Tweedie
  1 sibling, 2 replies; 243+ messages in thread
From: Linus Torvalds @ 1999-01-09 21:50 UTC (permalink / raw
  To: Savochkin Andrey Vladimirovich
  Cc: Andrea Arcangeli, steve, Eric W. Biederman, brent verner,
	Garst R. Reese, Kalle Andersson, Zlatko Calusic, Ben McCann,
	Alan Cox, bredelin, Stephen C. Tweedie, linux-kernel,
	Rik van Riel, linux-mm



On Sat, 9 Jan 1999, Linus Torvalds wrote:
> 
> The cleanest solution I can think of is actually to allow semaphores to be
> recursive. I can do that with minimal overhead (just one extra instruction
> in the non-contention case), so it's not too bad, and I've wanted to do it
> for certain other things, but it's still a nasty piece of code to mess
> around with. 
> 
> Oh, well. I don't think I have much choice.

Does anybody know semaphores by heart? I've got code that may well work,
but the race conditions for semaphores are nasty. As mentioned, this only
adds a single instruction to the common non-contended case, and I really
do believe it should be correct, but it is completely untested (so it
might not work at all), and it would be good to have somebody with some
theory go through this.. 

Basically, these simple changes should make it ok to do recursive
semaphore grabs, so

	down(&sem);
	down(&sem);
	up(&sem);
	up(&sem);

should work and leave the semaphore unlocked.

Anybody? Semaphore theory used to be really popular at Universities, so
there must be somebody who has some automated proving program somewhere..

		Linus

-----
diff -u --recursive --new-file penguin/linux/include/asm-i386/semaphore.h linux/include/asm-i386/semaphore.h
--- penguin/linux/include/asm-i386/semaphore.h	Fri Jan  1 11:56:20 1999
+++ linux/include/asm-i386/semaphore.h	Sat Jan  9 13:37:29 1999
@@ -25,12 +25,23 @@
 
 struct semaphore {
 	atomic_t count;
+	unsigned long owner;
 	int waking;
 	struct wait_queue * wait;
 };
 
-#define MUTEX ((struct semaphore) { ATOMIC_INIT(1), 0, NULL })
-#define MUTEX_LOCKED ((struct semaphore) { ATOMIC_INIT(0), 0, NULL })
+/*
+ * Because we want the non-contention case to be
+ * fast, we save the stack pointer into the "owner"
+ * field, and to get the true task pointer we have
+ * to do the bit masking. That moves the masking
+ * operation into the slow path.
+ */
+#define semaphore_owner(sem) \
+	((struct task_struct *)((2*PAGE_MASK) & (sem)->owner))
+
+#define MUTEX ((struct semaphore) { ATOMIC_INIT(1), 0, 0, NULL })
+#define MUTEX_LOCKED ((struct semaphore) { ATOMIC_INIT(0), 0, 0, NULL })
 
 asmlinkage void __down_failed(void /* special register calling convention */);
 asmlinkage int  __down_failed_interruptible(void  /* params in registers */);
@@ -64,13 +75,14 @@
 	spin_unlock_irqrestore(&semaphore_wake_lock, flags);
 }
 
-static inline int waking_non_zero(struct semaphore *sem)
+static inline int waking_non_zero(struct semaphore *sem, struct task_struct *tsk)
 {
 	unsigned long flags;
 	int ret = 0;
 
 	spin_lock_irqsave(&semaphore_wake_lock, flags);
-	if (sem->waking > 0) {
+	if (sem->waking > 0 || semaphore_owner(sem) == tsk) {
+		sem->owner = (unsigned long) tsk;
 		sem->waking--;
 		ret = 1;
 	}
@@ -91,7 +103,8 @@
 		"lock ; "
 #endif
 		"decl 0(%0)\n\t"
-		"js 2f\n"
+		"js 2f\n\t"
+		"movl %%esp,4(%0)\n"
 		"1:\n"
 		".section .text.lock,\"ax\"\n"
 		"2:\tpushl $1b\n\t"
@@ -113,6 +126,7 @@
 #endif
 		"decl 0(%1)\n\t"
 		"js 2f\n\t"
+		"movl %%esp,4(%1)\n\t"
 		"xorl %0,%0\n"
 		"1:\n"
 		".section .text.lock,\"ax\"\n"
diff -u --recursive --new-file penguin/linux/kernel/sched.c linux/kernel/sched.c
--- penguin/linux/kernel/sched.c	Mon Jan  4 23:15:49 1999
+++ linux/kernel/sched.c	Sat Jan  9 13:37:16 1999
@@ -883,7 +883,7 @@
 	 * who gets to gate through and who has to wait some more.	 \
 	 */								 \
 	for (;;) {							 \
-		if (waking_non_zero(sem))	/* are we waking up?  */ \
+		if (waking_non_zero(sem, tsk))	/* are we waking up?  */ \
 			break;			/* yes, exit loop */
 
 #define DOWN_TAIL(task_state)			\


--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Results: pre6 vs pre6+zlatko's_patch  vs pre5 vs arcavm13
  1999-01-07  3:32                                     ` Results: 2.2.0-pre5 vs arcavm10 vs arcavm9 vs arcavm7 Steve Bergman
  1999-01-07 12:02                                       ` Andrea Arcangeli
  1999-01-07 17:35                                       ` Linus Torvalds
@ 1999-01-09 22:39                                       ` Steve Bergman
  1999-01-10  0:28                                         ` Steve Bergman
  1999-01-11  3:47                                         ` Results: pre6 vs pre6+zlatko's_patch vs pre5 vs arcavm13 Gregory Maxwell
  2 siblings, 2 replies; 243+ messages in thread
From: Steve Bergman @ 1999-01-09 22:39 UTC (permalink / raw
  To: Andrea Arcangeli, brent verner, Garst R. Reese, Kalle Andersson,
	Zlatko Calusic, Ben McCann, bredelin, linux-kernel, linux-mm,
	Linus Torvalds, Alan Cox, Stephen C. Tweedie

In an effort to streamline testing, I am now running just:

make depend; make clean; make bzlilo

and leaving out the modules part.  I am also compiling only a minimal kernel
with no options at all selected in menuconfig.  I have added an idle mysql
server to the mix, which still includes netscape and a number of the usual
daemons (sendmail, lpd, inetd, etc.) along with vmstat 1, top, and ping
remote_host.  Please remember Linus' caution about the "swaps" number.  Here are
the latest results:

In 16MB:

pre6+zlatko_patch	5:29	192527	149728	3554
pre6			5:27	192002	149694	4257
pre5			5:28	188566	148674	5646
arcavm13		5:32	188560	148234	1594

Really putting on the squeeze, I tried out mem=12M which forced about 24MB into
the swap area.

In 12MB:

pre6+zlatko_patch	22:14	383206	204482	57823
pre6			20:54	352934	191210	48678
pre5			Did not test
arcavm13		19:45	344452	180243	38977

They all seem about the same in 16MB.  arcavm13 looks good in 12MB.  Zlatko, let
me know if you have any specific tests you want me to run on your patch.

-Steve
--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: Results: pre6 vs pre6+zlatko's_patch  vs pre5 vs arcavm13
  1999-01-09 22:39                                       ` Results: pre6 vs pre6+zlatko's_patch vs pre5 vs arcavm13 Steve Bergman
@ 1999-01-10  0:28                                         ` Steve Bergman
  1999-01-10  5:35                                           ` Linus Torvalds
  1999-01-11  3:47                                         ` Results: pre6 vs pre6+zlatko's_patch vs pre5 vs arcavm13 Gregory Maxwell
  1 sibling, 1 reply; 243+ messages in thread
From: Steve Bergman @ 1999-01-10  0:28 UTC (permalink / raw
  To: Andrea Arcangeli, brent verner, Garst R. Reese, Kalle Andersson,
	Zlatko Calusic, Ben McCann, bredelin, linux-kernel, linux-mm,
	Linus Torvalds, Alan Cox, Stephen C. Tweedie

Steve Bergman wrote:

I ran the "image test" (loading 116 jpg images simultaneously) on the latest
patches and got these results in 128MB (I end up with ~ 160MB in swap):

pre6+zlatko's_patch	2:35
pre6			2:27
pre5			1:58
arcavm13		9:13

Arcavm13 (the star performer in the low memory test) is having problems here. 
Pre5, which I performed about the same as the others in the my low memory test
and which I ignored in my even lower 12MB test looks quite good here.  Based on
it's good performance here, I decided to run the 12MB kernel compile test on it,
as well.  (See what happens when I try to cut corners...)

In 12MB:

pre6+zlatko_patch       22:14   383206  204482  57823
pre6                    20:54   352934  191210  48678
pre5                    19:35	334680	183732	93427 
arcavm13                19:45   344452  180243  38977

Pre5 is looking good.  Based upon the tests that I have run, anyway.  I agree
with the person who expressed a distrust of benchmarks.  But numbers are
necessary for tuning.  "Feels faster" is just not a very trustworthy thing.  So
I also agree with one of the responses:

"Try out your favorite apps and time some portion of them and post any
interesting numbers." (paraphrased)

Benchmarks are not the problem.  The problem is the lack of comprehensiveness,
or the tunnel-vision if you prefer, that benchmarks can lead one into.  Find a
way to quantify the things that you do everyday and post the results.

-Thanks
-Steve
--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: Results: pre6 vs pre6+zlatko's_patch  vs pre5 vs arcavm13
  1999-01-10  0:28                                         ` Steve Bergman
@ 1999-01-10  5:35                                           ` Linus Torvalds
  1999-01-10 18:33                                             ` Andrea Arcangeli
  1999-01-10 18:43                                             ` Steve Bergman
  0 siblings, 2 replies; 243+ messages in thread
From: Linus Torvalds @ 1999-01-10  5:35 UTC (permalink / raw
  To: Steve Bergman
  Cc: Andrea Arcangeli, brent verner, Garst R. Reese, Kalle Andersson,
	Zlatko Calusic, Ben McCann, bredelin, linux-kernel, linux-mm,
	Alan Cox, Stephen C. Tweedie



On Sat, 9 Jan 1999, Steve Bergman wrote:
> 
> I ran the "image test" (loading 116 jpg images simultaneously) on the latest
> patches and got these results in 128MB (I end up with ~ 160MB in swap):
> 
> pre6+zlatko's_patch	2:35
> pre6			2:27
> pre5			1:58
> arcavm13		9:13

Can you run pre6+zlatko with just the mm/page_alloc.c one-liner reverted
to pre5? That is, take pre6+zlatko, and just change 

	try_to_free_pages(gfp_mask, freepages.high - nr_free_pages);

back to

	try_to_free_pages(gfp_mask, SWAP_CLUSTER_MAX);

That particular one-liner was almost certainly a mistake, it was done on
the mistaken assumption that the clustering problem was due to
insufficient write-time clustering - while zlatko found that it was
actually due to fragmentation in the swap area. With zlatkos patch, the
original SWAP_CLUSTER_MAX is probably better and almost certainly results
in smoother behaviour due to less extreme free_pages.. 

		Linus

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-09 21:50                                               ` Linus Torvalds
@ 1999-01-10 11:56                                                 ` Savochkin Andrey Vladimirovich
  1999-01-10 17:59                                                   ` Andrea Arcangeli
  1999-01-10 22:33                                                   ` Stephen C. Tweedie
  1999-01-10 16:59                                                 ` Stephen C. Tweedie
  1 sibling, 2 replies; 243+ messages in thread
From: Savochkin Andrey Vladimirovich @ 1999-01-10 11:56 UTC (permalink / raw
  To: Linus Torvalds
  Cc: Andrea Arcangeli, steve, Eric W. Biederman, brent verner,
	Garst R. Reese, Kalle Andersson, Zlatko Calusic, Ben McCann,
	Alan Cox, bredelin, Stephen C. Tweedie, linux-kernel,
	Rik van Riel, linux-mm

On Sat, Jan 09, 1999 at 01:50:14PM -0800, Linus Torvalds wrote:
> 
> 
> On Sat, 9 Jan 1999, Linus Torvalds wrote:
> > 
> > The cleanest solution I can think of is actually to allow semaphores to be
> > recursive. I can do that with minimal overhead (just one extra instruction
> > in the non-contention case), so it's not too bad, and I've wanted to do it
> > for certain other things, but it's still a nasty piece of code to mess
> > around with. 
> > 
> > Oh, well. I don't think I have much choice.

Well, doesn't semaphore recursion mean that the write atomicity
is no more guaranteed by inode's i_sem semaphore?

Best wishes
					Andrey V.
					Savochkin
--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-09 21:50                                               ` Linus Torvalds
  1999-01-10 11:56                                                 ` Savochkin Andrey Vladimirovich
@ 1999-01-10 16:59                                                 ` Stephen C. Tweedie
  1999-01-10 18:13                                                   ` Andrea Arcangeli
  1999-01-10 18:35                                                   ` Linus Torvalds
  1 sibling, 2 replies; 243+ messages in thread
From: Stephen C. Tweedie @ 1999-01-10 16:59 UTC (permalink / raw
  To: Linus Torvalds
  Cc: Savochkin Andrey Vladimirovich, Andrea Arcangeli, steve,
	Eric W. Biederman, brent verner, Garst R. Reese, Kalle Andersson,
	Zlatko Calusic, Ben McCann, Alan Cox, bredelin,
	Stephen C. Tweedie, linux-kernel, Rik van Riel, linux-mm

Hi,

On Sat, 9 Jan 1999 13:50:14 -0800 (PST), Linus Torvalds
<torvalds@transmeta.com> said:

> On Sat, 9 Jan 1999, Linus Torvalds wrote:
>> 
>> The cleanest solution I can think of is actually to allow semaphores to be
>> recursive. I can do that with minimal overhead (just one extra instruction
>> in the non-contention case), so it's not too bad, and I've wanted to do it
>> for certain other things, but it's still a nasty piece of code to mess
>> around with. 

Ack.  I've been having a closer look, and making the superblock lock
recursive doesn't work: the ext2fs allocation code is definitely not
reentrant.  In particular, the bitmap buffers can get evicted out from
under our feet if we reenter the block allocation code, leading to nasty
filesystem and/or memory corruption.  The allocation code can also get
confused if the bitmap contents change between checking the group
descriptor for a block group and reading in the bitmap itself, leading
to potential ENOSPC errors turning up wrongly.

Preventing recursive VM access to the filesystem while we have the
superblock lock seems the only easy way out short of making the
allocation/truncate code fully reentrant.

On the other hand, it does look as if the inode deadlock is dealt with
OK if we just make that semaphore recursive; I can't see anywhere that
dies if we make that change.  This does somewhat imply that we may need
to make a distinction between reentrant and non-reentrant semaphores if
we go down this route.

--Stephen.
--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-10 11:56                                                 ` Savochkin Andrey Vladimirovich
@ 1999-01-10 17:59                                                   ` Andrea Arcangeli
  1999-01-10 22:33                                                   ` Stephen C. Tweedie
  1 sibling, 0 replies; 243+ messages in thread
From: Andrea Arcangeli @ 1999-01-10 17:59 UTC (permalink / raw
  To: Savochkin Andrey Vladimirovich
  Cc: Linus Torvalds, Eric W. Biederman, Zlatko Calusic, Alan Cox,
	bredelin, Stephen C. Tweedie, linux-kernel, Rik van Riel,
	linux-mm

On Sun, 10 Jan 1999, Savochkin Andrey Vladimirovich wrote:

> Well, doesn't semaphore recursion mean that the write atomicity
> is no more guaranteed by inode's i_sem semaphore?

Looking first Linus's patch I guessed right what does it mean recursion
over a sempahore (not that there would be many other choices though ;). As
I just pointed out the write atomicity is not more garanteed from the
internal path of the same process (previously in such case we would
deadlock but sure we had no ways to corrupt things). It's still garanteed
that many processes working on a critical section protected by the same
semaphore will not mess up things.

Andrea Arcangeli

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-10 16:59                                                 ` Stephen C. Tweedie
@ 1999-01-10 18:13                                                   ` Andrea Arcangeli
  1999-01-10 18:35                                                   ` Linus Torvalds
  1 sibling, 0 replies; 243+ messages in thread
From: Andrea Arcangeli @ 1999-01-10 18:13 UTC (permalink / raw
  To: Stephen C. Tweedie
  Cc: Linus Torvalds, Savochkin Andrey Vladimirovich, Eric W. Biederman,
	Zlatko Calusic, Alan Cox, bredelin, linux-kernel, Rik van Riel,
	linux-mm

On Sun, 10 Jan 1999, Stephen C. Tweedie wrote:

> dies if we make that change.  This does somewhat imply that we may need
> to make a distinction between reentrant and non-reentrant semaphores if
> we go down this route.

Agreed.

Andrea Arcangeli

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: Results: pre6 vs pre6+zlatko's_patch  vs pre5 vs arcavm13
  1999-01-10  5:35                                           ` Linus Torvalds
@ 1999-01-10 18:33                                             ` Andrea Arcangeli
  1999-01-10 18:43                                             ` Steve Bergman
  1 sibling, 0 replies; 243+ messages in thread
From: Andrea Arcangeli @ 1999-01-10 18:33 UTC (permalink / raw
  To: Linus Torvalds
  Cc: Steve Bergman, brent verner, Garst R. Reese, Kalle Andersson,
	Zlatko Calusic, Ben McCann, bredelin, linux-kernel, linux-mm,
	Alan Cox, Stephen C. Tweedie

On Sat, 9 Jan 1999, Linus Torvalds wrote:

> Can you run pre6+zlatko with just the mm/page_alloc.c one-liner reverted
> to pre5? That is, take pre6+zlatko, and just change 

I have no time to try code these days :(

> 	try_to_free_pages(gfp_mask, freepages.high - nr_free_pages);
> 
> back to
> 
> 	try_to_free_pages(gfp_mask, SWAP_CLUSTER_MAX);
> 
> That particular one-liner was almost certainly a mistake, it was done on
> the mistaken assumption that the clustering problem was due to
> insufficient write-time clustering - while zlatko found that it was
> actually due to fragmentation in the swap area. With zlatkos patch, the
> original SWAP_CLUSTER_MAX is probably better and almost certainly results
> in smoother behaviour due to less extreme free_pages.. 

I don't know which is zlatkos patch but my point is that it's
try_to_free_pages that has to care to be balanced and to do things at
best. I want to be allowed to ask try_to_free_pages to free any kind of
space and such function has to run _always_ efficient. 

My current free_user_and_cache() (arca-vm-13) is bad because it doesn't
swapout aggressively. I am fixing it right now. When I'll have finished
I'll post the new patch.

BTW, the reason pre6 is slower than my current _bad_ free_user_and_cache() 
(arca-vm-13) in low memory machines is that in high memory machines the
freepages.min is something like 255 while in low memory machines
freepages.min it's close to SWAP_CUSTER_MAX. So in low memory machines the
swapout cluster has to be reduced (because now swapout doesn't free
pages). free_user_and_cache() has to care about these issues and I see
plain wrong to hardwire the swap cluster size to a constant number since
the freepages values are dynamic (and also changable via sysctl).

I am not sure of all this, but this is currently my thought.

Comments?

Andrea Arcangeli

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-10 16:59                                                 ` Stephen C. Tweedie
  1999-01-10 18:13                                                   ` Andrea Arcangeli
@ 1999-01-10 18:35                                                   ` Linus Torvalds
  1999-01-10 19:45                                                     ` Alan Cox
                                                                       ` (3 more replies)
  1 sibling, 4 replies; 243+ messages in thread
From: Linus Torvalds @ 1999-01-10 18:35 UTC (permalink / raw
  To: Stephen C. Tweedie
  Cc: Savochkin Andrey Vladimirovich, Andrea Arcangeli, steve,
	Eric W. Biederman, brent verner, Garst R. Reese, Kalle Andersson,
	Zlatko Calusic, Ben McCann, Alan Cox, bredelin, linux-kernel,
	Rik van Riel, linux-mm



On Sun, 10 Jan 1999, Stephen C. Tweedie wrote:
> 
> Ack.  I've been having a closer look, and making the superblock lock
> recursive doesn't work

That's fine - the superblock lock doesn't need to be re-entrant, because
__GFP_IO is quite sufficient for that one.

The thing I want to make re-entrant is just semaphore accesses: at the
point where we would otherwise deadlock on the writer semaphore it's much
better to just allow nested writes. I suspect all filesystems can already
handle nested writes - they are a lot easier to handle than truly
concurrent ones.

		Linus

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: Results: pre6 vs pre6+zlatko's_patch  vs pre5 vs arcavm13
  1999-01-10  5:35                                           ` Linus Torvalds
  1999-01-10 18:33                                             ` Andrea Arcangeli
@ 1999-01-10 18:43                                             ` Steve Bergman
  1999-01-10 19:08                                               ` Linus Torvalds
  1999-01-10 20:40                                               ` Results: pre6 vs pre6+zlatko's_patch vs pre5 vs arcavm13 Andrea Arcangeli
  1 sibling, 2 replies; 243+ messages in thread
From: Steve Bergman @ 1999-01-10 18:43 UTC (permalink / raw
  To: Linus Torvalds
  Cc: Andrea Arcangeli, brent verner, Garst R. Reese, Kalle Andersson,
	Zlatko Calusic, Ben McCann, bredelin, linux-kernel, linux-mm,
	Alan Cox, Stephen C. Tweedie

Linus Torvalds wrote:

> Can you run pre6+zlatko with just the mm/page_alloc.c one-liner reverted
> to pre5? That is, take pre6+zlatko, and just change
> 
>         try_to_free_pages(gfp_mask, freepages.high - nr_free_pages);
> 
> back to
> 
>         try_to_free_pages(gfp_mask, SWAP_CLUSTER_MAX);
> 

OK, here are the updated results:

'Image test' in 128MB:

pre6+zlatko's_patch     	2:35
and with requested change	3:09
pre6                    	2:27
pre5                    	1:58
arcavm13                	9:13


I also ran the kernel compile test:

In 12MB:
				Elapsed	Maj.	Min.	Swaps
				-----	------	------	-----
pre6+zlatko_patch       	22:14   383206  204482  57823
and with requested change	22:23	378662	198194	51445
pre6                    	20:54   352934  191210  48678
pre5                    	19:35   334680  183732  93427 
arcavm13                	19:45   344452  180243  38977

The change seems to have hurt it in both cases.  What I am seeing on pre6 and
it's derivitives is a *lot* of *swapin* activity.  Pre5 almost exclusively swaps
*out* during the image test, averaging about 1.25MB/sec (spends a lot of time at
around 2000k/sec) with very little swapping in.  All the pre6 derivitives swap
*in* quite heavily during the test.  The 'so' number sometimes drops to 0 for
seconds at a time.  It also looks like pre6 swaps out slightly more overall
(~165MB vs 160MB).

-Steve
--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-10 19:45                                                     ` Alan Cox
@ 1999-01-10 19:03                                                       ` Andrea Arcangeli
  1999-01-10 21:39                                                         ` Stephen C. Tweedie
  1999-01-10 19:09                                                       ` Linus Torvalds
  1 sibling, 1 reply; 243+ messages in thread
From: Andrea Arcangeli @ 1999-01-10 19:03 UTC (permalink / raw
  To: Alan Cox
  Cc: Linus Torvalds, sct, saw, steve, ebiederm+eric, damonbrent, reese,
	kalle.andersson, Zlatko.Calusic, bmccann, bredelin, linux-kernel,
	H.H.vanRiel, linux-mm

On Sun, 10 Jan 1999, Alan Cox wrote:

> Suspect makes me kind of nervous. Especially so close to 2.2 and given the
> normal results of making a bad file system error.

Another way to fix the thing could be to left only to kswapd the work to
sync shared-mmapped page out to disk when needed. We could wakeup kswapd
from the inside of filemap_swapout... It's dirty but should work fine
without the need of reentrant semaphores. BTW, before my
always-async-swapout idea kswapd was hiding the bug pretty well ;). 

Personally I like far more the clean solution but...

Andrea Arcangeli

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: Results: pre6 vs pre6+zlatko's_patch  vs pre5 vs arcavm13
  1999-01-10 18:43                                             ` Steve Bergman
@ 1999-01-10 19:08                                               ` Linus Torvalds
  1999-01-10 19:23                                                 ` Vladimir Dergachev
                                                                   ` (2 more replies)
  1999-01-10 20:40                                               ` Results: pre6 vs pre6+zlatko's_patch vs pre5 vs arcavm13 Andrea Arcangeli
  1 sibling, 3 replies; 243+ messages in thread
From: Linus Torvalds @ 1999-01-10 19:08 UTC (permalink / raw
  To: Steve Bergman
  Cc: Andrea Arcangeli, brent verner, Garst R. Reese, Kalle Andersson,
	Zlatko Calusic, Ben McCann, bredelin, linux-kernel, linux-mm,
	Alan Cox, Stephen C. Tweedie

On Sun, 10 Jan 1999, Steve Bergman wrote:
> 
> The change seems to have hurt it in both cases.  What I am seeing on pre6 and
> it's derivitives is a *lot* of *swapin* activity.  Pre5 almost exclusively swaps
> *out* during the image test, averaging about 1.25MB/sec (spends a lot of time at
> around 2000k/sec) with very little swapping in.  All the pre6 derivitives swap
> *in* quite heavily during the test.  The 'so' number sometimes drops to 0 for
> seconds at a time.  It also looks like pre6 swaps out slightly more overall
> (~165MB vs 160MB).

This is interesting - the only thing that changed between pre5 and pre6
(apart from the line that I asked you to revert and that made things
worse) was actually the deadlock code. And the only thing _that_ changes,
in turn, is just the fact that when we allocate new buffers we cannot swap
out.

Now, this all actally makes sense: what happens when we limit swap-out
activity is that we limit our choices on what to do a _lot_.. That hurts
performance - we had to avoid a really nice path because of the deadlock
possibility, and due to that we can no longer make the best choice about
what to free up. So performance goes down. 

This shows up mainly on small memory machines, because on large memory
machines we still have a lot of choice about what to free up, so it's not
all that much of a problem.

But basically it seems that the reason pre-5 was so good was simply due to
the bug that allowed it to deadlock. Sad, because there's no way I can
re-introduce that nice behaviour without re-introducing the bug ;(

However, there are other things that we _can_ do. For example, we can
easily make the memory management code less eager to try to free memory if
it doesn't have the __GFP_IO bit set - because when it cannot swap it can
no longer maintain a good balance of pages, so it doesn't make sense for
it to try to free up all that many pages. 

Also, if we cannot swap out, we shouldn't bother looking at the
"trashing_memory" thing - there's just no point in making things worse. 

The logic for that would be something like the attached patch to
page_alloc.c..

		Linus

--- v2.2.0-pre6/linux/mm/page_alloc.c	Fri Jan  8 22:36:25 1999
+++ linux/mm/page_alloc.c	Sun Jan 10 11:04:17 1999
@@ -269,6 +231,9 @@
 				current->trashing_memory = 0;
 				goto ok_to_allocate;
 			}
+			/* If we cannot swap, ignore trashing_memory */
+			if (!(gfp_mask & __GFP_IO))
+				goto ok_to_allocate;
 		}
 		/*
 		 * Low priority (user) allocations must not
@@ -277,9 +242,12 @@
 		 */
 		current->trashing_memory = 1;
 		{
-			int freed;
+			int freed, pages;
+			pages = 16;
+			if (gfp_mask & __GFP_IO)
+				pages = freepages.high - nr_free_pages;
 			current->flags |= PF_MEMALLOC;
-			freed = try_to_free_pages(gfp_mask, freepages.high - nr_free_pages);
+			freed = try_to_free_pages(gfp_mask, pages);
 			current->flags &= ~PF_MEMALLOC;
 			if (!freed && !(gfp_mask & (__GFP_MED | __GFP_HIGH)))
 				goto nopage;

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-10 19:45                                                     ` Alan Cox
  1999-01-10 19:03                                                       ` Andrea Arcangeli
@ 1999-01-10 19:09                                                       ` Linus Torvalds
  1999-01-10 20:33                                                         ` Alan Cox
  1 sibling, 1 reply; 243+ messages in thread
From: Linus Torvalds @ 1999-01-10 19:09 UTC (permalink / raw
  To: Alan Cox
  Cc: sct, saw, andrea, steve, ebiederm+eric, damonbrent, reese,
	kalle.andersson, Zlatko.Calusic, bmccann, bredelin, linux-kernel,
	H.H.vanRiel, linux-mm



On Sun, 10 Jan 1999, Alan Cox wrote:
> 
> Suspect makes me kind of nervous. Especially so close to 2.2 and given the
> normal results of making a bad file system error.

Umm.. The other choice is to leave in an old deadlock condition - that is
now well documented and thus wellknown?

		Linus

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: Results: pre6 vs pre6+zlatko's_patch  vs pre5 vs arcavm13
  1999-01-10 19:08                                               ` Linus Torvalds
@ 1999-01-10 19:23                                                 ` Vladimir Dergachev
  1999-01-10 20:09                                                 ` Andrea Arcangeli
  1999-01-10 20:29                                                 ` Steve Bergman
  2 siblings, 0 replies; 243+ messages in thread
From: Vladimir Dergachev @ 1999-01-10 19:23 UTC (permalink / raw
  To: Linus Torvalds
  Cc: Steve Bergman, Andrea Arcangeli, brent verner, Garst R. Reese,
	Kalle Andersson, Zlatko Calusic, Ben McCann, bredelin,
	linux-kernel, linux-mm, Alan Cox, Stephen C. Tweedie



> This shows up mainly on small memory machines, because on large memory
> machines we still have a lot of choice about what to free up, so it's not
> all that much of a problem.
> 
> But basically it seems that the reason pre-5 was so good was simply due to
> the bug that allowed it to deadlock. Sad, because there's no way I can
> re-introduce that nice behaviour without re-introducing the bug ;(

Stupid question: is it possible to teach it to recognize the deadlock ?
If I understand things right "nice behaviour" happens when we don't have
the deadlock and the deadlock occurs not very often. So we might check
once a second whether we have been low on memory for a while with a lot of
swap available and if so revert to "bug-proof" behaviour. 

                       Vladimir Dergachev

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-10 18:35                                                   ` Linus Torvalds
@ 1999-01-10 19:45                                                     ` Alan Cox
  1999-01-10 19:03                                                       ` Andrea Arcangeli
  1999-01-10 19:09                                                       ` Linus Torvalds
  1999-01-10 22:18                                                     ` Stephen C. Tweedie
                                                                       ` (2 subsequent siblings)
  3 siblings, 2 replies; 243+ messages in thread
From: Alan Cox @ 1999-01-10 19:45 UTC (permalink / raw
  To: Linus Torvalds
  Cc: sct, saw, andrea, steve, ebiederm+eric, damonbrent, reese,
	kalle.andersson, Zlatko.Calusic, bmccann, alan, bredelin,
	linux-kernel, H.H.vanRiel, linux-mm

> point where we would otherwise deadlock on the writer semaphore it's much
> better to just allow nested writes. I suspect all filesystems can already
> handle nested writes - they are a lot easier to handle than truly
> concurrent ones.

Suspect makes me kind of nervous. Especially so close to 2.2 and given the
normal results of making a bad file system error.

Alan

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-10 20:33                                                         ` Alan Cox
@ 1999-01-10 20:07                                                           ` Linus Torvalds
  0 siblings, 0 replies; 243+ messages in thread
From: Linus Torvalds @ 1999-01-10 20:07 UTC (permalink / raw
  To: Alan Cox
  Cc: sct, saw, andrea, steve, ebiederm+eric, damonbrent, reese,
	kalle.andersson, Zlatko.Calusic, bmccann, bredelin, linux-kernel,
	H.H.vanRiel, linux-mm

On Sun, 10 Jan 1999, Alan Cox wrote:
> 
> Or to defer the I/O to the unlock

Hmm.. I don't generally like this idea because it is so easily fraught
with various nasty usage issues - just looking at the file semaphore would
probably make it fairly easy for somebody who knows how we work to come up
with some programs that may not deadlock but would create some really
pathological memory management behaviour. 

I'll think about it, though - together with some kswapd help we might well
be able to guarantee that nobody will be able to cause problems by keeping
a file busy.

		Linus

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: Results: pre6 vs pre6+zlatko's_patch  vs pre5 vs arcavm13
  1999-01-10 19:08                                               ` Linus Torvalds
  1999-01-10 19:23                                                 ` Vladimir Dergachev
@ 1999-01-10 20:09                                                 ` Andrea Arcangeli
  1999-01-10 20:29                                                 ` Steve Bergman
  2 siblings, 0 replies; 243+ messages in thread
From: Andrea Arcangeli @ 1999-01-10 20:09 UTC (permalink / raw
  To: Linus Torvalds
  Cc: Steve Bergman, brent verner, Garst R. Reese, Kalle Andersson,
	Zlatko Calusic, Ben McCann, bredelin, linux-kernel, linux-mm,
	Alan Cox, Stephen C. Tweedie

On Sun, 10 Jan 1999, Linus Torvalds wrote:

> 
> 
> On Sun, 10 Jan 1999, Steve Bergman wrote:
> > 
> > The change seems to have hurt it in both cases.  What I am seeing on pre6 and
> > it's derivitives is a *lot* of *swapin* activity.  Pre5 almost exclusively swaps
> > *out* during the image test, averaging about 1.25MB/sec (spends a lot of time at
> > around 2000k/sec) with very little swapping in.  All the pre6 derivitives swap
> > *in* quite heavily during the test.  The 'so' number sometimes drops to 0 for
> > seconds at a time.  It also looks like pre6 swaps out slightly more overall
> > (~165MB vs 160MB).
> 
> This is interesting - the only thing that changed between pre5 and pre6
> (apart from the line that I asked you to revert and that made things
> worse) was actually the deadlock code. And the only thing _that_ changes,

I just given my explaination of why pre6 is completly unbalanced (even if
I have never run pre6 myself, but I just tried doing what pre6 does some
time before pre6 was out). I quote myself:

	I think there are problems with 2.2.0-pre6 VM (even if I have not tried it
	yet really). Latest time I tried on previous kernel to use in
	__get_free_pages() a try_to_free_pages weight > than MAX_SWAP_CLUSTER (aka
	freepages.high - nr_free_pages) I had bad impact of VM balance under
	swapping. 

	The problem is try_to_free_pages() implementation. Using a lower weight as
	in pre5 we was sure to return to shrink_mmap with more frequency and so
	getting more balance. Instead now we return to risk to only swapout
	without make real free memory space.

Your patch will not fix the problem. The problem is try_to_free_pages()
implementation.

Andrea Arcangeli

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: Results: pre6 vs pre6+zlatko's_patch  vs pre5 vs arcavm13
  1999-01-10 19:08                                               ` Linus Torvalds
  1999-01-10 19:23                                                 ` Vladimir Dergachev
  1999-01-10 20:09                                                 ` Andrea Arcangeli
@ 1999-01-10 20:29                                                 ` Steve Bergman
  1999-01-10 21:41                                                   ` Linus Torvalds
  1999-01-11 16:57                                                   ` Results: pre6 vs pre6+zlatko's_patch vs pre5 vs arcavm13 Steve Bergman
  2 siblings, 2 replies; 243+ messages in thread
From: Steve Bergman @ 1999-01-10 20:29 UTC (permalink / raw
  To: Linus Torvalds
  Cc: Andrea Arcangeli, brent verner, Garst R. Reese, Kalle Andersson,
	Zlatko Calusic, Ben McCann, bredelin, linux-kernel, linux-mm,
	Alan Cox, Stephen C. Tweedie

Linus Torvalds wrote:

> The logic for that would be something like the attached patch to
> page_alloc.c..


I tried the patch in the 'image test' and it helped little if any.  Still a lot
of swapping in and the numbers are close enough that I'm not sure it helped at
all.  This was a comparison between vanilla pre6 and vanilla
pre6+page_alloc_patch with no other patches applied.

-Steve
--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-10 19:09                                                       ` Linus Torvalds
@ 1999-01-10 20:33                                                         ` Alan Cox
  1999-01-10 20:07                                                           ` Linus Torvalds
  0 siblings, 1 reply; 243+ messages in thread
From: Alan Cox @ 1999-01-10 20:33 UTC (permalink / raw
  To: Linus Torvalds
  Cc: alan, sct, saw, andrea, steve, ebiederm+eric, damonbrent, reese,
	kalle.andersson, Zlatko.Calusic, bmccann, bredelin, linux-kernel,
	H.H.vanRiel, linux-mm

> On Sun, 10 Jan 1999, Alan Cox wrote:
> > 
> > Suspect makes me kind of nervous. Especially so close to 2.2 and given the
> > normal results of making a bad file system error.
> 
> Umm.. The other choice is to leave in an old deadlock condition - that is
> now well documented and thus wellknown?

Or to defer the I/O to the unlock

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: Results: pre6 vs pre6+zlatko's_patch  vs pre5 vs arcavm13
  1999-01-10 18:43                                             ` Steve Bergman
  1999-01-10 19:08                                               ` Linus Torvalds
@ 1999-01-10 20:40                                               ` Andrea Arcangeli
  1999-01-10 20:50                                                 ` Linus Torvalds
  1 sibling, 1 reply; 243+ messages in thread
From: Andrea Arcangeli @ 1999-01-10 20:40 UTC (permalink / raw
  To: Steve Bergman
  Cc: Linus Torvalds, brent verner, Garst R. Reese, Kalle Andersson,
	Zlatko Calusic, Ben McCann, bredelin, linux-kernel, linux-mm,
	Alan Cox, Stephen C. Tweedie

On Sun, 10 Jan 1999, Steve Bergman wrote:

> 'Image test' in 128MB:

Steve, could you try the image test in 128Mbyte with this my new patch
(arca-vm-14) applyed against clean 2.2.0-pre6?

Index: linux/mm/vmscan.c
diff -u linux/mm/vmscan.c:1.1.1.11 linux/mm/vmscan.c:1.1.1.1.2.83
--- linux/mm/vmscan.c:1.1.1.11	Sat Jan  9 12:58:26 1999
+++ linux/mm/vmscan.c	Sun Jan 10 21:34:56 1999
@@ -10,6 +10,11 @@
  *  Version: $Id: vmscan.c,v 1.5 1998/02/23 22:14:28 sct Exp $
  */
 
+/*
+ * free_user_and_cache() and always async swapout original idea.
+ * Copyright (C) 1999  Andrea Arcangeli
+ */
+
 #include <linux/slab.h>
 #include <linux/kernel_stat.h>
 #include <linux/swap.h>
@@ -20,6 +25,8 @@
 
 #include <asm/pgtable.h>
 
+int swapout_interval = HZ;
+
 /*
  * The swap-out functions return 1 if they successfully
  * threw something out, and we got a free page. It returns
@@ -199,11 +206,11 @@
 
 	do {
 		int result;
-		tsk->swap_address = address + PAGE_SIZE;
 		result = try_to_swap_out(tsk, vma, address, pte, gfp_mask);
+		address += PAGE_SIZE;
+		tsk->swap_address = address;
 		if (result)
 			return result;
-		address += PAGE_SIZE;
 		pte++;
 	} while (address < end);
 	return 0;
@@ -306,7 +313,8 @@
 static int swap_out(unsigned int priority, int gfp_mask)
 {
 	struct task_struct * p, * pbest;
-	int counter, assign, max_cnt;
+	int counter, assign;
+	unsigned long max_cnt;
 
 	/* 
 	 * We make one or two passes through the task list, indexed by 
@@ -325,7 +333,7 @@
 	counter = nr_tasks / (priority+1);
 	if (counter < 1)
 		counter = 1;
-	if (counter > nr_tasks)
+	else if (counter > nr_tasks)
 		counter = nr_tasks;
 
 	for (; counter >= 0; counter--) {
@@ -338,7 +346,7 @@
 		for (; p != &init_task; p = p->next_task) {
 			if (!p->swappable)
 				continue;
-	 		if (p->mm->rss <= 0)
+	 		if (p->mm->rss == 0)
 				continue;
 			/* Refresh swap_cnt? */
 			if (assign)
@@ -430,7 +438,7 @@
 			break;
 		current->state = TASK_INTERRUPTIBLE;
 		run_task_queue(&tq_disk);
-		schedule_timeout(HZ);
+		schedule_timeout(swapout_interval);
 
 		/*
 		 * kswapd isn't even meant to keep up with anything,
@@ -438,13 +446,50 @@
 		 * point is to make sure that the system doesn't stay
 		 * forever in a really bad memory squeeze.
 		 */
-		if (nr_free_pages < freepages.high)
-			try_to_free_pages(0, 16);
+		if (nr_free_pages < freepages.min)
+			try_to_free_pages(0, freepages.high - nr_free_pages);
 	}
 
 	return 0;
 }
 
+static int free_user_and_cache(int priority, int gfp_mask)
+{
+	static unsigned long grow_swap_cache = 0;
+
+	if (nr_free_pages < freepages.min)
+		grow_swap_cache = 0;
+
+	switch (grow_swap_cache)
+	{
+	case 0:
+		if (shrink_mmap(priority, gfp_mask))
+			goto success;
+		if (swap_out(priority, gfp_mask))
+		{
+			grow_swap_cache = 1;
+			goto success;
+		}
+		goto failed;
+	default:
+		if (swap_out(priority, gfp_mask))
+		{
+			shrink_mmap(priority, gfp_mask);
+			if (++grow_swap_cache == SWAP_CLUSTER_MAX)
+				grow_swap_cache = 0;
+			goto success;
+		}
+		grow_swap_cache = 0;
+		if (shrink_mmap(priority, gfp_mask))
+			goto success;
+		goto failed;
+	}
+ failed:
+	return 0;
+ success:
+	return 1;
+}
+
 /*
  * We need to make the locks finer granularity, but right
  * now we need this so that we can do page allocations
@@ -457,33 +502,32 @@
 int try_to_free_pages(unsigned int gfp_mask, int count)
 {
 	int priority;
+	static int state = 0;
 
 	lock_kernel();
-
-	/* Always trim SLAB caches when memory gets low. */
-	kmem_cache_reap(gfp_mask);
-
-	priority = 6;
-	do {
-		while (shrink_mmap(priority, gfp_mask)) {
-			if (!--count)
-				goto done;
-		}
 
-		/* Try to get rid of some shared memory pages.. */
-		while (shm_swap(priority, gfp_mask)) {
-			if (!--count)
-				goto done;
-		}
-	
-		/* Then, try to page stuff out.. */
-		while (swap_out(priority, gfp_mask)) {
-			if (!--count)
-				goto done;
-		}
+	priority = 8;
+	switch (state)
+	{
+		do {
+		case 0:
+			while (free_user_and_cache(priority, gfp_mask)) {
+				if (!--count)
+					goto done;
+			}
+			state = 1;
+		case 1:
+			/* Try to get rid of some shared memory pages.. */
+			while (shm_swap(priority, gfp_mask)) {
+				if (!--count)
+					goto done;
+			}
+			state = 0;
 
-		shrink_dcache_memory(priority, gfp_mask);
-	} while (--priority >= 0);
+			kmem_cache_reap(gfp_mask);
+			shrink_dcache_memory(priority, gfp_mask);
+		} while (--priority >= 0);
+	}
 done:
 	unlock_kernel();
 


Andrea Arcangeli

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: Results: pre6 vs pre6+zlatko's_patch  vs pre5 vs arcavm13
  1999-01-10 20:40                                               ` Results: pre6 vs pre6+zlatko's_patch vs pre5 vs arcavm13 Andrea Arcangeli
@ 1999-01-10 20:50                                                 ` Linus Torvalds
  1999-01-10 21:01                                                   ` Andrea Arcangeli
  0 siblings, 1 reply; 243+ messages in thread
From: Linus Torvalds @ 1999-01-10 20:50 UTC (permalink / raw
  To: Andrea Arcangeli
  Cc: Steve Bergman, brent verner, Garst R. Reese, Kalle Andersson,
	Zlatko Calusic, Ben McCann, bredelin, linux-kernel, linux-mm,
	Alan Cox, Stephen C. Tweedie

On Sun, 10 Jan 1999, Andrea Arcangeli wrote:
> 
> Steve, could you try the image test in 128Mbyte with this my new patch
> (arca-vm-14) applyed against clean 2.2.0-pre6?

Please don't do this.

>  	do {
>  		int result;
> -		tsk->swap_address = address + PAGE_SIZE;
>  		result = try_to_swap_out(tsk, vma, address, pte, gfp_mask);
> +		address += PAGE_SIZE;
> +		tsk->swap_address = address;

The above is horribly broken, and should be killed on sight.

"swap_address" _must_ be set before try_to_swap_out(), because otherwise
when try-to_swap_out() sleeps any other processes trying to free will
start from the wrong address - unfairly penalizing the process in
question, as the accessed bits have been cleared.

If the above help, it only does so because it is unfair - which _can_ help
simply because other processes run faster.

It looks like you tried to optimize away one add, at the expense of
getting the whole algorithm wrong.

		Linus

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: Results: pre6 vs pre6+zlatko's_patch  vs pre5 vs arcavm13
  1999-01-10 20:50                                                 ` Linus Torvalds
@ 1999-01-10 21:01                                                   ` Andrea Arcangeli
  1999-01-10 21:51                                                     ` Steve Bergman
  0 siblings, 1 reply; 243+ messages in thread
From: Andrea Arcangeli @ 1999-01-10 21:01 UTC (permalink / raw
  To: Linus Torvalds
  Cc: Steve Bergman, brent verner, Garst R. Reese, Kalle Andersson,
	Zlatko Calusic, Ben McCann, bredelin, linux-kernel, linux-mm,
	Alan Cox, Stephen C. Tweedie

On Sun, 10 Jan 1999, Linus Torvalds wrote:

> The above is horribly broken, and should be killed on sight.

Woops thanks and excuse me. I did not thought at the sleep case... 

Here it is arca-vm-15 with at least this bug removed... excuse me again...

Index: linux/mm/vmscan.c
diff -u linux/mm/vmscan.c:1.1.1.11 linux/mm/vmscan.c:1.1.1.1.2.83
--- linux/mm/vmscan.c:1.1.1.11	Sat Jan  9 12:58:26 1999
+++ linux/mm/vmscan.c	Sun Jan 10 21:34:56 1999
@@ -10,6 +10,11 @@
  *  Version: $Id: vmscan.c,v 1.5 1998/02/23 22:14:28 sct Exp $
  */
 
+/*
+ * free_user_and_cache() and always async swapout original idea.
+ * Copyright (C) 1999  Andrea Arcangeli
+ */
+
 #include <linux/slab.h>
 #include <linux/kernel_stat.h>
 #include <linux/swap.h>
@@ -20,6 +25,8 @@
 
 #include <asm/pgtable.h>
 
+int swapout_interval = HZ;
+
 /*
  * The swap-out functions return 1 if they successfully
  * threw something out, and we got a free page. It returns
@@ -306,7 +313,8 @@
 static int swap_out(unsigned int priority, int gfp_mask)
 {
 	struct task_struct * p, * pbest;
-	int counter, assign, max_cnt;
+	int counter, assign;
+	unsigned long max_cnt;
 
 	/* 
 	 * We make one or two passes through the task list, indexed by 
@@ -325,7 +333,7 @@
 	counter = nr_tasks / (priority+1);
 	if (counter < 1)
 		counter = 1;
-	if (counter > nr_tasks)
+	else if (counter > nr_tasks)
 		counter = nr_tasks;
 
 	for (; counter >= 0; counter--) {
@@ -338,7 +346,7 @@
 		for (; p != &init_task; p = p->next_task) {
 			if (!p->swappable)
 				continue;
-	 		if (p->mm->rss <= 0)
+	 		if (p->mm->rss == 0)
 				continue;
 			/* Refresh swap_cnt? */
 			if (assign)
@@ -430,7 +438,7 @@
 			break;
 		current->state = TASK_INTERRUPTIBLE;
 		run_task_queue(&tq_disk);
-		schedule_timeout(HZ);
+		schedule_timeout(swapout_interval);
 
 		/*
 		 * kswapd isn't even meant to keep up with anything,
@@ -438,13 +446,50 @@
 		 * point is to make sure that the system doesn't stay
 		 * forever in a really bad memory squeeze.
 		 */
-		if (nr_free_pages < freepages.high)
-			try_to_free_pages(0, 16);
+		if (nr_free_pages < freepages.min)
+			try_to_free_pages(0, freepages.high - nr_free_pages);
 	}
 
 	return 0;
 }
 
+static int free_user_and_cache(int priority, int gfp_mask)
+{
+	static unsigned long grow_swap_cache = 0;
+
+	if (nr_free_pages < freepages.min)
+		grow_swap_cache = 0;
+
+	switch (grow_swap_cache)
+	{
+	case 0:
+		if (shrink_mmap(priority, gfp_mask))
+			goto success;
+		if (swap_out(priority, gfp_mask))
+		{
+			grow_swap_cache = 1;
+			goto success;
+		}
+		goto failed;
+	default:
+		if (swap_out(priority, gfp_mask))
+		{
+			shrink_mmap(priority, gfp_mask);
+			if (++grow_swap_cache == SWAP_CLUSTER_MAX)
+				grow_swap_cache = 0;
+			goto success;
+		}
+		grow_swap_cache = 0;
+		if (shrink_mmap(priority, gfp_mask))
+			goto success;
+		goto failed;
+	}
+ failed:
+	return 0;
+ success:
+	return 1;
+}
+
 /*
  * We need to make the locks finer granularity, but right
  * now we need this so that we can do page allocations
@@ -457,33 +502,32 @@
 int try_to_free_pages(unsigned int gfp_mask, int count)
 {
 	int priority;
+	static int state = 0;
 
 	lock_kernel();
-
-	/* Always trim SLAB caches when memory gets low. */
-	kmem_cache_reap(gfp_mask);
-
-	priority = 6;
-	do {
-		while (shrink_mmap(priority, gfp_mask)) {
-			if (!--count)
-				goto done;
-		}
 
-		/* Try to get rid of some shared memory pages.. */
-		while (shm_swap(priority, gfp_mask)) {
-			if (!--count)
-				goto done;
-		}
-	
-		/* Then, try to page stuff out.. */
-		while (swap_out(priority, gfp_mask)) {
-			if (!--count)
-				goto done;
-		}
+	priority = 8;
+	switch (state)
+	{
+		do {
+		case 0:
+			while (free_user_and_cache(priority, gfp_mask)) {
+				if (!--count)
+					goto done;
+			}
+			state = 1;
+		case 1:
+			/* Try to get rid of some shared memory pages.. */
+			while (shm_swap(priority, gfp_mask)) {
+				if (!--count)
+					goto done;
+			}
+			state = 0;
 
-		shrink_dcache_memory(priority, gfp_mask);
-	} while (--priority >= 0);
+			kmem_cache_reap(gfp_mask);
+			shrink_dcache_memory(priority, gfp_mask);
+		} while (--priority >= 0);
+	}
 done:
 	unlock_kernel();
 


I think you could try it now...

Andrea Arcangeli


--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-10 19:03                                                       ` Andrea Arcangeli
@ 1999-01-10 21:39                                                         ` Stephen C. Tweedie
  0 siblings, 0 replies; 243+ messages in thread
From: Stephen C. Tweedie @ 1999-01-10 21:39 UTC (permalink / raw
  To: Andrea Arcangeli
  Cc: Alan Cox, Linus Torvalds, sct, saw, steve, ebiederm+eric,
	damonbrent, reese, kalle.andersson, Zlatko.Calusic, bmccann,
	bredelin, linux-kernel, H.H.vanRiel, linux-mm

Hi,

On Sun, 10 Jan 1999 20:03:41 +0100 (CET), Andrea Arcangeli
<andrea@e-mind.com> said:

> Another way to fix the thing could be to left only to kswapd the work to
> sync shared-mmapped page out to disk when needed. We could wakeup kswapd
> from the inside of filemap_swapout... It's dirty but should work fine
> without the need of reentrant semaphores. 

Yep.  I had this working for swap a long time ago via a separate kswiod
thread for swap IO, but it didn't make a lot of difference at the time
(we weren't swapping very intelligently in those days, though).  It's
something I have thought of resurrecting, mainly because I'm nervous
that if kswapd spends too much time swapping asynchronously then we can
be left starved of real free pages on the free lists for interrupts. 

--Stephen

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: Results: pre6 vs pre6+zlatko's_patch  vs pre5 vs arcavm13
  1999-01-10 20:29                                                 ` Steve Bergman
@ 1999-01-10 21:41                                                   ` Linus Torvalds
  1999-01-10 23:33                                                     ` testing/pre-7 and do_poll() Chip Salzenberg
  1999-01-11 16:57                                                   ` Results: pre6 vs pre6+zlatko's_patch vs pre5 vs arcavm13 Steve Bergman
  1 sibling, 1 reply; 243+ messages in thread
From: Linus Torvalds @ 1999-01-10 21:41 UTC (permalink / raw
  To: Steve Bergman
  Cc: Andrea Arcangeli, Garst R. Reese, Zlatko Calusic, Ben McCann,
	bredelin, linux-kernel, linux-mm, Alan Cox, Stephen C. Tweedie

On Sun, 10 Jan 1999, Steve Bergman wrote:
> 
> I tried the patch in the 'image test' and it helped little if any.  Still a lot
> of swapping in and the numbers are close enough that I'm not sure it helped at
> all.  This was a comparison between vanilla pre6 and vanilla
> pre6+page_alloc_patch with no other patches applied.

Ok, I think I now know why pre-6 looks so unbalanced. It's two issues. 

Basically, trying to swap out a large number of pages from one process
context is just doomed. It bascially sucks, because

 - it has bad latency. This is further excerberated by the per-process
   "thrashing_memory" flag, which means that if we were unlucky enough to
   be selected to be the process that frees up memory, we'll probably be
   stuck with it for a long time. That can make it extremely unfair under
   some circumstances - other processes may allocate the pages we free'd
   up, so that we keep on being counted as a memory trasher even if we
   really aren't. 

   Note that this shows most under "moderate" load - the problem doesn't
   tend to show itself if you have some process that is _really_
   allocating a lot of pages, because then that process will be correctly
   found by the trashing logic. But if you have lots of "normal load"
   processes, some of those can get really badly hurt by this.

   In particular, the worst case you have a number of processes that all
   allocate memory, but not very quickly - certainly not more quickly than
   we can page things out. What happens is that under these circumstances
   one of them gets marked as a "scapegoat", and once that happens all the
   others will just live off the pages that the scapegoat frees up, while
   the scapegoat itself doesn't make much progress at all because it is
   always just freeing memory for others. 

   The really bad behaviour tends to go away reasonably quickly, but while
   it happens it's _really_ unfair.

 - try_to_free_pages() just goes overboard, and starts paging stuff out
   without getting back to the nice balanced behaviour. This is what
   Andrea noticed.

   Essentially, once it starts failing the shrink_mmap() tests, it will
   just page things out crazily. Normally this is avoided by just always
   starting from shrink_mmap(), but if you ask try_to_free_pages() to try
   to free up a ton of pages, the balancing that it does is basically
   bypassed.

So basically pre-6 works _really_ well for the kind of stress-me stuff
that it was designed for: a few processes that are extremely memory
hungry. It gets close to perfect swap-out behaviour, simply because it is
optimized for getting into a paging rut. 

That makes for nice benchmarks, but it also explains why (a) sometimes
it's just not very nice for interactive behaviour and (b) why it under
normal load can easily swap much too eagerly.

Anyway, the first problem is fixed by making "trashing" be a global flag
rather than a per-process flag. Being per-process is really nice when it
finds the right process, but it's really unfair under a lot of other
circumstances. I'd rather be fair than get the best possible page-out
speed. 

Note that even a global flag helps: it still clusters the write-outs, and
means that processes that allocate more pages tend to be more likely to be
hit by it, so it still does a large part of what the per-process flag did
- without the unfairness (but admittedly being unfair sometimes gets you
better performance - you just have to be _very_ careful whom you target
with the unfairness, and that's the hard part). 

The second problem actually goes away by simply just not asking
try_to_free_pages() to free too many pages - and having the global
trashing flag makes it unnecessary to do so anyway because the flag will
essentially cluster the page-outs even without asking for them to be all
done in one large chunk (and now it's not just one process that gets hit
any more).

There's a "pre-7.gz" on ftp.kernel.org in testing, anybody interested? 
It's not the real thing, as I haven't done the write semaphore deadlock
thing yet, but that one will not affect normal users anyway so for
performance testing this should be equivalent. 

			Linus

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-09 18:41                                               ` Andrea Arcangeli
@ 1999-01-10 21:41                                                 ` Stephen C. Tweedie
  1999-01-10 21:47                                                   ` Linus Torvalds
  0 siblings, 1 reply; 243+ messages in thread
From: Stephen C. Tweedie @ 1999-01-10 21:41 UTC (permalink / raw
  To: Andrea Arcangeli
  Cc: Linus Torvalds, Savochkin Andrey Vladimirovich, steve,
	Eric W. Biederman, brent verner, Garst R. Reese, Kalle Andersson,
	Zlatko Calusic, Ben McCann, Alan Cox, bredelin,
	Stephen C. Tweedie, linux-kernel, Rik van Riel, linux-mm

Hi,

On Sat, 9 Jan 1999 19:41:36 +0100 (CET), Andrea Arcangeli
<andrea@e-mind.com> said:

> On Sat, 9 Jan 1999, Linus Torvalds wrote:
>> refuse to touch an inode that is busy is a sure way to allow people to

> What do you mean for busy? What about refusing filemap_write_page() in
> filemap_swapout() only if
> !atomic_count(&vma->vm_file->d_entry->d_inode->i_sem.count)?

The problem with that is what happens if we have a large, active
write-mapped file with lots of IO activity on it; we become essentially
unable to swap that file out.  That has really nasty VM death
implications for things like databases.

--Stephen
--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-10 21:41                                                 ` Stephen C. Tweedie
@ 1999-01-10 21:47                                                   ` Linus Torvalds
  0 siblings, 0 replies; 243+ messages in thread
From: Linus Torvalds @ 1999-01-10 21:47 UTC (permalink / raw
  To: Stephen C. Tweedie
  Cc: Andrea Arcangeli, Savochkin Andrey Vladimirovich, steve,
	Eric W. Biederman, brent verner, Garst R. Reese, Kalle Andersson,
	Zlatko Calusic, Ben McCann, Alan Cox, bredelin, linux-kernel,
	Rik van Riel, linux-mm



On Sun, 10 Jan 1999, Stephen C. Tweedie wrote:
> 
> The problem with that is what happens if we have a large, active
> write-mapped file with lots of IO activity on it; we become essentially
> unable to swap that file out.  That has really nasty VM death
> implications for things like databases.

Indeed. Maybe we really should use kswapd for this, especially now that
kswapd doesn't really do much else..

Btw, pre-6 had a bug in kswapd that is relevant to this discussion - it
used a 0 argument to try_to_free_pages(), even though kswapd very much is
able to do IO. (So in pre-6, waking up kswapd is the wrong thing to try to
do ;)

		Linus

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: Results: pre6 vs pre6+zlatko's_patch  vs pre5 vs arcavm13
  1999-01-10 21:01                                                   ` Andrea Arcangeli
@ 1999-01-10 21:51                                                     ` Steve Bergman
  1999-01-10 22:50                                                       ` Results: arcavm15, et. al Steve Bergman
  0 siblings, 1 reply; 243+ messages in thread
From: Steve Bergman @ 1999-01-10 21:51 UTC (permalink / raw
  To: Andrea Arcangeli
  Cc: Linus Torvalds, brent verner, Garst R. Reese, Kalle Andersson,
	Zlatko Calusic, Ben McCann, bredelin, linux-kernel, linux-mm,
	Alan Cox, Stephen C. Tweedie

Andrea Arcangeli wrote:
> 
> Here it is arca-vm-15 with at least this bug removed... 
> 

In the image test, it looks very good:

arcavm15	1:59
pre6		2:27

The best run for arcavm15 was 1:53 which which is a record for any kernel that
I've tried so far.

I'll try the low memory compile and see how it looks.

-Steve
--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-10 18:35                                                   ` Linus Torvalds
  1999-01-10 19:45                                                     ` Alan Cox
@ 1999-01-10 22:18                                                     ` Stephen C. Tweedie
  1999-01-10 22:49                                                     ` Stephen C. Tweedie
  1999-01-11 14:11                                                     ` Savochkin Andrey Vladimirovich
  3 siblings, 0 replies; 243+ messages in thread
From: Stephen C. Tweedie @ 1999-01-10 22:18 UTC (permalink / raw
  To: Linus Torvalds
  Cc: Stephen C. Tweedie, Savochkin Andrey Vladimirovich,
	Andrea Arcangeli, steve, Eric W. Biederman, brent verner,
	Garst R. Reese, Kalle Andersson, Zlatko Calusic, Ben McCann,
	Alan Cox, bredelin, linux-kernel, Rik van Riel, linux-mm

Hi,

On Sun, 10 Jan 1999 10:35:10 -0800 (PST), Linus Torvalds
<torvalds@transmeta.com> said:

> The thing I want to make re-entrant is just semaphore accesses: at the
> point where we would otherwise deadlock on the writer semaphore it's much
> better to just allow nested writes. I suspect all filesystems can already
> handle nested writes - they are a lot easier to handle than truly
> concurrent ones.

We used to do it anyway, before inodes were locked for write, if I
remember correctly.  

What I'm after is something like the patch below for a fix (don't apply
it: it should work and should fix the problem, but it's really just for
illustration).  It enforces an i_atomic_allocate semaphore to lock
against truncate().  The write-page filemap code takes this semaphore,
but does _not_ take i_sem at all.  

Frankly, I really don't think we want to serialise writes so
aggressively in the first place.  In POSIX, O_APPEND is the only case
where we need to do this (and since that modifies i_size, it's a natural
case to do under the i_atomic_allocate semaphore in any case).

This patch should fix the problem in hand, but what I think we really
want is a read/write semaphore for i_atomic_allocate: we want normal
read and write IO to a file to guard against a concurrent truncate(),
but _not_ against each other (in situations such as threaded/async IO to
a database file, multiple outstanding IOs can be a big win).  Basically,
most writes should take out a read lock on the filesize so that the file
won't disappear from under their feet; only extending or truncating the
file should take out an i_atomic_allocate write lock (assuming the same
sorts of semantics for r/w semaphores as we already have for r/w
spinlocks).

Are there really any filesystems we know can't deal with
concurrent/reentrant writes to an inode?  We already have to deal with
concurrent reads with a single write in progress, after all.

--Stephen

----------------------------------------------------------------
--- fs/inode.c.~1~	Fri Jan  8 16:13:05 1999
+++ fs/inode.c	Sun Jan 10 21:58:46 1999
@@ -132,6 +132,7 @@
 	INIT_LIST_HEAD(&inode->i_dentry);
 	sema_init(&inode->i_sem, 1);
 	sema_init(&inode->i_atomic_write, 1);
+	sema_init(&inode->i_atomic_allocate, 1);
 }

 static inline void write_inode(struct inode *inode)
--- fs/open.c~	Fri Jan  8 17:24:19 1999
+++ fs/open.c	Sun Jan 10 21:59:49 1999
@@ -70,6 +70,7 @@
 	int error;
 	struct iattr newattrs;

+	down(&inode->i_atomic_allocate);
 	down(&inode->i_sem);
 	newattrs.ia_size = length;
 	newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME;
@@ -81,6 +82,7 @@
 			inode->i_op->truncate(inode);
 	}
 	up(&inode->i_sem);
+	up(&inode->i_atomic_allocate);
 	return error;
 }

--- include/linux/fs.h.~1~	Sun Jan 10 21:56:23 1999
+++ include/linux/fs.h	Sun Jan 10 21:58:39 1999
@@ -358,6 +358,7 @@
 	unsigned long		i_nrpages;
 	struct semaphore	i_sem;
 	struct semaphore	i_atomic_write;
+	struct semaphore	i_atomic_allocate;
 	struct inode_operations	*i_op;
 	struct super_block	*i_sb;
 	struct wait_queue	*i_wait;
--- mm/filemap.c~	Fri Jan  8 16:13:06 1999
+++ mm/filemap.c	Sun Jan 10 22:01:52 1999
@@ -1113,9 +1113,9 @@
 	 * and file could be released ... increment the count to be safe.
 	 */
 	file->f_count++;
-	down(&inode->i_sem);
+	down(&inode->i_atomic_allocate);
 	result = do_write_page(inode, file, (const char *) page, offset);
-	up(&inode->i_sem);
+	up(&inode->i_atomic_allocate);
 	fput(file);
 	return result;
 }

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-10 11:56                                                 ` Savochkin Andrey Vladimirovich
  1999-01-10 17:59                                                   ` Andrea Arcangeli
@ 1999-01-10 22:33                                                   ` Stephen C. Tweedie
  1 sibling, 0 replies; 243+ messages in thread
From: Stephen C. Tweedie @ 1999-01-10 22:33 UTC (permalink / raw
  To: Savochkin Andrey Vladimirovich
  Cc: Linus Torvalds, Andrea Arcangeli, steve, Eric W. Biederman,
	brent verner, Garst R. Reese, Kalle Andersson, Zlatko Calusic,
	Ben McCann, Alan Cox, bredelin, Stephen C. Tweedie, linux-kernel,
	Rik van Riel, linux-mm

Hi,

On Sun, 10 Jan 1999 14:56:18 +0300, Savochkin Andrey Vladimirovich
<saw@msu.ru> said:

> Well, doesn't semaphore recursion mean that the write atomicity
> is no more guaranteed by inode's i_sem semaphore?

Yes.  That's OK from one point of view --- there's nothing in the specs
which requires us to make writes atomic.  The question is whether any
filesystems rely on it internally in their implementation.

--Stephen
--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-10 18:35                                                   ` Linus Torvalds
  1999-01-10 19:45                                                     ` Alan Cox
  1999-01-10 22:18                                                     ` Stephen C. Tweedie
@ 1999-01-10 22:49                                                     ` Stephen C. Tweedie
  1999-01-11  6:04                                                       ` Eric W. Biederman
  1999-01-11 11:20                                                       ` Pavel Machek
  1999-01-11 14:11                                                     ` Savochkin Andrey Vladimirovich
  3 siblings, 2 replies; 243+ messages in thread
From: Stephen C. Tweedie @ 1999-01-10 22:49 UTC (permalink / raw
  To: Linus Torvalds
  Cc: Stephen C. Tweedie, Savochkin Andrey Vladimirovich,
	Andrea Arcangeli, steve, Eric W. Biederman, brent verner,
	Garst R. Reese, Kalle Andersson, Zlatko Calusic, Ben McCann,
	Alan Cox, bredelin, linux-kernel, Rik van Riel, linux-mm

Hi,

On Sun, 10 Jan 1999 10:35:10 -0800 (PST), Linus Torvalds
<torvalds@transmeta.com> said:

> On Sun, 10 Jan 1999, Stephen C. Tweedie wrote:
>> 
>> Ack.  I've been having a closer look, and making the superblock lock
>> recursive doesn't work

> That's fine - the superblock lock doesn't need to be re-entrant, because
> __GFP_IO is quite sufficient for that one.

I'm no longer convinced about that.  I think it's much much worse.  A
bread() on an ext2 bitmap buffer with the superblock held is only safe
if the IO can complete without _ever_ relying on a GFP_IO allocation.
That means that any interrupt allocations required in that space have to
be satisfiable by kswapd without GFP_IO, or kswapd could deadlock on us.
It means that if our superblock-locked IO has to stall waiting for an
nbd server process or a raid daemon, then those daemons cannot safely do
GFP_IO.  It's really gross.

I think it's actually ugly enough that we cannot make it safe: we can
really only be sure if we prevent all GFP_IO from any process which
might be involved in our deadlock loop, or if we avoid doing any IO with
the superblock lock held.  

It really looks as if the right way around this is to prevent GFP_IO
from deadlocking in the first place, by moving the asynchronous page
writes out of kswapd/try_to_free_page and into a separate worker thread.
That way we can continue to try to reclaim memory somewhere else without
deadlocking.  In that case the only thing we are left having to worry
about is doing a synchronous swapout, where we end up blocking waiting
for the IO thread to complete.  

In fact, to make it really safe we'd need to avoid synchronous swapout
altogether: otherwise we can have

	    A			kswiod		nbd server process
	    lock_super();
	    bread(ndb device);
	    try_to_free_page();
	    rw_swap_page_async();
				filemap_write_page();
				lock_super();
	    wait_on_buffer();
						try_to_free_page();
						rw_swap_page_sync();
						Oops, kswiod is stalled.

Can we get away without synchronous swapout?  Notice that in this case,
kswiod may be blocked but kswapd itself will not be.  As long as the nbd
server does not try to do a synchronous swap, it won't deadlock on
kswiod.  In other words, it is safe to wait for avaibility of another
free page, but it is not safe to wait for completion of any single,
specific swap IO.  If kswapd itself no longer performs the IO, then we
can always free more memory, until we get to the complete death stage
where there are absolutely no clean pages left in the system.

If we do this, then both the inode and the superblock deadlocks
disappear.

--Stephen.
--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Results: arcavm15, et. al.
  1999-01-10 21:51                                                     ` Steve Bergman
@ 1999-01-10 22:50                                                       ` Steve Bergman
  1999-01-11  0:20                                                         ` Steve Bergman
  1999-01-11 13:21                                                         ` Andrea Arcangeli
  0 siblings, 2 replies; 243+ messages in thread
From: Steve Bergman @ 1999-01-10 22:50 UTC (permalink / raw
  To: Andrea Arcangeli, Linus Torvalds, brent verner, Garst R. Reese,
	Kalle Andersson, Zlatko Calusic, Ben McCann, bredelin,
	linux-kernel, linux-mm, Alan Cox, Stephen C. Tweedie

For the image load test:

pre6+zlatko's_patch             2:35
and with requested change       3:09
pre6                            2:27
pre5                            1:58
arcavm13                        9:13
arcavm15			1:59


For the kernel compile test:

In 12MB:
                                Elapsed Maj.    Min.    Swaps
                                -----   ------  ------  -----
pre6+zlatko_patch               22:14   383206  204482  57823
and with requested change       22:23   378662  198194  51445
pre6                            20:54   352934  191210  48678
pre5                            19:35   334680  183732  93427 
arcavm13                        19:45   344452  180243  38977
arcavm15			20:07	N/A	N/A	N/A

Arcavm15 looks very good.  pre5 and arcavm13 look a bit better but of the
kernels with the anti-deadlock code it looks the best so far. ( I assume that
being based upon pre6 it's safe.)
The battery in my palmtop died so I don't have the page fault and swaps results
available for arcavm15.  I'll grab the pre-7.gz patch and see how it does.

-Steve
--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* testing/pre-7 and do_poll()
  1999-01-10 21:41                                                   ` Linus Torvalds
@ 1999-01-10 23:33                                                     ` Chip Salzenberg
  1999-01-11  6:02                                                       ` Linus Torvalds
  1999-01-11 20:20                                                       ` Adam Heath
  0 siblings, 2 replies; 243+ messages in thread
From: Chip Salzenberg @ 1999-01-10 23:33 UTC (permalink / raw
  To: Linus Torvalds; +Cc: linux-kernel, linux-mm

According to Linus Torvalds:
> There's a "pre-7.gz" on ftp.kernel.org in testing, anybody interested?

Got it, like it -- *except* the fix for overflow in do_poll() is a
little bit off.  Quoting testing/pre-7:

	if (timeout) {
		/* Carefula about overflow in the intermediate values */
		if ((unsigned long) timeout < MAX_SCHEDULE_TIMEOUT / HZ)
			timeout = (timeout*HZ+999)/1000+1;
		else /* Negative or overflow */
			timeout = MAX_SCHEDULE_TIMEOUT;
	}

However, the maximum legal millisecond timeout isn't (as shown)
MAX_SCHEDULE_TIMEOUT/HZ, but rather MAX_SCHEDULE_TIMEOUT/(1000/HZ).
So this code will turn some large timeouts into MAX_SCHEDULE_TIMEOUT
unnecessarily.

Therefore, I suggest this patch:

Index: fs/select.c
*************** asmlinkage int sys_poll(struct pollfd * 
*** 336,346 ****
  		goto out;
  
! 	if (timeout) {
! 		/* Carefula about overflow in the intermediate values */
! 		if ((unsigned long) timeout < MAX_SCHEDULE_TIMEOUT / HZ)
! 			timeout = (timeout*HZ+999)/1000+1;
! 		else /* Negative or overflow */
! 			timeout = MAX_SCHEDULE_TIMEOUT;
! 	}
  
  	err = -ENOMEM;
--- 336,343 ----
  		goto out;
  
! 	if (timeout < 0)
! 		timeout = MAX_SCHEDULE_TIMEOUT;
! 	else if (timeout)
! 		timeout = ROUND_UP(timeout, 1000/HZ);
  
  	err = -ENOMEM;


-- 
Chip Salzenberg      - a.k.a. -      <chip@perlsupport.com>
      "When do you work?"   "Whenever I'm not busy."
--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: Results: arcavm15, et. al.
  1999-01-10 22:50                                                       ` Results: arcavm15, et. al Steve Bergman
@ 1999-01-11  0:20                                                         ` Steve Bergman
  1999-01-11 13:21                                                         ` Andrea Arcangeli
  1 sibling, 0 replies; 243+ messages in thread
From: Steve Bergman @ 1999-01-11  0:20 UTC (permalink / raw
  To: Andrea Arcangeli, Linus Torvalds, brent verner, Garst R. Reese,
	Kalle Andersson, Zlatko Calusic, Ben McCann, bredelin,
	linux-kernel, linux-mm, Alan Cox, Stephen C. Tweedie

Here are the results of pre-7:

116 Image test in 128MB:

pre6+zlatko's_patch             2:35
and with requested change       3:09
pre6                            2:27
pre5                            1:58
arcavm13                        9:13
arcavm15                        1:59
pre-7				2:41


For the kernel compile test in 12MB:

                                Elapsed Maj.    Min.    Swaps
                                -----   ------  ------  -----
pre6+zlatko_patch               22:14   383206  204482  57823
and with requested change       22:23   378662  198194  51445
pre6                            20:54   352934  191210  48678
pre5                            19:35   334680  183732  93427
arcavm13                        19:45   344452  180243  38977
arcavm15                        20:07   N/A     N/A     N/A
pre-7				21:14	356386	192835	50912


-Steve
--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: Results: pre6 vs pre6+zlatko's_patch  vs pre5 vs arcavm13
  1999-01-09 22:39                                       ` Results: pre6 vs pre6+zlatko's_patch vs pre5 vs arcavm13 Steve Bergman
  1999-01-10  0:28                                         ` Steve Bergman
@ 1999-01-11  3:47                                         ` Gregory Maxwell
  1 sibling, 0 replies; 243+ messages in thread
From: Gregory Maxwell @ 1999-01-11  3:47 UTC (permalink / raw
  To: Steve Bergman
  Cc: Andrea Arcangeli, brent verner, Garst R. Reese, Kalle Andersson,
	Zlatko Calusic, Ben McCann, bredelin, linux-kernel, linux-mm,
	Linus Torvalds, Alan Cox, Stephen C. Tweedie

On Sat, 9 Jan 1999, Steve Bergman wrote:

> pre6+zlatko_patch	5:29	192527	149728	3554
> pre6			5:27	192002	149694	4257
> pre5			5:28	188566	148674	5646
> arcavm13		5:32	188560	148234	1594

>From what I've seen, arcavm and the zlatko_patch do differnt things. Would
it be possible to do arcavm+zlatko? or am I wrong?

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: testing/pre-7 and do_poll()
  1999-01-10 23:33                                                     ` testing/pre-7 and do_poll() Chip Salzenberg
@ 1999-01-11  6:02                                                       ` Linus Torvalds
  1999-01-11  6:26                                                         ` Chip Salzenberg
  1999-01-11 20:20                                                       ` Adam Heath
  1 sibling, 1 reply; 243+ messages in thread
From: Linus Torvalds @ 1999-01-11  6:02 UTC (permalink / raw
  To: Chip Salzenberg; +Cc: linux-kernel, linux-mm



On Sun, 10 Jan 1999, Chip Salzenberg wrote:
> 
> Got it, like it -- *except* the fix for overflow in do_poll() is a
> little bit off.  Quoting testing/pre-7:
> 
> 	if (timeout) {
> 		/* Carefula about overflow in the intermediate values */
> 		if ((unsigned long) timeout < MAX_SCHEDULE_TIMEOUT / HZ)
> 			timeout = (timeout*HZ+999)/1000+1;
> 		else /* Negative or overflow */
> 			timeout = MAX_SCHEDULE_TIMEOUT;
> 	}
> 
> However, the maximum legal millisecond timeout isn't (as shown)
> MAX_SCHEDULE_TIMEOUT/HZ, but rather MAX_SCHEDULE_TIMEOUT/(1000/HZ).
> So this code will turn some large timeouts into MAX_SCHEDULE_TIMEOUT
> unnecessarily.

Note the comment (and do NOT look at the speeling).

In particular, we need to make sure the _intermediate_ value doesn'
toverflow. We could do that by using 64-bit arithmetic, but let's not.

> ! 	if (timeout < 0)
> ! 		timeout = MAX_SCHEDULE_TIMEOUT;
> ! 	else if (timeout)
> ! 		timeout = ROUND_UP(timeout, 1000/HZ);

Eh? And re-introduce the original bug?

		Linus

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-10 22:49                                                     ` Stephen C. Tweedie
@ 1999-01-11  6:04                                                       ` Eric W. Biederman
  1999-01-12 16:06                                                         ` Stephen C. Tweedie
  1999-01-11 11:20                                                       ` Pavel Machek
  1 sibling, 1 reply; 243+ messages in thread
From: Eric W. Biederman @ 1999-01-11  6:04 UTC (permalink / raw
  To: Stephen C. Tweedie
  Cc: Linus Torvalds, Savochkin Andrey Vladimirovich, Andrea Arcangeli,
	steve, Eric W. Biederman, brent verner, Garst R. Reese,
	Kalle Andersson, Zlatko Calusic, Ben McCann, Alan Cox, bredelin,
	linux-kernel, Rik van Riel, linux-mm

>>>>> "ST" == Stephen C Tweedie <sct@redhat.com> writes:

ST> Hi,
ST> On Sun, 10 Jan 1999 10:35:10 -0800 (PST), Linus Torvalds
ST> <torvalds@transmeta.com> said:

>> On Sun, 10 Jan 1999, Stephen C. Tweedie wrote:
>>> 
>>> Ack.  I've been having a closer look, and making the superblock lock
>>> recursive doesn't work

>> That's fine - the superblock lock doesn't need to be re-entrant, because
>> __GFP_IO is quite sufficient for that one.

ST> I'm no longer convinced about that.  I think it's much much worse.  A
ST> bread() on an ext2 bitmap buffer with the superblock held is only safe
ST> if the IO can complete without _ever_ relying on a GFP_IO allocation.
ST> That means that any interrupt allocations required in that space have to
ST> be satisfiable by kswapd without GFP_IO, or kswapd could deadlock
ST> on us.

Well interrupts use GFP_ATOMIC . . . 

ST> It means that if our superblock-locked IO has to stall waiting for an
ST> nbd server process or a raid daemon, then those daemons cannot safely do
ST> GFP_IO.  It's really gross.

Right.  And the flag not to do I/O doesn't propogate across processes.
This sounds like a variation of the priority inheritance problem.

I wonder if this is why there are some known deadlocks with raid?

ST> I think it's actually ugly enough that we cannot make it safe: we can
ST> really only be sure if we prevent all GFP_IO from any process which
ST> might be involved in our deadlock loop, or if we avoid doing any IO with
ST> the superblock lock held.  

ST> In fact, to make it really safe we'd need to avoid synchronous swapout
ST> altogether: otherwise we can have

ST> Can we get away without synchronous swapout?  Notice that in this case,
ST> kswiod may be blocked but kswapd itself will not be.  As long as the nbd
ST> server does not try to do a synchronous swap, it won't deadlock on
ST> kswiod.  In other words, it is safe to wait for avaibility of another
ST> free page, but it is not safe to wait for completion of any single,
ST> specific swap IO.  If kswapd itself no longer performs the IO, then we
ST> can always free more memory, until we get to the complete death stage
ST> where there are absolutely no clean pages left in the system.

ST> If we do this, then both the inode and the superblock deadlocks
ST> disappear.

Sounds good.

I have a daemon just about ready to go, hopefully I can post it
tommorrow for preliminary testing.  It looks like my work for 2.3
in a small part can help deadlocks after all.

It walks the page tables and just writes out dirty pages, and marks
them clean but it doesn't remove them from processes.  So it can get
an early jump on writing things out.

Then if we are hitting a low memory situation (because pages become
dirty quickly), we can just wake it up, more often.

Currently we are doing totally asynchonous swapping but from the
context of the process that needs memory, (so the locks are in
different processes).  Adding a second daemon will play havoc on our
balancing but it shouldn't affect anything else. 

Grr.  I forgot about sysv shm.  It is the only thing doing synchronous
swapping right now.  

Oh, and just as a side note we are currently unfairly penalizing
threaded programs by doing for_each_task instead of for_each_mm in the
swapout code...

Eric
--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: testing/pre-7 and do_poll()
  1999-01-11  6:02                                                       ` Linus Torvalds
@ 1999-01-11  6:26                                                         ` Chip Salzenberg
  1999-01-11  6:46                                                           ` Linus Torvalds
  0 siblings, 1 reply; 243+ messages in thread
From: Chip Salzenberg @ 1999-01-11  6:26 UTC (permalink / raw
  To: Linus Torvalds; +Cc: linux-kernel, linux-mm

According to Linus Torvalds:
> On Sun, 10 Jan 1999, Chip Salzenberg wrote:
> > However, the maximum legal millisecond timeout isn't (as shown)
> > MAX_SCHEDULE_TIMEOUT/HZ, but rather MAX_SCHEDULE_TIMEOUT/(1000/HZ).
> > So this code will turn some large timeouts into MAX_SCHEDULE_TIMEOUT
> > unnecessarily.
> 
> Note the comment (and do NOT look at the speeling).  In particular,
> we need to make sure the _intermediate_ value doesn' toverflow.

Of course; that's obvious.  What's perhaps less obvious is that I'm
suggesting a change in the calculation of timeout -- a change which
avoids the creation of unnecessarily large _intermediate_ values.

> > ! 	if (timeout < 0)
> > ! 		timeout = MAX_SCHEDULE_TIMEOUT;
> > ! 	else if (timeout)
> > ! 		timeout = ROUND_UP(timeout, 1000/HZ);
> 
> Eh? And re-introduce the original bug?

Well, I forgot the (unsigned long) cast, as someone else noted:

	timeout = ROUND_UP((unsigned long) timeout, 1000/HZ);

Otherwise, the code is Just Right.
-- 
Chip Salzenberg      - a.k.a. -      <chip@perlsupport.com>
      "When do you work?"   "Whenever I'm not busy."
--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: testing/pre-7 and do_poll()
  1999-01-11  6:26                                                         ` Chip Salzenberg
@ 1999-01-11  6:46                                                           ` Linus Torvalds
  1999-01-11  6:59                                                             ` Chip Salzenberg
  0 siblings, 1 reply; 243+ messages in thread
From: Linus Torvalds @ 1999-01-11  6:46 UTC (permalink / raw
  To: Chip Salzenberg; +Cc: linux-kernel, linux-mm



On Mon, 11 Jan 1999, Chip Salzenberg wrote:
> 
> Well, I forgot the (unsigned long) cast, as someone else noted:
> 
> 	timeout = ROUND_UP((unsigned long) timeout, 1000/HZ);
> 
> Otherwise, the code is Just Right.

Duh?

The above code is basically just completely wrong.

Hint: HZ is a define - not 100.

You just ended up dividing by zero on certain architectures.

		Linus

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: testing/pre-7 and do_poll()
  1999-01-11  6:46                                                           ` Linus Torvalds
@ 1999-01-11  6:59                                                             ` Chip Salzenberg
  1999-01-11  7:02                                                               ` Linus Torvalds
  0 siblings, 1 reply; 243+ messages in thread
From: Chip Salzenberg @ 1999-01-11  6:59 UTC (permalink / raw
  To: Linus Torvalds; +Cc: linux-kernel, linux-mm

According to Linus Torvalds:
> On Mon, 11 Jan 1999, Chip Salzenberg wrote:
> > Well, I forgot the (unsigned long) cast, as someone else noted:
> > 	timeout = ROUND_UP((unsigned long) timeout, 1000/HZ);
> > Otherwise, the code is Just Right.
> 
> Hint: HZ is a define - not 100.
> You just ended up dividing by zero on certain architectures.

I didn't think HZ ranged over 1000 in practice, else of course I would
not have written the above.
-- 
Chip Salzenberg      - a.k.a. -      <chip@perlsupport.com>
      "When do you work?"   "Whenever I'm not busy."
--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: testing/pre-7 and do_poll()
  1999-01-11  6:59                                                             ` Chip Salzenberg
@ 1999-01-11  7:02                                                               ` Linus Torvalds
  1999-01-11 22:08                                                                 ` Shawn Leas
  0 siblings, 1 reply; 243+ messages in thread
From: Linus Torvalds @ 1999-01-11  7:02 UTC (permalink / raw
  To: Chip Salzenberg; +Cc: linux-kernel, linux-mm

On Mon, 11 Jan 1999, Chip Salzenberg wrote:
> > 
> > Hint: HZ is a define - not 100.
> > You just ended up dividing by zero on certain architectures.
> 
> I didn't think HZ ranged over 1000 in practice, else of course I would
> not have written the above.

I made the same assumption wrt usecs (notice how I myself would divide by
zero on any architecture where HZ is over 1000000).

Right now, HZ is 100 on most architectures, with alpha being the exception
at 1024. Some of the PC speaker patches used to have HZ at 8192 even on a
PC, although later versions scaled it down (and just internally used a
timer tick happening at 8kHz, leaving HZ at 100).

With modern machines, 100Hz is just peanuts, and a HZ in the kilohertz
certainly makes sense - and allows for nicer granularity for a lot of
things. So far, megahertz are still far in the future, but maybe I some
day will have to remove even that assumption. Unlikely to be a problem in
my lifetime, but hey, I can hope (whether due to a long life or really
fast CPU's, I don't care ;) 

		Linus

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Buffer handling (setting PG_referenced on access)
  1999-01-09  6:44                                               ` Linus Torvalds
  1999-01-09 18:58                                                 ` Andrea Arcangeli
@ 1999-01-11  9:21                                                 ` Zlatko Calusic
  1999-01-11 17:44                                                   ` Linus Torvalds
  1999-01-16 17:35                                                 ` 2.2.0-pre[56] swap performance poor with > 1 thrashing task Andrea Arcangeli
  2 siblings, 1 reply; 243+ messages in thread
From: Zlatko Calusic @ 1999-01-11  9:21 UTC (permalink / raw
  To: Linus Torvalds
  Cc: Linux-MM List, Linux Kernel List, Dax Kelson, Steve Bergman,
	Andrea Arcangeli, brent verner, Garst R. Reese, Kalle Andersson,
	Ben McCann, bredelin, Alan Cox, Stephen C. Tweedie

Linus Torvalds <torvalds@transmeta.com> writes:

> Btw, if there are people there who actually like timing different things
> (something I _hate_ doing - I lose interest if things become just a matter
> of numbers rather than trying to get some algorithm right), then there's
> one thing I'd love to hear about: the effect of trying to do some
> access bit setting on buffer cache pages.

OK, implementation was easy and simple, much simpler than it was made
before (with BH_Touched copying...), but I must admit that even after
lots of testing I couldn't find any difference. Not in performance,
not in CPU usage, not in overall behaviour. Whatever results I have
accomplished, they were too much in the statistical noise, so I don't
have any useful data. Maybe, others can try and see.

But, nevertheless, four lines added to the kernel look very correct to
me. My vote for including, if for nothing, then to make balance with
page cache. It won't harm anything, that's for sure. Patch applies
cleanly on pre-7, as found in testing directory on ftp.kernel.org.


Index: 2206.7/include/linux/fs.h
--- 2206.7/include/linux/fs.h Mon, 04 Jan 1999 17:24:06 +0100 zcalusic (linux-2.1/z/b/10_fs.h 1.1.5.1.1.3 644)
+++ 2206.8(w)/include/linux/fs.h Mon, 11 Jan 1999 08:31:48 +0100 zcalusic (linux-2.1/z/b/10_fs.h 1.1.5.1.1.3.1.1 644)
@@ -178,6 +178,9 @@
 #define BH_Req		3	/* 0 if the buffer has been invalidated */
 #define BH_Protected	6	/* 1 if the buffer is protected */
 
+#define buffer_page(bh)		(mem_map + MAP_NR((bh)->b_data))
+#define touch_buffer(bh)	set_bit(PG_referenced, &buffer_page(bh)->flags)
+
 /*
  * Try to keep the most commonly used fields in single cache lines (16
  * bytes) to improve performance.  This ordering should be
@@ -250,19 +253,6 @@
 {
 	return test_bit(BH_Protected, &bh->b_state);
 }
-
-/*
- * Deprecated - we don't keep per-buffer reference flags
- * any more.
- *
- * We _could_ try to update the page reference, but that
- * doesn't seem to really be worth it either. If we did,
- * it would look something like this:
- *
- *	#define buffer_page(bh)		(mem_map + MAP_NR((bh)->b_data))
- *	#define touch_buffer(bh)	set_bit(PG_referenced, &buffer_page(bh)->flags)
- */
-#define touch_buffer(bh)	do { } while (0)
 
 #include <linux/pipe_fs_i.h>
 #include <linux/minix_fs_i.h>
Index: 2206.7/fs/buffer.c
--- 2206.7/fs/buffer.c Sat, 09 Jan 1999 03:44:23 +0100 zcalusic (linux-2.1/G/b/41_buffer.c 1.1.1.1.1.3.2.1.2.1 644)
+++ 2206.8(w)/fs/buffer.c Mon, 11 Jan 1999 08:31:48 +0100 zcalusic (linux-2.1/G/b/41_buffer.c 1.1.1.1.1.3.2.1.2.1.1.1 644)
@@ -737,6 +737,7 @@
 				 put_last_lru(bh);
 			bh->b_flushtime = 0;
 		}
+		touch_buffer(bh);
 		return bh;
 	}
 
@@ -754,6 +755,7 @@
 	bh->b_lru_time	= jiffies;
 	bh->b_state=0;
 	insert_into_queues(bh);
+	touch_buffer(bh);
 	return bh;
 
 	/*

Regards,
-- 
Zlatko
--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-10 22:49                                                     ` Stephen C. Tweedie
  1999-01-11  6:04                                                       ` Eric W. Biederman
@ 1999-01-11 11:20                                                       ` Pavel Machek
  1999-01-11 17:35                                                         ` Stephen C. Tweedie
  1 sibling, 1 reply; 243+ messages in thread
From: Pavel Machek @ 1999-01-11 11:20 UTC (permalink / raw
  To: Stephen C. Tweedie
  Cc: Linus Torvalds, Savochkin Andrey Vladimirovich, Andrea Arcangeli,
	steve, Eric W. Biederman, brent verner, Garst R. Reese,
	Kalle Andersson, Zlatko Calusic, Ben McCann, Alan Cox, bredelin,
	linux-kernel, Rik van Riel, linux-mm

Hi!

> In fact, to make it really safe we'd need to avoid synchronous swapout
> altogether: otherwise we can have
> 
> 	    A			kswiod		nbd server process
> 	    lock_super();
> 	    bread(ndb device);
> 	    try_to_free_page();
> 	    rw_swap_page_async();
> 				filemap_write_page();
> 				lock_super();
> 	    wait_on_buffer();
> 						try_to_free_page();
> 						rw_swap_page_sync();
> 						Oops, kswiod is stalled.
> 
> Can we get away without synchronous swapout?  Notice that in this case,
> kswiod may be blocked but kswapd itself will not be.  As long as the nbd
> server does not try to do a synchronous swap, it won't deadlock on
> kswiod.  In other words, it is safe to wait for avaibility of
> another

Is this only matter of nbd? If so, maybe the best solution is to start
claiming: "don't swap over nbd, don't mount localhost drives read
write". [It is bad, but it is probably better than polluting rest of
kernel with nbd workarounds...]

								Pavel
-- 
The best software in life is free (not shareware)!		Pavel
GCM d? s-: !g p?:+ au- a--@ w+ v- C++@ UL+++ L++ N++ E++ W--- M- Y- R+
--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: Results: arcavm15, et. al.
  1999-01-10 22:50                                                       ` Results: arcavm15, et. al Steve Bergman
  1999-01-11  0:20                                                         ` Steve Bergman
@ 1999-01-11 13:21                                                         ` Andrea Arcangeli
  1 sibling, 0 replies; 243+ messages in thread
From: Andrea Arcangeli @ 1999-01-11 13:21 UTC (permalink / raw
  To: Steve Bergman
  Cc: Linus Torvalds, brent verner, Garst R. Reese, Kalle Andersson,
	Zlatko Calusic, Ben McCann, bredelin, linux-kernel, linux-mm,
	Alan Cox, Stephen C. Tweedie

On Sun, 10 Jan 1999, Steve Bergman wrote:

> For the image load test:
> 
> pre6+zlatko's_patch             2:35
> arcavm15			1:59

Steve, could you give a try to this new arca-vm-16?

arca-vm-16 is still against 2.2.0-pre6 because last night I can't find
pre7 on the ftp site.

Index: linux/mm/vmscan.c
diff -u linux/mm/vmscan.c:1.1.1.11 linux/mm/vmscan.c:1.1.1.1.2.86
--- linux/mm/vmscan.c:1.1.1.11	Sat Jan  9 12:58:26 1999
+++ linux/mm/vmscan.c	Mon Jan 11 14:14:13 1999
@@ -10,6 +10,11 @@
  *  Version: $Id: vmscan.c,v 1.5 1998/02/23 22:14:28 sct Exp $
  */
 
+/*
+ * free_user_and_cache() and always async swapout original idea.
+ * Copyright (C) 1999  Andrea Arcangeli
+ */
+
 #include <linux/slab.h>
 #include <linux/kernel_stat.h>
 #include <linux/swap.h>
@@ -20,6 +25,8 @@
 
 #include <asm/pgtable.h>
 
+int swapout_interval = HZ;
+
 /*
  * The swap-out functions return 1 if they successfully
  * threw something out, and we got a free page. It returns
@@ -306,7 +313,8 @@
 static int swap_out(unsigned int priority, int gfp_mask)
 {
 	struct task_struct * p, * pbest;
-	int counter, assign, max_cnt;
+	int counter, assign;
+	unsigned long max_cnt;
 
 	/* 
 	 * We make one or two passes through the task list, indexed by 
@@ -325,7 +333,7 @@
 	counter = nr_tasks / (priority+1);
 	if (counter < 1)
 		counter = 1;
-	if (counter > nr_tasks)
+	else if (counter > nr_tasks)
 		counter = nr_tasks;
 
 	for (; counter >= 0; counter--) {
@@ -338,7 +346,7 @@
 		for (; p != &init_task; p = p->next_task) {
 			if (!p->swappable)
 				continue;
-	 		if (p->mm->rss <= 0)
+	 		if (p->mm->rss == 0)
 				continue;
 			/* Refresh swap_cnt? */
 			if (assign)
@@ -430,7 +438,7 @@
 			break;
 		current->state = TASK_INTERRUPTIBLE;
 		run_task_queue(&tq_disk);
-		schedule_timeout(HZ);
+		schedule_timeout(swapout_interval);
 
 		/*
 		 * kswapd isn't even meant to keep up with anything,
@@ -438,13 +446,36 @@
 		 * point is to make sure that the system doesn't stay
 		 * forever in a really bad memory squeeze.
 		 */
-		if (nr_free_pages < freepages.high)
-			try_to_free_pages(0, 16);
+		if (nr_free_pages < freepages.min)
+			try_to_free_pages(0, freepages.high - nr_free_pages);
 	}
 
 	return 0;
 }
 
+static int free_user_and_cache(int priority, int gfp_mask)
+{
+	static unsigned long grow_swap_cache = 0;
+
+	if (!shrink_mmap(priority, gfp_mask))
+		grow_swap_cache = 1;
+
+	switch (grow_swap_cache)
+	{
+	case 0:
+		return 1;
+	default:
+		if (grow_swap_cache++ >= freepages.high)
+			grow_swap_cache = 0;
+	}
+
+	if (swap_out(priority, gfp_mask))
+		return 1;
+
+	grow_swap_cache = 0;
+	return 0;
+}
+
 /*
  * We need to make the locks finer granularity, but right
  * now we need this so that we can do page allocations
@@ -457,33 +488,32 @@
 int try_to_free_pages(unsigned int gfp_mask, int count)
 {
 	int priority;
+	static int state = 0;
 
 	lock_kernel();
 
-	/* Always trim SLAB caches when memory gets low. */
-	kmem_cache_reap(gfp_mask);
-
 	priority = 6;
-	do {
-		while (shrink_mmap(priority, gfp_mask)) {
-			if (!--count)
-				goto done;
-		}
-
-		/* Try to get rid of some shared memory pages.. */
-		while (shm_swap(priority, gfp_mask)) {
-			if (!--count)
-				goto done;
-		}
-	
-		/* Then, try to page stuff out.. */
-		while (swap_out(priority, gfp_mask)) {
-			if (!--count)
-				goto done;
-		}
+	switch (state)
+	{
+		do {
+		case 0:
+			while (free_user_and_cache(priority, gfp_mask)) {
+				if (!--count)
+					goto done;
+			}
+			state = 1;
+		case 1:
+			/* Try to get rid of some shared memory pages.. */
+			while (shm_swap(priority, gfp_mask)) {
+				if (!--count)
+					goto done;
+			}
+			state = 0;
 
-		shrink_dcache_memory(priority, gfp_mask);
-	} while (--priority >= 0);
+			kmem_cache_reap(gfp_mask);
+			shrink_dcache_memory(priority, gfp_mask);
+		} while (--priority >= 0);
+	}
 done:
 	unlock_kernel();
 


Andrea Arcangeli

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-10 18:35                                                   ` Linus Torvalds
                                                                       ` (2 preceding siblings ...)
  1999-01-10 22:49                                                     ` Stephen C. Tweedie
@ 1999-01-11 14:11                                                     ` Savochkin Andrey Vladimirovich
  1999-01-11 17:55                                                       ` Linus Torvalds
  3 siblings, 1 reply; 243+ messages in thread
From: Savochkin Andrey Vladimirovich @ 1999-01-11 14:11 UTC (permalink / raw
  To: Linus Torvalds
  Cc: Andrea Arcangeli, steve, Stephen C. Tweedie, Eric W. Biederman,
	brent verner, Garst R. Reese, Kalle Andersson, Zlatko Calusic,
	Ben McCann, Alan Cox, bredelin, linux-kernel, Rik van Riel,
	linux-mm

On Sun, Jan 10, 1999 at 10:35:10AM -0800, Linus Torvalds wrote:
> The thing I want to make re-entrant is just semaphore accesses: at the
> point where we would otherwise deadlock on the writer semaphore it's much
> better to just allow nested writes. I suspect all filesystems can already
> handle nested writes - they are a lot easier to handle than truly
> concurrent ones.

You're an optimist, aren't you? :-)

In any case I've checked your recursive semaphore code on a news server
which reliably deadlocked with the previous kernels.
The code seems to work well.

Best wishes
					Andrey V.
					Savochkin
--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: Results: pre6 vs pre6+zlatko's_patch  vs pre5 vs arcavm13
  1999-01-10 20:29                                                 ` Steve Bergman
  1999-01-10 21:41                                                   ` Linus Torvalds
@ 1999-01-11 16:57                                                   ` Steve Bergman
  1999-01-11 19:36                                                     ` Andrea Arcangeli
  1 sibling, 1 reply; 243+ messages in thread
From: Steve Bergman @ 1999-01-11 16:57 UTC (permalink / raw
  To: Linus Torvalds, Andrea Arcangeli, brent verner, Garst R. Reese,
	Kalle Andersson, Zlatko Calusic, Ben McCann, bredelin,
	linux-kernel, linux-mm, Alan Cox, Stephen C. Tweedie

Here are updated results including arcavm16:


116 Image test in 128MB:

pre6+zlatko's_patch             2:35
and with requested change       3:09
pre6                            2:27
pre5                            1:58
arcavm13                        9:13
arcavm15                        1:59
pre-7                           2:41
arcavm16			1:54

For the kernel compile test in 12MB:

                                Elapsed Maj.    Min.    Swaps
                                -----   ------  ------  -----
pre6+zlatko_patch               22:14   383206  204482  57823
and with requested change       22:23   378662  198194  51445
pre6                            20:54   352934  191210  48678
pre5                            19:35   334680  183732  93427
arcavm13                        19:45   344452  180243  38977
arcavm15                        20:07   N/A     N/A     N/A
pre-7                           21:14   356386  192835  50912
arcavm16			20:09	N/A	N/A	N/A


I think it's better than arcavm15 on the image test and the same on the compile
test.


-Steve
--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-11 11:20                                                       ` Pavel Machek
@ 1999-01-11 17:35                                                         ` Stephen C. Tweedie
  0 siblings, 0 replies; 243+ messages in thread
From: Stephen C. Tweedie @ 1999-01-11 17:35 UTC (permalink / raw
  To: Pavel Machek, Linus Torvalds
  Cc: Stephen C. Tweedie, Savochkin Andrey Vladimirovich,
	Andrea Arcangeli, steve, Eric W. Biederman, brent verner,
	Garst R. Reese, Kalle Andersson, Zlatko Calusic, Ben McCann,
	Alan Cox, bredelin, linux-kernel, Rik van Riel, linux-mm

Hi,

On Mon, 11 Jan 1999 12:20:39 +0100, Pavel Machek
<pavel@atrey.karlin.mff.cuni.cz> said:

> Hi!
>> In fact, to make it really safe we'd need to avoid synchronous swapout
>> altogether: otherwise we can have
>> 
>> A			kswiod		nbd server process
>> [deadlock]

> Is this only matter of nbd? If so, maybe the best solution is to start
> claiming: "don't swap over nbd, don't mount localhost drives read
> write". [It is bad, but it is probably better than polluting rest of
> kernel with nbd workarounds...]

No.  Any other process which gets in the way of our IO and which blocks
for memory allocation can cause the deadlock.  That might be another
process doing a file IO, locking a buffer and then allocating memory
inside the scsi layers, for example.  It is not limited to nbd, but
nbd's networking use will probably make it particularly bad.

Linus, I've also realised that making semaphores recursive does not fix
the inode deadlock.  It only eliminates the single process case.  We can
still have two separate processes each writing to a separate mmaped()
file deadlock.  If each process starts a msync() on its own file and in
the process of that tries to sync one of the other process's pages via
try_to_free_page, we get the deadlock back.

I can't see any way around this other than to make try_to_free never,
ever block on IO, which implies having a separate page writer thread, or
to rework the code so that we never allocate memory while holding one of
the critical filesystem locks (which is a non-starter for 2.2).

--Stephen
--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: Buffer handling (setting PG_referenced on access)
  1999-01-11  9:21                                                 ` Buffer handling (setting PG_referenced on access) Zlatko Calusic
@ 1999-01-11 17:44                                                   ` Linus Torvalds
  1999-01-11 20:14                                                     ` Zlatko Calusic
  0 siblings, 1 reply; 243+ messages in thread
From: Linus Torvalds @ 1999-01-11 17:44 UTC (permalink / raw
  To: Zlatko Calusic
  Cc: Linux-MM List, Linux Kernel List, Dax Kelson, Steve Bergman,
	Andrea Arcangeli, brent verner, Garst R. Reese, Kalle Andersson,
	Ben McCann, bredelin, Alan Cox, Stephen C. Tweedie



On 11 Jan 1999, Zlatko Calusic wrote:
> 
> OK, implementation was easy and simple, much simpler than it was made
> before (with BH_Touched copying...), but I must admit that even after
> lots of testing I couldn't find any difference. Not in performance,
> not in CPU usage, not in overall behaviour. Whatever results I have
> accomplished, they were too much in the statistical noise, so I don't
> have any useful data. Maybe, others can try and see.

This was what I saw in my very very inconclusive tests too - which is why
I decided that there was no point in doing buffer cache aging at all.

> But, nevertheless, four lines added to the kernel look very correct to
> me. My vote for including, if for nothing, then to make balance with
> page cache. It won't harm anything, that's for sure.

I can easily see it harming something - I actually think that not using
the reference bit is "safer" in that it never allows the buffer cache to
grow very aggressively for very long (and we definitely don't want to have
an overlarge buffer cache - it's mostly used for temporary buffers for
write-out anyway).

Basically I don't want to enable the aging code unless somebody shows me
that it makes a marked improvement under some (reasonably real-world)
circumstances.. So far the jury seems to say that it doesn't.

		Linus


--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-11 14:11                                                     ` Savochkin Andrey Vladimirovich
@ 1999-01-11 17:55                                                       ` Linus Torvalds
  1999-01-11 18:37                                                         ` Andrea Arcangeli
  0 siblings, 1 reply; 243+ messages in thread
From: Linus Torvalds @ 1999-01-11 17:55 UTC (permalink / raw
  To: Savochkin Andrey Vladimirovich
  Cc: Andrea Arcangeli, steve, Stephen C. Tweedie, Eric W. Biederman,
	brent verner, Garst R. Reese, Kalle Andersson, Zlatko Calusic,
	Ben McCann, Alan Cox, bredelin, linux-kernel, Rik van Riel,
	linux-mm



On Mon, 11 Jan 1999, Savochkin Andrey Vladimirovich wrote:
> On Sun, Jan 10, 1999 at 10:35:10AM -0800, Linus Torvalds wrote:
> > The thing I want to make re-entrant is just semaphore accesses: at the
> > point where we would otherwise deadlock on the writer semaphore it's much
> > better to just allow nested writes. I suspect all filesystems can already
> > handle nested writes - they are a lot easier to handle than truly
> > concurrent ones.
> 
> You're an optimist, aren't you? :-)

No, drugged to my eye-brows.

> In any case I've checked your recursive semaphore code on a news server
> which reliably deadlocked with the previous kernels.
> The code seems to work well.

I found a rather nasty race in my implementation - it's basically
impossible to triggerin real life, but quite frankly I don't want to have
semaphores that have a really subtle bug in them. 

However much I tried, I couldn't make the race go away without using a
spinlock in the critical path of the semaphore, something which I very
much want to avoid.

Unless I find a good recursive semaphore implementation (and I'm starting
to despair about finding one that is lock-free for the non-contention
case), I'll have to come up with something else (like letting only kswapd
swap out pages as has been discussed here).

			Linus

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-11 17:55                                                       ` Linus Torvalds
@ 1999-01-11 18:37                                                         ` Andrea Arcangeli
  0 siblings, 0 replies; 243+ messages in thread
From: Andrea Arcangeli @ 1999-01-11 18:37 UTC (permalink / raw
  To: Linus Torvalds
  Cc: Savochkin Andrey Vladimirovich, steve, Stephen C. Tweedie,
	Eric W. Biederman, brent verner, Garst R. Reese, Kalle Andersson,
	Zlatko Calusic, Ben McCann, Alan Cox, bredelin, linux-kernel,
	Rik van Riel, linux-mm

On Mon, 11 Jan 1999, Linus Torvalds wrote:

> Unless I find a good recursive semaphore implementation (and I'm starting

I would like to think about it for some day but I haven't time due
OFFTOPIC-AND-BORING University studies argghhh... I'll try to find the
time though (hoping to sleep enough to survive ;)...

Andrea Arcangeli

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: Results: pre6 vs pre6+zlatko's_patch  vs pre5 vs arcavm13
  1999-01-11 16:57                                                   ` Results: pre6 vs pre6+zlatko's_patch vs pre5 vs arcavm13 Steve Bergman
@ 1999-01-11 19:36                                                     ` Andrea Arcangeli
  1999-01-11 23:03                                                       ` Andrea Arcangeli
  0 siblings, 1 reply; 243+ messages in thread
From: Andrea Arcangeli @ 1999-01-11 19:36 UTC (permalink / raw
  To: Steve Bergman
  Cc: Linus Torvalds, brent verner, Garst R. Reese, Kalle Andersson,
	Zlatko Calusic, Ben McCann, bredelin, linux-kernel, linux-mm,
	Alan Cox, Stephen C. Tweedie

On Mon, 11 Jan 1999, Steve Bergman wrote:

> Here are updated results including arcavm16:
> 
> 
> 116 Image test in 128MB:
> 
> pre6+zlatko's_patch             2:35
> and with requested change       3:09
> pre6                            2:27
> pre5                            1:58
> arcavm13                        9:13
> arcavm15                        1:59
> pre-7                           2:41
> arcavm16			1:54

Cool, now that arcavm16 (in pre6) is faster than pre5 I am courious to see
what will happens with the one liner patch below applyed on the top of
arcavm16 (maybe nothing but... ;). We can call the resulting code arcavm17
against pre6 (that pratically insteaed is arcavm16 applyed on pre5 ;).

Index: linux/mm/page_alloc.c
diff -u linux/mm/page_alloc.c:1.1.1.7 linux/mm/page_alloc.c:1.1.1.1.2.27
--- linux/mm/page_alloc.c:1.1.1.7	Sat Jan  9 12:58:25 1999
+++ linux/mm/page_alloc.c	Mon Jan 11 19:57:07 1999
@@ -279,7 +279,7 @@
 		{
 			int freed;
 			current->flags |= PF_MEMALLOC;
-			freed = try_to_free_pages(gfp_mask, freepages.high - nr_free_pages);
+			freed = try_to_free_pages(gfp_mask, SWAP_CLUSTER_MAX);
 			current->flags &= ~PF_MEMALLOC;
 			if (!freed && !(gfp_mask & (__GFP_MED | __GFP_HIGH)))
 				goto nopage;



Andrea Arcangeli

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: Buffer handling (setting PG_referenced on access)
  1999-01-11 17:44                                                   ` Linus Torvalds
@ 1999-01-11 20:14                                                     ` Zlatko Calusic
  0 siblings, 0 replies; 243+ messages in thread
From: Zlatko Calusic @ 1999-01-11 20:14 UTC (permalink / raw
  To: Linus Torvalds
  Cc: Linux-MM List, Linux Kernel List, Dax Kelson, Steve Bergman,
	Andrea Arcangeli, brent verner, Garst R. Reese, Kalle Andersson,
	Ben McCann, bredelin, Alan Cox, Stephen C. Tweedie

Linus Torvalds <torvalds@transmeta.com> writes:

> On 11 Jan 1999, Zlatko Calusic wrote:
> > 
> > OK, implementation was easy and simple, much simpler than it was made
> > before (with BH_Touched copying...), but I must admit that even after
> > lots of testing I couldn't find any difference. Not in performance,
> > not in CPU usage, not in overall behaviour. Whatever results I have
> > accomplished, they were too much in the statistical noise, so I don't
> > have any useful data. Maybe, others can try and see.
> 
> This was what I saw in my very very inconclusive tests too - which is why
> I decided that there was no point in doing buffer cache aging at all.

Yes, looks like we finished our tests with same results.

> 
> > But, nevertheless, four lines added to the kernel look very correct to
> > me. My vote for including, if for nothing, then to make balance with
> > page cache. It won't harm anything, that's for sure.
> 
> I can easily see it harming something - I actually think that not using
> the reference bit is "safer" in that it never allows the buffer cache to
> grow very aggressively for very long (and we definitely don't want to have
> an overlarge buffer cache - it's mostly used for temporary buffers for
> write-out anyway).
> 
> Basically I don't want to enable the aging code unless somebody shows me
> that it makes a marked improvement under some (reasonably real-world)
> circumstances.. So far the jury seems to say that it doesn't.
> 

OK, I got one more idea in the meantime, and I'll try it as the time
permits. In the meantime, I agree with you. If we can't prove it's
actually worthwhile to add those four lines, then we really don't need
them.

Regards,
-- 
Zlatko
--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: testing/pre-7 and do_poll()
  1999-01-10 23:33                                                     ` testing/pre-7 and do_poll() Chip Salzenberg
  1999-01-11  6:02                                                       ` Linus Torvalds
@ 1999-01-11 20:20                                                       ` Adam Heath
  1 sibling, 0 replies; 243+ messages in thread
From: Adam Heath @ 1999-01-11 20:20 UTC (permalink / raw
  To: Chip Salzenberg; +Cc: Linus Torvalds, linux-kernel, linux-mm

On Sun, 10 Jan 1999, Chip Salzenberg wrote:

> According to Linus Torvalds:
> > There's a "pre-7.gz" on ftp.kernel.org in testing, anybody interested?
> 
> Got it, like it -- *except* the fix for overflow in do_poll() is a
> little bit off.  Quoting testing/pre-7:
> 
> 	if (timeout) {
> 		/* Carefula about overflow in the intermediate values */
> 		if ((unsigned long) timeout < MAX_SCHEDULE_TIMEOUT / HZ)
> 			timeout = (timeout*HZ+999)/1000+1;
> 		else /* Negative or overflow */
> 			timeout = MAX_SCHEDULE_TIMEOUT;
> 	}
> 
> However, the maximum legal millisecond timeout isn't (as shown)
> MAX_SCHEDULE_TIMEOUT/HZ, but rather MAX_SCHEDULE_TIMEOUT/(1000/HZ).
> So this code will turn some large timeouts into MAX_SCHEDULE_TIMEOUT
> unnecessarily.

A/(B/C) = A * (C / B) = A / B * C (done this way to eliminate overflow)

MAX_SCHEDULE_TIMEOUT / 1000 * HZ

Adam


--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: testing/pre-7 and do_poll()
  1999-01-11  7:02                                                               ` Linus Torvalds
@ 1999-01-11 22:08                                                                 ` Shawn Leas
  1999-01-11 22:13                                                                   ` Linus Torvalds
  0 siblings, 1 reply; 243+ messages in thread
From: Shawn Leas @ 1999-01-11 22:08 UTC (permalink / raw
  To: Linus Torvalds; +Cc: Chip Salzenberg, linux-kernel, linux-mm

On Sun, 10 Jan 1999, Linus Torvalds wrote:

> things. So far, megahertz are still far in the future, but maybe I some
> day will have to remove even that assumption. Unlikely to be a problem in
> my lifetime, but hey, I can hope (whether due to a long life or really
> fast CPU's, I don't care ;) 

Well, they've made a photonic chip, so we may be thinking about this
sooner than you think... Think 200GHz processors.

-Shawn
<=========== America Held Hostage ===========>
   Day 2182 for the poor and the middle class. 
   Day 2201 for the rich and the dead.
   740 days remaining in the Raw Deal.
<============================================> 

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: testing/pre-7 and do_poll()
  1999-01-11 22:08                                                                 ` Shawn Leas
@ 1999-01-11 22:13                                                                   ` Linus Torvalds
  1999-01-12  0:25                                                                     ` estafford
  1999-01-12  7:06                                                                     ` Gregory Maxwell
  0 siblings, 2 replies; 243+ messages in thread
From: Linus Torvalds @ 1999-01-11 22:13 UTC (permalink / raw
  To: Shawn Leas; +Cc: Chip Salzenberg, linux-kernel, linux-mm



On Mon, 11 Jan 1999, Shawn Leas wrote:

> On Sun, 10 Jan 1999, Linus Torvalds wrote:
> 
> > things. So far, megahertz are still far in the future, but maybe I some
> > day will have to remove even that assumption. Unlikely to be a problem in
> > my lifetime, but hey, I can hope (whether due to a long life or really
> > fast CPU's, I don't care ;) 
> 
> Well, they've made a photonic chip, so we may be thinking about this
> sooner than you think... Think 200GHz processors.

Hey, I want to see the memory subsystems for it..

		Linus

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: Results: pre6 vs pre6+zlatko's_patch  vs pre5 vs arcavm13
  1999-01-11 19:36                                                     ` Andrea Arcangeli
@ 1999-01-11 23:03                                                       ` Andrea Arcangeli
  1999-01-11 23:38                                                         ` Zlatko Calusic
  1999-01-12  2:02                                                         ` Steve Bergman
  0 siblings, 2 replies; 243+ messages in thread
From: Andrea Arcangeli @ 1999-01-11 23:03 UTC (permalink / raw
  To: Steve Bergman
  Cc: Linus Torvalds, brent verner, Garst R. Reese, Kalle Andersson,
	Zlatko Calusic, Ben McCann, bredelin, linux-kernel, linux-mm,
	Alan Cox, Stephen C. Tweedie

I've seen pre7 now and I produced arca-vm-18 against it.

In arca-vm-18 I avoided the swaping readahead if we would be forced to do
_sync_ IO in the readahead. This make tons of sense to me. 

I also reverted my trashing heuristic to a per-process thing. The point of
the heuristic is not to penalyze processes. The pint is to _not_ penalyze
processes that are not eating memory. Making it a static variable make
sense of course but it's a completly different thing. And I think that
having only a few processes that are in the free-pages-path will improve
global performances. 

Here arca-vm-18 against 2.2.0-pre7 in the testing directory (sent me by
email by Steve).

Note: it's still very interesting how arca-vm-17 is performing since here
I am following pre5/pre7/arca-vm-17 style of freeing every time only
SWAP_CLUSTER_MAX pages. Here this change decrease a _lot_ swapout
performances, but I don't know if the global system is faster... I am only
running a trashing-swapout-benchmarking application, and I am not
benchmarking how the rest of the system is responsive...

Another thing that would be interesting could be to change
SWAPFILE_CLUSTER to 256 as in clean pre7. I think it's not needed because
I am not hearing disk seeks under heavy swapping but may I guess there is
some reason is 256 in pre7 ;)?

Index: linux/include/linux/mm.h
diff -u linux/include/linux/mm.h:1.1.1.6 linux/include/linux/mm.h:1.1.1.1.2.18
--- linux/include/linux/mm.h:1.1.1.6	Mon Jan 11 22:23:57 1999
+++ linux/include/linux/mm.h	Mon Jan 11 22:56:08 1999
@@ -118,7 +118,6 @@
 	unsigned long offset;
 	struct page *next_hash;
 	atomic_t count;
-	unsigned int unused;
 	unsigned long flags;	/* atomic flags, some possibly updated asynchronously */
 	struct wait_queue *wait;
 	struct page **pprev_hash;
@@ -302,8 +301,7 @@
 
 /* filemap.c */
 extern void remove_inode_page(struct page *);
-extern unsigned long page_unuse(struct page *);
-extern int shrink_mmap(int, int);
+extern int FASTCALL(shrink_mmap(int, int));
 extern void truncate_inode_pages(struct inode *, unsigned long);
 extern unsigned long get_cached_page(struct inode *, unsigned long, int);
 extern void put_cached_page(unsigned long);
Index: linux/include/linux/sched.h
diff -u linux/include/linux/sched.h:1.1.1.6 linux/include/linux/sched.h:1.1.1.1.2.11
--- linux/include/linux/sched.h:1.1.1.6	Mon Jan 11 22:24:03 1999
+++ linux/include/linux/sched.h	Mon Jan 11 23:29:36 1999
@@ -270,6 +275,7 @@
 /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
 	unsigned long min_flt, maj_flt, nswap, cmin_flt, cmaj_flt, cnswap;
 	int swappable:1;
+	int trashing:1;
 	unsigned long swap_address;
 	unsigned long swap_cnt;		/* number of pages to swap on next pass */
 /* process credentials */
@@ -355,7 +361,7 @@
 /* utime */	{0,0,0,0},0, \
 /* per CPU times */ {0, }, {0, }, \
 /* flt */	0,0,0,0,0,0, \
-/* swp */	0,0,0, \
+/* swp */	0,0,0,0, \
 /* process credentials */					\
 /* uid etc */	0,0,0,0,0,0,0,0,				\
 /* suppl grps*/ 0, {0,},					\
Index: linux/kernel/fork.c
diff -u linux/kernel/fork.c:1.1.1.6 linux/kernel/fork.c:1.1.1.1.2.10
--- linux/kernel/fork.c:1.1.1.6	Mon Jan 11 22:24:21 1999
+++ linux/kernel/fork.c	Mon Jan 11 22:56:09 1999
@@ -511,6 +514,7 @@
 
 	p->did_exec = 0;
 	p->swappable = 0;
+	p->trashing = 0;
 	p->state = TASK_UNINTERRUPTIBLE;
 
 	copy_flags(clone_flags, p);
Index: linux/kernel/sysctl.c
diff -u linux/kernel/sysctl.c:1.1.1.6 linux/kernel/sysctl.c:1.1.1.1.2.11
--- linux/kernel/sysctl.c:1.1.1.6	Mon Jan 11 22:24:22 1999
+++ linux/kernel/sysctl.c	Mon Jan 11 22:56:09 1999
@@ -32,7 +32,7 @@
 
 /* External variables not in a header file. */
 extern int panic_timeout;
-extern int console_loglevel, C_A_D;
+extern int console_loglevel, C_A_D, swapout_interval;
 extern int bdf_prm[], bdflush_min[], bdflush_max[];
 extern char binfmt_java_interpreter[], binfmt_java_appletviewer[];
 extern int sysctl_overcommit_memory;
@@ -216,6 +216,8 @@
 };
 
 static ctl_table vm_table[] = {
+	{VM_SWAPOUT, "swapout_interval",
+	 &swapout_interval, sizeof(int), 0644, NULL, &proc_dointvec},
 	{VM_FREEPG, "freepages", 
 	 &freepages, sizeof(freepages_t), 0644, NULL, &proc_dointvec},
 	{VM_BDFLUSH, "bdflush", &bdf_prm, 9*sizeof(int), 0600, NULL,
Index: linux/mm/filemap.c
diff -u linux/mm/filemap.c:1.1.1.9 linux/mm/filemap.c:1.1.1.1.2.44
--- linux/mm/filemap.c:1.1.1.9	Thu Jan  7 12:21:35 1999
+++ linux/mm/filemap.c	Sat Jan  9 19:30:01 1999
@@ -122,13 +126,13 @@
 {
 	static unsigned long clock = 0;
 	unsigned long limit = num_physpages;
+	unsigned long count;
 	struct page * page;
-	int count;
 
 	count = (limit << 1) >> priority;
 
 	page = mem_map + clock;
-	do {
+	while (count-- != 0) {
 		int referenced;
 
 		/* This works even in the presence of PageSkip because
@@ -147,7 +151,6 @@
 			clock = page->map_nr;
 		}
 		
-		count--;
 		referenced = test_and_clear_bit(PG_referenced, &page->flags);
 
 		if (PageLocked(page))
@@ -191,8 +194,7 @@
 			remove_inode_page(page);
 			return 1;
 		}
-
-	} while (count > 0);
+	}
 	return 0;
 }
 
Index: linux/mm/page_alloc.c
diff -u linux/mm/page_alloc.c:1.1.1.8 linux/mm/page_alloc.c:1.1.1.1.2.28
--- linux/mm/page_alloc.c:1.1.1.8	Mon Jan 11 22:24:23 1999
+++ linux/mm/page_alloc.c	Mon Jan 11 22:56:09 1999
@@ -212,19 +212,18 @@
 		 * further thought.
 		 */
 		if (!(current->flags & PF_MEMALLOC)) {
-			static int trashing = 0;
 			int freed;
 
 			if (nr_free_pages > freepages.min) {
-				if (!trashing)
+				if (!current->trashing)
 					goto ok_to_allocate;
 				if (nr_free_pages > freepages.low) {
-					trashing = 0;
+					current->trashing = 0;
 					goto ok_to_allocate;
 				}
 			}
 
-			trashing = 1;
+			current->trashing = 1;
 			current->flags |= PF_MEMALLOC;
 			freed = try_to_free_pages(gfp_mask);
 			current->flags &= ~PF_MEMALLOC;
@@ -356,7 +355,9 @@
 	offset = (offset >> page_cluster) << page_cluster;
 	
 	for (i = 1 << page_cluster; i > 0; i--) {
-	      if (offset >= swapdev->max)
+	      if (offset >= swapdev->max ||
+		  /* don't block on I/O for doing readahead -arca */
+		  atomic_read(&nr_async_pages) > pager_daemon.swap_cluster)
 		      return;
 	      if (!swapdev->swap_map[offset] ||
 		  swapdev->swap_map[offset] == SWAP_MAP_BAD ||
Index: linux/mm/swapfile.c
diff -u linux/mm/swapfile.c:1.1.1.3 linux/mm/swapfile.c:1.1.1.1.2.4
--- linux/mm/swapfile.c:1.1.1.3	Mon Jan 11 22:24:24 1999
+++ linux/mm/swapfile.c	Mon Jan 11 22:56:09 1999
@@ -23,7 +23,7 @@
 
 struct swap_info_struct swap_info[MAX_SWAPFILES];
 
-#define SWAPFILE_CLUSTER 256
+#define SWAPFILE_CLUSTER	SWAP_CLUSTER_MAX
 
 static inline int scan_swap_map(struct swap_info_struct *si)
 {
Index: linux/mm/vmscan.c
diff -u linux/mm/vmscan.c:1.1.1.12 linux/mm/vmscan.c:1.1.1.1.2.87
--- linux/mm/vmscan.c:1.1.1.12	Mon Jan 11 22:24:24 1999
+++ linux/mm/vmscan.c	Mon Jan 11 22:56:09 1999
@@ -10,6 +10,11 @@
  *  Version: $Id: vmscan.c,v 1.5 1998/02/23 22:14:28 sct Exp $
  */
 
+/*
+ * free_user_and_cache() and always async swapout original idea.
+ * Copyright (C) 1999  Andrea Arcangeli
+ */
+
 #include <linux/slab.h>
 #include <linux/kernel_stat.h>
 #include <linux/swap.h>
@@ -20,6 +25,8 @@
 
 #include <asm/pgtable.h>
 
+int swapout_interval = HZ;
+
 /*
  * The swap-out functions return 1 if they successfully
  * threw something out, and we got a free page. It returns
@@ -306,7 +313,8 @@
 static int swap_out(unsigned int priority, int gfp_mask)
 {
 	struct task_struct * p, * pbest;
-	int counter, assign, max_cnt;
+	int counter, assign;
+	unsigned long max_cnt;
 
 	/* 
 	 * We make one or two passes through the task list, indexed by 
@@ -325,7 +333,7 @@
 	counter = nr_tasks / (priority+1);
 	if (counter < 1)
 		counter = 1;
-	if (counter > nr_tasks)
+	else if (counter > nr_tasks)
 		counter = nr_tasks;
 
 	for (; counter >= 0; counter--) {
@@ -338,7 +346,7 @@
 		for (; p != &init_task; p = p->next_task) {
 			if (!p->swappable)
 				continue;
-	 		if (p->mm->rss <= 0)
+	 		if (p->mm->rss == 0)
 				continue;
 			/* Refresh swap_cnt? */
 			if (assign)
@@ -430,7 +438,7 @@
 			break;
 		current->state = TASK_INTERRUPTIBLE;
 		run_task_queue(&tq_disk);
-		schedule_timeout(HZ);
+		schedule_timeout(swapout_interval);
 
 		/*
 		 * kswapd isn't even meant to keep up with anything,
@@ -445,6 +453,29 @@
 	return 0;
 }
 
+static int free_user_and_cache(int priority, int gfp_mask)
+{
+	static unsigned long grow_swap_cache = 0;
+
+	if (!shrink_mmap(priority, gfp_mask))
+		grow_swap_cache = 1;
+
+	switch (grow_swap_cache)
+	{
+	case 0:
+		return 1;
+	default:
+		if (grow_swap_cache++ >= freepages.high)
+			grow_swap_cache = 0;
+	}
+
+	if (swap_out(priority, gfp_mask))
+		return 1;
+
+	grow_swap_cache = 0;
+	return 0;
+}
+
 /*
  * We need to make the locks finer granularity, but right
  * now we need this so that we can do page allocations
@@ -457,34 +488,33 @@
 int try_to_free_pages(unsigned int gfp_mask)
 {
 	int priority;
+	static int state = 0;
 	int count = SWAP_CLUSTER_MAX;
 
 	lock_kernel();
 
-	/* Always trim SLAB caches when memory gets low. */
-	kmem_cache_reap(gfp_mask);
-
 	priority = 6;
-	do {
-		while (shrink_mmap(priority, gfp_mask)) {
-			if (!--count)
-				goto done;
-		}
-
-		/* Try to get rid of some shared memory pages.. */
-		while (shm_swap(priority, gfp_mask)) {
-			if (!--count)
-				goto done;
-		}
-	
-		/* Then, try to page stuff out.. */
-		while (swap_out(priority, gfp_mask)) {
-			if (!--count)
-				goto done;
-		}
+	switch (state)
+	{
+		do {
+		case 0:
+			while (free_user_and_cache(priority, gfp_mask)) {
+				if (!--count)
+					goto done;
+			}
+			state = 1;
+		case 1:
+			/* Try to get rid of some shared memory pages.. */
+			while (shm_swap(priority, gfp_mask)) {
+				if (!--count)
+					goto done;
+			}
+			state = 0;
 
-		shrink_dcache_memory(priority, gfp_mask);
-	} while (--priority >= 0);
+			kmem_cache_reap(gfp_mask);
+			shrink_dcache_memory(priority, gfp_mask);
+		} while (--priority >= 0);
+	}
 done:
 	unlock_kernel();
 

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: Results: pre6 vs pre6+zlatko's_patch  vs pre5 vs arcavm13
  1999-01-11 23:03                                                       ` Andrea Arcangeli
@ 1999-01-11 23:38                                                         ` Zlatko Calusic
  1999-01-12  2:02                                                         ` Steve Bergman
  1 sibling, 0 replies; 243+ messages in thread
From: Zlatko Calusic @ 1999-01-11 23:38 UTC (permalink / raw
  To: Andrea Arcangeli
  Cc: Steve Bergman, Linus Torvalds, brent verner, Garst R. Reese,
	Kalle Andersson, Ben McCann, bredelin, linux-kernel, linux-mm,
	Alan Cox, Stephen C. Tweedie

Andrea Arcangeli <andrea@e-mind.com> writes:

> I've seen pre7 now and I produced arca-vm-18 against it.
> 
> In arca-vm-18 I avoided the swaping readahead if we would be forced to do
> _sync_ IO in the readahead. This make tons of sense to me. 

Yes, I agree. I made a same change in my development patches, this
morning, and it works well. We can only gain with policy like that.

> Another thing that would be interesting could be to change
> SWAPFILE_CLUSTER to 256 as in clean pre7. I think it's not needed because
> I am not hearing disk seeks under heavy swapping but may I guess there is
> some reason is 256 in pre7 ;)?
> 

Better clustering of pages on the disk. That will improve swapin
readahead hit rate, and overall performance.

Regards,
-- 
Zlatko
--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: testing/pre-7 and do_poll()
  1999-01-11 22:13                                                                   ` Linus Torvalds
@ 1999-01-12  0:25                                                                     ` estafford
  1999-01-12  8:25                                                                       ` Shawn Leas
  1999-01-12  7:06                                                                     ` Gregory Maxwell
  1 sibling, 1 reply; 243+ messages in thread
From: estafford @ 1999-01-12  0:25 UTC (permalink / raw
  To: Linus Torvalds; +Cc: linux-mm, linux-kernel, Chip Salzenberg, Shawn Leas


On 11-Jan-99 Linus Torvalds wrote:
> 
> 
> On Mon, 11 Jan 1999, Shawn Leas wrote:
> 
>> On Sun, 10 Jan 1999, Linus Torvalds wrote:
>> 
>> > things. So far, megahertz are still far in the future, but maybe I some
>> > day will have to remove even that assumption. Unlikely to be a problem in
>> > my lifetime, but hey, I can hope (whether due to a long life or really
>> > fast CPU's, I don't care ;) 
>> 
>> Well, they've made a photonic chip, so we may be thinking about this
>> sooner than you think... Think 200GHz processors.
> 
> Hey, I want to see the memory subsystems for it..
> 
>               Linus

Got any links to such technology?  I've been working on it for a while, but
making transistors from glass are not quite as easy as I would think.. Lemme
know.

----------------------------------------
Ed Stafford            Of. (901)348-3487
iXL Hosting            Fx. (901)345-9992      
Programming Engineer   estafford@ixl.com
----------------------------------------

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: Results: pre6 vs pre6+zlatko's_patch  vs pre5 vs arcavm13
  1999-01-11 23:03                                                       ` Andrea Arcangeli
  1999-01-11 23:38                                                         ` Zlatko Calusic
@ 1999-01-12  2:02                                                         ` Steve Bergman
  1999-01-12  3:21                                                           ` Results: Zlatko's new vm patch Steve Bergman
  1 sibling, 1 reply; 243+ messages in thread
From: Steve Bergman @ 1999-01-12  2:02 UTC (permalink / raw
  To: Andrea Arcangeli
  Cc: Linus Torvalds, brent verner, Garst R. Reese, Kalle Andersson,
	Zlatko Calusic, Ben McCann, bredelin, linux-kernel, linux-mm,
	Alan Cox, Stephen C. Tweedie

Andrea Arcangeli wrote:
> 

> Here arca-vm-18 against 2.2.0-pre7 in the testing directory (sent me by
> email by Steve).
> 

(Zlatko, your patch is next. ;-)  )

Here are the results:


116 Image test in 128MB:

pre6+zlatko's_patch             2:35
and with requested change       3:09
pre6                            2:27
pre5                            1:58
arcavm13                        9:13
arcavm15                        1:59
pre-7                           2:41
arcavm16                        1:54
arcavm18			1:57

For the kernel compile test in 12MB:

                                Elapsed Maj.    Min.    Swaps
                                -----   ------  ------  -----
pre6+zlatko_patch               22:14   383206  204482  57823
and with requested change       22:23   378662  198194  51445
pre6                            20:54   352934  191210  48678
pre5                            19:35   334680  183732  93427
arcavm13                        19:45   344452  180243  38977
arcavm15                        20:07   N/A     N/A     N/A
pre-7                           21:14   356386  192835  50912
arcavm16                        20:09   N/A     N/A     N/A
arcavm18			21:08	363438	190763	48982


-Steve
--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Results: Zlatko's new vm patch
  1999-01-12  2:02                                                         ` Steve Bergman
@ 1999-01-12  3:21                                                           ` Steve Bergman
  1999-01-12  5:33                                                             ` Linus Torvalds
  1999-01-13 20:47                                                             ` [patch] arca-vm-19 [Re: Results: Zlatko's new vm patch] Andrea Arcangeli
  0 siblings, 2 replies; 243+ messages in thread
From: Steve Bergman @ 1999-01-12  3:21 UTC (permalink / raw
  To: Andrea Arcangeli, Linus Torvalds, brent verner, Garst R. Reese,
	Kalle Andersson, Zlatko Calusic, Ben McCann, bredelin,
	linux-kernel, linux-mm, Alan Cox, Stephen C. Tweedie

Here are the results:
 
116 Image test in 128MB:
 
pre6                            2:27
pre5                            1:58
arcavm13                        9:13
arcavm15                        1:59
pre-7                           2:41
arcavm16                        1:54
arcavm18                        1:57
pre-7+zlatko's latest patch	2:14
 
For the kernel compile test in 12MB:
 
                                 Elapsed Maj.    Min.    Swaps
                                -----   ------  ------  -----
pre6                            20:54   352934  191210  48678
pre5                            19:35   334680  183732  93427
arcavm13                        19:45   344452  180243  38977
arcavm15                        20:07   N/A     N/A     N/A
pre-7                           21:14   356386  192835  50912
arcavm16                        20:09   N/A     N/A     N/A
arcavm18                        21:08   363438  190763  48982
pre-7+zlatko's latest patch	21:34	358408	193930	51813

The patch seems to help in the image test and hurt a bit in the 12MB compile
test (vs pre-7).

 
 -Steve
--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: Results: Zlatko's new vm patch
  1999-01-12  3:21                                                           ` Results: Zlatko's new vm patch Steve Bergman
@ 1999-01-12  5:33                                                             ` Linus Torvalds
  1999-01-12 14:49                                                               ` Andrea Arcangeli
                                                                                 ` (3 more replies)
  1999-01-13 20:47                                                             ` [patch] arca-vm-19 [Re: Results: Zlatko's new vm patch] Andrea Arcangeli
  1 sibling, 4 replies; 243+ messages in thread
From: Linus Torvalds @ 1999-01-12  5:33 UTC (permalink / raw
  To: Steve Bergman
  Cc: Andrea Arcangeli, brent verner, Garst R. Reese, Kalle Andersson,
	Zlatko Calusic, Ben McCann, bredelin, linux-kernel, linux-mm,
	Alan Cox, Stephen C. Tweedie

Note that there are very few people who are testing interactive feel. I'd
be happier with more people giving more subjective comments on how the
system feels under heavy memory load. 

The only feedback I have so far says that pre-7 is much better than any of
the pre-6 versions, but I'd be happier with more coverage depth and more
comments from people in different circumstances. For example, what does it
feel like when you're paging heavily and doing a "find" at the same time
on a 16M machine?

I know this is harder than just trying to determine the throughput of
something, but the pre-6 thing certainly showed how dangerous it was to
just look at numbers.

		Linus

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: testing/pre-7 and do_poll()
  1999-01-11 22:13                                                                   ` Linus Torvalds
  1999-01-12  0:25                                                                     ` estafford
@ 1999-01-12  7:06                                                                     ` Gregory Maxwell
  1 sibling, 0 replies; 243+ messages in thread
From: Gregory Maxwell @ 1999-01-12  7:06 UTC (permalink / raw
  To: Linus Torvalds; +Cc: Shawn Leas, Chip Salzenberg, linux-kernel, linux-mm

On Mon, 11 Jan 1999, Linus Torvalds wrote:

> Hey, I want to see the memory subsystems for it..
> 
> 		Linus

Actually, I would imagine that such a processor could use a very long
nano-manufactured fiber loop as memory. Sorta like mercury tube storage...

But.. You'd be the one to know about fancy processor dohickies wouldn't
you? :)


--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: testing/pre-7 and do_poll()
  1999-01-12  0:25                                                                     ` estafford
@ 1999-01-12  8:25                                                                       ` Shawn Leas
  0 siblings, 0 replies; 243+ messages in thread
From: Shawn Leas @ 1999-01-12  8:25 UTC (permalink / raw
  To: estafford; +Cc: Linus Torvalds, linux-mm, linux-kernel, Chip Salzenberg

On Mon, 11 Jan 1999 estafford@ixl.com wrote:

> 
> On 11-Jan-99 Linus Torvalds wrote:
> > 
> > 
> > On Mon, 11 Jan 1999, Shawn Leas wrote:
> > 
> >> On Sun, 10 Jan 1999, Linus Torvalds wrote:
> >> 
> >> > things. So far, megahertz are still far in the future, but maybe I some
> >> > day will have to remove even that assumption. Unlikely to be a problem in
> >> > my lifetime, but hey, I can hope (whether due to a long life or really
> >> > fast CPU's, I don't care ;) 
> >> 
> >> Well, they've made a photonic chip, so we may be thinking about this
> >> sooner than you think... Think 200GHz processors.
> > 
> > Hey, I want to see the memory subsystems for it..
> > 
> >               Linus
> 
> Got any links to such technology?  I've been working on it for a while, but
> making transistors from glass are not quite as easy as I would think.. Lemme
> know.

Take a lewk at this! And yes, it aint digital logic, it's
quantum logic.  It used to be that photonic computers were not
really processors as we know them.  This is the real deal.

http://www.nbnn.com/pubNews/123239.html

-Shawn
<=========== America Held Hostage ===========>
   Day 2183 for the poor and the middle class. 
   Day 2202 for the rich and the dead.
   739 days remaining in the Raw Deal.
<============================================> 

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: Results: Zlatko's new vm patch
  1999-01-12  5:33                                                             ` Linus Torvalds
@ 1999-01-12 14:49                                                               ` Andrea Arcangeli
  1999-01-12 16:58                                                               ` Joseph Anthony
                                                                                 ` (2 subsequent siblings)
  3 siblings, 0 replies; 243+ messages in thread
From: Andrea Arcangeli @ 1999-01-12 14:49 UTC (permalink / raw
  To: Linus Torvalds
  Cc: Steve Bergman, brent verner, Garst R. Reese, Kalle Andersson,
	Zlatko Calusic, Ben McCann, bredelin, linux-kernel, linux-mm,
	Alan Cox, Stephen C. Tweedie

On Mon, 11 Jan 1999, Linus Torvalds wrote:

> Note that there are very few people who are testing interactive feel. I'd
> be happier with more people giving more subjective comments on how the
> system feels under heavy memory load. 

With my latest free_user_and_cache() (arca-vm >= 16) you can't get bad
iteractive performances. Usually bad iteractive performances are due
unbalaced algorithms in the big try_to_free_pages() path.

Andrea Arcangeli

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-11  6:04                                                       ` Eric W. Biederman
@ 1999-01-12 16:06                                                         ` Stephen C. Tweedie
  1999-01-12 17:54                                                           ` Linus Torvalds
  0 siblings, 1 reply; 243+ messages in thread
From: Stephen C. Tweedie @ 1999-01-12 16:06 UTC (permalink / raw
  To: Eric W. Biederman
  Cc: Stephen C. Tweedie, Linus Torvalds,
	Savochkin Andrey Vladimirovich, Andrea Arcangeli, steve,
	brent verner, Garst R. Reese, Kalle Andersson, Zlatko Calusic,
	Ben McCann, Alan Cox, bredelin, linux-kernel, Rik van Riel,
	linux-mm

Hi,

On 11 Jan 1999 00:04:11 -0600, ebiederm+eric@ccr.net (Eric W. Biederman)
said:

> Oh, and just as a side note we are currently unfairly penalizing
> threaded programs by doing for_each_task instead of for_each_mm in the
> swapout code...

I know, on my TODO list...

--Stephen
--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: Results: Zlatko's new vm patch
  1999-01-12  5:33                                                             ` Linus Torvalds
  1999-01-12 14:49                                                               ` Andrea Arcangeli
@ 1999-01-12 16:58                                                               ` Joseph Anthony
  1999-01-12 18:16                                                                 ` Stephen C. Tweedie
  1999-01-12 18:24                                                               ` Michael K Vance
  1999-01-13  0:01                                                               ` Where to find pre7. Was: " Robert Thorncrantz
  3 siblings, 1 reply; 243+ messages in thread
From: Joseph Anthony @ 1999-01-12 16:58 UTC (permalink / raw
  To: Linus Torvalds
  Cc: Steve Bergman, Andrea Arcangeli, brent verner, Garst R. Reese,
	Kalle Andersson, Zlatko Calusic, Ben McCann, bredelin,
	linux-kernel, linux-mm, Alan Cox, Stephen C. Tweedie

Well, sometimes the system writes to swap before I have used half my
memory ( in X ) I view this with wmmon in windowmaker.. also on shutting
down the system, it fails to unmount partitions saying they are busy and
forcing checks next boot saying they were not cleanly unmounted.
2.2.0-pre4 - pre6 ( I do not have pre7 to test )

On Mon, 11 Jan 1999, Linus Torvalds wrote:

> 
> 
> Note that there are very few people who are testing interactive feel. I'd
> be happier with more people giving more subjective comments on how the
> system feels under heavy memory load. 
> 
> The only feedback I have so far says that pre-7 is much better than any of
> the pre-6 versions, but I'd be happier with more coverage depth and more
> comments from people in different circumstances. For example, what does it
> feel like when you're paging heavily and doing a "find" at the same time
> on a 16M machine?
> 
> I know this is harder than just trying to determine the throughput of
> something, but the pre-6 thing certainly showed how dangerous it was to
> just look at numbers.
> 
> 		Linus
> 
> 
> -
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.rutgers.edu
> Please read the FAQ at http://www.tux.org/lkml/
> 


---
*************************************
*          Joseph Anthony           *
*          jga@cowboy.net           *
*     http://wasteland.cowboy.net   *
*  -------------------------------  *
*  System Administrator Cowboy.net  *
*       http://www.cowboy.net       *
*************************************

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-12 16:06                                                         ` Stephen C. Tweedie
@ 1999-01-12 17:54                                                           ` Linus Torvalds
  1999-01-12 18:44                                                             ` Zlatko Calusic
  0 siblings, 1 reply; 243+ messages in thread
From: Linus Torvalds @ 1999-01-12 17:54 UTC (permalink / raw
  To: Stephen C. Tweedie
  Cc: Eric W. Biederman, Savochkin Andrey Vladimirovich,
	Andrea Arcangeli, steve, brent verner, Garst R. Reese,
	Kalle Andersson, Zlatko Calusic, Ben McCann, Alan Cox, bredelin,
	linux-kernel, Rik van Riel, linux-mm



On Tue, 12 Jan 1999, Stephen C. Tweedie wrote:
> 
> On 11 Jan 1999 00:04:11 -0600, ebiederm+eric@ccr.net (Eric W. Biederman)
> said:
> 
> > Oh, and just as a side note we are currently unfairly penalizing
> > threaded programs by doing for_each_task instead of for_each_mm in the
> > swapout code...
> 
> I know, on my TODO list...

Actually, this one is _really_ easy to fix.

The truly trivial fix is to just move "swap_cnt" into the mm structure,
and you're all done. You'd still walk the list with for_each_task(), but
it no longer matters.

		Linus

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: Results: Zlatko's new vm patch
  1999-01-12 16:58                                                               ` Joseph Anthony
@ 1999-01-12 18:16                                                                 ` Stephen C. Tweedie
  1999-01-12 20:15                                                                   ` Michael K Vance
  0 siblings, 1 reply; 243+ messages in thread
From: Stephen C. Tweedie @ 1999-01-12 18:16 UTC (permalink / raw
  To: Joseph Anthony
  Cc: Linus Torvalds, Steve Bergman, Andrea Arcangeli, brent verner,
	Garst R. Reese, Kalle Andersson, Zlatko Calusic, Ben McCann,
	bredelin, linux-kernel, linux-mm, Alan Cox, Stephen C. Tweedie

Hi,

On Tue, 12 Jan 1999 10:58:06 -0600 (CST), Joseph Anthony
<jga@alien.cowboy.net> said:

> Well, sometimes the system writes to swap before I have used half my
> memory ( in X ) I view this with wmmon in windowmaker.. 

Suspect wmmon in that case.  If you can show this happening in a trace
output from "vmstat 1", then I'll start to worry.

--Stephen
--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: Results: Zlatko's new vm patch
  1999-01-12  5:33                                                             ` Linus Torvalds
  1999-01-12 14:49                                                               ` Andrea Arcangeli
  1999-01-12 16:58                                                               ` Joseph Anthony
@ 1999-01-12 18:24                                                               ` Michael K Vance
  1999-01-13  0:01                                                               ` Where to find pre7. Was: " Robert Thorncrantz
  3 siblings, 0 replies; 243+ messages in thread
From: Michael K Vance @ 1999-01-12 18:24 UTC (permalink / raw
  To: Linus Torvalds
  Cc: Steve Bergman, Andrea Arcangeli, brent verner, Garst R. Reese,
	Kalle Andersson, Zlatko Calusic, Ben McCann, bredelin,
	linux-kernel, linux-mm, Alan Cox, Stephen C. Tweedie

Linus Torvalds wrote:

> Note that there are very few people who are testing interactive feel. I'd
> be happier with more people giving more subjective comments on how the
> system feels under heavy memory load.

I left my machine today (64mb/80mb swap, running pre6 on an MMX/233) running
netscape, xemacs, a few rxvt's, and xscreensaver. Many times when I get home
after classes, xscreensaver's GL apps will have swapped large portions of
netscape and xemacs out. Today when I came home, I tried to check my mail, and
write a bit of code, but everything was swapping left and right. It wasn't
just that netscape and xemacs got swapped back in, and then that was
that--instead it just continually ground my hard drive as it downloaded email
and I switched around to apps, etc, for a good few minutes. Very unpleasant.

FYI,

m.

-- 
"We watched her fall over and lay down,
 shouting the poetic truths of high school journal keepers."
 -- Lee Rinaldo, Sonic Youth
--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-12 17:54                                                           ` Linus Torvalds
@ 1999-01-12 18:44                                                             ` Zlatko Calusic
  1999-01-12 19:05                                                               ` Andrea Arcangeli
  1999-01-12 21:46                                                               ` Rik van Riel
  0 siblings, 2 replies; 243+ messages in thread
From: Zlatko Calusic @ 1999-01-12 18:44 UTC (permalink / raw
  To: Linus Torvalds
  Cc: Stephen C. Tweedie, Eric W. Biederman,
	Savochkin Andrey Vladimirovich, Andrea Arcangeli, steve,
	brent verner, Garst R. Reese, Kalle Andersson, Ben McCann,
	Alan Cox, bredelin, linux-kernel, Rik van Riel, linux-mm

Linus Torvalds <torvalds@transmeta.com> writes:

> On Tue, 12 Jan 1999, Stephen C. Tweedie wrote:
> > 
> > On 11 Jan 1999 00:04:11 -0600, ebiederm+eric@ccr.net (Eric W. Biederman)
> > said:
> > 
> > > Oh, and just as a side note we are currently unfairly penalizing
> > > threaded programs by doing for_each_task instead of for_each_mm in the
> > > swapout code...
> > 
> > I know, on my TODO list...
> 
> Actually, this one is _really_ easy to fix.
> 
> The truly trivial fix is to just move "swap_cnt" into the mm structure,
> and you're all done. You'd still walk the list with for_each_task(), but
> it no longer matters.
> 
> 		Linus
> 

Not related to this, but I (hopefully correctly) observed that SHM
swap I/O is done synchronously.

Could somebody spare a minute to explain why is that so, and what
needs to be done to make SHM swapping asynchronous?


Also, while we're at MM fixes, I'm appending below a small patch that
will improve interactive feel.

After number of async pages gets bigger than pager_daemon.swap_cluster
(= SWAP_CLUSTER_MAX), swapin readahead becomes synchronous, and that
hurts performance. It is better to skip readahead in such situations,
and that is also more fair to swapout. Andrea came to exactly the same
conclusion, independent of me (on the same day :)).

diff -urN linux-pre-7/mm/page_alloc.c linux/mm/page_alloc.c
--- linux-pre-7/mm/page_alloc.c	Tue Jan 11 07:28:06 1999
+++ linux/mm/page_alloc.c	Tue Jan 11 07:29:44 1999
@@ -358,6 +358,8 @@
 	for (i = 1 << page_cluster; i > 0; i--) {
 	      if (offset >= swapdev->max)
 		      return;
+	      if (atomic_read(&nr_async_pages) > pager_daemon.swap_cluster)
+		      return;
 	      if (!swapdev->swap_map[offset] ||
 		  swapdev->swap_map[offset] == SWAP_MAP_BAD ||
 		  test_bit(offset, swapdev->swap_lockmap))

Regards,
-- 
Zlatko
--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-12 18:44                                                             ` Zlatko Calusic
@ 1999-01-12 19:05                                                               ` Andrea Arcangeli
  1999-01-13 17:48                                                                 ` Stephen C. Tweedie
  1999-01-12 21:46                                                               ` Rik van Riel
  1 sibling, 1 reply; 243+ messages in thread
From: Andrea Arcangeli @ 1999-01-12 19:05 UTC (permalink / raw
  To: Zlatko Calusic
  Cc: Linus Torvalds, Stephen C. Tweedie, Eric W. Biederman,
	Savochkin Andrey Vladimirovich, steve, brent verner,
	Garst R. Reese, Kalle Andersson, Ben McCann, Alan Cox, bredelin,
	linux-kernel, Rik van Riel, linux-mm

On 12 Jan 1999, Zlatko Calusic wrote:

> Could somebody spare a minute to explain why is that so, and what
> needs to be done to make SHM swapping asynchronous?

Maybe because nobody care about shm? I think shm can wait for 2.3 to be
improved.

> Also, while we're at MM fixes, I'm appending below a small patch that
> will improve interactive feel.

This is just in my latest arca patches as you have just noticed. Don't
think that this thing make some difference though. But sometimes
could improve performances and make tons of sense.

Andrea Arcangeli

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: Results: Zlatko's new vm patch
  1999-01-12 18:16                                                                 ` Stephen C. Tweedie
@ 1999-01-12 20:15                                                                   ` Michael K Vance
  1999-01-13 19:25                                                                     ` Stephen C. Tweedie
  0 siblings, 1 reply; 243+ messages in thread
From: Michael K Vance @ 1999-01-12 20:15 UTC (permalink / raw
  To: Stephen C. Tweedie
  Cc: Joseph Anthony, Linus Torvalds, Steve Bergman, Andrea Arcangeli,
	brent verner, Garst R. Reese, Kalle Andersson, Zlatko Calusic,
	Ben McCann, bredelin, linux-kernel, linux-mm, Alan Cox

"Stephen C. Tweedie" wrote:

> > Well, sometimes the system writes to swap before I have used half my
> > memory ( in X ) I view this with wmmon in windowmaker..
> 
> Suspect wmmon in that case.  If you can show this happening in a trace
> output from "vmstat 1", then I'll start to worry.

wmmon stuffs both swap and physical mem in its "MEM" area, and also has a
listing for "SWP", ie swap. I assume top is still reliable?

m.

-- 
"We watched her fall over and lay down,
 shouting the poetic truths of high school journal keepers."
 -- Lee Rinaldo, Sonic Youth
--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-12 18:44                                                             ` Zlatko Calusic
  1999-01-12 19:05                                                               ` Andrea Arcangeli
@ 1999-01-12 21:46                                                               ` Rik van Riel
  1999-01-13  6:52                                                                 ` Zlatko Calusic
  1999-01-13 13:45                                                                 ` Andrea Arcangeli
  1 sibling, 2 replies; 243+ messages in thread
From: Rik van Riel @ 1999-01-12 21:46 UTC (permalink / raw
  To: Zlatko Calusic
  Cc: Linus Torvalds, Stephen C. Tweedie, Eric W. Biederman,
	Savochkin Andrey Vladimirovich, Andrea Arcangeli, steve,
	brent verner, Garst R. Reese, Kalle Andersson, Ben McCann,
	Alan Cox, bredelin, linux-kernel, linux-mm

On 12 Jan 1999, Zlatko Calusic wrote:

> After number of async pages gets bigger than
> pager_daemon.swap_cluster (= SWAP_CLUSTER_MAX), swapin readahead
> becomes synchronous, and that hurts performance. It is better to
> skip readahead in such situations, and that is also more fair to
> swapout. Andrea came to exactly the same conclusion, independent
> of me (on the same day :)).

IIRC this facility was in the original swapin readahead
implementation. That only leaves the question who removed
it and why :))

cheers,

Rik -- If a Microsoft product fails, who do you sue?
+-------------------------------------------------------------------+
| Linux memory management tour guide.        riel@humbolt.geo.uu.nl |
| Scouting Vries cubscout leader.    http://humbolt.geo.uu.nl/~riel |
+-------------------------------------------------------------------+

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Where to find pre7. Was: Results: Zlatko's new vm patch
  1999-01-12  5:33                                                             ` Linus Torvalds
                                                                                 ` (2 preceding siblings ...)
  1999-01-12 18:24                                                               ` Michael K Vance
@ 1999-01-13  0:01                                                               ` Robert Thorncrantz
  3 siblings, 0 replies; 243+ messages in thread
From: Robert Thorncrantz @ 1999-01-13  0:01 UTC (permalink / raw
  To: Linus Torvalds, Steve Bergman
  Cc: Andrea Arcangeli, brent verner, Garst R. Reese, Kalle Andersson,
	Zlatko Calusic, Ben McCann, bredelin, linux-kernel, linux-mm,
	Alan Cox, Stephen C. Tweedie

On Mon, Jan 11, 1999 at 09:33:08PM -0800, Linus Torvalds wrote:
<snip>
> 
> The only feedback I have so far says that pre-7 is much better than any of
> the pre-6 versions, but I'd be happier with more coverage depth and more

I've not found the pre7 in the usual places on the mirrors. I found
something that looks like it in /pub/linux/kernel/testing/pre-7.gz, is
this the one, and is it a "real" pre-release? I'm a little uncertain
since it's not in the same place as the others.

  /robert

-- 
Robert Thorncrantz                                 rtz@pirx.df.lth.se
Mundus Vult Decipi                              dat95rth@ludat.lth.se
--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-12 21:46                                                               ` Rik van Riel
@ 1999-01-13  6:52                                                                 ` Zlatko Calusic
  1999-01-13 13:45                                                                 ` Andrea Arcangeli
  1 sibling, 0 replies; 243+ messages in thread
From: Zlatko Calusic @ 1999-01-13  6:52 UTC (permalink / raw
  To: Rik van Riel
  Cc: Linus Torvalds, Stephen C. Tweedie, Eric W. Biederman,
	Savochkin Andrey Vladimirovich, Andrea Arcangeli, steve,
	brent verner, Garst R. Reese, Kalle Andersson, Ben McCann,
	Alan Cox, bredelin, linux-kernel, linux-mm

Rik van Riel <riel@humbolt.geo.uu.nl> writes:

> On 12 Jan 1999, Zlatko Calusic wrote:
> 
> > After number of async pages gets bigger than
> > pager_daemon.swap_cluster (= SWAP_CLUSTER_MAX), swapin readahead
> > becomes synchronous, and that hurts performance. It is better to
> > skip readahead in such situations, and that is also more fair to
> > swapout. Andrea came to exactly the same conclusion, independent
> > of me (on the same day :)).
> 
> IIRC this facility was in the original swapin readahead
> implementation. That only leaves the question who removed
> it and why :))
> 

*I* did, because original test was too complicated and nobody
understood what was it actual purpose.

Beside that, when MM code changed recently, nr_free_pages started
hovering at lower values, and that was killing readahead at most cases
(with old test in place), thus producing terrible results, especially
when you had more than one thrashing task.

-- 
Zlatko
--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-12 21:46                                                               ` Rik van Riel
  1999-01-13  6:52                                                                 ` Zlatko Calusic
@ 1999-01-13 13:45                                                                 ` Andrea Arcangeli
  1999-01-13 13:58                                                                   ` Chris Evans
                                                                                     ` (2 more replies)
  1 sibling, 3 replies; 243+ messages in thread
From: Andrea Arcangeli @ 1999-01-13 13:45 UTC (permalink / raw
  To: Rik van Riel
  Cc: Zlatko Calusic, Linus Torvalds, Stephen C. Tweedie,
	Eric W. Biederman, Savochkin Andrey Vladimirovich, steve,
	brent verner, Garst R. Reese, Kalle Andersson, Ben McCann,
	Alan Cox, bredelin, linux-kernel, linux-mm

On Tue, 12 Jan 1999, Rik van Riel wrote:

> IIRC this facility was in the original swapin readahead
> implementation. That only leaves the question who removed
> it and why :))

There's another thing I completly disagree and that I just removed here. 
It's the alignment of the offset field. I see no one point in going back
instead of only doing real read_ahead_. 

Maybe I am missing something?

Index: page_alloc.c
===================================================================
RCS file: /var/cvs/linux/mm/page_alloc.c,v
retrieving revision 1.1.1.8
retrieving revision 1.1.1.1.2.29
diff -u -r1.1.1.8 -r1.1.1.1.2.29
--- page_alloc.c	1999/01/11 21:24:23	1.1.1.8
+++ linux/mm/page_alloc.c	1999/01/12 23:00:04	1.1.1.1.2.29
@@ -353,10 +352,10 @@
 	unsigned long offset = SWP_OFFSET(entry);
 	struct swap_info_struct *swapdev = SWP_TYPE(entry) + swap_info;
 	
-	offset = (offset >> page_cluster) << page_cluster;
-	
 	for (i = 1 << page_cluster; i > 0; i--) {
-	      if (offset >= swapdev->max)
+	      if (offset >= swapdev->max ||
+		  /* don't block on I/O for doing readahead -arca */
+		  atomic_read(&nr_async_pages) > pager_daemon.max_async_pages)
 		      return;
 	      if (!swapdev->swap_map[offset] ||
 		  swapdev->swap_map[offset] == SWAP_MAP_BAD ||



Andrea Arcangeli

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-13 13:45                                                                 ` Andrea Arcangeli
@ 1999-01-13 13:58                                                                   ` Chris Evans
  1999-01-13 15:07                                                                     ` Andrea Arcangeli
  1999-01-13 14:59                                                                   ` Rik van Riel
  1999-01-13 17:55                                                                   ` [PATCH] " Stephen C. Tweedie
  2 siblings, 1 reply; 243+ messages in thread
From: Chris Evans @ 1999-01-13 13:58 UTC (permalink / raw
  To: Andrea Arcangeli
  Cc: Rik van Riel, Zlatko Calusic, Linus Torvalds, Stephen C. Tweedie,
	Eric W. Biederman, Savochkin Andrey Vladimirovich, steve,
	brent verner, Garst R. Reese, Kalle Andersson, Ben McCann,
	Alan Cox, bredelin, linux-kernel, linux-mm

On Wed, 13 Jan 1999, Andrea Arcangeli wrote:

> On Tue, 12 Jan 1999, Rik van Riel wrote:
> 
> > IIRC this facility was in the original swapin readahead
> > implementation. That only leaves the question who removed
> > it and why :))
> 
> There's another thing I completly disagree and that I just removed here. 
> It's the alignment of the offset field. I see no one point in going back
> instead of only doing real read_ahead_. 
> 
> Maybe I am missing something?

Yes. Imagine the paging in of big binary case. The page faults will occur
all over the place, not in a nice sequential order. The page-in clusters
stuff _doubled_ performance of paging in certain big static binaries.

Chris

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-13 13:45                                                                 ` Andrea Arcangeli
  1999-01-13 13:58                                                                   ` Chris Evans
@ 1999-01-13 14:59                                                                   ` Rik van Riel
  1999-01-13 18:10                                                                     ` Andrea Arcangeli
  1999-01-13 17:55                                                                   ` [PATCH] " Stephen C. Tweedie
  2 siblings, 1 reply; 243+ messages in thread
From: Rik van Riel @ 1999-01-13 14:59 UTC (permalink / raw
  To: Andrea Arcangeli
  Cc: Zlatko Calusic, Linus Torvalds, Stephen C. Tweedie,
	Eric W. Biederman, Savochkin Andrey Vladimirovich, steve,
	brent verner, Garst R. Reese, Kalle Andersson, Ben McCann,
	Alan Cox, bredelin, linux-kernel, linux-mm

On Wed, 13 Jan 1999, Andrea Arcangeli wrote:
> On Tue, 12 Jan 1999, Rik van Riel wrote:
> 
> > IIRC this facility was in the original swapin readahead
> > implementation. That only leaves the question who removed
> > it and why :))
> 
> There's another thing I completly disagree and that I just removed here. 
> It's the alignment of the offset field. I see no one point in going back
> instead of only doing real read_ahead_. 
> 
> Maybe I am missing something?

Yes, you are:

- aligned reads make sure you don't do smallish readaheads of
  only 1 block (because you've already got the rest)
- there are programs that move through the data backwards or
  tilewise
- in allocating swap space it just doesn't make sense to read
  into the next swap 'region'

Rik -- If a Microsoft product fails, who do you sue?
+-------------------------------------------------------------------+
| Linux memory management tour guide.             riel@nl.linux.org |
| Scouting Vries cubscout leader.     http://www.nl.linux.org/~riel |
+-------------------------------------------------------------------+

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-13 13:58                                                                   ` Chris Evans
@ 1999-01-13 15:07                                                                     ` Andrea Arcangeli
  1999-01-13 22:11                                                                       ` Stephen C. Tweedie
  0 siblings, 1 reply; 243+ messages in thread
From: Andrea Arcangeli @ 1999-01-13 15:07 UTC (permalink / raw
  To: Chris Evans
  Cc: Rik van Riel, Zlatko Calusic, Linus Torvalds, Stephen C. Tweedie,
	Eric W. Biederman, Savochkin Andrey Vladimirovich, steve,
	brent verner, Garst R. Reese, Kalle Andersson, Ben McCann,
	Alan Cox, bredelin, linux-kernel, linux-mm

On Wed, 13 Jan 1999, Chris Evans wrote:

> Yes. Imagine the paging in of big binary case. The page faults will occur
> all over the place, not in a nice sequential order. The page-in clusters
> stuff _doubled_ performance of paging in certain big static binaries.

I think that if it helped it means that the swap cache got shrunk too much
early due a not good free paging algorithm.

Andrea Arcangeli

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-12 19:05                                                               ` Andrea Arcangeli
@ 1999-01-13 17:48                                                                 ` Stephen C. Tweedie
  1999-01-13 18:07                                                                   ` 2.2.0-pre6 ain't nice =( Kalle Andersson
                                                                                     ` (2 more replies)
  0 siblings, 3 replies; 243+ messages in thread
From: Stephen C. Tweedie @ 1999-01-13 17:48 UTC (permalink / raw
  To: Andrea Arcangeli
  Cc: Zlatko Calusic, Linus Torvalds, Stephen C. Tweedie,
	Eric W. Biederman, Savochkin Andrey Vladimirovich, steve,
	brent verner, Garst R. Reese, Kalle Andersson, Ben McCann,
	Alan Cox, bredelin, linux-kernel, Rik van Riel, linux-mm

Hi,

On Tue, 12 Jan 1999 20:05:21 +0100 (CET), Andrea Arcangeli
<andrea@e-mind.com> said:

> On 12 Jan 1999, Zlatko Calusic wrote:
>> Could somebody spare a minute to explain why is that so, and what
>> needs to be done to make SHM swapping asynchronous?

> Maybe because nobody care about shm? I think shm can wait for 2.3 to be
> improved.

"Nobody"?  Oracle uses large shared memory regions for starters.

--Stephen
--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* [PATCH] Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-13 13:45                                                                 ` Andrea Arcangeli
  1999-01-13 13:58                                                                   ` Chris Evans
  1999-01-13 14:59                                                                   ` Rik van Riel
@ 1999-01-13 17:55                                                                   ` Stephen C. Tweedie
  1999-01-13 18:52                                                                     ` Andrea Arcangeli
  2 siblings, 1 reply; 243+ messages in thread
From: Stephen C. Tweedie @ 1999-01-13 17:55 UTC (permalink / raw
  To: Andrea Arcangeli, Linus Torvalds
  Cc: Rik van Riel, Zlatko Calusic, Stephen C. Tweedie,
	Eric W. Biederman, Savochkin Andrey Vladimirovich, steve,
	brent verner, Garst R. Reese, Kalle Andersson, Ben McCann,
	Alan Cox, bredelin, linux-kernel, linux-mm

Hi,

On Wed, 13 Jan 1999 14:45:09 +0100 (CET), Andrea Arcangeli
<andrea@e-mind.com> said:

> On Tue, 12 Jan 1999, Rik van Riel wrote:
>> IIRC this facility was in the original swapin readahead
>> implementation. That only leaves the question who removed
>> it and why :))

> There's another thing I completly disagree and that I just removed here. 
> It's the alignment of the offset field. I see no one point in going back
> instead of only doing real read_ahead_. 

> Maybe I am missing something?

Yes, very much so.

When paging in binaries, you often have locality of reference in both
directions --- a set of functions compiled from a single source file
will occupy adjacent pages in VM, but you are as likely to call a
function at the end of the region first as one at the beginning.  It
is very common to get backwards locality as a result.

The big advantage of doing aligned clusters for readin is twofold:
first, it means that you get as much of a readahead advantage for
these backwards access patterns as for forward accesses.  Secondly, it
means that you are reading in complete tiles which are guaranteed to
have no gaps between them, so any two accesses in adjacent tiles are
sufficient to read in the complete set of nearby pages without missing
any gaps between them: it avoids having to do yet another IO to fill
in the few pages missed by a strictly forward-looking readahead
function.

> +		  /* don't block on I/O for doing readahead -arca */
> +		  atomic_read(&nr_async_pages) > pager_daemon.max_async_pages)
>  		      return;

I think this is the wrong solution: far better to do the patch below,
which simply exempts reads from nr_async_pages altogether.  I
originally added nr_async_pages to serve two functions: to allow
kswapd to determine how much memory it was already in the process of
freeing, and to act as a throttle on the number of write IOs submitted
when swapping.

We don't need a similar throttling action for reads, because every
place where we do VM readahead, each readahead IO cluster is followed
by a synchronous read on one page.  We don't throttle the async
readaheads on normal file IO, for example.

--Stephen

----------------------------------------------------------------
--- mm/page_io.c~	Mon Dec 28 21:56:29 1998
+++ mm/page_io.c	Tue Jan 12 16:45:55 1999
@@ -58,7 +58,8 @@
 	}

 	/* Don't allow too many pending pages in flight.. */
-	if (atomic_read(&nr_async_pages) > pager_daemon.swap_cluster)
+	if (rw == WRITE &&
+	    atomic_read(&nr_async_pages) > pager_daemon.swap_cluster)
 		wait = 1;

 	p = &swap_info[type];
@@ -170,7 +171,7 @@
 		atomic_dec(&page->count);
 		return;
 	}
- 	if (!wait) {
+ 	if (rw == WRITE && !wait) {
  		set_bit(PG_decr_after, &page->flags);
  		atomic_inc(&nr_async_pages);
  	}
--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* 2.2.0-pre6 ain't nice =(
  1999-01-13 17:48                                                                 ` Stephen C. Tweedie
@ 1999-01-13 18:07                                                                   ` Kalle Andersson
  1999-01-13 19:05                                                                   ` MM deadlock [was: Re: arca-vm-8...] Alan Cox
  1999-01-14 10:48                                                                   ` Mike Jagdis
  2 siblings, 0 replies; 243+ messages in thread
From: Kalle Andersson @ 1999-01-13 18:07 UTC (permalink / raw
  To: Stephen C. Tweedie
  Cc: Andrea Arcangeli, Zlatko Calusic, Linus Torvalds,
	Eric W. Biederman, Savochkin Andrey Vladimirovich, steve,
	brent verner, Garst R. Reese, Ben McCann, Alan Cox, bredelin,
	linux-kernel, Rik van Riel, linux-mm

Hello

I've been running 2.2.0-pre6 for about 5 days now (w/o reboot) and I'm sad
to say that it seems to swap more and more for each day. Especially when
something heavy I/O is running, but even when the system is not used at
all it is much more sluggish then freshly rebooted... 

I hope this problem will be fixed, we certainly don't want NT emulation =)

--
Med vanlig halsning
Kalle Andersson
kalle@sslug.dk

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-13 14:59                                                                   ` Rik van Riel
@ 1999-01-13 18:10                                                                     ` Andrea Arcangeli
  1999-01-13 22:14                                                                       ` Stephen C. Tweedie
  0 siblings, 1 reply; 243+ messages in thread
From: Andrea Arcangeli @ 1999-01-13 18:10 UTC (permalink / raw
  To: Rik van Riel
  Cc: Zlatko Calusic, Linus Torvalds, Stephen C. Tweedie,
	Eric W. Biederman, Savochkin Andrey Vladimirovich, steve,
	brent verner, Garst R. Reese, Kalle Andersson, Ben McCann,
	Alan Cox, bredelin, linux-kernel, linux-mm

On Wed, 13 Jan 1999, Rik van Riel wrote:

> - in allocating swap space it just doesn't make sense to read
>   into the next swap 'region'

The point is that I can't see a swap `region' looking at how
scan_swap_map() works. The more atomic region I can see in the swap space
is a block of bytes large PAGE_SIZE bytes (e.g. offset ;).

For the case of binaries the aging on the page cache should take care of
it (even if there's no aging on the swap cache as pre[567] if I remeber
well). 

Andrea Arcangeli


--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: [PATCH] Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-13 17:55                                                                   ` [PATCH] " Stephen C. Tweedie
@ 1999-01-13 18:52                                                                     ` Andrea Arcangeli
  1999-01-13 22:10                                                                       ` Stephen C. Tweedie
  0 siblings, 1 reply; 243+ messages in thread
From: Andrea Arcangeli @ 1999-01-13 18:52 UTC (permalink / raw
  To: Stephen C. Tweedie
  Cc: Linus Torvalds, Rik van Riel, Zlatko Calusic, Eric W. Biederman,
	Savochkin Andrey Vladimirovich, steve, brent verner,
	Garst R. Reese, Kalle Andersson, Ben McCann, Alan Cox, bredelin,
	linux-kernel, linux-mm

On Wed, 13 Jan 1999, Stephen C. Tweedie wrote:

> I think this is the wrong solution: far better to do the patch below,
> which simply exempts reads from nr_async_pages altogether.  I
> originally added nr_async_pages to serve two functions: to allow
> kswapd to determine how much memory it was already in the process of
> freeing, and to act as a throttle on the number of write IOs submitted
> when swapping.
> 
> We don't need a similar throttling action for reads, because every
> place where we do VM readahead, each readahead IO cluster is followed
> by a synchronous read on one page.  We don't throttle the async
> readaheads on normal file IO, for example.

Note that we don't need nr_async_pages at all. Here when the limit of
nr_async_pages is low it's only a bottleneck for swapout performances. I
have not removed it (because it could be useful to decrease swapout I/O if
somebody needs this strange feature), but I have added a
page_daemon.max_async_pages and set it to something like 256. Now I check
nr_async_pages against the new max_async_pages. 

I _guess_ (not checked) that the _only_ reason Steve seen arca-vm-16 so
high improved changing SWAP_CLUSTER_MAX to 512 instead of 32 is the
removal of the nr_async_pages bottleneck. 

Andrea Arcangeli

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-13 17:48                                                                 ` Stephen C. Tweedie
  1999-01-13 18:07                                                                   ` 2.2.0-pre6 ain't nice =( Kalle Andersson
@ 1999-01-13 19:05                                                                   ` Alan Cox
  1999-01-13 19:23                                                                     ` MOLNAR Ingo
  1999-01-13 19:26                                                                     ` Andrea Arcangeli
  1999-01-14 10:48                                                                   ` Mike Jagdis
  2 siblings, 2 replies; 243+ messages in thread
From: Alan Cox @ 1999-01-13 19:05 UTC (permalink / raw
  To: Stephen C. Tweedie
  Cc: andrea, Zlatko.Calusic, torvalds, ebiederm+eric, saw, steve,
	damonbrent, reese, kalle.andersson, bmccann, alan, bredelin,
	linux-kernel, H.H.vanRiel, linux-mm

> >> Could somebody spare a minute to explain why is that so, and what
> >> needs to be done to make SHM swapping asynchronous?
> 
> > Maybe because nobody care about shm? I think shm can wait for 2.3 to be
> > improved.
> 
> "Nobody"?  Oracle uses large shared memory regions for starters.

All the big databases use large shared memory objects. 

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-13 19:05                                                                   ` MM deadlock [was: Re: arca-vm-8...] Alan Cox
@ 1999-01-13 19:23                                                                     ` MOLNAR Ingo
  1999-01-13 19:26                                                                     ` Andrea Arcangeli
  1 sibling, 0 replies; 243+ messages in thread
From: MOLNAR Ingo @ 1999-01-13 19:23 UTC (permalink / raw
  To: Alan Cox
  Cc: Stephen C. Tweedie, andrea, Zlatko.Calusic, torvalds,
	ebiederm+eric, saw, steve, damonbrent, reese, kalle.andersson,
	bmccann, bredelin, linux-kernel, H.H.vanRiel, linux-mm

On Wed, 13 Jan 1999, Alan Cox wrote:

> > "Nobody"?  Oracle uses large shared memory regions for starters.
> 
> All the big databases use large shared memory objects. 

which is _not_ expected to be swapped at all for a correctly set-up Oracle
database installation.

-- mingo

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: Results: Zlatko's new vm patch
  1999-01-12 20:15                                                                   ` Michael K Vance
@ 1999-01-13 19:25                                                                     ` Stephen C. Tweedie
  0 siblings, 0 replies; 243+ messages in thread
From: Stephen C. Tweedie @ 1999-01-13 19:25 UTC (permalink / raw
  To: Michael K Vance
  Cc: Stephen C. Tweedie, Joseph Anthony, Linus Torvalds, Steve Bergman,
	Andrea Arcangeli, brent verner, Garst R. Reese, Kalle Andersson,
	Zlatko Calusic, Ben McCann, bredelin, linux-kernel, linux-mm,
	Alan Cox

Hi,

On Tue, 12 Jan 1999 15:15:22 -0500, Michael K Vance <mkv102@psu.edu>
said:

> wmmon stuffs both swap and physical mem in its "MEM" area, and also has a
> listing for "SWP", ie swap. I assume top is still reliable?

It should be, yes.

--Stephen
--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-13 19:05                                                                   ` MM deadlock [was: Re: arca-vm-8...] Alan Cox
  1999-01-13 19:23                                                                     ` MOLNAR Ingo
@ 1999-01-13 19:26                                                                     ` Andrea Arcangeli
  1999-01-14 11:02                                                                       ` Mike Jagdis
  1999-01-15  7:40                                                                       ` Agus Budy Wuysang
  1 sibling, 2 replies; 243+ messages in thread
From: Andrea Arcangeli @ 1999-01-13 19:26 UTC (permalink / raw
  To: Alan Cox; +Cc: Stephen C. Tweedie, linux-kernel, linux-mm

On Wed, 13 Jan 1999, Alan Cox wrote:

> > >> Could somebody spare a minute to explain why is that so, and what
> > >> needs to be done to make SHM swapping asynchronous?
> > 
> > > Maybe because nobody care about shm? I think shm can wait for 2.3 to be
> > > improved.
> > 
> > "Nobody"?  Oracle uses large shared memory regions for starters.
> 
> All the big databases use large shared memory objects. 

I was't aware of that. I noticed that also postgres (a big database) uses
shm but it's _only_ something like 1 Mbyte (at least during trivial
usage). With my current code such 1 Mbyte would not be touched unless
there would be some really big memory squeezee (like something that cause
a swapout of tons of memory and that would slowdown a bit the system
anyway). This is obviously not true in clean pre7 (see
try_to_free_pages()). 

With my latest code, optimizing the shm swapout (as we do with normal
userspace memory) would help only if the shm memory is going to be in size
something like the total VM allocated in all processes' mm.  Since I
supposed that "normal" apps don't use huge amount of shm memory I told
that we could not care until 2.3. I can't know how much shm memory uses
Oracle SQL server because I can't have it. I am pretty sure instead that
postgresql will be not stuck in shm swapout here even if the shm swapout
code is gross.

Andrea Arcangeli

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* [patch] arca-vm-19 [Re: Results: Zlatko's new vm patch]
  1999-01-12  3:21                                                           ` Results: Zlatko's new vm patch Steve Bergman
  1999-01-12  5:33                                                             ` Linus Torvalds
@ 1999-01-13 20:47                                                             ` Andrea Arcangeli
  1999-01-14 12:30                                                               ` Andrea Arcangeli
  1 sibling, 1 reply; 243+ messages in thread
From: Andrea Arcangeli @ 1999-01-13 20:47 UTC (permalink / raw
  To: Steve Bergman, dlux
  Cc: Linus Torvalds, brent verner, Garst R. Reese, Kalle Andersson,
	Zlatko Calusic, Ben McCann, bredelin, linux-kernel, linux-mm,
	Alan Cox, Stephen C. Tweedie

I produced a new arca-vm-19. I would like if you could try it. I don't
know if it will work well as previous one...

You could try it on 128Mbyte:

1. with the bootup pager settings (8 2 2 5 32 128 256).
2. after `echo 10 2 2 5 32 128 256 >/proc/sys/vm/pager`
3. after `echo 8 2 2 10 32 128 256 >/proc/sys/vm/pager`
4. after `echo 8 2 2 5 32 32 256 >/proc/sys/vm/pager`
5. after `echo 8 2 2 5 32 128 512 >/proc/sys/vm/pager`
6. after `echo 8 2 2 5 64 128 512 >/proc/sys/vm/pager`

NOTENOTE: if the performances of `1.' are worse than arca-vm-18, don't
_waste_ time trying other pager settings of course ;). 

Again the main differeces is free_user_and_cache() implementation. Now I
try to balance the swap cache to the 5% of total memeory during heavy
swapping activities. I do that growing the cache slowly from the point
shrink_mmap start failing. This _seems_ to work pretty well (the cache
levels seems more balanced than arca-vm-18). But again I based the
behavior on an fixed number (this time tunable via sysctl). I tried
inventing some new autotuning algorithm but it seems that everything I
done was performing worse than both arca-vm-18 and arca-vm-19 (the new
below).

Andrea Arcangeli

arca-vm-19 against pre7.gz (in the testing directory), I hope to have
included everything in this diff...

Index: linux/mm/filemap.c
diff -u linux/mm/filemap.c:1.1.1.9 linux/mm/filemap.c:1.1.1.1.2.46
--- linux/mm/filemap.c:1.1.1.9	Thu Jan  7 12:21:35 1999
+++ linux/mm/filemap.c	Wed Jan 13 21:23:38 1999
@@ -121,14 +125,11 @@
 int shrink_mmap(int priority, int gfp_mask)
 {
 	static unsigned long clock = 0;
-	unsigned long limit = num_physpages;
 	struct page * page;
-	int count;
-
-	count = (limit << 1) >> priority;
+	unsigned long count = num_physpages / (priority+1);
 
 	page = mem_map + clock;
-	do {
+	while (count-- != 0) {
 		int referenced;
 
 		/* This works even in the presence of PageSkip because
@@ -147,7 +148,6 @@
 			clock = page->map_nr;
 		}
 		
-		count--;
 		referenced = test_and_clear_bit(PG_referenced, &page->flags);
 
 		if (PageLocked(page))
@@ -160,21 +160,6 @@
 		if (atomic_read(&page->count) != 1)
 			continue;
 
-		/*
-		 * Is it a page swap page? If so, we want to
-		 * drop it if it is no longer used, even if it
-		 * were to be marked referenced..
-		 */
-		if (PageSwapCache(page)) {
-			if (referenced && swap_count(page->offset) != 1)
-				continue;
-			delete_from_swap_cache(page);
-			return 1;
-		}	
-
-		if (referenced)
-			continue;
-
 		/* Is it a buffer page? */
 		if (page->buffers) {
 			if (buffer_under_min())
@@ -184,6 +169,14 @@
 			return 1;
 		}
 
+		if (referenced)
+			continue;
+
+		if (PageSwapCache(page)) {
+			delete_from_swap_cache(page);
+			return 1;
+		}	
+
 		/* is it a page-cache page? */
 		if (page->inode) {
 			if (pgcache_under_min())
@@ -191,8 +184,7 @@
 			remove_inode_page(page);
 			return 1;
 		}
-
-	} while (count > 0);
+	}
 	return 0;
 }
 
Index: linux/mm/mmap.c
diff -u linux/mm/mmap.c:1.1.1.2 linux/mm/mmap.c:1.1.1.1.2.12
--- linux/mm/mmap.c:1.1.1.2	Fri Nov 27 11:19:10 1998
+++ linux/mm/mmap.c	Wed Jan 13 21:23:38 1999
@@ -66,7 +66,7 @@
 	free += page_cache_size;
 	free += nr_free_pages;
 	free += nr_swap_pages;
-	free -= (page_cache.min_percent + buffer_mem.min_percent + 2)*num_physpages/100; 
+	free -= (pager_daemon.cache_min_percent + pager_daemon.buffer_min_percent + 2)*num_physpages/100; 
 	return free > pages;
 }
 
Index: linux/mm/page_alloc.c
diff -u linux/mm/page_alloc.c:1.1.1.8 linux/mm/page_alloc.c:1.1.1.1.2.30
--- linux/mm/page_alloc.c:1.1.1.8	Mon Jan 11 22:24:23 1999
+++ linux/mm/page_alloc.c	Wed Jan 13 21:23:38 1999
@@ -124,7 +124,6 @@
 	if (!PageReserved(page) && atomic_dec_and_test(&page->count)) {
 		if (PageSwapCache(page))
 			panic ("Freeing swap cache page");
-		page->flags &= ~(1 << PG_referenced);
 		free_pages_ok(page->map_nr, 0);
 		return;
 	}
@@ -141,7 +140,6 @@
 		if (atomic_dec_and_test(&map->count)) {
 			if (PageSwapCache(map))
 				panic ("Freeing swap cache pages");
-			map->flags &= ~(1 << PG_referenced);
 			free_pages_ok(map_nr, order);
 			return;
 		}
@@ -212,19 +210,18 @@
 		 * further thought.
 		 */
 		if (!(current->flags & PF_MEMALLOC)) {
-			static int trashing = 0;
 			int freed;
 
 			if (nr_free_pages > freepages.min) {
-				if (!trashing)
+				if (!current->trashing)
 					goto ok_to_allocate;
 				if (nr_free_pages > freepages.low) {
-					trashing = 0;
+					current->trashing = 0;
 					goto ok_to_allocate;
 				}
 			}
 
-			trashing = 1;
+			current->trashing = 1;
 			current->flags |= PF_MEMALLOC;
 			freed = try_to_free_pages(gfp_mask);
 			current->flags &= ~PF_MEMALLOC;
@@ -353,10 +350,10 @@
 	unsigned long offset = SWP_OFFSET(entry);
 	struct swap_info_struct *swapdev = SWP_TYPE(entry) + swap_info;
 	
-	offset = (offset >> page_cluster) << page_cluster;
-	
 	for (i = 1 << page_cluster; i > 0; i--) {
-	      if (offset >= swapdev->max)
+	      if (offset >= swapdev->max ||
+		  /* don't block on I/O for doing readahead -arca */
+		  atomic_read(&nr_async_pages) > pager_daemon.max_async_pages)
 		      return;
 	      if (!swapdev->swap_map[offset] ||
 		  swapdev->swap_map[offset] == SWAP_MAP_BAD ||
Index: linux/mm/page_io.c
diff -u linux/mm/page_io.c:1.1.1.4 linux/mm/page_io.c:1.1.1.1.2.6
--- linux/mm/page_io.c:1.1.1.4	Tue Dec 29 01:39:20 1998
+++ linux/mm/page_io.c	Wed Jan 13 00:00:04 1999
@@ -58,7 +58,7 @@
 	}
 
 	/* Don't allow too many pending pages in flight.. */
-	if (atomic_read(&nr_async_pages) > pager_daemon.swap_cluster)
+	if (atomic_read(&nr_async_pages) > pager_daemon.max_async_pages)
 		wait = 1;
 
 	p = &swap_info[type];
Index: linux/mm/swap.c
diff -u linux/mm/swap.c:1.1.1.6 linux/mm/swap.c:1.1.1.1.2.13
--- linux/mm/swap.c:1.1.1.6	Mon Jan 11 22:24:24 1999
+++ linux/mm/swap.c	Wed Jan 13 21:23:38 1999
@@ -40,41 +40,18 @@
 };
 
 /* How many pages do we try to swap or page in/out together? */
-int page_cluster = 4; /* Default value modified in swap_setup() */
+int page_cluster = 5; /* Default readahead 32 pages every time */
 
 /* We track the number of pages currently being asynchronously swapped
    out, so that we don't try to swap TOO many pages out at once */
 atomic_t nr_async_pages = ATOMIC_INIT(0);
 
-buffer_mem_t buffer_mem = {
+pager_daemon_t pager_daemon = {
+	8,	/* starting priority of try_to_free_pages() */
 	2,	/* minimum percent buffer */
-	10,	/* borrow percent buffer */
-	60	/* maximum percent buffer */
-};
-
-buffer_mem_t page_cache = {
 	2,	/* minimum percent page cache */
-	15,	/* borrow percent page cache */
-	75	/* maximum */
-};
-
-pager_daemon_t pager_daemon = {
-	512,	/* base number for calculating the number of tries */
-	SWAP_CLUSTER_MAX,	/* minimum number of tries */
-	SWAP_CLUSTER_MAX,	/* do swap I/O in clusters of this size */
+	5,	/* minimum percent swap page cache */
+	32,	/* number of tries we do on every try_to_free_pages() */
+	128,	/* do swap I/O in clusters of this size */
+	256	/* max number of async swapped-out pages on the fly */
 };
-
-/*
- * Perform any setup for the swap system
- */
-
-void __init swap_setup(void)
-{
-	/* Use a smaller cluster for memory <16MB or <32MB */
-	if (num_physpages < ((16 * 1024 * 1024) >> PAGE_SHIFT))
-		page_cluster = 2;
-	else if (num_physpages < ((32 * 1024 * 1024) >> PAGE_SHIFT))
-		page_cluster = 3;
-	else
-		page_cluster = 4;
-}
Index: linux/mm/swapfile.c
diff -u linux/mm/swapfile.c:1.1.1.3 linux/mm/swapfile.c:1.1.1.1.2.6
--- linux/mm/swapfile.c:1.1.1.3	Mon Jan 11 22:24:24 1999
+++ linux/mm/swapfile.c	Wed Jan 13 00:00:04 1999
@@ -23,7 +23,6 @@
 
 struct swap_info_struct swap_info[MAX_SWAPFILES];
 
-#define SWAPFILE_CLUSTER 256
 
 static inline int scan_swap_map(struct swap_info_struct *si)
 {
@@ -31,7 +30,7 @@
 	/* 
 	 * We try to cluster swap pages by allocating them
 	 * sequentially in swap.  Once we've allocated
-	 * SWAPFILE_CLUSTER pages this way, however, we resort to
+	 * SWAP_CLUSTER pages this way, however, we resort to
 	 * first-free allocation, starting a new cluster.  This
 	 * prevents us from scattering swap pages all over the entire
 	 * swap partition, so that we reduce overall disk seek times
@@ -47,7 +46,7 @@
 			goto got_page;
 		}
 	}
-	si->cluster_nr = SWAPFILE_CLUSTER;
+	si->cluster_nr = SWAP_CLUSTER;
 	for (offset = si->lowest_bit; offset <= si->highest_bit ; offset++) {
 		if (si->swap_map[offset])
 			continue;
Index: linux/mm/vmscan.c
diff -u linux/mm/vmscan.c:1.1.1.12 linux/mm/vmscan.c:1.1.1.1.2.91
--- linux/mm/vmscan.c:1.1.1.12	Mon Jan 11 22:24:24 1999
+++ linux/mm/vmscan.c	Wed Jan 13 21:23:38 1999
@@ -10,6 +10,11 @@
  *  Version: $Id: vmscan.c,v 1.5 1998/02/23 22:14:28 sct Exp $
  */
 
+/*
+ * free_user_and_cache() and always async swapout original idea.
+ * Copyright (C) 1999  Andrea Arcangeli
+ */
+
 #include <linux/slab.h>
 #include <linux/kernel_stat.h>
 #include <linux/swap.h>
@@ -20,6 +25,8 @@
 
 #include <asm/pgtable.h>
 
+int swapout_interval = HZ;
+
 /*
  * The swap-out functions return 1 if they successfully
  * threw something out, and we got a free page. It returns
@@ -71,6 +78,21 @@
 	 * memory, and we should just continue our scan.
 	 */
 	if (PageSwapCache(page_map)) {
+		if (pte_write(pte))
+		{
+			struct page *found;
+			printk ("VM: Found a writable swap-cached page!\n");
+			/* Try to diagnose the problem ... */
+			found = find_page(&swapper_inode, page_map->offset);
+			if (found) {
+				printk("page=%p@%08lx, found=%p, count=%d\n",
+				       page_map, page_map->offset,
+				       found, atomic_read(&found->count));
+				__free_page(found);
+			} else 
+				printk ("Spurious, page not in cache\n");
+			return 0;
+		}
 		entry = page_map->offset;
 		swap_duplicate(entry);
 		set_pte(page_table, __pte(entry));
@@ -199,7 +221,7 @@
 
 	do {
 		int result;
-		tsk->swap_address = address + PAGE_SIZE;
+		tsk->mm->swap_address = address + PAGE_SIZE;
 		result = try_to_swap_out(tsk, vma, address, pte, gfp_mask);
 		if (result)
 			return result;
@@ -271,7 +293,7 @@
 	/*
 	 * Go through process' page directory.
 	 */
-	address = p->swap_address;
+	address = p->mm->swap_address;
 
 	/*
 	 * Find the proper vm-area
@@ -293,8 +315,8 @@
 	}
 
 	/* We didn't find anything for the process */
-	p->swap_cnt = 0;
-	p->swap_address = 0;
+	p->mm->swap_cnt = 0;
+	p->mm->swap_address = 0;
 	return 0;
 }
 
@@ -306,7 +328,8 @@
 static int swap_out(unsigned int priority, int gfp_mask)
 {
 	struct task_struct * p, * pbest;
-	int counter, assign, max_cnt;
+	int counter, assign;
+	unsigned long max_cnt;
 
 	/* 
 	 * We make one or two passes through the task list, indexed by 
@@ -325,7 +348,7 @@
 	counter = nr_tasks / (priority+1);
 	if (counter < 1)
 		counter = 1;
-	if (counter > nr_tasks)
+	else if (counter > nr_tasks)
 		counter = nr_tasks;
 
 	for (; counter >= 0; counter--) {
@@ -338,13 +361,13 @@
 		for (; p != &init_task; p = p->next_task) {
 			if (!p->swappable)
 				continue;
-	 		if (p->mm->rss <= 0)
+	 		if (p->mm->rss == 0)
 				continue;
 			/* Refresh swap_cnt? */
 			if (assign)
-				p->swap_cnt = p->mm->rss;
-			if (p->swap_cnt > max_cnt) {
-				max_cnt = p->swap_cnt;
+				p->mm->swap_cnt = p->mm->rss;
+			if (p->mm->swap_cnt > max_cnt) {
+				max_cnt = p->mm->swap_cnt;
 				pbest = p;
 			}
 		}
@@ -375,8 +398,6 @@
        int i;
        char *revision="$Revision: 1.5 $", *s, *e;
 
-       swap_setup();
-       
        if ((s = strchr(revision, ':')) &&
            (e = strchr(s, '$')))
                s++, i = e - s;
@@ -430,7 +451,7 @@
 			break;
 		current->state = TASK_INTERRUPTIBLE;
 		run_task_queue(&tq_disk);
-		schedule_timeout(HZ);
+		schedule_timeout(swapout_interval);
 
 		/*
 		 * kswapd isn't even meant to keep up with anything,
@@ -438,13 +459,36 @@
 		 * point is to make sure that the system doesn't stay
 		 * forever in a really bad memory squeeze.
 		 */
-		if (nr_free_pages < freepages.high)
+		if (nr_free_pages < freepages.min)
 			try_to_free_pages(GFP_KSWAPD);
 	}
 
 	return 0;
 }
 
+static int free_user_and_cache(int priority, int gfp_mask)
+{
+	int freed, swapped = 0;
+	static int grow_swap_cache_mode = 0;
+
+	if (!grow_swap_cache_mode)
+	{
+		freed = shrink_mmap(priority, gfp_mask);
+		if (!freed)
+		{
+			grow_swap_cache_mode = 1;
+			swapped = swap_out(priority, gfp_mask);
+		}
+	} else {
+		if (!swpcache_under_min())
+			grow_swap_cache_mode = 0;
+		swapped = swap_out(priority, gfp_mask);
+		freed = shrink_mmap(priority, gfp_mask);
+	}
+
+	return freed || swapped;
+}
+
 /*
  * We need to make the locks finer granularity, but right
  * now we need this so that we can do page allocations
@@ -457,34 +501,33 @@
 int try_to_free_pages(unsigned int gfp_mask)
 {
 	int priority;
-	int count = SWAP_CLUSTER_MAX;
+	static int state = 0;
+	int count = pager_daemon.tries;
 
 	lock_kernel();
-
-	/* Always trim SLAB caches when memory gets low. */
-	kmem_cache_reap(gfp_mask);
-
-	priority = 6;
-	do {
-		while (shrink_mmap(priority, gfp_mask)) {
-			if (!--count)
-				goto done;
-		}
 
-		/* Try to get rid of some shared memory pages.. */
-		while (shm_swap(priority, gfp_mask)) {
-			if (!--count)
-				goto done;
-		}
-	
-		/* Then, try to page stuff out.. */
-		while (swap_out(priority, gfp_mask)) {
-			if (!--count)
-				goto done;
-		}
+	priority = pager_daemon.priority;
+	switch (state)
+	{
+		do {
+		case 0:
+			while (free_user_and_cache(priority, gfp_mask)) {
+				if (!--count)
+					goto done;
+			}
+			state = 1;
+		case 1:
+			/* Try to get rid of some shared memory pages.. */
+			while (shm_swap(priority, gfp_mask)) {
+				if (!--count)
+					goto done;
+			}
+			state = 0;
 
-		shrink_dcache_memory(priority, gfp_mask);
-	} while (--priority >= 0);
+			kmem_cache_reap(gfp_mask);
+			shrink_dcache_memory(priority, gfp_mask);
+		} while (--priority >= 0);
+	}
 done:
 	unlock_kernel();
 
Index: linux/kernel/fork.c
diff -u linux/kernel/fork.c:1.1.1.6 linux/kernel/fork.c:1.1.1.1.2.10
--- linux/kernel/fork.c:1.1.1.6	Mon Jan 11 22:24:21 1999
+++ linux/kernel/fork.c	Mon Jan 11 22:56:09 1999
@@ -511,6 +514,7 @@
 
 	p->did_exec = 0;
 	p->swappable = 0;
+	p->trashing = 0;
 	p->state = TASK_UNINTERRUPTIBLE;
 
 	copy_flags(clone_flags, p);
Index: linux/kernel/sysctl.c
diff -u linux/kernel/sysctl.c:1.1.1.6 linux/kernel/sysctl.c:1.1.1.1.2.12
--- linux/kernel/sysctl.c:1.1.1.6	Mon Jan 11 22:24:22 1999
+++ linux/kernel/sysctl.c	Wed Jan 13 21:23:38 1999
@@ -32,7 +32,7 @@
 
 /* External variables not in a header file. */
 extern int panic_timeout;
-extern int console_loglevel, C_A_D;
+extern int console_loglevel, C_A_D, swapout_interval;
 extern int bdf_prm[], bdflush_min[], bdflush_max[];
 extern char binfmt_java_interpreter[], binfmt_java_appletviewer[];
 extern int sysctl_overcommit_memory;
@@ -216,6 +216,8 @@
 };
 
 static ctl_table vm_table[] = {
+	{VM_SWAPOUT, "swapout_interval",
+	 &swapout_interval, sizeof(int), 0644, NULL, &proc_dointvec},
 	{VM_FREEPG, "freepages", 
 	 &freepages, sizeof(freepages_t), 0644, NULL, &proc_dointvec},
 	{VM_BDFLUSH, "bdflush", &bdf_prm, 9*sizeof(int), 0600, NULL,
@@ -223,11 +225,7 @@
 	 &bdflush_min, &bdflush_max},
 	{VM_OVERCOMMIT_MEMORY, "overcommit_memory", &sysctl_overcommit_memory,
 	 sizeof(sysctl_overcommit_memory), 0644, NULL, &proc_dointvec},
-	{VM_BUFFERMEM, "buffermem",
-	 &buffer_mem, sizeof(buffer_mem_t), 0644, NULL, &proc_dointvec},
-	{VM_PAGECACHE, "pagecache",
-	 &page_cache, sizeof(buffer_mem_t), 0644, NULL, &proc_dointvec},
-	{VM_PAGERDAEMON, "kswapd",
+	{VM_PAGERDAEMON, "pager",
 	 &pager_daemon, sizeof(pager_daemon_t), 0644, NULL, &proc_dointvec},
 	{VM_PGT_CACHE, "pagetable_cache", 
 	 &pgt_cache_water, 2*sizeof(int), 0600, NULL, &proc_dointvec},
Index: linux/include/linux/mm.h
diff -u linux/include/linux/mm.h:1.1.1.6 linux/include/linux/mm.h:1.1.1.1.2.20
--- linux/include/linux/mm.h:1.1.1.6	Mon Jan 11 22:23:57 1999
+++ linux/include/linux/mm.h	Wed Jan 13 21:23:36 1999
@@ -118,7 +118,6 @@
 	unsigned long offset;
 	struct page *next_hash;
 	atomic_t count;
-	unsigned int unused;
 	unsigned long flags;	/* atomic flags, some possibly updated asynchronously */
 	struct wait_queue *wait;
 	struct page **pprev_hash;
@@ -302,8 +301,7 @@
 
 /* filemap.c */
 extern void remove_inode_page(struct page *);
-extern unsigned long page_unuse(struct page *);
-extern int shrink_mmap(int, int);
+extern int FASTCALL(shrink_mmap(int, int));
 extern void truncate_inode_pages(struct inode *, unsigned long);
 extern unsigned long get_cached_page(struct inode *, unsigned long, int);
 extern void put_cached_page(unsigned long);
@@ -387,9 +385,11 @@
 }
 
 #define buffer_under_min()	((buffermem >> PAGE_SHIFT) * 100 < \
-				buffer_mem.min_percent * num_physpages)
-#define pgcache_under_min()	(page_cache_size * 100 < \
-				page_cache.min_percent * num_physpages)
+				pager_daemon.buffer_min_percent * num_physpages)
+#define pgcache_under_min()	((page_cache_size-swapper_inode.i_nrpages) * 100 < \
+				pager_daemon.cache_min_percent * num_physpages)
+#define swpcache_under_min()	(swapper_inode.i_nrpages * 100 < \
+				pager_daemon.swap_min_percent * num_physpages)
 
 #endif /* __KERNEL__ */
 
Index: linux/include/linux/sched.h
diff -u linux/include/linux/sched.h:1.1.1.6 linux/include/linux/sched.h:1.1.1.1.2.12
--- linux/include/linux/sched.h:1.1.1.6	Mon Jan 11 22:24:03 1999
+++ linux/include/linux/sched.h	Wed Jan 13 00:00:03 1999
@@ -169,6 +174,7 @@
 	unsigned long rss, total_vm, locked_vm;
 	unsigned long def_flags;
 	unsigned long cpu_vm_mask;
+	unsigned long swap_cnt, swap_address;
 	/*
 	 * This is an architecture-specific pointer: the portable
 	 * part of Linux does not know about any segments.
@@ -177,15 +183,17 @@
 };
 
 #define INIT_MM {					\
-		&init_mmap, NULL, swapper_pg_dir, 	\
+		&init_mmap, NULL, swapper_pg_dir,	\
 		ATOMIC_INIT(1), 1,			\
 		MUTEX,					\
 		0,					\
 		0, 0, 0, 0,				\
-		0, 0, 0, 				\
+		0, 0, 0,				\
 		0, 0, 0, 0,				\
 		0, 0, 0,				\
-		0, 0, NULL }
+		0, 0,					\
+		0, 0,					\
+		NULL }
 
 struct signal_struct {
 	atomic_t		count;
@@ -270,8 +278,7 @@
 /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
 	unsigned long min_flt, maj_flt, nswap, cmin_flt, cmaj_flt, cnswap;
 	int swappable:1;
-	unsigned long swap_address;
-	unsigned long swap_cnt;		/* number of pages to swap on next pass */
+	int trashing:1;
 /* process credentials */
 	uid_t uid,euid,suid,fsuid;
 	gid_t gid,egid,sgid,fsgid;
@@ -355,7 +362,7 @@
 /* utime */	{0,0,0,0},0, \
 /* per CPU times */ {0, }, {0, }, \
 /* flt */	0,0,0,0,0,0, \
-/* swp */	0,0,0, \
+/* swp */	0,0, \
 /* process credentials */					\
 /* uid etc */	0,0,0,0,0,0,0,0,				\
 /* suppl grps*/ 0, {0,},					\
Index: linux/include/linux/swapctl.h
diff -u linux/include/linux/swapctl.h:1.1.1.4 linux/include/linux/swapctl.h:1.1.1.1.2.5
--- linux/include/linux/swapctl.h:1.1.1.4	Mon Jan 11 22:24:05 1999
+++ linux/include/linux/swapctl.h	Wed Jan 13 21:23:36 1999
@@ -4,32 +4,24 @@
 #include <asm/page.h>
 #include <linux/fs.h>
 
-typedef struct buffer_mem_v1
+typedef struct freepages_s
 {
-	unsigned int	min_percent;
-	unsigned int	borrow_percent;
-	unsigned int	max_percent;
-} buffer_mem_v1;
-typedef buffer_mem_v1 buffer_mem_t;
-extern buffer_mem_t buffer_mem;
-extern buffer_mem_t page_cache;
-
-typedef struct freepages_v1
-{
 	unsigned int	min;
 	unsigned int	low;
 	unsigned int	high;
-} freepages_v1;
-typedef freepages_v1 freepages_t;
+} freepages_t;
 extern freepages_t freepages;
 
-typedef struct pager_daemon_v1
+typedef struct pager_daemon_s
 {
-	unsigned int	tries_base;
-	unsigned int	tries_min;
+	unsigned int	priority;
+	unsigned int	buffer_min_percent;
+	unsigned int	cache_min_percent;
+	unsigned int	swap_min_percent;
+	unsigned int	tries;
 	unsigned int	swap_cluster;
-} pager_daemon_v1;
-typedef pager_daemon_v1 pager_daemon_t;
+	unsigned int	max_async_pages;
+} pager_daemon_t;
 extern pager_daemon_t pager_daemon;
 
 #endif /* _LINUX_SWAPCTL_H */
Index: linux/include/linux/swap.h
diff -u linux/include/linux/swap.h:1.1.1.6 linux/include/linux/swap.h:1.1.1.1.2.17
--- linux/include/linux/swap.h:1.1.1.6	Mon Jan 11 22:24:05 1999
+++ linux/include/linux/swap.h	Wed Jan 13 21:28:52 1999
@@ -33,7 +33,7 @@
 #define SWP_USED	1
 #define SWP_WRITEOK	3
 
-#define SWAP_CLUSTER_MAX 32
+#define SWAP_CLUSTER	(pager_daemon.swap_cluster)
 
 #define SWAP_MAP_MAX	0x7fff
 #define SWAP_MAP_BAD	0x8000
@@ -68,9 +68,6 @@
 
 /* linux/ipc/shm.c */
 extern int shm_swap (int, int);
-
-/* linux/mm/swap.c */
-extern void swap_setup (void);
 
 /* linux/mm/vmscan.c */
 extern int try_to_free_pages(unsigned int gfp_mask);

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: [PATCH] Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-13 18:52                                                                     ` Andrea Arcangeli
@ 1999-01-13 22:10                                                                       ` Stephen C. Tweedie
  1999-01-13 22:30                                                                         ` Linus Torvalds
  0 siblings, 1 reply; 243+ messages in thread
From: Stephen C. Tweedie @ 1999-01-13 22:10 UTC (permalink / raw
  To: Andrea Arcangeli
  Cc: Stephen C. Tweedie, Linus Torvalds, Rik van Riel, Zlatko Calusic,
	Eric W. Biederman, Savochkin Andrey Vladimirovich, steve,
	brent verner, Garst R. Reese, Kalle Andersson, Ben McCann,
	Alan Cox, bredelin, linux-kernel, linux-mm

Hi,

On Wed, 13 Jan 1999 19:52:03 +0100 (CET), Andrea Arcangeli
<andrea@e-mind.com> said:

> Note that we don't need nr_async_pages at all. Here when the limit of
> nr_async_pages is low it's only a bottleneck for swapout performances. I
> have not removed it (because it could be useful to decrease swapout I/O if
> somebody needs this strange feature), but I have added a
> page_daemon.max_async_pages and set it to something like 256. Now I check
> nr_async_pages against the new max_async_pages. 

The problem is that if you do this, it is easy for the swapper to
generate huge amounts of async IO without actually freeing any real
memory: there's a question of balancing the amount of free memory we
have available right now with the amount which we are in the process of
freeing.  Setting the nr_async_pages bound to 256 just makes the swapper
keen to send a whole 1MB of memory out to disk at a time, which is a bit
steep on an 8MB box.

--Stephen
--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-13 15:07                                                                     ` Andrea Arcangeli
@ 1999-01-13 22:11                                                                       ` Stephen C. Tweedie
  0 siblings, 0 replies; 243+ messages in thread
From: Stephen C. Tweedie @ 1999-01-13 22:11 UTC (permalink / raw
  To: Andrea Arcangeli
  Cc: Chris Evans, Rik van Riel, Zlatko Calusic, Linus Torvalds,
	Stephen C. Tweedie, Eric W. Biederman,
	Savochkin Andrey Vladimirovich, steve, brent verner,
	Garst R. Reese, Kalle Andersson, Ben McCann, Alan Cox, bredelin,
	linux-kernel, linux-mm

Hi,

On Wed, 13 Jan 1999 16:07:13 +0100 (CET), Andrea Arcangeli
<andrea@e-mind.com> said:

> On Wed, 13 Jan 1999, Chris Evans wrote:
>> Yes. Imagine the paging in of big binary case. The page faults will occur
>> all over the place, not in a nice sequential order. The page-in clusters
>> stuff _doubled_ performance of paging in certain big static binaries.

> I think that if it helped it means that the swap cache got shrunk too much
> early due a not good free paging algorithm.

Not in the slightest.  We're talking about the things like the
performance of starting up a fresh new copy of netscape.  Swapout has
nothing to do with it in that case: we are starting from a ground state
where the binary is completely uncached.  The clustered pagein has a
huge impact in that case.

--Stephen
--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-13 18:10                                                                     ` Andrea Arcangeli
@ 1999-01-13 22:14                                                                       ` Stephen C. Tweedie
  1999-01-14 14:53                                                                         ` Dr. Werner Fink
  0 siblings, 1 reply; 243+ messages in thread
From: Stephen C. Tweedie @ 1999-01-13 22:14 UTC (permalink / raw
  To: Andrea Arcangeli
  Cc: Rik van Riel, Zlatko Calusic, Linus Torvalds, Stephen C. Tweedie,
	Eric W. Biederman, Savochkin Andrey Vladimirovich, steve,
	brent verner, Garst R. Reese, Kalle Andersson, Ben McCann,
	Alan Cox, bredelin, linux-kernel, linux-mm

Hi,

On Wed, 13 Jan 1999 19:10:28 +0100 (CET), Andrea Arcangeli
<andrea@e-mind.com> said:

> On Wed, 13 Jan 1999, Rik van Riel wrote:
>> - in allocating swap space it just doesn't make sense to read
>> into the next swap 'region'

> The point is that I can't see a swap `region' looking at how
> scan_swap_map() works. The more atomic region I can see in the swap space
> is a block of bytes large PAGE_SIZE bytes (e.g. offset ;).

The whole point is that we try to swap adjacent virtual pages to
adjacent swap entries, so there is a good chance that nearby swap
entries are going to be useful when we page them back in again.  Given
that adjacent swap entries on a swap partition are guaranteed to be
physically contiguous, it costs very little to swap in several nearby
elements at the same time, and we get a good chance of reading in useful
pages.

> For the case of binaries the aging on the page cache should take care of
> it (even if there's no aging on the swap cache as pre[567] if I remeber
> well). 

There is no aging on the page cache at all other than the PG_referenced
bit.

--Stephen
--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: [PATCH] Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-13 22:10                                                                       ` Stephen C. Tweedie
@ 1999-01-13 22:30                                                                         ` Linus Torvalds
  0 siblings, 0 replies; 243+ messages in thread
From: Linus Torvalds @ 1999-01-13 22:30 UTC (permalink / raw
  To: Stephen C. Tweedie
  Cc: Andrea Arcangeli, Rik van Riel, Zlatko Calusic, Eric W. Biederman,
	Savochkin Andrey Vladimirovich, steve, brent verner,
	Garst R. Reese, Kalle Andersson, Ben McCann, Alan Cox, bredelin,
	linux-kernel, linux-mm

On Wed, 13 Jan 1999, Stephen C. Tweedie wrote:
> 
> The problem is that if you do this, it is easy for the swapper to
> generate huge amounts of async IO without actually freeing any real
> memory: there's a question of balancing the amount of free memory we
> have available right now with the amount which we are in the process of
> freeing.  Setting the nr_async_pages bound to 256 just makes the swapper
> keen to send a whole 1MB of memory out to disk at a time, which is a bit
> steep on an 8MB box.

Note that this should be much less of a problem with the current swapout
strategies, but yes, basically we definitely do want to have _some_ way of
maintaining a sane "maximum number of pages in flight" thing. 

The right solution may be to do the check in some other place, rather than
fairly deep inside the swap logic. 

It's not a big deal, I suspect.

Anyway, there's a real pre7 out there now, and it doesn't change a lot of
th issues discussed here. I wanted to get something stable and working. I
still need to get the recursive semaphore thing (or other approach) done,
but basically I think we're at 2.2.0 already apart from that issue, and
that we can continue this discussion as a "occasional tweaks" thing. 

		Linus

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-13 17:48                                                                 ` Stephen C. Tweedie
  1999-01-13 18:07                                                                   ` 2.2.0-pre6 ain't nice =( Kalle Andersson
  1999-01-13 19:05                                                                   ` MM deadlock [was: Re: arca-vm-8...] Alan Cox
@ 1999-01-14 10:48                                                                   ` Mike Jagdis
  2 siblings, 0 replies; 243+ messages in thread
From: Mike Jagdis @ 1999-01-14 10:48 UTC (permalink / raw
  To: Stephen C. Tweedie; +Cc: linux-kernel, linux-mm

On Wed, 13 Jan 1999, Stephen C. Tweedie wrote:

> On Tue, 12 Jan 1999 20:05:21 +0100 (CET), Andrea Arcangeli
> <andrea@e-mind.com> said:
> 
> > Maybe because nobody care about shm? I think shm can wait for 2.3 to be
> > improved.
> 
> "Nobody"?  Oracle uses large shared memory regions for starters.

Yeah, and so does Informix Dynamic Server. But, in general, you
do not want this to be swapped heavily, if at all.

				Mike

-- 
    A train stops at a train station, a bus stops at a bus station.
    On my desk I have a work station...
.----------------------------------------------------------------------.
|  Mike Jagdis                  |  Internet:  mailto:mike@roan.co.uk   |
|  Roan Technology Ltd.         |                                      |
|  54A Peach Street, Wokingham  |  Telephone:  +44 118 989 0403        |
|  RG40 1XG, ENGLAND            |  Fax:        +44 118 989 1195        |
`----------------------------------------------------------------------'

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-13 19:26                                                                     ` Andrea Arcangeli
@ 1999-01-14 11:02                                                                       ` Mike Jagdis
  1999-01-14 22:38                                                                         ` Andrea Arcangeli
  1999-01-15  7:40                                                                       ` Agus Budy Wuysang
  1 sibling, 1 reply; 243+ messages in thread
From: Mike Jagdis @ 1999-01-14 11:02 UTC (permalink / raw
  To: Andrea Arcangeli; +Cc: Alan Cox, Stephen C. Tweedie, linux-kernel, linux-mm

On Wed, 13 Jan 1999, Andrea Arcangeli wrote:

> I was't aware of that. I noticed that also postgres (a big database) uses
> shm but it's _only_ something like 1 Mbyte (at least during trivial
> usage). With my current code such 1 Mbyte would not be touched unless
> there would be some really big memory squeezee (like something that cause
> a swapout of tons of memory and that would slowdown a bit the system
> anyway). This is obviously not true in clean pre7 (see
> try_to_free_pages()). 
> 
> With my latest code, optimizing the shm swapout (as we do with normal
> userspace memory) would help only if the shm memory is going to be in size
> something like the total VM allocated in all processes' mm.  Since I
> supposed that "normal" apps don't use huge amount of shm memory I told
> that we could not care until 2.3. I can't know how much shm memory uses
> Oracle SQL server because I can't have it. I am pretty sure instead that
> postgresql will be not stuck in shm swapout here even if the shm swapout
> code is gross.

Informix Dynamic Server is a free download from www.informix.com
(but its fairly big, ~30MB). I think Oracle is downloadable too.
With either the shm used is normally "large" with respect to
available physical memory (i.e. everything that isn't needed by
processes under your normal load) because the database engines
use it to cache data (this is why they want raw IO - there's no
point the OS caching the data as well).

				Mike

-- 
    A train stops at a train station, a bus stops at a bus station.
    On my desk I have a work station...
.----------------------------------------------------------------------.
|  Mike Jagdis                  |  Internet:  mailto:mike@roan.co.uk   |
|  Roan Technology Ltd.         |                                      |
|  54A Peach Street, Wokingham  |  Telephone:  +44 118 989 0403        |
|  RG40 1XG, ENGLAND            |  Fax:        +44 118 989 1195        |
`----------------------------------------------------------------------'

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: [patch] arca-vm-19 [Re: Results: Zlatko's new vm patch]
  1999-01-13 20:47                                                             ` [patch] arca-vm-19 [Re: Results: Zlatko's new vm patch] Andrea Arcangeli
@ 1999-01-14 12:30                                                               ` Andrea Arcangeli
  1999-01-15 23:56                                                                 ` [patch] NEW: arca-vm-21, swapout via shrink_mmap using PG_dirty Andrea Arcangeli
  0 siblings, 1 reply; 243+ messages in thread
From: Andrea Arcangeli @ 1999-01-14 12:30 UTC (permalink / raw
  To: Steve Bergman, dlux, Nicholas J. Leon
  Cc: Linus Torvalds, brent verner, Garst R. Reese, Kalle Andersson,
	Zlatko Calusic, Ben McCann, bredelin, linux-kernel, linux-mm,
	Alan Cox, Stephen C. Tweedie, Heinz Mauelshagen

On Wed, 13 Jan 1999, Andrea Arcangeli wrote:

> I produced a new arca-vm-19. I would like if you could try it. I don't

It seems that the better algorithm I am be able to invent is been the
growing_swap_cache one (the one in arca-vm-16). Steve could you try this
new patch (arca-vm-20) against real 2.2.0-pre7? I think that it should be
still better than arca-vm-16 + SWAP_CLUSTER_MAX=512. 

If it will be not very good could you do:

echo 8 2 4 512 512 512 > /proc/sys/vm/pager

and try again? (such numbers should be the same of setting
SWAP_CLUSTER_MAX in arca-vm-16, but as default only the max_async_pages is
set to 512 because I think it's been the only one that made a difference). 

If this will be not the best again you could apply the filemap.c patch I
sent you in the last email (the one that return to put the shrink_mmap()
weight exponential increasing in function of priority) and try again?

Many thanks!

Andrea Arcangeli

Here arca-vm-20 against 2.2.0-pre7:

Index: linux/mm/filemap.c
diff -u linux/mm/filemap.c:1.1.1.9 linux/mm/filemap.c:1.1.1.1.2.48
--- linux/mm/filemap.c:1.1.1.9	Thu Jan  7 12:21:35 1999
+++ linux/mm/filemap.c	Thu Jan 14 13:15:32 1999
@@ -121,14 +125,11 @@
 int shrink_mmap(int priority, int gfp_mask)
 {
 	static unsigned long clock = 0;
-	unsigned long limit = num_physpages;
 	struct page * page;
-	int count;
-
-	count = (limit << 1) >> priority;
+	unsigned long count = num_physpages / (priority+1);
 
 	page = mem_map + clock;
-	do {
+	while (count-- != 0) {
 		int referenced;
 
 		/* This works even in the presence of PageSkip because
@@ -147,7 +148,6 @@
 			clock = page->map_nr;
 		}
 		
-		count--;
 		referenced = test_and_clear_bit(PG_referenced, &page->flags);
 
 		if (PageLocked(page))
@@ -160,21 +160,6 @@
 		if (atomic_read(&page->count) != 1)
 			continue;
 
-		/*
-		 * Is it a page swap page? If so, we want to
-		 * drop it if it is no longer used, even if it
-		 * were to be marked referenced..
-		 */
-		if (PageSwapCache(page)) {
-			if (referenced && swap_count(page->offset) != 1)
-				continue;
-			delete_from_swap_cache(page);
-			return 1;
-		}	
-
-		if (referenced)
-			continue;
-
 		/* Is it a buffer page? */
 		if (page->buffers) {
 			if (buffer_under_min())
@@ -184,6 +169,14 @@
 			return 1;
 		}
 
+		if (referenced)
+			continue;
+
+		if (PageSwapCache(page)) {
+			delete_from_swap_cache(page);
+			return 1;
+		}	
+
 		/* is it a page-cache page? */
 		if (page->inode) {
 			if (pgcache_under_min())
@@ -191,8 +184,7 @@
 			remove_inode_page(page);
 			return 1;
 		}
-
-	} while (count > 0);
+	}
 	return 0;
 }
 
Index: linux/mm/mmap.c
diff -u linux/mm/mmap.c:1.1.1.2 linux/mm/mmap.c:1.1.1.1.2.12
--- linux/mm/mmap.c:1.1.1.2	Fri Nov 27 11:19:10 1998
+++ linux/mm/mmap.c	Wed Jan 13 21:23:38 1999
@@ -66,7 +66,7 @@
 	free += page_cache_size;
 	free += nr_free_pages;
 	free += nr_swap_pages;
-	free -= (page_cache.min_percent + buffer_mem.min_percent + 2)*num_physpages/100; 
+	free -= (pager_daemon.cache_min_percent + pager_daemon.buffer_min_percent + 2)*num_physpages/100; 
 	return free > pages;
 }
 
Index: linux/mm/page_alloc.c
diff -u linux/mm/page_alloc.c:1.1.1.9 linux/mm/page_alloc.c:1.1.1.1.2.31
--- linux/mm/page_alloc.c:1.1.1.9	Thu Jan 14 12:32:57 1999
+++ linux/mm/page_alloc.c	Thu Jan 14 12:42:59 1999
@@ -124,7 +124,6 @@
 	if (!PageReserved(page) && atomic_dec_and_test(&page->count)) {
 		if (PageSwapCache(page))
 			panic ("Freeing swap cache page");
-		page->flags &= ~(1 << PG_referenced);
 		free_pages_ok(page->map_nr, 0);
 		return;
 	}
@@ -141,7 +140,6 @@
 		if (atomic_dec_and_test(&map->count)) {
 			if (PageSwapCache(map))
 				panic ("Freeing swap cache pages");
-			map->flags &= ~(1 << PG_referenced);
 			free_pages_ok(map_nr, order);
 			return;
 		}
@@ -212,19 +210,18 @@
 		 * further thought.
 		 */
 		if (!(current->flags & PF_MEMALLOC)) {
-			static int trashing = 0;
 			int freed;
 
 			if (nr_free_pages > freepages.min) {
-				if (!trashing)
+				if (!current->trashing)
 					goto ok_to_allocate;
 				if (nr_free_pages > freepages.low) {
-					trashing = 0;
+					current->trashing = 0;
 					goto ok_to_allocate;
 				}
 			}
 
-			trashing = 1;
+			current->trashing = 1;
 			current->flags |= PF_MEMALLOC;
 			freed = try_to_free_pages(gfp_mask);
 			current->flags &= ~PF_MEMALLOC;
@@ -361,7 +358,7 @@
 		if (offset >= swapdev->max)
 			break;
 		/* Don't block on I/O for read-ahead */
-		if (atomic_read(&nr_async_pages) >= pager_daemon.swap_cluster)
+		if (atomic_read(&nr_async_pages) >= pager_daemon.max_async_pages)
 			break;
 		/* Don't read in bad or busy pages */
 		if (!swapdev->swap_map[offset])
Index: linux/mm/page_io.c
diff -u linux/mm/page_io.c:1.1.1.4 linux/mm/page_io.c:1.1.1.1.2.6
--- linux/mm/page_io.c:1.1.1.4	Tue Dec 29 01:39:20 1998
+++ linux/mm/page_io.c	Wed Jan 13 00:00:04 1999
@@ -58,7 +58,7 @@
 	}
 
 	/* Don't allow too many pending pages in flight.. */
-	if (atomic_read(&nr_async_pages) > pager_daemon.swap_cluster)
+	if (atomic_read(&nr_async_pages) > pager_daemon.max_async_pages)
 		wait = 1;
 
 	p = &swap_info[type];
Index: linux/mm/swap.c
diff -u linux/mm/swap.c:1.1.1.6 linux/mm/swap.c:1.1.1.1.2.14
--- linux/mm/swap.c:1.1.1.6	Mon Jan 11 22:24:24 1999
+++ linux/mm/swap.c	Thu Jan 14 13:15:32 1999
@@ -40,41 +40,17 @@
 };
 
 /* How many pages do we try to swap or page in/out together? */
-int page_cluster = 4; /* Default value modified in swap_setup() */
+int page_cluster = 5; /* Default readahead 32 pages every time */
 
 /* We track the number of pages currently being asynchronously swapped
    out, so that we don't try to swap TOO many pages out at once */
 atomic_t nr_async_pages = ATOMIC_INIT(0);
 
-buffer_mem_t buffer_mem = {
-	2,	/* minimum percent buffer */
-	10,	/* borrow percent buffer */
-	60	/* maximum percent buffer */
-};
-
-buffer_mem_t page_cache = {
-	2,	/* minimum percent page cache */
-	15,	/* borrow percent page cache */
-	75	/* maximum */
-};
-
 pager_daemon_t pager_daemon = {
-	512,	/* base number for calculating the number of tries */
-	SWAP_CLUSTER_MAX,	/* minimum number of tries */
-	SWAP_CLUSTER_MAX,	/* do swap I/O in clusters of this size */
+	8,	/* starting priority of try_to_free_pages() */
+	2,	/* minimum percent buffer */
+	4,	/* minimum percent page cache */
+	32,	/* number of tries we do on every try_to_free_pages() */
+	128,	/* do swap I/O in clusters of this size */
+	512	/* max number of async swapped-out pages on the fly */
 };
-
-/*
- * Perform any setup for the swap system
- */
-
-void __init swap_setup(void)
-{
-	/* Use a smaller cluster for memory <16MB or <32MB */
-	if (num_physpages < ((16 * 1024 * 1024) >> PAGE_SHIFT))
-		page_cluster = 2;
-	else if (num_physpages < ((32 * 1024 * 1024) >> PAGE_SHIFT))
-		page_cluster = 3;
-	else
-		page_cluster = 4;
-}
Index: linux/mm/swapfile.c
diff -u linux/mm/swapfile.c:1.1.1.3 linux/mm/swapfile.c:1.1.1.1.2.6
--- linux/mm/swapfile.c:1.1.1.3	Mon Jan 11 22:24:24 1999
+++ linux/mm/swapfile.c	Wed Jan 13 00:00:04 1999
@@ -23,7 +23,6 @@
 
 struct swap_info_struct swap_info[MAX_SWAPFILES];
 
-#define SWAPFILE_CLUSTER 256
 
 static inline int scan_swap_map(struct swap_info_struct *si)
 {
@@ -31,7 +30,7 @@
 	/* 
 	 * We try to cluster swap pages by allocating them
 	 * sequentially in swap.  Once we've allocated
-	 * SWAPFILE_CLUSTER pages this way, however, we resort to
+	 * SWAP_CLUSTER pages this way, however, we resort to
 	 * first-free allocation, starting a new cluster.  This
 	 * prevents us from scattering swap pages all over the entire
 	 * swap partition, so that we reduce overall disk seek times
@@ -47,7 +46,7 @@
 			goto got_page;
 		}
 	}
-	si->cluster_nr = SWAPFILE_CLUSTER;
+	si->cluster_nr = SWAP_CLUSTER;
 	for (offset = si->lowest_bit; offset <= si->highest_bit ; offset++) {
 		if (si->swap_map[offset])
 			continue;
Index: linux/mm/vmscan.c
diff -u linux/mm/vmscan.c:1.1.1.12 linux/mm/vmscan.c:1.1.1.1.2.93
--- linux/mm/vmscan.c:1.1.1.12	Mon Jan 11 22:24:24 1999
+++ linux/mm/vmscan.c	Thu Jan 14 13:15:32 1999
@@ -10,6 +10,11 @@
  *  Version: $Id: vmscan.c,v 1.5 1998/02/23 22:14:28 sct Exp $
  */
 
+/*
+ * free_user_and_cache() and always async swapout original idea.
+ * Copyright (C) 1999  Andrea Arcangeli
+ */
+
 #include <linux/slab.h>
 #include <linux/kernel_stat.h>
 #include <linux/swap.h>
@@ -20,6 +25,8 @@
 
 #include <asm/pgtable.h>
 
+int swapout_interval = HZ;
+
 /*
  * The swap-out functions return 1 if they successfully
  * threw something out, and we got a free page. It returns
@@ -71,6 +78,21 @@
 	 * memory, and we should just continue our scan.
 	 */
 	if (PageSwapCache(page_map)) {
+		if (pte_write(pte))
+		{
+			struct page *found;
+			printk ("VM: Found a writable swap-cached page!\n");
+			/* Try to diagnose the problem ... */
+			found = find_page(&swapper_inode, page_map->offset);
+			if (found) {
+				printk("page=%p@%08lx, found=%p, count=%d\n",
+				       page_map, page_map->offset,
+				       found, atomic_read(&found->count));
+				__free_page(found);
+			} else 
+				printk ("Spurious, page not in cache\n");
+			return 0;
+		}
 		entry = page_map->offset;
 		swap_duplicate(entry);
 		set_pte(page_table, __pte(entry));
@@ -199,7 +221,7 @@
 
 	do {
 		int result;
-		tsk->swap_address = address + PAGE_SIZE;
+		tsk->mm->swap_address = address + PAGE_SIZE;
 		result = try_to_swap_out(tsk, vma, address, pte, gfp_mask);
 		if (result)
 			return result;
@@ -271,7 +293,7 @@
 	/*
 	 * Go through process' page directory.
 	 */
-	address = p->swap_address;
+	address = p->mm->swap_address;
 
 	/*
 	 * Find the proper vm-area
@@ -293,8 +315,8 @@
 	}
 
 	/* We didn't find anything for the process */
-	p->swap_cnt = 0;
-	p->swap_address = 0;
+	p->mm->swap_cnt = 0;
+	p->mm->swap_address = 0;
 	return 0;
 }
 
@@ -306,7 +328,8 @@
 static int swap_out(unsigned int priority, int gfp_mask)
 {
 	struct task_struct * p, * pbest;
-	int counter, assign, max_cnt;
+	int counter, assign;
+	unsigned long max_cnt;
 
 	/* 
 	 * We make one or two passes through the task list, indexed by 
@@ -325,7 +348,7 @@
 	counter = nr_tasks / (priority+1);
 	if (counter < 1)
 		counter = 1;
-	if (counter > nr_tasks)
+	else if (counter > nr_tasks)
 		counter = nr_tasks;
 
 	for (; counter >= 0; counter--) {
@@ -338,13 +361,13 @@
 		for (; p != &init_task; p = p->next_task) {
 			if (!p->swappable)
 				continue;
-	 		if (p->mm->rss <= 0)
+	 		if (p->mm->rss == 0)
 				continue;
 			/* Refresh swap_cnt? */
 			if (assign)
-				p->swap_cnt = p->mm->rss;
-			if (p->swap_cnt > max_cnt) {
-				max_cnt = p->swap_cnt;
+				p->mm->swap_cnt = p->mm->rss;
+			if (p->mm->swap_cnt > max_cnt) {
+				max_cnt = p->mm->swap_cnt;
 				pbest = p;
 			}
 		}
@@ -375,8 +398,6 @@
        int i;
        char *revision="$Revision: 1.5 $", *s, *e;
 
-       swap_setup();
-       
        if ((s = strchr(revision, ':')) &&
            (e = strchr(s, '$')))
                s++, i = e - s;
@@ -430,7 +451,7 @@
 			break;
 		current->state = TASK_INTERRUPTIBLE;
 		run_task_queue(&tq_disk);
-		schedule_timeout(HZ);
+		schedule_timeout(swapout_interval);
 
 		/*
 		 * kswapd isn't even meant to keep up with anything,
@@ -438,13 +459,36 @@
 		 * point is to make sure that the system doesn't stay
 		 * forever in a really bad memory squeeze.
 		 */
-		if (nr_free_pages < freepages.high)
+		if (nr_free_pages < freepages.min)
 			try_to_free_pages(GFP_KSWAPD);
 	}
 
 	return 0;
 }
 
+static int free_user_and_cache(int priority, int gfp_mask)
+{
+	static unsigned long grow_swap_cache = 0;
+
+	if (!shrink_mmap(priority, gfp_mask))
+		grow_swap_cache = 1;
+
+	switch (grow_swap_cache)
+	{
+	case 0:
+		return 1;
+	default:
+		if (grow_swap_cache++ >= freepages.high)
+			grow_swap_cache = 0;
+	}
+
+	if (swap_out(priority, gfp_mask))
+		return 1;
+
+	grow_swap_cache = 0;
+	return 0;
+}
+
 /*
  * We need to make the locks finer granularity, but right
  * now we need this so that we can do page allocations
@@ -457,34 +501,35 @@
 int try_to_free_pages(unsigned int gfp_mask)
 {
 	int priority;
-	int count = SWAP_CLUSTER_MAX;
+	static int state = 0;
+	int count = pager_daemon.tries;
 
 	lock_kernel();
 
 	/* Always trim SLAB caches when memory gets low. */
 	kmem_cache_reap(gfp_mask);
-
-	priority = 6;
-	do {
-		while (shrink_mmap(priority, gfp_mask)) {
-			if (!--count)
-				goto done;
-		}
 
-		/* Try to get rid of some shared memory pages.. */
-		while (shm_swap(priority, gfp_mask)) {
-			if (!--count)
-				goto done;
-		}
-	
-		/* Then, try to page stuff out.. */
-		while (swap_out(priority, gfp_mask)) {
-			if (!--count)
-				goto done;
-		}
+	priority = pager_daemon.priority;
+	switch (state)
+	{
+		do {
+		case 0:
+			while (free_user_and_cache(priority, gfp_mask)) {
+				if (!--count)
+					goto done;
+			}
+			state = 1;
+		case 1:
+			/* Try to get rid of some shared memory pages.. */
+			while (shm_swap(priority, gfp_mask)) {
+				if (!--count)
+					goto done;
+			}
+			state = 0;
 
-		shrink_dcache_memory(priority, gfp_mask);
-	} while (--priority >= 0);
+			shrink_dcache_memory(priority, gfp_mask);
+		} while (--priority >= 0);
+	}
 done:
 	unlock_kernel();
 
Index: linux/kernel/fork.c
diff -u linux/kernel/fork.c:1.1.1.6 linux/kernel/fork.c:1.1.1.1.2.10
--- linux/kernel/fork.c:1.1.1.6	Mon Jan 11 22:24:21 1999
+++ linux/kernel/fork.c	Mon Jan 11 22:56:09 1999
@@ -511,6 +514,7 @@
 
 	p->did_exec = 0;
 	p->swappable = 0;
+	p->trashing = 0;
 	p->state = TASK_UNINTERRUPTIBLE;
 
 	copy_flags(clone_flags, p);
Index: linux/kernel/sysctl.c
diff -u linux/kernel/sysctl.c:1.1.1.6 linux/kernel/sysctl.c:1.1.1.1.2.12
--- linux/kernel/sysctl.c:1.1.1.6	Mon Jan 11 22:24:22 1999
+++ linux/kernel/sysctl.c	Wed Jan 13 21:23:38 1999
@@ -32,7 +32,7 @@
 
 /* External variables not in a header file. */
 extern int panic_timeout;
-extern int console_loglevel, C_A_D;
+extern int console_loglevel, C_A_D, swapout_interval;
 extern int bdf_prm[], bdflush_min[], bdflush_max[];
 extern char binfmt_java_interpreter[], binfmt_java_appletviewer[];
 extern int sysctl_overcommit_memory;
@@ -216,6 +216,8 @@
 };
 
 static ctl_table vm_table[] = {
+	{VM_SWAPOUT, "swapout_interval",
+	 &swapout_interval, sizeof(int), 0644, NULL, &proc_dointvec},
 	{VM_FREEPG, "freepages", 
 	 &freepages, sizeof(freepages_t), 0644, NULL, &proc_dointvec},
 	{VM_BDFLUSH, "bdflush", &bdf_prm, 9*sizeof(int), 0600, NULL,
@@ -223,11 +225,7 @@
 	 &bdflush_min, &bdflush_max},
 	{VM_OVERCOMMIT_MEMORY, "overcommit_memory", &sysctl_overcommit_memory,
 	 sizeof(sysctl_overcommit_memory), 0644, NULL, &proc_dointvec},
-	{VM_BUFFERMEM, "buffermem",
-	 &buffer_mem, sizeof(buffer_mem_t), 0644, NULL, &proc_dointvec},
-	{VM_PAGECACHE, "pagecache",
-	 &page_cache, sizeof(buffer_mem_t), 0644, NULL, &proc_dointvec},
-	{VM_PAGERDAEMON, "kswapd",
+	{VM_PAGERDAEMON, "pager",
 	 &pager_daemon, sizeof(pager_daemon_t), 0644, NULL, &proc_dointvec},
 	{VM_PGT_CACHE, "pagetable_cache", 
 	 &pgt_cache_water, 2*sizeof(int), 0600, NULL, &proc_dointvec},
Index: linux/include/linux/mm.h
diff -u linux/include/linux/mm.h:1.1.1.6 linux/include/linux/mm.h:1.1.1.1.2.21
--- linux/include/linux/mm.h:1.1.1.6	Mon Jan 11 22:23:57 1999
+++ linux/include/linux/mm.h	Thu Jan 14 13:15:31 1999
@@ -118,7 +118,6 @@
 	unsigned long offset;
 	struct page *next_hash;
 	atomic_t count;
-	unsigned int unused;
 	unsigned long flags;	/* atomic flags, some possibly updated asynchronously */
 	struct wait_queue *wait;
 	struct page **pprev_hash;
@@ -302,8 +301,7 @@
 
 /* filemap.c */
 extern void remove_inode_page(struct page *);
-extern unsigned long page_unuse(struct page *);
-extern int shrink_mmap(int, int);
+extern int FASTCALL(shrink_mmap(int, int));
 extern void truncate_inode_pages(struct inode *, unsigned long);
 extern unsigned long get_cached_page(struct inode *, unsigned long, int);
 extern void put_cached_page(unsigned long);
@@ -387,9 +385,9 @@
 }
 
 #define buffer_under_min()	((buffermem >> PAGE_SHIFT) * 100 < \
-				buffer_mem.min_percent * num_physpages)
-#define pgcache_under_min()	(page_cache_size * 100 < \
-				page_cache.min_percent * num_physpages)
+				pager_daemon.buffer_min_percent * num_physpages)
+#define pgcache_under_min()	((page_cache_size-swapper_inode.i_nrpages) * 100 < \
+				pager_daemon.cache_min_percent * num_physpages)
 
 #endif /* __KERNEL__ */
 
Index: linux/include/linux/sched.h
diff -u linux/include/linux/sched.h:1.1.1.6 linux/include/linux/sched.h:1.1.1.1.2.13
--- linux/include/linux/sched.h:1.1.1.6	Mon Jan 11 22:24:03 1999
+++ linux/include/linux/sched.h	Thu Jan 14 12:42:58 1999
@@ -169,6 +174,7 @@
 	unsigned long rss, total_vm, locked_vm;
 	unsigned long def_flags;
 	unsigned long cpu_vm_mask;
+	unsigned long swap_cnt, swap_address;
 	/*
 	 * This is an architecture-specific pointer: the portable
 	 * part of Linux does not know about any segments.
@@ -177,15 +183,17 @@
 };
 
 #define INIT_MM {					\
-		&init_mmap, NULL, swapper_pg_dir, 	\
+		&init_mmap, NULL, swapper_pg_dir,	\
 		ATOMIC_INIT(1), 1,			\
 		MUTEX,					\
 		0,					\
 		0, 0, 0, 0,				\
-		0, 0, 0, 				\
+		0, 0, 0,				\
 		0, 0, 0, 0,				\
 		0, 0, 0,				\
-		0, 0, NULL }
+		0, 0,					\
+		0, 0,					\
+		NULL }
 
 struct signal_struct {
 	atomic_t		count;
@@ -270,8 +278,7 @@
 /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
 	unsigned long min_flt, maj_flt, nswap, cmin_flt, cmaj_flt, cnswap;
 	int swappable:1;
-	unsigned long swap_address;
-	unsigned long swap_cnt;		/* number of pages to swap on next pass */
+	int trashing:1;
 /* process credentials */
 	uid_t uid,euid,suid,fsuid;
 	gid_t gid,egid,sgid,fsgid;
@@ -355,7 +362,7 @@
 /* utime */	{0,0,0,0},0, \
 /* per CPU times */ {0, }, {0, }, \
 /* flt */	0,0,0,0,0,0, \
-/* swp */	0,0,0, \
+/* swp */	0,0, \
 /* process credentials */					\
 /* uid etc */	0,0,0,0,0,0,0,0,				\
 /* suppl grps*/ 0, {0,},					\
Index: linux/include/linux/swap.h
diff -u linux/include/linux/swap.h:1.1.1.6 linux/include/linux/swap.h:1.1.1.1.2.17
--- linux/include/linux/swap.h:1.1.1.6	Mon Jan 11 22:24:05 1999
+++ linux/include/linux/swap.h	Wed Jan 13 21:28:52 1999
@@ -33,7 +33,7 @@
 #define SWP_USED	1
 #define SWP_WRITEOK	3
 
-#define SWAP_CLUSTER_MAX 32
+#define SWAP_CLUSTER	(pager_daemon.swap_cluster)
 
 #define SWAP_MAP_MAX	0x7fff
 #define SWAP_MAP_BAD	0x8000
@@ -68,9 +68,6 @@
 
 /* linux/ipc/shm.c */
 extern int shm_swap (int, int);
-
-/* linux/mm/swap.c */
-extern void swap_setup (void);
 
 /* linux/mm/vmscan.c */
 extern int try_to_free_pages(unsigned int gfp_mask);
Index: linux/include/linux/swapctl.h
diff -u linux/include/linux/swapctl.h:1.1.1.4 linux/include/linux/swapctl.h:1.1.1.1.2.6
--- linux/include/linux/swapctl.h:1.1.1.4	Mon Jan 11 22:24:05 1999
+++ linux/include/linux/swapctl.h	Thu Jan 14 13:15:31 1999
@@ -4,32 +4,23 @@
 #include <asm/page.h>
 #include <linux/fs.h>
 
-typedef struct buffer_mem_v1
+typedef struct freepages_s
 {
-	unsigned int	min_percent;
-	unsigned int	borrow_percent;
-	unsigned int	max_percent;
-} buffer_mem_v1;
-typedef buffer_mem_v1 buffer_mem_t;
-extern buffer_mem_t buffer_mem;
-extern buffer_mem_t page_cache;
-
-typedef struct freepages_v1
-{
 	unsigned int	min;
 	unsigned int	low;
 	unsigned int	high;
-} freepages_v1;
-typedef freepages_v1 freepages_t;
+} freepages_t;
 extern freepages_t freepages;
 
-typedef struct pager_daemon_v1
+typedef struct pager_daemon_s
 {
-	unsigned int	tries_base;
-	unsigned int	tries_min;
+	unsigned int	priority;
+	unsigned int	buffer_min_percent;
+	unsigned int	cache_min_percent;
+	unsigned int	tries;
 	unsigned int	swap_cluster;
-} pager_daemon_v1;
-typedef pager_daemon_v1 pager_daemon_t;
+	unsigned int	max_async_pages;
+} pager_daemon_t;
 extern pager_daemon_t pager_daemon;
 
 #endif /* _LINUX_SWAPCTL_H */


--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-13 22:14                                                                       ` Stephen C. Tweedie
@ 1999-01-14 14:53                                                                         ` Dr. Werner Fink
  1999-01-21 16:50                                                                           ` Stephen C. Tweedie
  1999-01-22 16:29                                                                           ` Eric W. Biederman
  0 siblings, 2 replies; 243+ messages in thread
From: Dr. Werner Fink @ 1999-01-14 14:53 UTC (permalink / raw
  To: Stephen C. Tweedie, Andrea Arcangeli
  Cc: Rik van Riel, Zlatko Calusic, Linus Torvalds, Eric W. Biederman,
	Savochkin Andrey Vladimirovich, steve, brent verner,
	Garst R. Reese, Kalle Andersson, Ben McCann, Alan Cox, bredelin,
	linux-kernel, linux-mm

> > For the case of binaries the aging on the page cache should take care of
> > it (even if there's no aging on the swap cache as pre[567] if I remeber
> > well). 
> 
> There is no aging on the page cache at all other than the PG_referenced
> bit.

I know that most of you do not like aging.  Nevertheless, on high stressed
systems with less than 128M you will see a critical point whereas the page
cache and readahead does not avoid that swapin I/O time needed by a program
increases to similar size of the average program time slice.

At this point the system performance breaks down dramatically even
with 2.2.0pre[567] ...

What's about a simple aging of program page cluster or better of the
page cache?  Increasing the age could be done if and only if the pages
or page clusters swapped in and the program wasn't able to use its
time slice. Decreasing the age could be placed in shrink_mmap().

        Werner

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-14 11:02                                                                       ` Mike Jagdis
@ 1999-01-14 22:38                                                                         ` Andrea Arcangeli
  0 siblings, 0 replies; 243+ messages in thread
From: Andrea Arcangeli @ 1999-01-14 22:38 UTC (permalink / raw
  To: Mike Jagdis
  Cc: Alan Cox, Stephen C. Tweedie, linux-kernel, linux-mm,
	Alessandro Suardi

On Thu, 14 Jan 1999, Mike Jagdis wrote:

> With either the shm used is normally "large" with respect to
> available physical memory (i.e. everything that isn't needed by
> processes under your normal load) because the database engines
> use it to cache data (this is why they want raw IO - there's no
> point the OS caching the data as well).

Ah but if what you say is true the db server shm issue raised by Stephen
is completly pointless. If the shm memory is used as _cache_ for the data
there's _no_ one point to swapout it out in first place. So when using the
shm for caching purposes the db server _must_ set the SHM_LOCK flag on the
shm memory using shmctl.

Andrea Arcangeli

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-13 19:26                                                                     ` Andrea Arcangeli
  1999-01-14 11:02                                                                       ` Mike Jagdis
@ 1999-01-15  7:40                                                                       ` Agus Budy Wuysang
  1 sibling, 0 replies; 243+ messages in thread
From: Agus Budy Wuysang @ 1999-01-15  7:40 UTC (permalink / raw
  To: Andrea Arcangeli; +Cc: Alan Cox, Stephen C. Tweedie, linux-kernel, linux-mm

Andrea Arcangeli wrote:
> 
> On Wed, 13 Jan 1999, Alan Cox wrote:
> 
> > > >> Could somebody spare a minute to explain why is that so, and what
> > > >> needs to be done to make SHM swapping asynchronous?
> > >
> > > > Maybe because nobody care about shm? I think shm can wait for 2.3 to be
> > > > improved.
> > >
> > > "Nobody"?  Oracle uses large shared memory regions for starters.
> >
> > All the big databases use large shared memory objects.
> 
> I was't aware of that. I noticed that also postgres (a big database) uses
> shm but it's _only_ something like 1 Mbyte (at least during trivial
> usage). With my current code such 1 Mbyte would not be touched unless

Our current database size 1.6Gb, locks & buffers are
using "large" SHM heavily. Due to Linux SHMMAX = 16M,
allocation is divided into serveral 16M segments.

current database buffers = 130,000 * 1024 bytes
locks = 65536 * 18 bytes

Quick Spec:
Dual PPro 200MHz, 256Mb Ram
Kernel 2.2.0-pre6
Progress 7.3C08 via latest iBCS 2.1.x emulator

-- 
+---| Netscape Communicator 4.x |---| Powered by Linux 2.1.x |---+
|/v\ Agus Budy Wuysang                   MIS Department          |
| |  Phone:  +62-21-344-1316 ext 317     GSM: +62-816-1972-051   |
+--------| http://www.rad.net.id/users/personal/s/supes |--------+
-----BEGIN GEEK CODE BLOCK-----
Version: 3.1
GCS/IT dx s: a- C+++ UL++++$ P- L+++(++++) E--- W++ N+++ o? K? w-- O-
M- V-- PS+ PE Y-- PGP t+@ 5 X+ R- tv- b+ DI? D++(+) G e++ h* r+ y++
------END GEEK CODE BLOCK------
--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* [patch] NEW: arca-vm-21, swapout via shrink_mmap using PG_dirty
  1999-01-14 12:30                                                               ` Andrea Arcangeli
@ 1999-01-15 23:56                                                                 ` Andrea Arcangeli
  1999-01-16 16:49                                                                   ` Andrea Arcangeli
  0 siblings, 1 reply; 243+ messages in thread
From: Andrea Arcangeli @ 1999-01-15 23:56 UTC (permalink / raw
  To: Steve Bergman, dlux, Nicholas J. Leon, Eric W. Biederman
  Cc: Linus Torvalds, brent verner, Garst R. Reese, Kalle Andersson,
	Zlatko Calusic, Ben McCann, bredelin, linux-kernel, linux-mm,
	Alan Cox, Stephen C. Tweedie, Heinz Mauelshagen, Max

I did not understood what PG_dirty means until this afternoon I thought
"why not to move the rw_swap_page to shrink_mmap" and left swap_out() only
to allocate the in-order-swap-entry and its swap cache page? So I brute
moved the rw_swap_cache with obviously tons of races and worked for a
bit well ;).

Then I seen the PG_dirty comment of Linus in vmscan.c, and if I have
understood well it, I had to use PG_dirty to realize what I was going to
do.

So I hacked heavily all the afternoon and the evening and now seems to
work fine ;)). Really the free_user_and_cache() algorithm that I am using
in this patch is "new", never seen how it works with the usual swapout
code in swap_out() so I can't make raw comparison.

What I can say is that the global performances seems improved a lot (also
the OOM handling seems improved, try and you'll see). But the raw swapout
performances are been reduced (from 51 sec of arca-vm-19 to 61 sec).
Seems very good here though.

The patch merge also other my stuff like my update_shared_mapping()  that
is safe right now (at last as vmtruncate ;).  It fix also the for_each_mm
issue. I can cut-out the garbage if somebody needs...

Ah and the patch removed also the map_nr field since x86 should perform
equally well (and the removal saves some bits of memory). This is been an
idea from Max.

Don't use the patch without first doing a backup though since mm
corruption could happens since I could have done mistaken.

I would be interested if somebody could make comparison with arca-vm-19 or
pre[57] for example... but don't waste time doing many benchmarks if it
seems a lose under every side.

Probably in low memory (<=8m) this my new arca-vm-21 needs a:

echo 6 1 4 32 128 512 >/proc/sys/vm/pager

Thanks.

Here arca-vm-21 against 2.2.0-pre7:

Index: linux/mm/filemap.c
diff -u linux/mm/filemap.c:1.1.1.9 linux/mm/filemap.c:1.1.1.1.2.52
--- linux/mm/filemap.c:1.1.1.9	Thu Jan  7 12:21:35 1999
+++ linux/mm/filemap.c	Sat Jan 16 00:17:39 1999
@@ -5,6 +5,11 @@
  */
 
 /*
+ * update_shared_mappings(), Copyright (C) 1998  Andrea Arcangeli
+ * PG_dirty shrink_mmap swapout, Copyright (C) 1999  Andrea Arcangeli
+ */
+
+/*
  * This file handles the generic file mmap semantics used by
  * most "normal" filesystems (but you don't /have/ to use this:
  * the NFS filesystem used to do this differently, for example)
@@ -121,14 +126,11 @@
 int shrink_mmap(int priority, int gfp_mask)
 {
 	static unsigned long clock = 0;
-	unsigned long limit = num_physpages;
 	struct page * page;
-	int count;
-
-	count = (limit << 1) >> priority;
+	unsigned long count = (num_physpages << 1) >> priority;
 
 	page = mem_map + clock;
-	do {
+	while (count-- != 0) {
 		int referenced;
 
 		/* This works even in the presence of PageSkip because
@@ -144,10 +146,9 @@
 		if (PageSkip(page)) {
 			/* next_hash is overloaded for PageSkip */
 			page = page->next_hash;
-			clock = page->map_nr;
+			clock = page - mem_map;
 		}
 		
-		count--;
 		referenced = test_and_clear_bit(PG_referenced, &page->flags);
 
 		if (PageLocked(page))
@@ -160,21 +161,6 @@
 		if (atomic_read(&page->count) != 1)
 			continue;
 
-		/*
-		 * Is it a page swap page? If so, we want to
-		 * drop it if it is no longer used, even if it
-		 * were to be marked referenced..
-		 */
-		if (PageSwapCache(page)) {
-			if (referenced && swap_count(page->offset) != 1)
-				continue;
-			delete_from_swap_cache(page);
-			return 1;
-		}	
-
-		if (referenced)
-			continue;
-
 		/* Is it a buffer page? */
 		if (page->buffers) {
 			if (buffer_under_min())
@@ -184,6 +170,26 @@
 			return 1;
 		}
 
+		if (referenced)
+			continue;
+
+		if (PageSwapCache(page)) {
+			unsigned long entry = page->offset;
+			if (PageTestandClearDirty(page) &&
+			    swap_count(entry) > 1)
+			{
+				if (!(gfp_mask & __GFP_IO))
+					continue;
+				entry = page->offset;
+				set_bit(PG_locked, &page->flags);
+				atomic_inc(&page->count);
+				rw_swap_page(WRITE, entry, page, 0);
+				atomic_dec(&page->count);
+			}
+			delete_from_swap_cache(page);
+			return 1;
+		}
+
 		/* is it a page-cache page? */
 		if (page->inode) {
 			if (pgcache_under_min())
@@ -191,8 +197,7 @@
 			remove_inode_page(page);
 			return 1;
 		}
-
-	} while (count > 0);
+	}
 	return 0;
 }
 
@@ -1165,6 +1170,74 @@
 	return mk_pte(page,vma->vm_page_prot);
 }
 
+static void update_one_shared_mapping(struct vm_area_struct *shared,
+				      unsigned long address, pte_t orig_pte)
+{
+	pgd_t *pgd;
+	pmd_t *pmd;
+	pte_t *pte;
+
+	pgd = pgd_offset(shared->vm_mm, address);
+	if (pgd_none(*pgd))
+		goto out;
+	if (pgd_bad(*pgd)) {
+		printk(KERN_ERR "update_shared_mappings: bad pgd (%08lx)\n",
+		       pgd_val(*pgd));
+		pgd_clear(pgd);
+		goto out;
+	}
+
+	pmd = pmd_offset(pgd, address);
+	if (pmd_none(*pmd))
+		goto out;
+	if (pmd_bad(*pmd))
+	{
+		printk(KERN_ERR "update_shared_mappings: bad pmd (%08lx)\n",
+		       pmd_val(*pmd));
+		pmd_clear(pmd);
+		goto out;
+	}
+
+	pte = pte_offset(pmd, address);
+
+	if (pte_val(pte_mkclean(pte_mkyoung(*pte))) !=
+	    pte_val(pte_mkclean(pte_mkyoung(orig_pte))))
+		goto out;
+
+	flush_page_to_ram(page(pte));
+	flush_cache_page(shared, address);
+	set_pte(pte, pte_mkclean(*pte));
+	flush_tlb_page(shared, address);
+
+ out:
+}
+
+static void update_shared_mappings(struct vm_area_struct *this,
+				   unsigned long address,
+				   pte_t orig_pte)
+{
+	if (this->vm_flags & VM_SHARED)
+	{
+		struct file * filp = this->vm_file;
+		if (filp)
+		{
+			struct inode * inode = filp->f_dentry->d_inode;
+			struct semaphore * s = &inode->i_sem;
+			struct vm_area_struct * shared;
+
+			down(s);
+			for (shared = inode->i_mmap; shared;
+			     shared = shared->vm_next_share)
+			{
+				if (shared->vm_mm == this->vm_mm)
+					continue;
+				update_one_shared_mapping(shared, address,
+							  orig_pte);
+			}
+			up(s);
+		}
+	}
+}
 
 static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
 	unsigned long address, unsigned int flags)
@@ -1184,6 +1257,7 @@
 		flush_tlb_page(vma, address);
 		page = pte_page(pte);
 		atomic_inc(&mem_map[MAP_NR(page)].count);
+		update_shared_mappings(vma, address, pte);
 	} else {
 		if (pte_none(pte))
 			return 0;
Index: linux/mm/mmap.c
diff -u linux/mm/mmap.c:1.1.1.2 linux/mm/mmap.c:1.1.1.1.2.12
--- linux/mm/mmap.c:1.1.1.2	Fri Nov 27 11:19:10 1998
+++ linux/mm/mmap.c	Wed Jan 13 21:23:38 1999
@@ -66,7 +66,7 @@
 	free += page_cache_size;
 	free += nr_free_pages;
 	free += nr_swap_pages;
-	free -= (page_cache.min_percent + buffer_mem.min_percent + 2)*num_physpages/100; 
+	free -= (pager_daemon.cache_min_percent + pager_daemon.buffer_min_percent + 2)*num_physpages/100; 
 	return free > pages;
 }
 
@@ -76,11 +76,16 @@
 	struct file * file = vma->vm_file;
 
 	if (file) {
+		struct semaphore * sem = &file->f_dentry->d_inode->i_sem;
+		struct file * file = vma->vm_file;
+
+		down(sem);
 		if (vma->vm_flags & VM_DENYWRITE)
 			file->f_dentry->d_inode->i_writecount++;
 		if(vma->vm_next_share)
 			vma->vm_next_share->vm_pprev_share = vma->vm_pprev_share;
 		*vma->vm_pprev_share = vma->vm_next_share;
+		up(sem);
 	}
 }
 
@@ -514,9 +519,6 @@
 		free = free->vm_next;
 		freed = 1;
 
-		mm->map_count--;
-		remove_shared_vm_struct(mpnt);
-
 		st = addr < mpnt->vm_start ? mpnt->vm_start : addr;
 		end = addr+len;
 		end = end > mpnt->vm_end ? mpnt->vm_end : end;
@@ -525,6 +527,9 @@
 		if (mpnt->vm_ops && mpnt->vm_ops->unmap)
 			mpnt->vm_ops->unmap(mpnt, st, size);
 
+		mm->map_count--;
+		remove_shared_vm_struct(mpnt);
+
 		flush_cache_range(mm, st, end);
 		zap_page_range(mm, st, size);
 		flush_tlb_range(mm, st, end);
@@ -616,14 +621,18 @@
 	file = vmp->vm_file;
 	if (file) {
 		struct inode * inode = file->f_dentry->d_inode;
+		struct semaphore * sem = &inode->i_sem;
+
 		if (vmp->vm_flags & VM_DENYWRITE)
 			inode->i_writecount--;
       
+		down(sem);
 		/* insert vmp into inode's share list */
 		if((vmp->vm_next_share = inode->i_mmap) != NULL)
 			inode->i_mmap->vm_pprev_share = &vmp->vm_next_share;
 		inode->i_mmap = vmp;
 		vmp->vm_pprev_share = &inode->i_mmap;
+		up(sem);
 	}
 }
 
Index: linux/mm/page_alloc.c
diff -u linux/mm/page_alloc.c:1.1.1.9 linux/mm/page_alloc.c:1.1.1.1.2.32
--- linux/mm/page_alloc.c:1.1.1.9	Thu Jan 14 12:32:57 1999
+++ linux/mm/page_alloc.c	Fri Jan 15 21:48:00 1999
@@ -124,8 +124,7 @@
 	if (!PageReserved(page) && atomic_dec_and_test(&page->count)) {
 		if (PageSwapCache(page))
 			panic ("Freeing swap cache page");
-		page->flags &= ~(1 << PG_referenced);
-		free_pages_ok(page->map_nr, 0);
+		free_pages_ok(page - mem_map, 0);
 		return;
 	}
 }
@@ -141,7 +140,6 @@
 		if (atomic_dec_and_test(&map->count)) {
 			if (PageSwapCache(map))
 				panic ("Freeing swap cache pages");
-			map->flags &= ~(1 << PG_referenced);
 			free_pages_ok(map_nr, order);
 			return;
 		}
@@ -163,7 +161,7 @@
 			if (!dma || CAN_DMA(ret)) { \
 				unsigned long map_nr; \
 				(prev->next = ret->next)->prev = prev; \
-				map_nr = ret->map_nr; \
+				map_nr = ret - mem_map; \
 				MARK_USED(map_nr, new_order, area); \
 				nr_free_pages -= 1 << order; \
 				EXPAND(ret, map_nr, order, new_order, area); \
@@ -212,19 +210,18 @@
 		 * further thought.
 		 */
 		if (!(current->flags & PF_MEMALLOC)) {
-			static int trashing = 0;
 			int freed;
 
 			if (nr_free_pages > freepages.min) {
-				if (!trashing)
+				if (!current->trashing)
 					goto ok_to_allocate;
 				if (nr_free_pages > freepages.low) {
-					trashing = 0;
+					current->trashing = 0;
 					goto ok_to_allocate;
 				}
 			}
 
-			trashing = 1;
+			current->trashing = 1;
 			current->flags |= PF_MEMALLOC;
 			freed = try_to_free_pages(gfp_mask);
 			current->flags &= ~PF_MEMALLOC;
@@ -322,7 +319,6 @@
 		--p;
 		atomic_set(&p->count, 0);
 		p->flags = (1 << PG_DMA) | (1 << PG_reserved);
-		p->map_nr = p - mem_map;
 	} while (p > mem_map);
 
 	for (i = 0 ; i < NR_MEM_LISTS ; i++) {
@@ -361,7 +357,7 @@
 		if (offset >= swapdev->max)
 			break;
 		/* Don't block on I/O for read-ahead */
-		if (atomic_read(&nr_async_pages) >= pager_daemon.swap_cluster)
+		if (atomic_read(&nr_async_pages) >= pager_daemon.max_async_pages)
 			break;
 		/* Don't read in bad or busy pages */
 		if (!swapdev->swap_map[offset])
Index: linux/mm/page_io.c
diff -u linux/mm/page_io.c:1.1.1.4 linux/mm/page_io.c:1.1.1.1.2.7
--- linux/mm/page_io.c:1.1.1.4	Tue Dec 29 01:39:20 1998
+++ linux/mm/page_io.c	Fri Jan 15 21:48:00 1999
@@ -58,7 +58,7 @@
 	}
 
 	/* Don't allow too many pending pages in flight.. */
-	if (atomic_read(&nr_async_pages) > pager_daemon.swap_cluster)
+	if (atomic_read(&nr_async_pages) > pager_daemon.max_async_pages)
 		wait = 1;
 
 	p = &swap_info[type];
@@ -233,10 +233,8 @@
 /* A simple wrapper so the base function doesn't need to enforce
  * that all swap pages go through the swap cache!
  */
-void rw_swap_page(int rw, unsigned long entry, char *buf, int wait)
+void rw_swap_page(int rw, unsigned long entry, struct page *page, int wait)
 {
-	struct page *page = mem_map + MAP_NR(buf);
-
 	if (page->inode && page->inode != &swapper_inode)
 		panic ("Tried to swap a non-swapper page");
 
@@ -281,7 +279,7 @@
 	page->inode = &swapper_inode;
 	page->offset = entry;
 	atomic_inc(&page->count);	/* Protect from shrink_mmap() */
-	rw_swap_page(rw, entry, buffer, 1);
+	rw_swap_page(rw, entry, page, 1);
 	atomic_dec(&page->count);
 	page->inode = 0;
 	clear_bit(PG_swap_cache, &page->flags);
Index: linux/mm/swap.c
diff -u linux/mm/swap.c:1.1.1.6 linux/mm/swap.c:1.1.1.1.2.18
--- linux/mm/swap.c:1.1.1.6	Mon Jan 11 22:24:24 1999
+++ linux/mm/swap.c	Sat Jan 16 00:00:55 1999
@@ -40,28 +40,19 @@
 };
 
 /* How many pages do we try to swap or page in/out together? */
-int page_cluster = 4; /* Default value modified in swap_setup() */
+int page_cluster = 5; /* Default readahead 32 pages every time */
 
 /* We track the number of pages currently being asynchronously swapped
    out, so that we don't try to swap TOO many pages out at once */
 atomic_t nr_async_pages = ATOMIC_INIT(0);
 
-buffer_mem_t buffer_mem = {
-	2,	/* minimum percent buffer */
-	10,	/* borrow percent buffer */
-	60	/* maximum percent buffer */
-};
-
-buffer_mem_t page_cache = {
-	2,	/* minimum percent page cache */
-	15,	/* borrow percent page cache */
-	75	/* maximum */
-};
-
 pager_daemon_t pager_daemon = {
-	512,	/* base number for calculating the number of tries */
-	SWAP_CLUSTER_MAX,	/* minimum number of tries */
-	SWAP_CLUSTER_MAX,	/* do swap I/O in clusters of this size */
+	10,	/* starting priority of try_to_free_pages() */
+	1,	/* minimum percent buffer */
+	5,	/* minimum percent page cache */
+	32,	/* number of tries we do on every try_to_free_pages() */
+	128,	/* do swap I/O in clusters of this size */
+	512	/* max number of async swapped-out pages on the fly */
 };
 
 /*
@@ -75,6 +66,4 @@
 		page_cluster = 2;
 	else if (num_physpages < ((32 * 1024 * 1024) >> PAGE_SHIFT))
 		page_cluster = 3;
-	else
-		page_cluster = 4;
 }
Index: linux/mm/swap_state.c
diff -u linux/mm/swap_state.c:1.1.1.6 linux/mm/swap_state.c:1.1.1.1.2.13
--- linux/mm/swap_state.c:1.1.1.6	Thu Jan 14 12:32:57 1999
+++ linux/mm/swap_state.c	Fri Jan 15 23:23:54 1999
@@ -213,6 +213,7 @@
 	       "entry %08lx)\n",
 	       page_address(page), atomic_read(&page->count), entry);
 #endif
+	PageClearDirty(page);
 	remove_from_swap_cache (page);
 	swap_free (entry);
 }
@@ -320,7 +321,7 @@
 		goto out_free_page;
 
 	set_bit(PG_locked, &new_page->flags);
-	rw_swap_page(READ, entry, (char *) new_page_addr, wait);
+	rw_swap_page(READ, entry, new_page, wait);
 #ifdef DEBUG_SWAP
 	printk("DebugVM: read_swap_cache_async created "
 	       "entry %08lx at %p\n",
Index: linux/mm/swapfile.c
diff -u linux/mm/swapfile.c:1.1.1.3 linux/mm/swapfile.c:1.1.1.1.2.6
--- linux/mm/swapfile.c:1.1.1.3	Mon Jan 11 22:24:24 1999
+++ linux/mm/swapfile.c	Wed Jan 13 00:00:04 1999
@@ -23,7 +23,6 @@
 
 struct swap_info_struct swap_info[MAX_SWAPFILES];
 
-#define SWAPFILE_CLUSTER 256
 
 static inline int scan_swap_map(struct swap_info_struct *si)
 {
@@ -31,7 +30,7 @@
 	/* 
 	 * We try to cluster swap pages by allocating them
 	 * sequentially in swap.  Once we've allocated
-	 * SWAPFILE_CLUSTER pages this way, however, we resort to
+	 * SWAP_CLUSTER pages this way, however, we resort to
 	 * first-free allocation, starting a new cluster.  This
 	 * prevents us from scattering swap pages all over the entire
 	 * swap partition, so that we reduce overall disk seek times
@@ -47,7 +46,7 @@
 			goto got_page;
 		}
 	}
-	si->cluster_nr = SWAPFILE_CLUSTER;
+	si->cluster_nr = SWAP_CLUSTER;
 	for (offset = si->lowest_bit; offset <= si->highest_bit ; offset++) {
 		if (si->swap_map[offset])
 			continue;
Index: linux/mm/vmalloc.c
diff -u linux/mm/vmalloc.c:1.1.1.2 linux/mm/vmalloc.c:1.1.1.1.2.3
--- linux/mm/vmalloc.c:1.1.1.2	Fri Nov 27 11:19:11 1998
+++ linux/mm/vmalloc.c	Thu Dec 31 18:55:11 1998
@@ -10,6 +10,7 @@
 #include <asm/uaccess.h>
 
 static struct vm_struct * vmlist = NULL;
+static spinlock_t	  vmlist_spinlock;
 
 static inline void free_area_pte(pmd_t * pmd, unsigned long address, unsigned long size)
 {
@@ -158,17 +159,21 @@
 	if (!area)
 		return NULL;
 	addr = VMALLOC_START;
+	spin_lock(&vmlist_spinlock);
 	for (p = &vmlist; (tmp = *p) ; p = &tmp->next) {
 		if (size + addr < (unsigned long) tmp->addr)
 			break;
-		if (addr > VMALLOC_END-size)
+		if (addr > VMALLOC_END-size) {
+			spin_unlock(&vmlist_spinlock);
 			return NULL;
+		}
 		addr = tmp->size + (unsigned long) tmp->addr;
 	}
 	area->addr = (void *)addr;
 	area->size = size + PAGE_SIZE;
 	area->next = *p;
 	*p = area;
+	spin_unlock(&vmlist_spinlock);
 	return area;
 }
 
@@ -182,14 +187,18 @@
 		printk("Trying to vfree() bad address (%p)\n", addr);
 		return;
 	}
+	spin_lock(&vmlist_spinlock);
 	for (p = &vmlist ; (tmp = *p) ; p = &tmp->next) {
 		if (tmp->addr == addr) {
 			*p = tmp->next;
-			vmfree_area_pages(VMALLOC_VMADDR(tmp->addr), tmp->size);
+			spin_unlock(&vmlist_spinlock);
+			vmfree_area_pages(VMALLOC_VMADDR(tmp->addr),
+					  tmp->size - PAGE_SIZE);
 			kfree(tmp);
 			return;
 		}
 	}
+	spin_unlock(&vmlist_spinlock);
 	printk("Trying to vfree() nonexistent vm area (%p)\n", addr);
 }
 
@@ -222,6 +231,7 @@
 	if ((unsigned long) addr + count < count)
 		count = -(unsigned long) addr;
 
+	spin_lock(&vmlist_spinlock);
 	for (tmp = vmlist; tmp; tmp = tmp->next) {
 		vaddr = (char *) tmp->addr;
 		if (addr >= vaddr + tmp->size - PAGE_SIZE)
@@ -245,5 +255,6 @@
 		} while (--n > 0);
 	}
 finished:
+	spin_unlock(&vmlist_spinlock);
 	return buf - buf_start;
 }
Index: linux/mm/vmscan.c
diff -u linux/mm/vmscan.c:1.1.1.12 linux/mm/vmscan.c:1.1.1.1.2.98
--- linux/mm/vmscan.c:1.1.1.12	Mon Jan 11 22:24:24 1999
+++ linux/mm/vmscan.c	Sat Jan 16 00:06:41 1999
@@ -10,6 +10,12 @@
  *  Version: $Id: vmscan.c,v 1.5 1998/02/23 22:14:28 sct Exp $
  */
 
+/*
+ * free_user_and_cache() and always async swapout original idea.
+ * PG_dirty shrink_mmap swapout
+ * Copyright (C) 1999  Andrea Arcangeli
+ */
+
 #include <linux/slab.h>
 #include <linux/kernel_stat.h>
 #include <linux/swap.h>
@@ -20,6 +26,8 @@
 
 #include <asm/pgtable.h>
 
+int swapout_interval = HZ;
+
 /*
  * The swap-out functions return 1 if they successfully
  * threw something out, and we got a free page. It returns
@@ -53,12 +61,7 @@
 		return 0;
 
 	if (pte_young(pte)) {
-		/*
-		 * Transfer the "accessed" bit from the page
-		 * tables to the global page map.
-		 */
 		set_pte(page_table, pte_mkold(pte));
-		set_bit(PG_referenced, &page_map->flags);
 		return 0;
 	}
 
@@ -66,9 +69,6 @@
 	 * Is the page already in the swap cache? If so, then
 	 * we can just drop our reference to it without doing
 	 * any IO - it's already up-to-date on disk.
-	 *
-	 * Return 0, as we didn't actually free any real
-	 * memory, and we should just continue our scan.
 	 */
 	if (PageSwapCache(page_map)) {
 		entry = page_map->offset;
@@ -77,8 +77,9 @@
 drop_pte:
 		vma->vm_mm->rss--;
 		flush_tlb_page(vma, address);
+		entry = atomic_read(&page_map->count);
 		__free_page(page_map);
-		return 0;
+		return entry <= 2;
 	}
 
 	/*
@@ -86,11 +87,6 @@
 	 * by just paging it in again, and we can just drop
 	 * it..
 	 *
-	 * However, this won't actually free any real
-	 * memory, as the page will just be in the page cache
-	 * somewhere, and as such we should just continue
-	 * our scan.
-	 *
 	 * Basically, this just makes it possible for us to do
 	 * some real work in the future in "shrink_mmap()".
 	 */
@@ -127,7 +123,10 @@
 	 * That would get rid of a lot of problems.
 	 */
 	if (vma->vm_ops && vma->vm_ops->swapout) {
-		pid_t pid = tsk->pid;
+		pid_t pid;
+		if (!(gfp_mask & __GFP_IO))
+			return 0;
+		pid = tsk->pid;
 		vma->vm_mm->rss--;
 		if (vma->vm_ops->swapout(vma, address - vma->vm_start + vma->vm_offset, page_table))
 			kill_proc(pid, SIGBUS, 1);
@@ -151,14 +150,9 @@
 	set_pte(page_table, __pte(entry));
 	flush_tlb_page(vma, address);
 	swap_duplicate(entry);	/* One for the process, one for the swap cache */
-	add_to_swap_cache(page_map, entry);
-	/* We checked we were unlocked way up above, and we
-	   have been careful not to stall until here */
-	set_bit(PG_locked, &page_map->flags);
-
-	/* OK, do a physical asynchronous write to swap.  */
-	rw_swap_page(WRITE, entry, (char *) page, 0);
-
+ 	add_to_swap_cache(page_map, entry);
+	if (PageTestandSetDirty(page_map))
+		printk(KERN_ERR "VM: page was just marked dirty!\n");
 	__free_page(page_map);
 	return 1;
 }
@@ -199,7 +193,7 @@
 
 	do {
 		int result;
-		tsk->swap_address = address + PAGE_SIZE;
+		tsk->mm->swap_address = address + PAGE_SIZE;
 		result = try_to_swap_out(tsk, vma, address, pte, gfp_mask);
 		if (result)
 			return result;
@@ -271,7 +265,7 @@
 	/*
 	 * Go through process' page directory.
 	 */
-	address = p->swap_address;
+	address = p->mm->swap_address;
 
 	/*
 	 * Find the proper vm-area
@@ -293,8 +287,8 @@
 	}
 
 	/* We didn't find anything for the process */
-	p->swap_cnt = 0;
-	p->swap_address = 0;
+	p->mm->swap_cnt = 0;
+	p->mm->swap_address = 0;
 	return 0;
 }
 
@@ -303,10 +297,11 @@
  * N.B. This function returns only 0 or 1.  Return values != 1 from
  * the lower level routines result in continued processing.
  */
-static int swap_out(unsigned int priority, int gfp_mask)
+static int grow_freeable(unsigned int priority, int gfp_mask)
 {
 	struct task_struct * p, * pbest;
-	int counter, assign, max_cnt;
+	int counter, assign;
+	unsigned long max_cnt;
 
 	/* 
 	 * We make one or two passes through the task list, indexed by 
@@ -325,8 +320,6 @@
 	counter = nr_tasks / (priority+1);
 	if (counter < 1)
 		counter = 1;
-	if (counter > nr_tasks)
-		counter = nr_tasks;
 
 	for (; counter >= 0; counter--) {
 		assign = 0;
@@ -338,13 +331,13 @@
 		for (; p != &init_task; p = p->next_task) {
 			if (!p->swappable)
 				continue;
-	 		if (p->mm->rss <= 0)
+	 		if (p->mm->rss == 0)
 				continue;
 			/* Refresh swap_cnt? */
 			if (assign)
-				p->swap_cnt = p->mm->rss;
-			if (p->swap_cnt > max_cnt) {
-				max_cnt = p->swap_cnt;
+				p->mm->swap_cnt = p->mm->rss;
+			if (p->mm->swap_cnt > max_cnt) {
+				max_cnt = p->mm->swap_cnt;
 				pbest = p;
 			}
 		}
@@ -376,7 +369,7 @@
        char *revision="$Revision: 1.5 $", *s, *e;
 
        swap_setup();
-       
+
        if ((s = strchr(revision, ':')) &&
            (e = strchr(s, '$')))
                s++, i = e - s;
@@ -406,12 +399,6 @@
 	strcpy(current->comm, "kswapd");
 
 	/*
-	 * Hey, if somebody wants to kill us, be our guest. 
-	 * Don't come running to mama if things don't work.
-	 */
-	siginitsetinv(&current->blocked, sigmask(SIGKILL));
-	
-	/*
 	 * Tell the memory management that we're a "memory allocator",
 	 * and that if we need more memory we should get access to it
 	 * regardless (see "__get_free_pages()"). "kswapd" should
@@ -426,11 +413,10 @@
 	current->flags |= PF_MEMALLOC;
 
 	while (1) {
-		if (signal_pending(current))
-			break;
-		current->state = TASK_INTERRUPTIBLE;
 		run_task_queue(&tq_disk);
-		schedule_timeout(HZ);
+		current->state = TASK_INTERRUPTIBLE;
+		flush_signals(current);
+		schedule_timeout(swapout_interval);
 
 		/*
 		 * kswapd isn't even meant to keep up with anything,
@@ -438,13 +424,37 @@
 		 * point is to make sure that the system doesn't stay
 		 * forever in a really bad memory squeeze.
 		 */
-		if (nr_free_pages < freepages.high)
+		if (nr_free_pages < freepages.min)
 			try_to_free_pages(GFP_KSWAPD);
 	}
 
 	return 0;
 }
 
+static int free_user_and_cache(int priority, int gfp_mask)
+{
+	int freed, grown = 0;
+	static int need_freeable = 0;
+
+	freed = shrink_mmap(priority, gfp_mask);
+
+	if (need_freeable)
+	{
+		grown = grow_freeable(priority, gfp_mask);
+		if (freed)
+			need_freeable = 0;
+	} else {
+		freed = shrink_mmap(priority, gfp_mask);
+		if (!freed)
+		{
+			grown = grow_freeable(priority, gfp_mask);
+			need_freeable = 1;
+		}
+	}
+
+	return freed || grown;
+}
+
 /*
  * We need to make the locks finer granularity, but right
  * now we need this so that we can do page allocations
@@ -457,34 +467,35 @@
 int try_to_free_pages(unsigned int gfp_mask)
 {
 	int priority;
-	int count = SWAP_CLUSTER_MAX;
+	static int state = 0;
+	int count = pager_daemon.tries;
 
 	lock_kernel();
 
 	/* Always trim SLAB caches when memory gets low. */
 	kmem_cache_reap(gfp_mask);
 
-	priority = 6;
-	do {
-		while (shrink_mmap(priority, gfp_mask)) {
-			if (!--count)
-				goto done;
-		}
-
-		/* Try to get rid of some shared memory pages.. */
-		while (shm_swap(priority, gfp_mask)) {
-			if (!--count)
-				goto done;
-		}
-	
-		/* Then, try to page stuff out.. */
-		while (swap_out(priority, gfp_mask)) {
-			if (!--count)
-				goto done;
-		}
+	priority = pager_daemon.priority;
+	switch (state)
+	{
+		do {
+		case 0:
+			while (free_user_and_cache(priority, gfp_mask)) {
+				if (!--count)
+					goto done;
+			}
+			state = 1;
+		case 1:
+			/* Try to get rid of some shared memory pages.. */
+			while (shm_swap(priority, gfp_mask)) {
+				if (!--count)
+					goto done;
+			}
+			state = 0;
 
-		shrink_dcache_memory(priority, gfp_mask);
-	} while (--priority >= 0);
+			shrink_dcache_memory(priority, gfp_mask);
+		} while (--priority >= 0);
+	}
 done:
 	unlock_kernel();
 
Index: linux/kernel/fork.c
diff -u linux/kernel/fork.c:1.1.1.6 linux/kernel/fork.c:1.1.1.1.2.10
--- linux/kernel/fork.c:1.1.1.6	Mon Jan 11 22:24:21 1999
+++ linux/kernel/fork.c	Mon Jan 11 22:56:09 1999
@@ -209,16 +209,19 @@
 		tmp->vm_next = NULL;
 		file = tmp->vm_file;
 		if (file) {
+			struct semaphore * s = &file->f_dentry->d_inode->i_sem;
 			file->f_count++;
 			if (tmp->vm_flags & VM_DENYWRITE)
 				file->f_dentry->d_inode->i_writecount--;
-      
+
+			down(s);
 			/* insert tmp into the share list, just after mpnt */
 			if((tmp->vm_next_share = mpnt->vm_next_share) != NULL)
 				mpnt->vm_next_share->vm_pprev_share =
 					&tmp->vm_next_share;
 			mpnt->vm_next_share = tmp;
 			tmp->vm_pprev_share = &mpnt->vm_next_share;
+			up(s);
 		}
 
 		/* Copy the pages, but defer checking for errors */
@@ -511,6 +514,7 @@
 
 	p->did_exec = 0;
 	p->swappable = 0;
+	p->trashing = 0;
 	p->state = TASK_UNINTERRUPTIBLE;
 
 	copy_flags(clone_flags, p);
Index: linux/kernel/sysctl.c
diff -u linux/kernel/sysctl.c:1.1.1.6 linux/kernel/sysctl.c:1.1.1.1.2.12
--- linux/kernel/sysctl.c:1.1.1.6	Mon Jan 11 22:24:22 1999
+++ linux/kernel/sysctl.c	Wed Jan 13 21:23:38 1999
@@ -32,7 +32,7 @@
 
 /* External variables not in a header file. */
 extern int panic_timeout;
-extern int console_loglevel, C_A_D;
+extern int console_loglevel, C_A_D, swapout_interval;
 extern int bdf_prm[], bdflush_min[], bdflush_max[];
 extern char binfmt_java_interpreter[], binfmt_java_appletviewer[];
 extern int sysctl_overcommit_memory;
@@ -216,6 +216,8 @@
 };
 
 static ctl_table vm_table[] = {
+	{VM_SWAPOUT, "swapout_interval",
+	 &swapout_interval, sizeof(int), 0644, NULL, &proc_dointvec},
 	{VM_FREEPG, "freepages", 
 	 &freepages, sizeof(freepages_t), 0644, NULL, &proc_dointvec},
 	{VM_BDFLUSH, "bdflush", &bdf_prm, 9*sizeof(int), 0600, NULL,
@@ -223,11 +225,7 @@
 	 &bdflush_min, &bdflush_max},
 	{VM_OVERCOMMIT_MEMORY, "overcommit_memory", &sysctl_overcommit_memory,
 	 sizeof(sysctl_overcommit_memory), 0644, NULL, &proc_dointvec},
-	{VM_BUFFERMEM, "buffermem",
-	 &buffer_mem, sizeof(buffer_mem_t), 0644, NULL, &proc_dointvec},
-	{VM_PAGECACHE, "pagecache",
-	 &page_cache, sizeof(buffer_mem_t), 0644, NULL, &proc_dointvec},
-	{VM_PAGERDAEMON, "kswapd",
+	{VM_PAGERDAEMON, "pager",
 	 &pager_daemon, sizeof(pager_daemon_t), 0644, NULL, &proc_dointvec},
 	{VM_PGT_CACHE, "pagetable_cache", 
 	 &pgt_cache_water, 2*sizeof(int), 0600, NULL, &proc_dointvec},
Index: linux/include/linux/mm.h
diff -u linux/include/linux/mm.h:1.1.1.6 linux/include/linux/mm.h:1.1.1.1.2.23
--- linux/include/linux/mm.h:1.1.1.6	Mon Jan 11 22:23:57 1999
+++ linux/include/linux/mm.h	Fri Jan 15 23:23:53 1999
@@ -118,12 +118,10 @@
 	unsigned long offset;
 	struct page *next_hash;
 	atomic_t count;
-	unsigned int unused;
 	unsigned long flags;	/* atomic flags, some possibly updated asynchronously */
 	struct wait_queue *wait;
 	struct page **pprev_hash;
 	struct buffer_head * buffers;
-	unsigned long map_nr;	/* page->map_nr == page - mem_map */
 } mem_map_t;
 
 /* Page flag bit values */
@@ -165,6 +163,7 @@
 
 #define PageClearSlab(page)	(clear_bit(PG_Slab, &(page)->flags))
 #define PageClearSwapCache(page)(clear_bit(PG_swap_cache, &(page)->flags))
+#define PageClearDirty(page)	(clear_bit(PG_dirty, &(page)->flags))
 
 #define PageTestandClearDirty(page) \
 			(test_and_clear_bit(PG_dirty, &(page)->flags))
@@ -302,8 +301,7 @@
 
 /* filemap.c */
 extern void remove_inode_page(struct page *);
-extern unsigned long page_unuse(struct page *);
-extern int shrink_mmap(int, int);
+extern int FASTCALL(shrink_mmap(int, int));
 extern void truncate_inode_pages(struct inode *, unsigned long);
 extern unsigned long get_cached_page(struct inode *, unsigned long, int);
 extern void put_cached_page(unsigned long);
@@ -387,9 +385,9 @@
 }
 
 #define buffer_under_min()	((buffermem >> PAGE_SHIFT) * 100 < \
-				buffer_mem.min_percent * num_physpages)
-#define pgcache_under_min()	(page_cache_size * 100 < \
-				page_cache.min_percent * num_physpages)
+				pager_daemon.buffer_min_percent * num_physpages)
+#define pgcache_under_min()	((page_cache_size-swapper_inode.i_nrpages) * 100 < \
+				pager_daemon.cache_min_percent * num_physpages)
 
 #endif /* __KERNEL__ */
 
Index: linux/include/linux/pagemap.h
diff -u linux/include/linux/pagemap.h:1.1.1.1 linux/include/linux/pagemap.h:1.1.1.1.2.3
--- linux/include/linux/pagemap.h:1.1.1.1	Fri Nov 20 00:01:16 1998
+++ linux/include/linux/pagemap.h	Fri Jan 15 21:47:58 1999
@@ -14,7 +14,7 @@
 
 static inline unsigned long page_address(struct page * page)
 {
-	return PAGE_OFFSET + PAGE_SIZE * page->map_nr;
+	return PAGE_OFFSET + ((page - mem_map) << PAGE_SHIFT);
 }
 
 #define PAGE_HASH_BITS 11
Index: linux/include/linux/sched.h
diff -u linux/include/linux/sched.h:1.1.1.6 linux/include/linux/sched.h:1.1.1.1.2.13
--- linux/include/linux/sched.h:1.1.1.6	Mon Jan 11 22:24:03 1999
+++ linux/include/linux/sched.h	Thu Jan 14 12:42:58 1999
@@ -169,6 +174,7 @@
 	unsigned long rss, total_vm, locked_vm;
 	unsigned long def_flags;
 	unsigned long cpu_vm_mask;
+	unsigned long swap_cnt, swap_address;
 	/*
 	 * This is an architecture-specific pointer: the portable
 	 * part of Linux does not know about any segments.
@@ -177,15 +183,17 @@
 };
 
 #define INIT_MM {					\
-		&init_mmap, NULL, swapper_pg_dir, 	\
+		&init_mmap, NULL, swapper_pg_dir,	\
 		ATOMIC_INIT(1), 1,			\
 		MUTEX,					\
 		0,					\
 		0, 0, 0, 0,				\
-		0, 0, 0, 				\
+		0, 0, 0,				\
 		0, 0, 0, 0,				\
 		0, 0, 0,				\
-		0, 0, NULL }
+		0, 0,					\
+		0, 0,					\
+		NULL }
 
 struct signal_struct {
 	atomic_t		count;
@@ -270,8 +278,7 @@
 /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
 	unsigned long min_flt, maj_flt, nswap, cmin_flt, cmaj_flt, cnswap;
 	int swappable:1;
-	unsigned long swap_address;
-	unsigned long swap_cnt;		/* number of pages to swap on next pass */
+	int trashing:1;
 /* process credentials */
 	uid_t uid,euid,suid,fsuid;
 	gid_t gid,egid,sgid,fsgid;
@@ -355,7 +362,7 @@
 /* utime */	{0,0,0,0},0, \
 /* per CPU times */ {0, }, {0, }, \
 /* flt */	0,0,0,0,0,0, \
-/* swp */	0,0,0, \
+/* swp */	0,0, \
 /* process credentials */					\
 /* uid etc */	0,0,0,0,0,0,0,0,				\
 /* suppl grps*/ 0, {0,},					\
Index: linux/include/linux/swap.h
diff -u linux/include/linux/swap.h:1.1.1.6 linux/include/linux/swap.h:1.1.1.1.2.19
--- linux/include/linux/swap.h:1.1.1.6	Mon Jan 11 22:24:05 1999
+++ linux/include/linux/swap.h	Fri Jan 15 21:47:58 1999
@@ -33,7 +33,7 @@
 #define SWP_USED	1
 #define SWP_WRITEOK	3
 
-#define SWAP_CLUSTER_MAX 32
+#define SWAP_CLUSTER	(pager_daemon.swap_cluster)
 
 #define SWAP_MAP_MAX	0x7fff
 #define SWAP_MAP_BAD	0x8000
@@ -76,7 +76,7 @@
 extern int try_to_free_pages(unsigned int gfp_mask);
 
 /* linux/mm/page_io.c */
-extern void rw_swap_page(int, unsigned long, char *, int);
+extern void rw_swap_page(int, unsigned long, struct page *, int);
 extern void rw_swap_page_nocache(int, unsigned long, char *);
 extern void rw_swap_page_nolock(int, unsigned long, char *, int);
 extern void swap_after_unlock_page (unsigned long entry);
@@ -134,13 +134,6 @@
 extern unsigned long swap_cache_find_total;
 extern unsigned long swap_cache_find_success;
 #endif
-
-extern inline unsigned long in_swap_cache(struct page *page)
-{
-	if (PageSwapCache(page))
-		return page->offset;
-	return 0;
-}
 
 /*
  * Work out if there are any other processes sharing this page, ignoring
Index: linux/include/linux/swapctl.h
diff -u linux/include/linux/swapctl.h:1.1.1.4 linux/include/linux/swapctl.h:1.1.1.1.2.9
--- linux/include/linux/swapctl.h:1.1.1.4	Mon Jan 11 22:24:05 1999
+++ linux/include/linux/swapctl.h	Fri Jan 15 23:23:53 1999
@@ -4,32 +4,23 @@
 #include <asm/page.h>
 #include <linux/fs.h>
 
-typedef struct buffer_mem_v1
+typedef struct freepages_s
 {
-	unsigned int	min_percent;
-	unsigned int	borrow_percent;
-	unsigned int	max_percent;
-} buffer_mem_v1;
-typedef buffer_mem_v1 buffer_mem_t;
-extern buffer_mem_t buffer_mem;
-extern buffer_mem_t page_cache;
-
-typedef struct freepages_v1
-{
 	unsigned int	min;
 	unsigned int	low;
 	unsigned int	high;
-} freepages_v1;
-typedef freepages_v1 freepages_t;
+} freepages_t;
 extern freepages_t freepages;
 
-typedef struct pager_daemon_v1
+typedef struct pager_daemon_s
 {
-	unsigned int	tries_base;
-	unsigned int	tries_min;
+	unsigned int	priority;
+	unsigned int	buffer_min_percent;
+	unsigned int	cache_min_percent;
+	unsigned int	tries;
 	unsigned int	swap_cluster;
-} pager_daemon_v1;
-typedef pager_daemon_v1 pager_daemon_t;
+	unsigned int	max_async_pages;
+} pager_daemon_t;
 extern pager_daemon_t pager_daemon;
 
 #endif /* _LINUX_SWAPCTL_H */



--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: [patch] NEW: arca-vm-21, swapout via shrink_mmap using PG_dirty
  1999-01-15 23:56                                                                 ` [patch] NEW: arca-vm-21, swapout via shrink_mmap using PG_dirty Andrea Arcangeli
@ 1999-01-16 16:49                                                                   ` Andrea Arcangeli
  1999-01-17 23:47                                                                     ` Andrea Arcangeli
  0 siblings, 1 reply; 243+ messages in thread
From: Andrea Arcangeli @ 1999-01-16 16:49 UTC (permalink / raw
  To: Steve Bergman, dlux, Nicholas J. Leon, Eric W. Biederman
  Cc: Linus Torvalds, brent verner, Garst R. Reese, Kalle Andersson,
	Zlatko Calusic, Ben McCann, bredelin, linux-kernel, linux-mm,
	Alan Cox, Stephen C. Tweedie, Heinz Mauelshagen, Max

In my new PG_dirty implementation I did a grave bug that was causing
sometimes under eavily swapping a corrupted swap entry (note, the kernel
memory was always safe, so no risk to bad fs corruption). Sometimes
happened that a swapin was swapping in a random swap data, instead of the
(not) swapped out page of the process. This because when GFP_IO was not
set (and so I was not going to sync the page to disk) I didn't re-set the
Dirty bit to tell to the MM that the page is still dirty.

I noticed the bug one hour ago and I fixed it now.

> +		if (PageSwapCache(page)) {
> +			unsigned long entry = page->offset;
> +			if (PageTestandClearDirty(page) &&
			    ^^^^^^^^^^^^^^^^^^^^^^^^^^^
> +			    swap_count(entry) > 1)
> +			{
> +				if (!(gfp_mask & __GFP_IO))
> +					continue;
					^^^^^^^^^
> +				entry = page->offset;
> +				set_bit(PG_locked, &page->flags);
> +				atomic_inc(&page->count);
> +				rw_swap_page(WRITE, entry, page, 0);
> +				atomic_dec(&page->count);
> +			}
> +			delete_from_swap_cache(page);
> +			return 1;
> +		}
> +

With the bug fixed it seems really rock solid. It would be interesting
making performance comparison with kernels that are swapping out in
swap_out() (e.g. clean pre7). I am not sure it's a win (but I'm sure it's
more fun ;). The swapout performances are a bit decreased even if
sometimes (in the second pass) my benchmark give me super results with
this my new code (43 sec, that's been the record of every other kernel
tried). 

The only thing I don't like is that the kernel seems to stall a bit every
some seconds in the grow_freeabe() path (the old swap_out()), also the
profiling report try_to_swap_out() as the function where the kernel is
passing most of the time (20 timer interrupt against 10 of shrink_mmap). 
Maybe because now I'm recalling grow_freeable() many more times than I was
used to do... Maybe I should reinsert something like the smart swapout
weight code to allow grow_freeable() to scale greatly... It's just quite
good though.

The fixed patch (arca-vm-22) can be downloaded from here:

ftp://e-mind.com/pub/linux/kernel-patches/2.2.0-pre7-arca-VM-22.gz

(if it's too slow drop me a line and I'll post privately it via email, I
am not posting it here again to not spam too much ;)

Andrea Arcangeli

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: 2.2.0-pre[56] swap performance poor with > 1 thrashing task
  1999-01-09  6:44                                               ` Linus Torvalds
  1999-01-09 18:58                                                 ` Andrea Arcangeli
  1999-01-11  9:21                                                 ` Buffer handling (setting PG_referenced on access) Zlatko Calusic
@ 1999-01-16 17:35                                                 ` Andrea Arcangeli
  2 siblings, 0 replies; 243+ messages in thread
From: Andrea Arcangeli @ 1999-01-16 17:35 UTC (permalink / raw
  To: Linus Torvalds; +Cc: Zlatko Calusic, linux-kernel, linux-mm

On Fri, 8 Jan 1999, Linus Torvalds wrote:

> As a no-op, it can now randomly and unprectably result in even worthwhile
> buffers just being thrown out - possibly quite soon after they've been
> loaded in. I happen to believe that it doesn't actually matter (and I'm

I think it doesn't matter because the buffer_under_min() check just
protect the buffer cache enough. In arca-vm-22 I removed the specific
buffer and cache min limitis so I applyed Zlatko patch ;).

Basically arca-vm-22 take the sum of the buffermem+page_cache_size always
close to a percentage tunable via sysctl (10% as default) when _low_ on
memory. So the buffer aging now make sense to me (not benchmarked though
;).

Somebody in the list asked for an algorithm that doens't work with magic
but it's tunable. Having a constant cache+buffermem memory size under
swapping seems to work very well and even if it doesn't work with magic I
like it right now.

Andrea Arcangeli

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: [patch] NEW: arca-vm-21, swapout via shrink_mmap using PG_dirty
  1999-01-16 16:49                                                                   ` Andrea Arcangeli
@ 1999-01-17 23:47                                                                     ` Andrea Arcangeli
  1999-01-18  5:11                                                                       ` Linus Torvalds
  1999-01-18 19:22                                                                       ` Andrea Arcangeli
  0 siblings, 2 replies; 243+ messages in thread
From: Andrea Arcangeli @ 1999-01-17 23:47 UTC (permalink / raw
  To: Steve Bergman, dlux, Nicholas J. Leon, Eric W. Biederman,
	Kalle Andersson
  Cc: Linus Torvalds, brent verner, Garst R. Reese, Kalle Andersson,
	Zlatko Calusic, Ben McCann, bredelin, linux-kernel, linux-mm,
	Alan Cox, Stephen C. Tweedie, Heinz Mauelshagen, Max

On Sat, 16 Jan 1999, Andrea Arcangeli wrote:

> With the bug fixed it seems really rock solid. It would be interesting
> making performance comparison with kernels that are swapping out in
> swap_out() (e.g. clean pre7). I am not sure it's a win (but I'm sure it's

Even if rock solid my PG_dirty implementation is been a lose. This because
swapping out from shrink_mmap() was causing not ordered write to disk. So
even if the process userspace was ordered in the swap, it was async
written not-ordered. This was harming a _lot_ swapout performances... 
There's also the point that shrink_mmap() is so highly/default used that
stalling in it many times (if unlucky due many consecutive dirty swap
cache pages to sync to disk) was not the best. So I dropped the
shrink_mmap()-swapout idea completly. 

But I had new ideas that here has improved things a _lot_.

Steve could you try the image test over this new patch against pre7 clean?

Also you Kelle, could you try my new arca-vm-24 on your 16Mbyte machine? I
think that you'll be very happy ;).

Note, arca-vm-24 is tunable:

/proc/sys/vm/pager has 5 entry:

6       10       32      32      2048

The first is the lower priority that try_to_free_pages() uses. The most
this value is high, the lower will be the starting priority, and this will
result in more swapping. 6 is a rasonable value. If the system swap too
much even if not low on memory you could decrease this value to 4 for
example.

The second value is the max percentage of cache in the system that we take
even if we are very low on memory (the system should autobalance close to
that percentage after some time). The `cache' is intended as
cache+buffers.

The third is the number of pages we free every time a process try to
reclaim memory but when the system is low on memory. 32 pages at time is a
safe value.  Incresing this value will cause the process that get stuck
in the freeing memory path to stall for more time (so it could harm
iteractive performances).

The fourth is the cluster size on the swap space. pre7 set it to 512
pages, but I checked that it's a _lose_. 32 is/was a good value.

The fifth number is the number of pages that we allow to run async at the
same time (when the limit is reached we'll wait for I/O completation).
pre7 set this value to 32 but that value is a _bottleneck_ for some
rasonable fast system. Increasing too much this value could cause the
system to have too much page in fly to disk at one time and so it could
risk to go out of memory (if the freepages.min is too low, note I never
tried arca-mv-24 on verylow memory and I don't know if it's reproducible).
That never happens here and 2048 is something like infinite. But I am
running with freepages.min = 255 because I have 128Mbyte of RAM. So if
arca-vm-24 will kill random processes on your system (search for Out of
memory messages in `dmesg`) the first thing you should try is:

echo 6 10 32 32 32 >/proc/sys/vm/pager

this way you'll return to clean pre7 from this point of view.

begin 664 2.2.0-pre7-arca-VM-24.gz
M'XL("+MOHC8"`S(N,BXP+7!R93<M87)C82U632TR-`"D/&MWV[:2G^E?@?J>
MFRM9E"U2#\MRXZUK.XVW=NSC1],]:0X/14$2:XE4^?#C9KV_?6<&(`F0E.WT
M)JTD`IC!8#"8%X8Y#2;\<<06?I`^[DSCG7$ZG?)HV]N8^-,I:Z?5GI&UC7]W
MU_98V_:V96VTV^WUP,:UF[#_=@/&]IAEC_J#4:?+K+V]O8U6JU4%RX=;`V;M
MCJSAJ-\3PW_ZB;5WN[OF@+7P:Y?]]-,&,^`/6Z6)LW!C^(C2QGC>W*?V\;Q]
M,':FBS2>)_Z2L_>L0QW/&RW#2,+4FSMBTAPDXDD:!6P\QZ?G#2:F[/=HRGX_
MFU(@AKD<Q&N\9W\"`WT>[^=]<>(F_#U-YP<QCQ+'#Y+0^2OE*8_%=*U:$E0*
MX'%G:^-4W34_\!;IA.]D7-N>ES:O.J"TAVL'T%9VE:U<C\JX3@.QHUUF=T8]
M>V19Y1VM0M=LK%ULK+4[1"Y;NWOF'G'Y'Q,^]0/.?O[H7/&_#*,+S&`=YD]9
M,N=,<(W-W9B-.0^8']R["W\"7)^PK1T-^C(*$^Y!AS%`%%8)A1^S53:"0#=:
M&;`8X:S<&<?],8S&DB^=I;MB+79^>.E\NFI@.VXXS.PVFP5H>6^-F"?.V$\:
ME[\X$8=&'GA\8K)WI3G:!].%.XL!TP:#S6=LB]U$3X".W7&^(KJ789PP+UPN
MPV#QQ-(8R`;A6TQB8`*+_6"VX,QS/1@)O.<Q:U@#PC-^2GC<1%3^$E9\S]F*
M1],P6KI`RC9C-W-@11A->`0X6#P/T\4$F$N[8_<MT]IC+1L.P4`<`BFI@%,L
M3.4TKJLX!RC8>)C:L*`V$'+,5Q'W:*O:[(&S21C\*Q'K`XK:<E]R+C'B"$&Z
MP1,L/^+;^$0MGSES/*3488G@4[I"*2!.(5,+/";L9P+M;D*0DY#'.&_,^1+A
M(NXN@)]CSA["*)DS/V'<!RS1-CN=$IG^Q"1(Z'D@YBS"\([%X9(G<V39PK_#
M>?UXE)%G_'U)4J'_8V%JHV"WUZ*;A.P;>V8/<Q]$I]&!\7"`Y`%F/XH3O/)7
MW)G&CK\]/ZCV+OW`?US?S1\3.^]]0:$MEZ\H-!P@M=#@M0&HT.SA6H6FHC+.
M0Z'00(_9]LCNCOJ[+RLTA"Z@AJS3&5F@"?<4A68-P5ZT\$N>F#2(_5G`46Y`
M6L+I%+81=7Z<1*F7"&G="H!7#J@U,D%N$BY]S\'CG@8PMEW@`'O"T@!/_WX%
M->WZ/BH[@4`TF"2J;!7&L3]&S;$2ZM*-GP)O'H5!F,;0C"HP(^G!]1-AM-@6
M_JY0N[6"HWPOZ:55=SNF;<.R[3W3[FB*'(1U$7IWH(<-UM&:>12%$;1:6FLA
MV=!E%^(+71,_2IZ@M:NUIJLDQ"5!1T_KF$:<.^XTX3A)7\?$O2CO&FA=\8.[
M<M(`:98CV*XVX/C\$)V/H=9XO7#'V+I7Q45*V3"LCMYUYZ\``+RH5OU:NEJ'
MMI:>UJ6MI:]UU:QEH`V0:]G5&K.U#*NXY%I@F:V:M71*&PD>T#UN8]="KP;D
M\MP%50EZ%*0G27S0]J"`T9*0VG;'8)RVM[<U0WX)\G9&XM-`T6L:C=SRY((%
M&E!T9HJOJ<.?H)Q)<!V>)/`U\*M<(.M(T!1Q&4];Q7.,TEM+!LGU:V3<2M&H
M(R(3F]=P?``I.D0YJ$-2B-AK:(Y!XM:B*<2Q!@WIBO[0M-#G`Z?"LG5=`:.O
M>8("F*%6;!XVOT8:0H.@'J&<UJ`HA+@.D;Y?-[`H-Y@`1G7G_@!MG&T?]#H*
M\C=M8X&U3.<?%+W48GZ%;&V"HP5W(Y6#'C9\#P\%!ITZ'<O?8",A51C)RHS4
M)O@>5M:16V'F]Y"/0MKMV.80'-^]+/HC_34%3PF\MVV/=!28;`ZN\'WH3\"-
M7(+V@G`OG'#AB&D&$WSAMARNVVSL=LB@KP5`DQ_/P3N_<Y8P>0.>36S$8%(9
M\N'P^N;H\.RL43L6G7&57I@K0%]<H3C.**`FMF7JI"H8]"7,0$Z)F1-MX?58
M,LHU8C".5S&4YZ4-&0Y,J\-:70@8][10Y'[I9I%&R>].P>6,''!2&W`,&J(1
M''!V<,`N#W\Y<:X_GGZX:8)K;W4Z[$<ID88$AH';`.I`;.)QX.\6"]*ELYH_
MQ<0L1;FO9D2\/AUMK&B/_7_S\B1%]\N39&;6X."GS>8"I($&9/V"P#[3GS\P
MX8$/K1(QV9(/*B-Q8`2Q"$ASL"T`QNX"(\4J::AW>`!N.YX,Q_GUY.K3R9GC
MB&#Z!6\_QHU^Q>&78U[T^?4QK^0Q2@C+GG^O)CE5B\"XF:<"L(=9K9X]Z@\5
MYW^P)[(9O2QGI!^5*`9W/`D3=^'<+TTFW!?X6?7E8=<=X<]7NKQ5"B`01<9W
MF$[2.X5JPU-/O]S)!-PP0H)I!0/S"ACOPW^P!C?RYCZ&[FG$V_&*>_X4HH95
M"$>41R,12T-0[(X77,"NW"B!"(:=(4LHE&9!"#%\$#Z@^P9Q-H;J,9\M09CC
M;9GAV36M/C!EV#4MP97G?=5HG7XZO7'.S]DW%$%A7]]!7)F0!C/9I]NS,[&:
M%<:X,[0-)C-(9%\=)^S`X<W%^>F1@Q,UK*;)+),F@I[SVYN3WTTY,3QWM-\L
M^R\G+&\3+:VB93T8*P_*\2#)[#G'8NA(BT<Y#G@F=2ON.,B0?/JFA(R&(6-&
M2M_L=LB([0ZE.,)!72[9U$T7N%,3XA8HY&DXH@0&\T@H9JD[%BF1&'-L;BPS
M(@!;2$D(SOL\XNZD:*(04A='U&W3!4CCTOU3_`AP3I-Y>8^7=WG4)Y*GB=A)
M%+V1I47`A9CGPMU><PCV#0R&06V-@7@06U)<&'/0ND$#8-0-K7&,M+=HVB2"
MH-8/9C@K\FL5A1Y,PKR(3T"H?7<1RX7Z$XC0X=/D^!'CQQ0_<0$SZIQA)W[$
M,^J<82<9,THJM[H#N]B8E)+56SO&MXY)?Y_A`S0S$0'T'UW>,AR"TS,8PY[-
M[$L,`AXBN(2FOVAMH"=^6!4]J.SUMFR2VI5FYP$I!%/-$Z\\"?XM!L7I:K5@
MLV@5`YD=(O$Y$^67C`+LR&LV@8:\;!+4(601]M9;!`U=C4'HOV(0"-[X$/D"
MKL]L:]3;U>Q!MTO;W)6[G&F\Z\^7SNWUR;%AZ6V?KTYO3BY^-;JJ+W/]^?#2
M.3J[O;XYN7+.#W]G7;L(OM5.X7/DUEL<@D4:@R[7@@2".:?_?S<ZC[O3Z;2F
M\^?#8^@<=CH=<1LRP)70)ZY$\3N3Z,E)0A$]"C=22U;-IBLR5<U]Z4-GB;0=
M<DK\4#C3;<V9?A`N.CF#Y,*6?$AO[D;H6NI>\-M@-3>[UAU543A!*%RN]62\
M`HY&_@V+T%`(]88Q=):](7)UQYC!(8V>FED"KF=:7;"RW9[,.];ZZD7DX\!V
M@XY"7V3_C8/CU/.$,R'\OHVV$J/@I4,)'C1\@4(/;RCFVFA_`]4-_F.#(G<]
M@FNBF90>/C:T#[+,:3MK[L#O9R%5=-/Q.8SN&#HAXJ8GXF#+.'DD(9DOJ>!`
M@\;`>DS=D]%#[+`+LR#$MM>4E)<LWJ"G:)34+;TWC2)MU7E16VE(_Y["(A3:
MC5R?;N1V"YW5,[LV:_4PG4LZ*\_IN[$XL[6I?KSI.T"UE3RMP'Q,LV-6!%/.
MO;71*O6BTB"=X<0;[%LISVTH@5$Y!VZ,PR@*']9V+]U'I>]9)V._H%)K5Y^2
M(OY66Y6'-0.*4&L?S\?:!0,WVM]4?TFN>+_<M@@?*FUS?S:G=:GXE&6IS<H#
M+*KUK#]G)U]I+'[OUVRH:F/JME3KK]O5)/)AEK$;\_WZ+F)"2^]913Z<SN2I
MTJ'%J)5>0ECAGFH9B8NE-2F,+/7HSTEE/A0[NE=Q)/]:SQ60C.-ZN_:X7XJM
MSTX_W?[NH&D^NCES/J*]U+34'>#CBYTIZ+]*68?65W*>ZOHJ>J@6057[V)52
M``VP!(#5(,JE6=^R,&[N6[T\VV:LV@<3\*+Y(_>R$@YHRJ,"T=;"MLQE5X?A
M!30\WQQ>_^K<?CK]!/[1U>WES>G/9R>BQ,(+5T\BPFYX8*VX(R_+5F!0ZY@;
M/\6H/->P-^NM9W"IEUAL5UE<1E+#9'L-DS-0XS-((H%TT1VUNZ.NZH[:Y([:
M:DKSA*317;![,(G(6!'2^Q#WL3F$=V`W,>NYK68\T:];N0'&FQ"0@,75DY5>
M&,3A@CN+<+;@]WQALB/GT#G6TY5K!HD('E`ZE(2X5[T3!!M/ILXJ6G[Y:N)/
MK/%!?:$]NH]?OA9`Y&6-(<9=)LZ?[KTK\.(%%(\(3.D"T5KPY-[G#]BG3RPX
M[(3W8%7"Y1+3#G!4HR<9:5L#%&#\&BHI#A1#"(P1D+(H['XI?GSY"L+Y#<3W
MVV_G=+`O;F],MEE>^Z8)0]B[<C,PR?\W#Z?H5C9-UAGT>EG^XQTZ.<X$4SCW
MW'LV0=)QB@]7)R>7O\`,N6[?-/$4L'=Y0XY4,05O0_[S\0>(0#X"=KD'FUB3
M(#;*9'M;)6([,O-A"L[971/DNV7;>9T5`6=;:RI/[F,^Y\5O)U='%^?GF#HZ
M.;^X^A^8O;(W2,>Z?4-$V8K7C7EM^6VQ_-L/'TZN@`ID0):2W<3>O#(#&G+V
MJM["VR;`U.[1X=''$Y@`-X9LGIR@<#?^PPFNC@^!CY]@BCL4MPF)7K63S-0F
M,>^=:K+RV76S]C8!NOSEQE$72$?$D<L48KJ:R2L"Y\&EJT5[K5A59]'T.42>
M^35.29FK75()[ZWO0C7>5PM-ZL#S?#';Q7PQZ.1NQ4U7X?(:.VN7@?:VA[G.
MIRC/MDRKAV46?3PT>%IJ;H=8YBV9>@2^09E"J9-*.64,,H7]+"?3%CX<".C2
MLO^5BA#Z0F"AVRD%6<Y-4S-@:FBHV(\_,JO)=E@C([ME4;*@;>0`@@8Q\."`
M%=Y@&\T]$O">%955M!BD!0N<@`I9X$388+-^@&4VB15$;'&!+QP#3!E2COP!
M')>8@6U"`R"2X5C2@'<AX51<-M_Y*S:&\YC&HE[.ZNU2\4\O+_XQC(RU(H@%
M"IT@DO6H\(%1KB1,EJ)FU`!(_>VE5G$@L(J[2TE_%E"K91/RYGW0,;M[0!]\
MVUD=+0Z7V6-,YS8D2B*JB<RRFF(=89"`N'*Q-7B=`)^P\:<Q5G2X0@XHL8J_
M_@NKY^+0Q`JZ!Q?31*$</XG"%4+`O/B)[@:)!S@:6-9D2HYCKX1XP%`^"3$A
MO70C6).R9]O;8M`.?JU+)5`0(KH5_KY[)S,<N-*&FF/(EDTW=<6Z\7'"P4,`
M9S$*EVIR@^81(V1R@I+6QK-!(EJ:NLI0(789*V4II.`C)9L)@2!1=,;JHJK7
MG17:F1SZ@YJNDZ@R$6%5D&(Q0F#Q-N(%+K=>Y5!+0]JBJFRY>E\1I+:H9*WG
M`-TLJ^NON8"M94#UDEY25;-4B-V-O#92**(#T!M(\W-Q]]R1-\]E"[-<8UZ6
MJG&PU[17KC(K@)1U_A3>,SC%EC6R]D96!ZW$4+<J2VE2]'3/WJC3*4S*@!*[
M@RRQ:Z!@L-9[5KHRWE>Z@DC)]I8Z\LPG7<M01_L]:ZRY[FZQ-9?M+68WMS0S
ML6-U.OLH?QK.^KOJ>M!\TPC#`<O(K]L_HM==@.JNVT6UM^HGU/3BCG;[^H[6
M(2G=+G?MFH+4$JCJ,'3`8=@=V0/58>B10;*SK+!0`J*L393GR7(<T(;2"DRX
M1T8'K8]N#3*KN2YCRZBP`>)"UMC$4C-?)I%9<9HWA994S!9[]Y[]7\-"ZZX9
M-W$L"T%SPKN&:D--.HWYVQOBU0UAA"UAA#NY$59LG+XZ0%5:7,WJ8%"F'5]9
M72R79Q#>-RROM+YL952.K^DEJ95$K&F+HGRK8UI#N4(PD],THC1W,L=*D61;
M-.>Z\X>&ET8P=Y(3QBX_8/!T>'9V<90;2>D>JE>A^1LTY"\AQ9FK1(@U=0#G
M*H\?\41G:#/C(S`V1;T)M>54%9W$:F,6@LT/[]!:D:R#WR^H>&761?B0SVKH
M:Z!9C<J,Q0)?FI7\-3*";1VQ,&2U:(4U,4J<_]_W*NL+02"?KW*=IMZ@57&A
M>&G(*-$SH"/0[0]-U<T3W@T[>$]R.^'W>)8>!;_'X/S=[4M3?$RO9HS)<PT#
M=KISP:8AOIOA3MHN9H44ATOS'6%7E/QG$Z=ZX5*R]3<PE%*L:XE';.BWC^$+
M2!^G\9.\_U>.1,X&H@M.WQ?!HJ_->HN`MY5KS`%VE2Y\:KK0$.S6V`$5VKA)
M.3OF'K/W6`?BQ;V176?<<[CR%?10L^\D`JU"$)[E"V:232CC#^!<A^!:!\`A
M3#K+LD1ZL6BZ\%&7;(L-?WVW7M[N[X6OVVQ\(4&>+`S^X"?EQQPL8_F">?NO
M,B77[5+!('X/LR*+0Q;[RQ6X=`\1U0E!B")>"G-C#@HT\!(?Y#U[1RC`(PGJ
M@`-J\$OHDA%?)$*N%:%.S&:()$*M2\@*H_`#@@#C:F^FH:%T)RQN=+.;87"-
M1`"/*P;N?2>2RGVKBHST8V6$%D7+]Y.`BJ:(LTL..'H,ZN,/<B.P^DIXZ+A;
MF<&\B7S!2V*."T%?T):C<[]`5-[:0])=]NY>)K+J+.5)]O-^J=S>B_4KK]'X
M@:>[,?2*C'QIC6&<HB50FD+4-48CDR5;A<-J,HM"@76C!+\ICV$4CD>)C,K2
MR`BMKU368_VR@J)ZE!KM)-I+%Q/E=GIIJJ<K)AVPYC*B5^.:"J#R*U*]44^]
M8>Z8]I"U>KG[\IS5AGP$723T$)VK28CI`_F"GZC9$M("DKR#%_U)../H\U",
MV!;7$AAH"(T#'.WM(]IC+DK>[MU%BB]2`H]\NK$3"0`0FU3N>ZL&1U_#@7I+
MF,"N+<GD]QQ)%.5;<B6?D6X7S"<JA'(%FC3A5&.'VK;T*I80\,D&5N;"*DVA
MI$#MY"]-JBRYN;A0>49%F`E8;4]0D[]+IJM:6)=:%=D1&;<U5]UT5]$V;!,/
M#K[SMTR7+`O59+*"#@UXI#A$7,[7CAAT"(?[N`[',UV;K[E2?Y$0$@PQ3!#3
MKR.F/&JWKQ*44_#2]:RDHF\).LAVR"U&+\ES%UZZ`"^:"DS4W:?K:#%MN:I+
M6]';(292"-!#`W&60AL3*.8P1>DYUC8.:#QX]Q$1EJ50Q21EEU.<A6P[!;>R
M.G#)2#(:.*@KN%"FF605G4=Y.M;-(<'?OA"[TQO*+5,F)='.3DX;#X$\#B)S
M.Y5O.&::1K[ACW=V@[[9DSYRZ>#;J([Y`G:7`@X]:<T:#5``6$</>E!\-4MU
M^,T:G%V,#Q%G%@5KR@J3$75Z7;Q0O4Z[9[WU.K[42\FE&DU?1O+&5(0*^O+=
M1=?NB-OG3AZ3B%@K30JQ$$Z=\N93_H9?P!\<W0"6;/35R>%Q;G\;LC2/96!4
M+&P*[Z=BN#70#"(?S/[A3[$6Y/CDY]M?Z+(6#3<X#,E=8_.8C]/9;^<CL@N*
MN1:Z%FMIZ67;3;S$$G\V:1[VS\YP\8B:^I^K/X)-LW;;Z>)]S::+/KE;W1?Z
M<,,'U?W6X;_#N@M`M=``+#S^I_QS'3;5O<+G0!8:9'7KF:/NE!N^@$8CWGXX
M/3NY_KI?KH#%YDS[,;L_4&[79>TA781YKJPY1$]NW:1;L4\>,$FE")1SF03%
M(EYU^)P['MD!51S^\1.3R0&IY)<"*.9_I:)T&O2-="^V&;M`O?G`_W7/,S"P
M[FT$J"Q-EJC3_9,+TC@/'U"!TCU*Q.,PHIN45@;[G7!$Y-2/XJ1-2=!L$71_
MFYD&%X]`MFKY#T3(US\BO)M)8I;&TF\&X$3^PQ$%=S`ZPKMTTKS(CDB^/B+'
MP#1RRL*OB?@DQ8LU`$/PB1_?X=L'=Z+D7;B-=+?6&Q3_`@SI#_C(5(?(T3QC
M:..#ERU6X`2H6\N,1AU0.Z@8P`RTYHT\KL#A$"S+5UWWY9O\[$?1@R5YLNO_
M>[O:YB9R)/S9]RM$BB)VQDX<V\F%A%`5%A92L,L54+=5?'%-XL%XX[?SC!-R
M\..OGW[1:%Y,<K!W*2K$,Y)&([5:K>[G:3N[%T6!QQ2%JFZ&8B"HK`2N9Y#H
M.A5@=VR1]S;?8D!720-4JM]3`5B]DH'??7S<ZP=>YRZSH+J(56.J:.K=/VDW
MITD_=@_/1\?.VFE?N_U=#H@?[75[>]0(/9R."W0^2&FEOOBR=`_E#.W`6OA;
M!/G%9K%.:=(X-"J$N%V^=38EX4\#8P"&`-DYXPD03I-1$DLYF&A#0]7<)'!Q
MD#T"1LPJF;+"3C.R0*7P9*[&)^X7:7B64@5T)PX2BY7#TTQ-6[=^62QO5WB(
M:_[2XF%R[FP^HKW!G:UH',;)=(*">XB%E9&UZ32^J(7<"O2+-]_:^T)0>"H*
MF6>D=Z2>D`JL=\R0"X'P1L:^"2%')/^O/IZ$Z6G4T<&VEGE.4F=1-,5@TV%&
M0..?UJ02U8&R(O62IT_A$PZ&EK0`K652/ZR:&&GLSC-M,542*%YC<%3/KO,'
M_RK%MAZO@*5XHKP?FV5U*S%ZD)TB`M@ZL=`)W1D*#B!KTM\:63_H@\H>'7K>
M7Q@@--\)U;RED_]8ZHF+VH?1/ZSB>?I)E>96S(.6C+8<]`GK6LMLH^4SP>IE
MXK\:3Q=DG\N[(>AG07'J"(RII:8-D'=I.W1E=K68CJ0K'+/8E%S&!BI`&13>
MSB?,.CQL]P]<]/=>NZ]Q%T$'^)0\\11&TJVA*7(WF0<+T.6YU+M)F(SV)VEG
M00PLUF%N(&0SHGUC@E@'%5@`N,_U<!`^?^LZ='L[]4]<+SO9HL,)@N#>H\UE
ME_=?V83?Z<NT0763C#_L&KW,UKR1RU9)#2-7D%11#)G)K69-XMZ:-N<>LY;C
MGNVI%%0#9T..+TGTW.L:"QV(L7CJ\EG($\@(D'FT7DXG,"F:Q@?9,.E#OB*%
M4`K#BDO'J'`]BSM/P2J==9ZNTE31*(*[RZ878B=3(7IE8=U)+-^Z5P&0#(/(
M&F8C7UV-QC"W]_,!.`GX'A*IT0_Z"!T0<U][Z6+(`'QF<T4+D&*:X^H,DW$!
MR^*2K0I/8B7CC2>*BD/_3`3L.HZ!-=3Y+(B>U)IDNRHS$3[+Q5=F98GAM?B.
MV+B"V`2KXF9"]LZ?VED#&GDGA8@;M.4-&"W20ZI.:O7S)LF3.KGX184F1!RE
M/\_BE$2'.JMOP,W,XJN$QU53!B7LU5BSKAFI#<DIA?!>C)*R;G]:@TF,3UL%
MC^I6<0FP#AU)*H@LJ1']7(`4H>*^?0LNJN-7HT>^WJ-'[D%Q>VZ%<!.)\.'1
M[&$-EH8L&38F;4T$T>3](R%6]7P*-*@95A3+1>:M_72*1&8J.Q[2)*6]2L!\
MS"?,0F/7YFPRSCY+8].IM9RN9S@ML/QLC_*=,E19O[_]\"+_]4``:Z$LP#*-
MI2R/-.\CD[D?11&D7+UF)FKT\KMD1TUY0ND0/9(9OM&T<'!`)K2$5"[#60[[
M]X'->JXSIIZL)B-VTM"H,'^<C"8:]UE:E`O308MEBLD,/HKE3*I>[>DETVOI
M-P*FZ57GZ=*8M?L'@L`[\(DBZYP)-7N:G/W>OF[#%14[>'JP-`J^67>SFF2)
M>5YW74VD@"F<53>$.!2Z5==#L7P^.UWMTP9=Z0*XDJ51Q&M[IK=ZHFN.R&J;
M#I>SD9V0LSB]\H=C#*B/'M'PQ[3,\YO8`NB=J3)-P`Y3\8O6EFX/U:#4J(Q&
MC?Z"5LH%=@1DRHS7KV+'X0E.EW3),*3'4XLZ@H\?<[B0_^\9"841I!XHFJZG
M3#5KL,R%''1L@?I7I"[`\X\O6$=Q66RK]RG/IA4>DR,!_&&"YR7<@MF.:[LB
M/D"0AFBB%<+:K/.J-!]T.CM^I"I*TG?MM-0W>EX4L6XT9)P5?8*Y:1D9@CU]
MP*EHOH>[1''\,Z(X_DM$<?P_%T41=DA=C2B.`U'L];MMX'OZ/0-YP]""KO/%
M7,.$LQ,*ISLMKN]<9.A3(#?<^4!NHI]NI.W?]K^1PGO(X&DN8[12?GO.PM@"
M>(G^_NWL_6L1S-GH/H(Y$,$<#&H%,Y!(>MT?E\@-\E,6N7L4NU.F1&KIO\GJ
MI.8\;`)U,.",:SUD7GMLNDU$RG99%:O:T?NNF(T+$C+F%?0#@G;_9OY/HO;R
M^?F[7-CX4RYNZ)\*7!4,W*F3*"6?UTO5LB(=/]M$FU\3\8;O2U"M$`;>DJ*,
M[TBR+<DMHS"-H]QU+A;LRQP-HSW>E@,-C5ERF=$!R,R^3C#TR^(&B8DKW"SO
MH">%@^"O$Q)FH6@L`"FYGG70;064]%B='OF-O1'.N2T`]G\KKI$=OR<G'F->
M*[+0$<ORCER4]1^IV5:4(DU>0<XK@EXCZ76B7I1U'/2#ET8"G(`<0)>U;474
M\?@][B.G:]3O]MN#;@E%]H=WER!%!(Z[XM7[M%C9?'!2&9YMF^-+9O4PUVA9
ML8N,U>LGW)<N7B[4J*S!J&ANE(A(->[?D#=%`X<C:?TZJ_*:+*&7]A(F/H*!
MG/CC$6I/)\A13Q]9*FAHA@D=4(>XU5QBEKYZ-152G&7^\NB`EM%!6.'=066J
MEO(=BOS2L4O"7]`>2FZ1FCYR`1[/IF_+T[3V(S].(+@'/N'WR120*LP[FF3_
MG$`KXJE/RL0'SP*F9VVLCUUNY??=9[MRH/6`//4!.TY]W@4*:)].7^JW8U1/
MRI0=]I2JESEQ"-:L'+.+'>FCC-.BZVH$QD"';&1"RF=L.=35J-]B6IM-%+M0
M<8^1H0+'-TSF_:HK0Z]>O].OBR3-"BP[.)_B%"TCQ]87SGJE'FVY7!57+>:W
M47_X]8%/>(`<SOE8Q#<+SI#%T<4<X\CS"YG!&XR2+]3XQ:VFSNSUV&/2/^A9
MID9).S>_PMD_EX.4IGTK_1R/%C?(C+?%7EN7)=,I1QC)%+GT6XCW"BN93'84
MB$XSEJ`/1&H:K\9()\PASE:^S>B;.J:HH.MIA6\HX%8K^$3I7WG%_7*1I[ZM
M4DF[C)'/K];IG&`=`50NNX[S#X`ZL4^=3JO"9<35D,TH$^[![#K/_C,+#WT"
M259AXGWQ;?4/.7F0;(W:BR4#.CF_'MX&%UB9<*)R7`GY$B6UY<J\.5?17$]4
M<[D-.JV&B[8'3STMI_2SER'CA@FPF%]?B5_%K29O_B0GC04EGMJB\,2!?/""
M<DH@J&Y.8?M1H_!*]<^(*L\(RRKAP"9L>6+,`Z.`],7+U3_JEX+AG(O!>98"
MQYW+YB,W&U@J'H=_5\FV>V261:M"F:,G4M5CZ9YD"QMTS0&@/^(-6R77$X2"
M3[<>OM,_CSD*_'"+=!Q90CL2A-.?$*))LZ=7L>OH#[K>A!U`NO+R\ZII[;?=
M]O%V"\PF7Q0_S20O2@_;?DAE6H42_-0H(LT&-#%II?1$<:N''.4;T,%<7<%H
M9WF;,VN0/B"GT'N_HCA%7R70]Y_8HWNQ&-VR+F.W^A5B`&OJS86$C<:TH66[
M3JH)4/^2/>YK=A^CRBPF0XZ#K'0A55PH7+7BAMUC$,08ZY?&;C*_;C[RG;PP
M/R@5P+PVWY^_?'W^YHT$`NF?3Q6:J!]8XA:`F9+6Y6"WXC:VD6#+;>E]A9,L
M-$'`CFSU*#GAK])@/#V^R,,:S`,8\!)+T%-B>U)_E8SCU6B*J\TT2=S6<(AL
MPR&*<*NUZ\=;&Y/)ZAUR7HG!@/,&\VS=1</)S[_[=N9EX`8GVU1JA,VU,DS5
M3NX$O)Q"[IERYAE:-.LYZT_Y@@4ROOXU1$!2@[#(,KN>)I98I?GJHYPD[M5Z
M9!$[Z7+J^\JW*FV78_PMST161ID,JYLP&8)IV;-$MU_^VI;U4DP\L_HEH\=`
M=Y4!*:G>0<Y/X[RR8/FR])*%D2)8Q.+!X>#;-$MF^?>S9/&M$=L6P!VM)">-
M?5]+/#(A2FD@DW\GGN_6J2.(/0D(8H#J>*6WN1C8:ZSFJMC5E[_^8_@::*'G
MK3PHN?$$DM5"5KYK17Y5&XY986VV).?Y\<*X8F'X)6\J/((JY^:!U/CVK39"
M9JT7[=6-#08L6FZ2JY=.`G\DGCS#4RWF^"7974#TD8);Q?,U66O</KZ?A^$Q
M7!6YA$U7"/0W1XDAMC9:&)K`L&L*#R-QZP^`#CD"#L%R8]PW/:68WN&D%.SL
M$CH,`&R&D`7G!%V87;._K5Z!T609R>A9G-=10#S-/`)U9M;L9.;>OSE[)F%@
M6,.\_%CB20>F.-U(4J;&%6#R@CJEU;$L$`9QQ#:H]ZD[]%DQ:)&HJKM#B$*>
M_8-.1R+Z8B-)Q)3."9+J@"GSG#U##G=!X(_#Q4BZF/A5*VM,>65Y7R1GP,]V
MI&$]H4%KVV%3$E0`UP5+I?)H.^;]W*.C<+@+,Q^FL$M);=+9IBG?DV4.`$U7
MTK@$H:![S$:B]JY.@6SH:,ZL]3V-&J6N1@W].KR&2:UP6/G)^_+D'YC(H,-W
MS>2/];*KR4=49D<B]-*1>H45)''H=/S4X&BE'KK[-X5D%=]KBY0^^GW,7M7*
+XOX/]0H@'H9Q````
`
end

Comments?

Andrea Arcangeli

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: [patch] NEW: arca-vm-21, swapout via shrink_mmap using PG_dirty
  1999-01-17 23:47                                                                     ` Andrea Arcangeli
@ 1999-01-18  5:11                                                                       ` Linus Torvalds
  1999-01-18  7:28                                                                         ` Eric W. Biederman
  1999-01-18  9:15                                                                         ` Andrea Arcangeli
  1999-01-18 19:22                                                                       ` Andrea Arcangeli
  1 sibling, 2 replies; 243+ messages in thread
From: Linus Torvalds @ 1999-01-18  5:11 UTC (permalink / raw
  To: Andrea Arcangeli
  Cc: Steve Bergman, dlux, Nicholas J. Leon, Eric W. Biederman,
	Kalle Andersson, brent verner, Garst R. Reese, Kalle Andersson,
	Zlatko Calusic, Ben McCann, bredelin, linux-kernel, linux-mm,
	Alan Cox, Stephen C. Tweedie, Heinz Mauelshagen, Max

On Mon, 18 Jan 1999, Andrea Arcangeli wrote:
> 
> Even if rock solid my PG_dirty implementation is been a lose. This because
> swapping out from shrink_mmap() was causing not ordered write to disk. So
> even if the process userspace was ordered in the swap, it was async
> written not-ordered. This was harming a _lot_ swapout performances... 

Indeed. 

Note that what I really wanted to use PG_dirty for was not for normal
page-outs, but for shared mappings of files. 

For normal page-out activity, the PG_dirty thing isn't a win, simply
because (a) it doesn't actually buy us anything (we might as well do it 
from the page tables directly) and (b) as you noticed, it increases
fragmentation.

The reason PG_dirty should be a win for shared mappings is: (a) it gets
rid of the file write semaphore problem in a very clean way and (b) it
reduces the number of IO requests for mappings that are writable for
multiple contexts (right now we will actually do multiple page-outs, one
for each shared mapping that has dirtied the page). 

I know you worked on patches to reduce (b) by walking multiple page
tables, but quite frankly that was always so ugly as to never stand a
chance in h*ll of ever getting included in a standard kernel. 

I looked at the problem, and PG_dirty for shared mappings should be
reasonably simple. However, I don't think I can do it for 2.2 simply
because it involves some VFS interface changes (it requires that you can
use the pame_map[] information and nothing else to page out: we have the
inode and the offset which actually is enough data to do it, but we don't
have a good enough "inode->i_op->writepage()" setup yet).

		Linus

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: [patch] NEW: arca-vm-21, swapout via shrink_mmap using PG_dirty
  1999-01-18  5:11                                                                       ` Linus Torvalds
@ 1999-01-18  7:28                                                                         ` Eric W. Biederman
  1999-01-18 10:00                                                                           ` Andrea Arcangeli
  1999-01-18  9:15                                                                         ` Andrea Arcangeli
  1 sibling, 1 reply; 243+ messages in thread
From: Eric W. Biederman @ 1999-01-18  7:28 UTC (permalink / raw
  To: Linus Torvalds
  Cc: Andrea Arcangeli, Steve Bergman, dlux, Nicholas J. Leon,
	Kalle Andersson, brent verner, Garst R. Reese, Kalle Andersson,
	Zlatko Calusic, Ben McCann, bredelin, linux-kernel, linux-mm,
	Alan Cox, Stephen C. Tweedie, Heinz Mauelshagen, Max

>>>>> "LT" == Linus Torvalds <torvalds@transmeta.com> writes:

LT> Note that what I really wanted to use PG_dirty for was not for normal
LT> page-outs, but for shared mappings of files. 

LT> For normal page-out activity, the PG_dirty thing isn't a win, simply
LT> because (a) it doesn't actually buy us anything (we might as well do it 
LT> from the page tables directly) 

If we combine it with early scanning of the page tables,  it should allow us
to perform I/O before we need the memory.  While not bogging down processes.

LT> and (b) as you noticed, it increases fragmentation.

This is only because he didn't implement any kind of request queue.
A fifo queue of pages to write would have keep performance up at current levels.

LT> The reason PG_dirty should be a win for shared mappings is: (a) it gets
LT> rid of the file write semaphore problem in a very clean way and 

Nope.  Because we can still have some try to write to file X.
That write needs memory, and we try to swapout a mapping of file X.
Unless you believe it implies the write outs then must use a seperate process.

LT> (b) it
LT> reduces the number of IO requests for mappings that are writable for
LT> multiple contexts (right now we will actually do multiple page-outs, one
LT> for each shared mapping that has dirtied the page). 

With a reasonable proximity in time even that isn't necessary because of the buffer
cache.  This is only a real issue for filesystems that don't implement good
write cachine on their own.  In which case the primary thing to fix is
caching for those filesystems.  We can use PG_dirty for that case too, but
the emphasis is different.

LT> I looked at the problem, and PG_dirty for shared mappings should be
LT> reasonably simple. However, I don't think I can do it for 2.2 simply
LT> because it involves some VFS interface changes (it requires that you can
LT> use the pame_map[] information and nothing else to page out: we have the
LT> inode and the offset which actually is enough data to do it, but we don't
LT> have a good enough "inode->i_op->writepage()" setup yet).

At least in part because for NFS you need who is doing the write, which
we currently store with a struct file.  To get it correct we really
need a two level setup.  Level 1 per pte to enter in the write request.
Level 2 per page to acutally write it out.

If there are conflicts, (not the same user, etc) Level 1 can handle them.

If I have any luck I should have a draft of a complete set of changes to do
all of this for 2.3 in about a month.  I have been working on this in the
backgroud for quite a while.

Eric

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: [patch] NEW: arca-vm-21, swapout via shrink_mmap using PG_dirty
  1999-01-18  5:11                                                                       ` Linus Torvalds
  1999-01-18  7:28                                                                         ` Eric W. Biederman
@ 1999-01-18  9:15                                                                         ` Andrea Arcangeli
  1999-01-18 17:49                                                                           ` Linus Torvalds
  1 sibling, 1 reply; 243+ messages in thread
From: Andrea Arcangeli @ 1999-01-18  9:15 UTC (permalink / raw
  To: Linus Torvalds
  Cc: Eric W. Biederman, Zlatko Calusic, linux-kernel, linux-mm,
	Alan Cox, Stephen C. Tweedie

On Sun, 17 Jan 1999, Linus Torvalds wrote:

> Note that what I really wanted to use PG_dirty for was not for normal
> page-outs, but for shared mappings of files. 

Ah, Ok ;). I was using it in a completly different manner. I was using it
to indicate that the swap cache page was not uptodate on the swap space.
But as just said this cause not ordered write to disk from shrink_mmap().
Other than that it _was_ fine.

> The reason PG_dirty should be a win for shared mappings is: (a) it gets
> rid of the file write semaphore problem in a very clean way and (b) it

I can't understand this. I think to know _where_ to mark the page dirty
(in the `if (vm_op->swapout)' path) but I don't understand _where_ to
write the page out to disk avoiding the fs deadlock. Writing them in
shrink_mmap() would not fix the deadlock (obviously if shrink_mmap() is
still recalled as now by try_to_free_pages() etc...). 

> I know you worked on patches to reduce (b) by walking multiple page
> tables, but quite frankly that was always so ugly as to never stand a

OK, agreed ;). I am taking it here in the meantime only because it should
be at least safe. 

> I looked at the problem, and PG_dirty for shared mappings should be
> reasonably simple. However, I don't think I can do it for 2.2 simply
> because it involves some VFS interface changes (it requires that you can
> use the pame_map[] information and nothing else to page out: we have the
> inode and the offset which actually is enough data to do it, but we don't
> have a good enough "inode->i_op->writepage()" setup yet).

I still don't understand from _where_ doing the writepage. If we would do
it from shrink_mmap() I can't see how we could clear the pte of the
process (or better processes) before starting the writepage(). Probably I
am missing something of important (maybe because these nights I had not a
lot of time to sleep ;)... 

Andrea Arcangeli

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: [patch] NEW: arca-vm-21, swapout via shrink_mmap using PG_dirty
  1999-01-18  7:28                                                                         ` Eric W. Biederman
@ 1999-01-18 10:00                                                                           ` Andrea Arcangeli
  0 siblings, 0 replies; 243+ messages in thread
From: Andrea Arcangeli @ 1999-01-18 10:00 UTC (permalink / raw
  To: Eric W. Biederman; +Cc: Linus Torvalds, linux-kernel, linux-mm

On 18 Jan 1999, Eric W. Biederman wrote:

> LT> and (b) as you noticed, it increases fragmentation.
> 
> This is only because he didn't implement any kind of request queue.  A
> fifo queue of pages to write would have keep performance up at current
> levels. 

Infact I didn't wanted having to alloc more memory in order to free memory
(it's something I like to avoid). But the point is that I think that
swapping out from shrink_mmap() even if doing ordered I/O is not a win.
Try, benchmark and let me know your results, maybe I am wrong.  And with a
FIFO also shrink_mmap() would change in order to do what swap_out() is
doing right now. And btw I think that the fifo could be approssimated to a
browse in the swap cache. 

> LT> The reason PG_dirty should be a win for shared mappings is: (a) it gets
> LT> rid of the file write semaphore problem in a very clean way and 
> 
> Nope.  Because we can still have some try to write to file X.
> That write needs memory, and we try to swapout a mapping of file X.
> Unless you believe it implies the write outs then must use a seperate process.

Agreed. I just pointed this out, but maybe I did not understood _where_ we
should do the write to disk to reclaim memory. 

Andrea Arcangeli

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: [patch] NEW: arca-vm-21, swapout via shrink_mmap using PG_dirty
  1999-01-18  9:15                                                                         ` Andrea Arcangeli
@ 1999-01-18 17:49                                                                           ` Linus Torvalds
  0 siblings, 0 replies; 243+ messages in thread
From: Linus Torvalds @ 1999-01-18 17:49 UTC (permalink / raw
  To: Andrea Arcangeli
  Cc: Eric W. Biederman, Zlatko Calusic, linux-kernel, linux-mm,
	Alan Cox, Stephen C. Tweedie


On Mon, 18 Jan 1999, Andrea Arcangeli wrote:
> 
> I can't understand this. I think to know _where_ to mark the page dirty
> (in the `if (vm_op->swapout)' path) but I don't understand _where_ to
> write the page out to disk avoiding the fs deadlock. Writing them in
> shrink_mmap() would not fix the deadlock (obviously if shrink_mmap() is
> still recalled as now by try_to_free_pages() etc...). 

You'd write them out only from a separate deamon that only needs to scan
the physical page map. That separate deamon might actually be kswapd, but
that's just an implementation detail rather than a conceptual issue.

		Linus

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: [patch] NEW: arca-vm-21, swapout via shrink_mmap using PG_dirty
  1999-01-17 23:47                                                                     ` Andrea Arcangeli
  1999-01-18  5:11                                                                       ` Linus Torvalds
@ 1999-01-18 19:22                                                                       ` Andrea Arcangeli
  1 sibling, 0 replies; 243+ messages in thread
From: Andrea Arcangeli @ 1999-01-18 19:22 UTC (permalink / raw
  To: Steve Bergman, dlux, Nicholas J. Leon, Eric W. Biederman,
	Kalle Andersson
  Cc: Linus Torvalds, brent verner, Garst R. Reese, Kalle Andersson,
	Zlatko Calusic, Ben McCann, bredelin, linux-kernel, linux-mm,
	Alan Cox, Stephen C. Tweedie, Heinz Mauelshagen, Max

On Mon, 18 Jan 1999, Andrea Arcangeli wrote:

> Also you Kelle, could you try my new arca-vm-24 on your 16Mbyte machine? I
> think that you'll be very happy ;).
> 
> Note, arca-vm-24 is tunable:

Ehmmm, ohoh, I did a little mistake in arca-vm-24 that was causing a
lockup under heavy swapping ;) (I was still decrementing counter in the
grow_freeable() while loop...) 

So I diffed a fixed arca-vm-25 that rocks here. It's against pre-8.gz in
testing. This new one should be really safe (and pre8 is also fixing also
the swapout deadlock making the semaphores recursive). Give me comments if
you'll try it. arca-vm-24/25 still seems far better than anything tried
before here. I don't think I could do something better than it for 2.2.

ftp://e-mind.com/pub/linux/kernel-patches/2.2.0-pre8testing-arca-VM-25.gz

Andrea Arcangeli

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-14 14:53                                                                         ` Dr. Werner Fink
@ 1999-01-21 16:50                                                                           ` Stephen C. Tweedie
  1999-01-21 19:53                                                                             ` Andrea Arcangeli
  1999-01-23 23:20                                                                             ` Alan Cox
  1999-01-22 16:29                                                                           ` Eric W. Biederman
  1 sibling, 2 replies; 243+ messages in thread
From: Stephen C. Tweedie @ 1999-01-21 16:50 UTC (permalink / raw
  To: Dr. Werner Fink
  Cc: Stephen C. Tweedie, Andrea Arcangeli, Rik van Riel,
	Zlatko Calusic, Linus Torvalds, Eric W. Biederman,
	Savochkin Andrey Vladimirovich, steve, brent verner,
	Garst R. Reese, Kalle Andersson, Ben McCann, Alan Cox, bredelin,
	linux-kernel, linux-mm

Hi,

On Thu, 14 Jan 1999 15:53:21 +0100, "Dr. Werner Fink" <werner@suse.de> said:

>> There is no aging on the page cache at all other than the PG_referenced
>> bit.

> I know that most of you do not like aging.  Nevertheless, on high stressed
> systems with less than 128M you will see a critical point whereas the page
> cache and readahead does not avoid that swapin I/O time needed by a program
> increases to similar size of the average program time slice.

There's no reason why timeslices should have anything to do with swapin
IO time; we do not count time spent waiting for IO against the process's
allocated timeslice.

> What's about a simple aging of program page cluster or better of the
> page cache?  Increasing the age could be done if and only if the pages
> or page clusters swapped in and the program wasn't able to use its
> time slice. Decreasing the age could be placed in shrink_mmap().

Page aging dramatically increases the amount of CPU time we spend
looking for free pages.  The selection of which pages to swap out really
shouldn't have anything to do with scheduling of CPU-bound processes
(which is the only time where timeslices actually count for anything).

--Stephen
--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-21 16:50                                                                           ` Stephen C. Tweedie
@ 1999-01-21 19:53                                                                             ` Andrea Arcangeli
  1999-01-22 13:55                                                                               ` Stephen C. Tweedie
  1999-01-23 23:20                                                                             ` Alan Cox
  1 sibling, 1 reply; 243+ messages in thread
From: Andrea Arcangeli @ 1999-01-21 19:53 UTC (permalink / raw
  To: Stephen C. Tweedie
  Cc: Dr. Werner Fink, Rik van Riel, Zlatko Calusic, Linus Torvalds,
	Eric W. Biederman, Savochkin Andrey Vladimirovich, steve,
	brent verner, Garst R. Reese, Kalle Andersson, Ben McCann,
	Alan Cox, bredelin, linux-kernel, linux-mm

On Thu, 21 Jan 1999, Stephen C. Tweedie wrote:

> > I know that most of you do not like aging.  Nevertheless, on high stressed
> > systems with less than 128M you will see a critical point whereas the page
> > cache and readahead does not avoid that swapin I/O time needed by a program
> > increases to similar size of the average program time slice.
> 
> There's no reason why timeslices should have anything to do with swapin
> IO time; we do not count time spent waiting for IO against the process's
> allocated timeslice.

Yes we do I/O async so while the I/O is in action we could be just back in
userspace, but both shrink_mmap() and swap_out() are not something of
really so light (at least with >128Mbyte of ram). When we are running in
shrink_mmap() the current->counter is decreased as usual.

It's trivial conceptually make shrink_mmap() _fast_, adding two
prev_freeable,next_freeable pointers in the mem_map struct and adding
pages back and forth to the list (at the same time I now update
nr_freeable_pages). Probably I'll do that soon.

I see instead not trivial to decrease the cost of swap_out()... 

I agree that the timeslice has nothing to do with swapout/shrink_mmap
issue. But the timeslice _must_ be decremented as now during the
shrink_mmap/swapout passes, because otherwise we would risk to stall the
not trashing process too much. 

Andrea Arcangeli

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-21 19:53                                                                             ` Andrea Arcangeli
@ 1999-01-22 13:55                                                                               ` Stephen C. Tweedie
  1999-01-22 19:45                                                                                 ` Andrea Arcangeli
  0 siblings, 1 reply; 243+ messages in thread
From: Stephen C. Tweedie @ 1999-01-22 13:55 UTC (permalink / raw
  To: Andrea Arcangeli
  Cc: Stephen C. Tweedie, Dr. Werner Fink, Rik van Riel, Zlatko Calusic,
	Linus Torvalds, Eric W. Biederman, Savochkin Andrey Vladimirovich,
	steve, brent verner, Garst R. Reese, Kalle Andersson, Ben McCann,
	Alan Cox, bredelin, linux-kernel, linux-mm

Hi,

On Thu, 21 Jan 1999 20:53:28 +0100 (CET), Andrea Arcangeli
<andrea@e-mind.com> said:

> Yes we do I/O async so while the I/O is in action we could be just back in
> userspace, but both shrink_mmap() and swap_out() are not something of
> really so light (at least with >128Mbyte of ram). When we are running in
> shrink_mmap() the current->counter is decreased as usual.

If shrink_mmap() can exhaust the timeslice while we are swapping (ie. we
are IO-bound), then something is *SERIOUSLY* wrong!

--Stephen
--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-14 14:53                                                                         ` Dr. Werner Fink
  1999-01-21 16:50                                                                           ` Stephen C. Tweedie
@ 1999-01-22 16:29                                                                           ` Eric W. Biederman
  1999-01-25 13:14                                                                             ` Dr. Werner Fink
  1 sibling, 1 reply; 243+ messages in thread
From: Eric W. Biederman @ 1999-01-22 16:29 UTC (permalink / raw
  To: Dr. Werner Fink
  Cc: Stephen C. Tweedie, Andrea Arcangeli, Rik van Riel,
	Zlatko Calusic, Linus Torvalds, Eric W. Biederman,
	Savochkin Andrey Vladimirovich, steve, brent verner,
	Garst R. Reese, Kalle Andersson, Ben McCann, Alan Cox, bredelin,
	linux-kernel, linux-mm

>>>>> "WF" == Werner Fink <werner@suse.de> writes:

WF> I know that most of you do not like aging.

We love aging.  We dislike the BS that called it self aging code.
It implemented something like least frequently used.  Instead of
least recently used.  We dislike least frequently used because it's a poor
predictor of pages to be used next, and a cpu hog.

WF> At this point the system performance breaks down dramatically even
WF> with 2.2.0pre[567] ...

If you could demonstrate this it would aid any plea for changing the VM system.

WF> What's about a simple aging of program page cluster or better of the
WF> page cache? 

We do age pages.  The PG_referenced bit.  This scheme as far as I can
tell is more effective at predicting pages we are going to use next
than any we have used before.

WF> Increasing the age could be done if and only if the pages
WF> or page clusters swapped in and the program wasn't able to use its
WF> time slice. Decreasing the age could be placed in shrink_mmap().

People keep playing with ignoring PG_referenced in shrink_mmap for the swap cache,
because it doesn't seem terribly important.  If you could demonstrate
this is a problem we can stop ignoring it.

Eric

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-22 13:55                                                                               ` Stephen C. Tweedie
@ 1999-01-22 19:45                                                                                 ` Andrea Arcangeli
  0 siblings, 0 replies; 243+ messages in thread
From: Andrea Arcangeli @ 1999-01-22 19:45 UTC (permalink / raw
  To: Stephen C. Tweedie
  Cc: Dr. Werner Fink, Rik van Riel, Zlatko Calusic, Linus Torvalds,
	Eric W. Biederman, Savochkin Andrey Vladimirovich, steve,
	brent verner, Garst R. Reese, Kalle Andersson, Ben McCann,
	Alan Cox, bredelin, linux-kernel, linux-mm

On Fri, 22 Jan 1999, Stephen C. Tweedie wrote:

> If shrink_mmap() can exhaust the timeslice while we are swapping (ie. we

When we are in shrink_mmap() we are _not_ swapping. We are instead looping
in the mem_map pool.

If we instead block in I/O sync, the timeslice should be not touched.

Andrea Arcangeli

--
This is a majordomo managed list.  To unsubscribe, send a message with
the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-21 16:50                                                                           ` Stephen C. Tweedie
  1999-01-21 19:53                                                                             ` Andrea Arcangeli
@ 1999-01-23 23:20                                                                             ` Alan Cox
  1999-01-24  0:19                                                                               ` Linus Torvalds
  1 sibling, 1 reply; 243+ messages in thread
From: Alan Cox @ 1999-01-23 23:20 UTC (permalink / raw
  To: Stephen C. Tweedie
  Cc: werner, andrea, riel, Zlatko.Calusic, torvalds, ebiederm+eric,
	saw, steve, damonbrent, reese, kalle.andersson, bmccann, alan,
	bredelin, linux-kernel, linux-mm

> Page aging dramatically increases the amount of CPU time we spend
> looking for free pages.  The selection of which pages to swap out really

Thats a bug in our current vm structures, like the others - inability to
throw out page tables, inability to find memory easily, inability to move
blocks to allocate large areas in a target space, inability to handle
large user spaces etc.

At least 2.3 will have plenty of fun things to do 8)

Alan

--
To unsubscribe, send a message with 'unsubscribe linux-mm my@address'
in the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://humbolt.geo.uu.nl/Linux-MM/

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-23 23:20                                                                             ` Alan Cox
@ 1999-01-24  0:19                                                                               ` Linus Torvalds
  1999-01-24 18:33                                                                                 ` Gregory Maxwell
                                                                                                   ` (2 more replies)
  0 siblings, 3 replies; 243+ messages in thread
From: Linus Torvalds @ 1999-01-24  0:19 UTC (permalink / raw
  To: Alan Cox
  Cc: Stephen C. Tweedie, werner, andrea, riel, Zlatko.Calusic,
	ebiederm+eric, saw, steve, damonbrent, reese, kalle.andersson,
	bmccann, bredelin, linux-kernel, linux-mm


On Sat, 23 Jan 1999, Alan Cox wrote:
> 
> Thats a bug in our current vm structures, like the others - inability to
> throw out page tables, inability to find memory easily, inability to move
> blocks to allocate large areas in a target space, inability to handle
> large user spaces etc.

What? None of those are bugs, they are features.

Complexity is not a goal to be reached. Complexity is something to be
avoided at all cost. If you don't believe me, look at NT.

		Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm my@address'
in the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://humbolt.geo.uu.nl/Linux-MM/

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-24  0:19                                                                               ` Linus Torvalds
@ 1999-01-24 18:33                                                                                 ` Gregory Maxwell
  1999-01-25  0:21                                                                                   ` Linus Torvalds
  1999-01-24 20:33                                                                                 ` Alan Cox
  1999-01-25 16:25                                                                                 ` Stephen C. Tweedie
  2 siblings, 1 reply; 243+ messages in thread
From: Gregory Maxwell @ 1999-01-24 18:33 UTC (permalink / raw
  To: Linus Torvalds
  Cc: Alan Cox, Stephen C. Tweedie, werner, andrea, riel,
	Zlatko.Calusic, ebiederm+eric, saw, steve, damonbrent, reese,
	kalle.andersson, bmccann, bredelin, linux-kernel, linux-mm

On Sat, 23 Jan 1999, Linus Torvalds wrote:

> On Sat, 23 Jan 1999, Alan Cox wrote:
> > 
> > Thats a bug in our current vm structures, like the others - inability to
> > throw out page tables, inability to find memory easily, inability to move
> > blocks to allocate large areas in a target space, inability to handle
> > large user spaces etc.
> 
> What? None of those are bugs, they are features.
> 
> Complexity is not a goal to be reached. Complexity is something to be
> avoided at all cost. If you don't believe me, look at NT.
> 
> 		Linus

Make things as simple as possible, but no simpler.

Do you really think "inability to handle large user spaces" or "inability
to find memory easily" are features? 

Perhaps all the current solutions have been overly complex, however, that
doesn't mean there is no simple way to accomplish the same thing. 



--
To unsubscribe, send a message with 'unsubscribe linux-mm my@address'
in the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://humbolt.geo.uu.nl/Linux-MM/

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-24  0:19                                                                               ` Linus Torvalds
  1999-01-24 18:33                                                                                 ` Gregory Maxwell
@ 1999-01-24 20:33                                                                                 ` Alan Cox
  1999-01-25  0:27                                                                                   ` Linus Torvalds
  1999-01-25 16:25                                                                                 ` Stephen C. Tweedie
  2 siblings, 1 reply; 243+ messages in thread
From: Alan Cox @ 1999-01-24 20:33 UTC (permalink / raw
  To: Linus Torvalds
  Cc: alan, sct, werner, andrea, riel, Zlatko.Calusic, ebiederm+eric,
	saw, steve, damonbrent, reese, kalle.andersson, bmccann, bredelin,
	linux-kernel, linux-mm

> On Sat, 23 Jan 1999, Alan Cox wrote:
> > 
> > Thats a bug in our current vm structures, like the others - inability to
> > throw out page tables, inability to find memory easily, inability to move
> > blocks to allocate large areas in a target space, inability to handle
> > large user spaces etc.
> 
> What? None of those are bugs, they are features.
> 
> Complexity is not a goal to be reached. Complexity is something to be
> avoided at all cost. If you don't believe me, look at NT.

A feature becomes a bug at the point it becomes a problem. Right now there
is a continual background DMA rumbling. That one at least needs solving.

Being able to throw out page tables is something that is going to be needed
too. As far as I can see that does not mean complexity. The Linux VM is
very clean in its page handling, there is almost nothing in the page tables
that cannot be flushed or dumped to disk if need be.

There are real cases where grab large linear block is needed. Sadly the
fact that NT and 98 support it will make this more not less common. The
current PCI soundcards like the S3 SonicVibes aren't easily supportable
in Linux because they require a 4Mb linear block. The Zoran video capture
chipset (Trust, Iomega, and others) needs large linear blocks. Even I2O
wants 32/64K linear chunks and thats designed to be "OS independant"

Its on my "please for 2.3" list not because the linear block problem is an
elegance issue but because people are baning their heads on it. The large
physical memory problem is there because people are already hitting it.

Alan

--
To unsubscribe, send a message with 'unsubscribe linux-mm my@address'
in the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://humbolt.geo.uu.nl/Linux-MM/

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-24 18:33                                                                                 ` Gregory Maxwell
@ 1999-01-25  0:21                                                                                   ` Linus Torvalds
  1999-01-25  1:28                                                                                     ` Alan Cox
  0 siblings, 1 reply; 243+ messages in thread
From: Linus Torvalds @ 1999-01-25  0:21 UTC (permalink / raw
  To: Gregory Maxwell
  Cc: Alan Cox, Stephen C. Tweedie, werner, andrea, riel,
	Zlatko.Calusic, ebiederm+eric, saw, steve, damonbrent, reese,
	kalle.andersson, bmccann, bredelin, linux-kernel, linux-mm


On Sun, 24 Jan 1999, Gregory Maxwell wrote:
> 
> Do you really think "inability to handle large user spaces" or "inability
> to find memory easily" are features? 

Alan is just full of it on both accounts.

We handle large user space with no problem, and we find free memory no
problem.

		Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm my@address'
in the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://humbolt.geo.uu.nl/Linux-MM/

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-24 20:33                                                                                 ` Alan Cox
@ 1999-01-25  0:27                                                                                   ` Linus Torvalds
  1999-01-25  1:38                                                                                     ` Alan Cox
  0 siblings, 1 reply; 243+ messages in thread
From: Linus Torvalds @ 1999-01-25  0:27 UTC (permalink / raw
  To: Alan Cox
  Cc: sct, werner, andrea, riel, Zlatko.Calusic, ebiederm+eric, saw,
	steve, damonbrent, reese, kalle.andersson, bmccann, bredelin,
	linux-kernel, linux-mm

On Sun, 24 Jan 1999, Alan Cox wrote:
>
> Being able to throw out page tables is something that is going to be needed
> too. As far as I can see that does not mean complexity. The Linux VM is
> very clean in its page handling, there is almost nothing in the page tables
> that cannot be flushed or dumped to disk if need be.

There _is_ a major problem: being able to swap out page tables means that
the thing that swaps them out _has_ to own the mm semaphore. 

That's the right thing to do anyway, but it means, for example, that the
_only_ process that can page stuff out would be kswapd. 

Who knows? Maybe I should just bite the bullet and make that the rule,
then we could forget about all the extra recursive semaphore crap too. And
it has other advantages - it can speed up the page fault handler (which
right now has to get the kernel lock for certain situations). 

Once that is done, paging out page tables is not really a problem.

> There are real cases where grab large linear block is needed.

Nobody has so far shown a reasonable implementation where this would be
possible.

		Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm my@address'
in the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://humbolt.geo.uu.nl/Linux-MM/

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-25  1:38                                                                                     ` Alan Cox
@ 1999-01-25  1:04                                                                                       ` Andrea Arcangeli
  1999-01-25  2:10                                                                                         ` Alan Cox
  1999-01-25 21:59                                                                                       ` Gerard Roudier
  1 sibling, 1 reply; 243+ messages in thread
From: Andrea Arcangeli @ 1999-01-25  1:04 UTC (permalink / raw
  To: Alan Cox
  Cc: Linus Torvalds, sct, werner, riel, Zlatko.Calusic, ebiederm+eric,
	saw, steve, damonbrent, reese, kalle.andersson, bmccann, bredelin,
	linux-kernel, linux-mm

On Mon, 25 Jan 1999, Alan Cox wrote:

> Thats as maybe. However someone needs to find a way to do it. Right now I
> can run a matrox meteor on netbsd,freebsd,openbsd,windows95, nt but not Linux

If I understand well the problem is get more than 1<<maxorder contiguos
phys pages in RAM. I think it should not too difficult to do a dirty hack
to have such contiguos RAM without wait for 2.[34]. I could implement a
alternate __get_big_pages that does some try to get many mem-areas of the
maximal order contigous. Maybe it will not able to give you such contiguos
memory (due mem fragmentation) but if it's possible it will give back it
to you (_slowly_). Then you should use an aware free_big_pages() to give
back the memory. That way the codebase (for people that doesn't need
__get_big_pages in their device drivers) will be untouched (so no codebase
stability issues). 

Andrea Arcangeli

--
To unsubscribe, send a message with 'unsubscribe linux-mm my@address'
in the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://humbolt.geo.uu.nl/Linux-MM/

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-25  0:21                                                                                   ` Linus Torvalds
@ 1999-01-25  1:28                                                                                     ` Alan Cox
  1999-01-25  3:35                                                                                       ` pmonta
  1999-01-25  4:17                                                                                       ` Linus Torvalds
  0 siblings, 2 replies; 243+ messages in thread
From: Alan Cox @ 1999-01-25  1:28 UTC (permalink / raw
  To: Linus Torvalds
  Cc: linker, alan, sct, werner, andrea, riel, Zlatko.Calusic,
	ebiederm+eric, saw, steve, damonbrent, reese, kalle.andersson,
	bmccann, bredelin, linux-kernel, linux-mm

> On Sun, 24 Jan 1999, Gregory Maxwell wrote:
> > 
> > Do you really think "inability to handle large user spaces" or "inability
> > to find memory easily" are features? 
> 
> Alan is just full of it on both accounts.
> 
> We handle large user space with no problem, and we find free memory no
> problem.

Oh good, whats the configuration setting for a 4Gig Xeon box. I've got
people dying to know. So I'm not full of it.

Its not "inability to find memory easily" in my original comments either.
In context its about the expense sometimes of finding which things to swap.

Note that I don't disagree with Linus. Every time Linus can say "but you don't
need that because [sensible solution]" is a bigger win than adding a ton
of special case code.

Right now

o	I can't run 3Gig user processes on a 4Gig Xeon
o	I can't support devices needing large physically linear blocks of
	memory

#1 is happening today
#2 is happening a bit now - although its a lesser problem (unable to allocate
ISA DMA buffer..) thats the visible part of a bigger issue. Some people
don't bother with scatter gather DMA - real examples:
	S3 Sonic Vibes	- linux can't support its wavetable (wants 4Mb linear)
	Zoran based capture chips - physically linear capture/masks
	Matrox Meteor frame grabber - physically linear grabbing

So 2.3 needs to be able to allocate large linear physical spaces - not
neccessarily efficiently either. These are all occasional grabs of memory.

Alan

--
To unsubscribe, send a message with 'unsubscribe linux-mm my@address'
in the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://humbolt.geo.uu.nl/Linux-MM/

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-25  0:27                                                                                   ` Linus Torvalds
@ 1999-01-25  1:38                                                                                     ` Alan Cox
  1999-01-25  1:04                                                                                       ` Andrea Arcangeli
  1999-01-25 21:59                                                                                       ` Gerard Roudier
  0 siblings, 2 replies; 243+ messages in thread
From: Alan Cox @ 1999-01-25  1:38 UTC (permalink / raw
  To: Linus Torvalds
  Cc: alan, sct, werner, andrea, riel, Zlatko.Calusic, ebiederm+eric,
	saw, steve, damonbrent, reese, kalle.andersson, bmccann, bredelin,
	linux-kernel, linux-mm

> > There are real cases where grab large linear block is needed.
> 
> Nobody has so far shown a reasonable implementation where this would be
> possible.

Thats as maybe. However someone needs to find a way to do it. Right now I
can run a matrox meteor on netbsd,freebsd,openbsd,windows95, nt but not Linux

Thats not meant as a flippant remark - its something to be stuck on the 2.3
problem chart. Its just a question of who out there is sitting on the 
solution

--
To unsubscribe, send a message with 'unsubscribe linux-mm my@address'
in the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://humbolt.geo.uu.nl/Linux-MM/

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-25  1:04                                                                                       ` Andrea Arcangeli
@ 1999-01-25  2:10                                                                                         ` Alan Cox
  1999-01-25  3:16                                                                                           ` Garst R. Reese
  1999-01-25 14:06                                                                                           ` Rik van Riel
  0 siblings, 2 replies; 243+ messages in thread
From: Alan Cox @ 1999-01-25  2:10 UTC (permalink / raw
  To: Andrea Arcangeli
  Cc: alan, torvalds, sct, werner, riel, Zlatko.Calusic, ebiederm+eric,
	saw, steve, damonbrent, reese, kalle.andersson, bmccann, bredelin,
	linux-kernel, linux-mm

> If I understand well the problem is get more than 1<<maxorder contiguos
> phys pages in RAM. I think it should not too difficult to do a dirty hack

Yep. We are talking about 2->4Mb sized chunks. We are also talking about
chunks that are allocated rarely - for example when you load wave data
into the sound card, while you are capturing etc. So its blocks that
can be slow to allocate, slow to free, so long as they are normal speed
to access. That may make the problem a lot easier

> alternate __get_big_pages that does some try to get many mem-areas of the
> maximal order contigous. Maybe it will not able to give you such contiguos
> memory (due mem fragmentation) but if it's possible it will give back it
> to you (_slowly_). Then you should use an aware free_big_pages() to give
> back the memory. That way the codebase (for people that doesn't need
> __get_big_pages in their device drivers) will be untouched (so no codebase
> stability issues). 

That fact we effectively "poison" the various blocks of memory with locked
down kernel objects is what makes this so tricky. It really needs some back
pressure applied so that kernel allocations come from a limited number of
maxorder blocks, at least except under exceptional circumstances.

I think its too tricky for 2.2 even as a later retrofit

--
To unsubscribe, send a message with 'unsubscribe linux-mm my@address'
in the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://humbolt.geo.uu.nl/Linux-MM/

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-25  2:10                                                                                         ` Alan Cox
@ 1999-01-25  3:16                                                                                           ` Garst R. Reese
  1999-01-25 10:49                                                                                             ` Alan Cox
  1999-01-25 14:06                                                                                           ` Rik van Riel
  1 sibling, 1 reply; 243+ messages in thread
From: Garst R. Reese @ 1999-01-25  3:16 UTC (permalink / raw
  To: Alan Cox
  Cc: Andrea Arcangeli, torvalds, sct, werner, riel, Zlatko.Calusic,
	ebiederm+eric, saw, steve, damonbrent, kalle.andersson, bmccann,
	bredelin, linux-kernel, linux-mm

Alan Cox wrote:
> 
> > If I understand well the problem is get more than 1<<maxorder contiguos
> > phys pages in RAM. I think it should not too difficult to do a dirty hack
> 
> Yep. We are talking about 2->4Mb sized chunks. We are also talking about
> chunks that are allocated rarely - for example when you load wave data
> into the sound card, while you are capturing etc. So its blocks that
> can be slow to allocate, slow to free, so long as they are normal speed
> to access. That may make the problem a lot easier
How much of this problem can be solved with a ramfs that takes what you
give it at boot time?
-- 
Garst
--
To unsubscribe, send a message with 'unsubscribe linux-mm my@address'
in the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://humbolt.geo.uu.nl/Linux-MM/

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-25  1:28                                                                                     ` Alan Cox
@ 1999-01-25  3:35                                                                                       ` pmonta
  1999-01-25  4:17                                                                                       ` Linus Torvalds
  1 sibling, 0 replies; 243+ messages in thread
From: pmonta @ 1999-01-25  3:35 UTC (permalink / raw
  To: alan
  Cc: torvalds, linker, sct, werner, andrea, riel, Zlatko.Calusic,
	ebiederm+eric, saw, steve, damonbrent, reese, kalle.andersson,
	bmccann, bredelin, linux-kernel, linux-mm

Alan Cox writes:

> I can't support devices needing large physically linear blocks of
> memory ...
>
> S3 Sonic Vibes	- linux can't support its wavetable (wants 4Mb linear)
> Zoran based capture chips - physically linear capture/masks
> Matrox Meteor frame grabber - physically linear grabbing
>
> So 2.3 needs to be able to allocate large linear physical spaces - not
> neccessarily efficiently either. These are all occasional grabs of memory.

Yes---physical addressing for I/O is reality.  Some devices may
not implement scatter-gather, and some may do so and yet still be
afflicted with high latencies for descriptor fetching and
the like.

If allocations are rare, it doesn't seem that unreasonable to actually
do physical copies, push stuff bodily out of the way to construct a new
contiguous region.  Or else a separate allocator, like the present-day
bigphysarea.

Cheers,
Peter Monta   pmonta@imedia.com
Imedia Corp.
--
To unsubscribe, send a message with 'unsubscribe linux-mm my@address'
in the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://humbolt.geo.uu.nl/Linux-MM/

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-25  1:28                                                                                     ` Alan Cox
  1999-01-25  3:35                                                                                       ` pmonta
@ 1999-01-25  4:17                                                                                       ` Linus Torvalds
  1 sibling, 0 replies; 243+ messages in thread
From: Linus Torvalds @ 1999-01-25  4:17 UTC (permalink / raw
  To: Alan Cox
  Cc: linker, sct, werner, andrea, riel, Zlatko.Calusic, ebiederm+eric,
	saw, steve, damonbrent, reese, kalle.andersson, bmccann, bredelin,
	linux-kernel, linux-mm


On Mon, 25 Jan 1999, Alan Cox wrote:
> 
> Oh good, whats the configuration setting for a 4Gig Xeon box. I've got
> people dying to know. So I'm not full of it.

Oh, the answer is very simple: it's not going to happen.

EVER.

You need more that 32 bits of address space to handle that kind of memory. 
This is not something I'm going to discuss further. If people want to use
more than 2GB of memory, they have exactly two options with Linux: 

 - get a machine with reasonable address spaces. Right now that's either
   alpha or sparc64, in the not too distant future it will be merced.
 - use the extra memory as a ram-disk (possibly memory-mappable, but even
   that I consider unlikely)

This is not negotiable.

		Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm my@address'
in the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://humbolt.geo.uu.nl/Linux-MM/

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-25  3:16                                                                                           ` Garst R. Reese
@ 1999-01-25 10:49                                                                                             ` Alan Cox
  0 siblings, 0 replies; 243+ messages in thread
From: Alan Cox @ 1999-01-25 10:49 UTC (permalink / raw
  To: Garst R. Reese
  Cc: alan, andrea, torvalds, sct, werner, riel, Zlatko.Calusic,
	ebiederm+eric, saw, steve, damonbrent, kalle.andersson, bmccann,
	bredelin, linux-kernel, linux-mm

> How much of this problem can be solved with a ramfs that takes what you
> give it at boot time?

Grabbing 4Mb for wave tables, and 4Mb for a matrox meteor at boot "just
in case" is at the "you might as well run another OS" level of "supported"
IMHO anyway. Its the right answer for any 2.2 retrofits

Alan

--
To unsubscribe, send a message with 'unsubscribe linux-mm my@address'
in the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://humbolt.geo.uu.nl/Linux-MM/

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-22 16:29                                                                           ` Eric W. Biederman
@ 1999-01-25 13:14                                                                             ` Dr. Werner Fink
  1999-01-25 17:56                                                                               ` Stephen C. Tweedie
  1999-01-25 19:10                                                                               ` Andrea Arcangeli
  0 siblings, 2 replies; 243+ messages in thread
From: Dr. Werner Fink @ 1999-01-25 13:14 UTC (permalink / raw
  To: Eric W. Biederman, Dr. Werner Fink
  Cc: Stephen C. Tweedie, Andrea Arcangeli, Rik van Riel,
	Zlatko Calusic, Linus Torvalds, Savochkin Andrey Vladimirovich,
	steve, brent verner, Garst R. Reese, Kalle Andersson, Ben McCann,
	Alan Cox, bredelin, linux-kernel, linux-mm

On Fri, Jan 22, 1999 at 10:29:05AM -0600, Eric W. Biederman wrote:
> 
> WF> At this point the system performance breaks down dramatically even
> WF> with 2.2.0pre[567] ...
> 
> If you could demonstrate this it would aid any plea for changing the VM system.

I'm using simple two loops in different kernel trees:

      while true; do make clean; make MAKE='make -j10'; done

which leads into load upper 30.  You can see a great performance upto
load to 25 ... 30+ *and* a brutal break down of that performance
at this point.  The system is a PentiumII 400MHz with 32, 64, 128MB
(mem=xxx) and SCSI only.  In comparision to 2.0.36 the performance
is *beside of this break down* much better ...  that means that only
the performance break down at high load is the real problem.

> 
> WF> What's about a simple aging of program page cluster or better of the
> WF> page cache? 
> 
> We do age pages.  The PG_referenced bit.  This scheme as far as I can
> tell is more effective at predicting pages we are going to use next
> than any we have used before.

What's about a `PG_recently_swapped_in' bit for pages which arn't found
anymore with the swap cache?  This isn't a prediction but a protection
against throwing out the same page in the following cycle.

> 
> WF> Increasing the age could be done if and only if the pages
> WF> or page clusters swapped in and the program wasn't able to use its
> WF> time slice. Decreasing the age could be placed in shrink_mmap().
> 
> People keep playing with ignoring PG_referenced in shrink_mmap for the swap cache,
> because it doesn't seem terribly important.  If you could demonstrate
> this is a problem we can stop ignoring it.
> 
> Eric
> 


            Werner
--
To unsubscribe, send a message with 'unsubscribe linux-mm my@address'
in the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://humbolt.geo.uu.nl/Linux-MM/

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-25  2:10                                                                                         ` Alan Cox
  1999-01-25  3:16                                                                                           ` Garst R. Reese
@ 1999-01-25 14:06                                                                                           ` Rik van Riel
  1 sibling, 0 replies; 243+ messages in thread
From: Rik van Riel @ 1999-01-25 14:06 UTC (permalink / raw
  To: Alan Cox
  Cc: Andrea Arcangeli, Linus Torvalds, Stephen C. Tweedie,
	Dr. Werner Fink, Zlatko Calusic, ebiederm+eric, saw, bredelin,
	Linux Kernel, Linux MM

On Mon, 25 Jan 1999, Alan Cox wrote:

> > If I understand well the problem is get more than 1<<maxorder contiguos
> > phys pages in RAM. I think it should not too difficult to do a dirty hack
> 
> Yep. We are talking about 2->4Mb sized chunks. We are also talking about
> chunks that are allocated rarely 

> > alternate __get_big_pages that does some try to get many mem-areas of the
> > maximal order contigous. Maybe it will not able to give you such contiguos
> > memory (due mem fragmentation) but if it's possible it will give back it
> > to you (_slowly_).
> 
> That fact we effectively "poison" the various blocks of memory
> with locked down kernel objects is what makes this so tricky. It
> really needs some back pressure applied so that kernel allocations
> come from a limited number of maxorder blocks, at least except
> under exceptional circumstances.

We need a different memory allocator for that. Maybe it's
time to dig up my zone allocator proposal (on my home page)
and adapt it to something working.

Unfortunately I don't have the time to do that, so I'll
leave the job to Alan or Stephen (who should have the time
since they're with Red Hat)...

> I think its too tricky for 2.2 even as a later retrofit

Once the allocator is ready and stabilized, we might be
able to retrofit it to 2.2. It's just a single module
we need to touch...

cheers,

Rik -- If a Microsoft product fails, who do you sue?
+-------------------------------------------------------------------+
| Linux memory management tour guide.             riel@nl.linux.org |
| Scouting Vries cubscout leader.     http://www.nl.linux.org/~riel |
+-------------------------------------------------------------------+

--
To unsubscribe, send a message with 'unsubscribe linux-mm my@address'
in the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://humbolt.geo.uu.nl/Linux-MM/

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-24  0:19                                                                               ` Linus Torvalds
  1999-01-24 18:33                                                                                 ` Gregory Maxwell
  1999-01-24 20:33                                                                                 ` Alan Cox
@ 1999-01-25 16:25                                                                                 ` Stephen C. Tweedie
  1999-01-25 16:52                                                                                   ` Andrea Arcangeli
                                                                                                     ` (3 more replies)
  2 siblings, 4 replies; 243+ messages in thread
From: Stephen C. Tweedie @ 1999-01-25 16:25 UTC (permalink / raw
  To: Linus Torvalds
  Cc: Alan Cox, Stephen C. Tweedie, werner, andrea, riel,
	Zlatko.Calusic, ebiederm+eric, saw, steve, damonbrent, reese,
	kalle.andersson, bmccann, bredelin, linux-kernel, linux-mm

Hi,

On Sat, 23 Jan 1999 16:19:13 -0800 (PST), Linus Torvalds
<torvalds@transmeta.com> said:

> Complexity is not a goal to be reached. Complexity is something to be
> avoided at all cost. If you don't believe me, look at NT.

Nevertheless, the 2.2.0-pre9 VM sucks.  I've been getting seriously
frustrated at pre-9's interactive feel over the past few days.

Linus, there really are fundamental problems remaining in the VM in
2.2.0-pre right now.  The two biggest are the lack of responsiveness
of kswapd and a general misbalance in the cache management.

The kswapd in pre9 is improved, but still only checks status at 1HZ.
Once we detect an out-of-memory condition, then yes, we increase that
frequency, but it means we can take a long time to start responding to
(say) a burst of network traffic, and the free list can be completely
exhausted long before kswapd notices.

The second, balancing issue is evident in a general all-round
performance degradation when under VM load.  I notice this on 64M and on
8M.  Interactive response is simply poor all over, and build times are
excessive especially in low memory configurations.

Regarding the former, is there any chance you'd consider adding a kswapd
wakeup when low_memory gets set in get_free_pages()?  Being able to
respond to a burst in network traffic without locking up is not exactly
a minor issue.

As for the balancing, the tiny patch below seems to completely restore
the responsiveness and throughput of the VM we had in the 132-ac*
kernels.  8MB builds are very much faster.  Responsiveness on memory
sizes up to 64MB is improved both when we have several competing tasks
running and when simply switching between applications.  vmstat shows
swapouts clustered well: I see between 3 and 6 times the swap throughput
that pre9 alone has, and swap bursts end in about a quarter of the time
as under plain pre9.

The changes are very similar to the self-tuning clock counter we had in
those ac* vms.  The modified shrink_mmap() just avoids decrementing the
count for locked, non-DMA (if GFP_DMA) or multiply-mapped pages.  The
effect is to avoid counting memory mapped pages when we trim cache.  In
low memory, this allows us to keep trimming back the "excess" unmapped
pure cache pages even if a large fraction of physical memory is occupied
by mapped pages.  

Right now, on my 64MB box this kernel is so much more responsive than
pre9 that it is scary.  Ditto 8MB.  Kernel builds also now proceed
without excessive cache trimming: even pre9 used to show large amounts
of disk read activity as the include file working set got tossed from
cache, but halving the "count" limit as below is enough to eliminate
that entirely.  The new limit also has the side effect of allowing
swapout to stream much more effectively, without any signs of the cache
growing to excess.  Sustained IO activity grows the cache to about the
same size as in previous kernels.

Up to you, take it or leave it, but right now one of the major benefits
we are touting for 2.2 over 2.0 is performance, and people will expect
2.2.0's performance to be representative of the 2.2.* series.  Right now
we are way behind the 131+ kernels on that front.

--Stephen

----------------------------------------------------------------
--- mm/filemap.c.~1~	Thu Jan 21 10:26:41 1999
+++ mm/filemap.c	Mon Jan 25 12:59:38 1999
@@ -125,7 +125,7 @@
 	struct page * page;
 	int count;

-	count = (limit << 1) >> priority;
+	count = limit >> priority;

 	page = mem_map + clock;
 	do {
@@ -147,7 +147,6 @@
 			clock = page - mem_map;
 		}

-		count--;
 		referenced = test_and_clear_bit(PG_referenced, &page->flags);

 		if (PageLocked(page))
@@ -159,6 +158,8 @@
 		/* We can't free pages unless there's just one user */
 		if (atomic_read(&page->count) != 1)
 			continue;
+
+		count--;

 		/*
 		 * Is it a page swap page? If so, we want to
--
To unsubscribe, send a message with 'unsubscribe linux-mm my@address'
in the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://humbolt.geo.uu.nl/Linux-MM/

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-25 16:25                                                                                 ` Stephen C. Tweedie
@ 1999-01-25 16:52                                                                                   ` Andrea Arcangeli
  1999-01-25 18:27                                                                                   ` Linus Torvalds
                                                                                                     ` (2 subsequent siblings)
  3 siblings, 0 replies; 243+ messages in thread
From: Andrea Arcangeli @ 1999-01-25 16:52 UTC (permalink / raw
  To: Stephen C. Tweedie
  Cc: Linus Torvalds, Alan Cox, werner, riel, Zlatko.Calusic,
	ebiederm+eric, saw, steve, damonbrent, reese, kalle.andersson,
	bmccann, bredelin, linux-kernel, linux-mm

On Mon, 25 Jan 1999, Stephen C. Tweedie wrote:

> Nevertheless, the 2.2.0-pre9 VM sucks.  I've been getting seriously
> frustrated at pre-9's interactive feel over the past few days.

Just for curiosity, did you tried my latest
ftp://e-mind.com/pub/linux/arca-tree/2.2.0-pre9_arca-2.gz ?

I would like if you would apply it, recompile and reboot and see how it
feels. You should not waste more than 5/10 minutes to do that.

> Linus, there really are fundamental problems remaining in the VM in
> 2.2.0-pre right now.  The two biggest are the lack of responsiveness
> of kswapd and a general misbalance in the cache management.

kswapd is not an issue. kswapd has nothing to do with performances. Feel
free to change kswapd rating as you want to see with your eyes.

The _problem_ of pre9 is try_to_free_pages(). I just posted a patch that I
think could help (note never tried such patch myself though).

--
To unsubscribe, send a message with 'unsubscribe linux-mm my@address'
in the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://humbolt.geo.uu.nl/Linux-MM/

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-25 13:14                                                                             ` Dr. Werner Fink
@ 1999-01-25 17:56                                                                               ` Stephen C. Tweedie
  1999-01-25 19:10                                                                               ` Andrea Arcangeli
  1 sibling, 0 replies; 243+ messages in thread
From: Stephen C. Tweedie @ 1999-01-25 17:56 UTC (permalink / raw
  To: Dr. Werner Fink
  Cc: Eric W. Biederman, Stephen C. Tweedie, Andrea Arcangeli,
	Rik van Riel, Zlatko Calusic, Savochkin Andrey Vladimirovich,
	steve, brent verner, Garst R. Reese, Kalle Andersson, Ben McCann,
	Alan Cox, bredelin, linux-kernel, linux-mm

Hi,

On Mon, 25 Jan 1999 14:14:09 +0100, "Dr. Werner Fink" <werner@suse.de>
said:

> which leads into load upper 30.  You can see a great performance upto
> load to 25 ... 30+ *and* a brutal break down of that performance
> at this point.  The system is a PentiumII 400MHz with 32, 64, 128MB
> (mem=xxx) and SCSI only.  In comparision to 2.0.36 the performance
> is *beside of this break down* much better ...  that means that only
> the performance break down at high load is the real problem.

But is the performance of 2.0.36 better or worse at high load?

--Stephen
--
To unsubscribe, send a message with 'unsubscribe linux-mm my@address'
in the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://humbolt.geo.uu.nl/Linux-MM/

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-25 16:25                                                                                 ` Stephen C. Tweedie
  1999-01-25 16:52                                                                                   ` Andrea Arcangeli
@ 1999-01-25 18:27                                                                                   ` Linus Torvalds
  1999-01-25 18:43                                                                                     ` Stephen C. Tweedie
  1999-01-25 18:43                                                                                   ` Linus Torvalds
  1999-01-26  1:57                                                                                   ` Andrea Arcangeli
  3 siblings, 1 reply; 243+ messages in thread
From: Linus Torvalds @ 1999-01-25 18:27 UTC (permalink / raw
  To: Stephen C. Tweedie
  Cc: Alan Cox, werner, andrea, riel, Zlatko.Calusic, ebiederm+eric,
	saw, steve, damonbrent, reese, kalle.andersson, bmccann, bredelin,
	linux-kernel, linux-mm


On Mon, 25 Jan 1999, Stephen C. Tweedie wrote:
> 
> Regarding the former, is there any chance you'd consider adding a kswapd
> wakeup when low_memory gets set in get_free_pages()?  Being able to
> respond to a burst in network traffic without locking up is not exactly
> a minor issue.

I did that, only to revert it later, because I didn't think it would make
any difference - processes that get to that point will try to free up
memory on their own anyway. 

Note that it wouldn't ever trigger for GFP_ATOMIC allocations, so I
suspect you haven't actually _tried_ it? For a machine that gets burst of
network traffic with nothing else going on, adding it should essentially
amount to a no-op.

I'll look at your other patch.

		Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm my@address'
in the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://humbolt.geo.uu.nl/Linux-MM/

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-25 16:25                                                                                 ` Stephen C. Tweedie
  1999-01-25 16:52                                                                                   ` Andrea Arcangeli
  1999-01-25 18:27                                                                                   ` Linus Torvalds
@ 1999-01-25 18:43                                                                                   ` Linus Torvalds
  1999-01-25 19:15                                                                                     ` Stephen C. Tweedie
  1999-01-26  1:57                                                                                   ` Andrea Arcangeli
  3 siblings, 1 reply; 243+ messages in thread
From: Linus Torvalds @ 1999-01-25 18:43 UTC (permalink / raw
  To: Stephen C. Tweedie; +Cc: linux-mm

On Mon, 25 Jan 1999, Stephen C. Tweedie wrote:
> 
> The changes are very similar to the self-tuning clock counter we had in
> those ac* vms.  The modified shrink_mmap() just avoids decrementing the
> count for locked, non-DMA (if GFP_DMA) or multiply-mapped pages.  The
> effect is to avoid counting memory mapped pages when we trim cache.  In
> low memory, this allows us to keep trimming back the "excess" unmapped
> pure cache pages even if a large fraction of physical memory is occupied
> by mapped pages.  

Parts of your patch makes sense, other parts make no sense at all.

For example, the "multiply by two" that you removed, is done in order to
make shrink_mmap() look at all pages when given a priority of zero. Your
patch makes it possible that shrink_mmap() wouldn't have looked at all
pages, because count is still decremented before looking at "referenced"

I don't think that's actually a problem, because before we call
shrink_mmap() with argument 0, we'll have called it many times before, and
that together with the fact that you changed the count to not be
decremented for shared pages makes the "problem" fairly academic. So my
only objection is basically that I think you mixed up the behaviour of the
new patch with the (original) patch of yours that made count decrements
conditional on the PG_referenced bit.

Basically, this is _very_ different from the self-tuning clock you
proposed earlier: your earlier patch had the explanation that you wanted
to more quickly go through referenced pages, while this one goes through
_shared_ pages more quickly. Big difference.

I like the second way of thinking about it a lot more, though. And it may
be that even though you _thought_ that the first one was due to reference
counting, the shared page issue was the more important one. 

As far as I can see, this patch essentially makes us more likely to keep
shared pages - somehting I wholeheartedly agree with, and I'll apply it. I
just wanted to point out that I think you're making up the explanations
for your patches as you go along, and that this is NOT the same
explanation you had for your earlier patch that did a very similar thing.
Sounds like you made up the explanations after making the patch.

			Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm my@address'
in the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://humbolt.geo.uu.nl/Linux-MM/

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-25 18:27                                                                                   ` Linus Torvalds
@ 1999-01-25 18:43                                                                                     ` Stephen C. Tweedie
  1999-01-25 18:49                                                                                       ` Linus Torvalds
  0 siblings, 1 reply; 243+ messages in thread
From: Stephen C. Tweedie @ 1999-01-25 18:43 UTC (permalink / raw
  To: Linus Torvalds
  Cc: Stephen C. Tweedie, Alan Cox, werner, andrea, riel,
	Zlatko.Calusic, ebiederm+eric, saw, steve, damonbrent, reese,
	kalle.andersson, bmccann, bredelin, linux-kernel, linux-mm

Hi,

On Mon, 25 Jan 1999 10:27:30 -0800 (PST), Linus Torvalds
<torvalds@transmeta.com> said:

>> Regarding the former, is there any chance you'd consider adding a kswapd
>> wakeup when low_memory gets set in get_free_pages()?  Being able to
>> respond to a burst in network traffic without locking up is not exactly
>> a minor issue.

> Note that it wouldn't ever trigger for GFP_ATOMIC allocations, so I
> suspect you haven't actually _tried_ it? For a machine that gets burst of
> network traffic with nothing else going on, adding it should essentially
> amount to a no-op.

Correct: I haven't been testing any of the networking stuff myself so it
has been a non-issue for any of my workloads here.  Obviously any check
for this case would have to be outside the GFP_WAIT conditional, but it
does make sense to set low_on_memory there anyway.

--Stephen
--
To unsubscribe, send a message with 'unsubscribe linux-mm my@address'
in the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://humbolt.geo.uu.nl/Linux-MM/

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-25 18:43                                                                                     ` Stephen C. Tweedie
@ 1999-01-25 18:49                                                                                       ` Linus Torvalds
  0 siblings, 0 replies; 243+ messages in thread
From: Linus Torvalds @ 1999-01-25 18:49 UTC (permalink / raw
  To: Stephen C. Tweedie
  Cc: Alan Cox, werner, andrea, riel, Zlatko.Calusic, ebiederm+eric,
	saw, steve, damonbrent, reese, kalle.andersson, bmccann, bredelin,
	linux-kernel, linux-mm


On Mon, 25 Jan 1999, Stephen C. Tweedie wrote:
> 
> Correct: I haven't been testing any of the networking stuff myself so it
> has been a non-issue for any of my workloads here.  Obviously any check
> for this case would have to be outside the GFP_WAIT conditional, but it
> does make sense to set low_on_memory there anyway.

In fact, I wonder if we shouldn't just get rid of the GFP_WAIT conditional
in __get_free_pages(), and make all that unconditional, so that we track
low memory situations correctly even for atomic network traffic -
something that obviously is a GoodThing(tm) to do. Then we could just make
sure that try_to_free_pages() returns immediately for anything that
doesn't have GFP_WAIT set, and have all the kswapd logic there.

That would even get rid of a test in the common path.

		Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm my@address'
in the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://humbolt.geo.uu.nl/Linux-MM/

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-25 13:14                                                                             ` Dr. Werner Fink
  1999-01-25 17:56                                                                               ` Stephen C. Tweedie
@ 1999-01-25 19:10                                                                               ` Andrea Arcangeli
  1999-01-25 20:49                                                                                 ` Dr. Werner Fink
  1 sibling, 1 reply; 243+ messages in thread
From: Andrea Arcangeli @ 1999-01-25 19:10 UTC (permalink / raw
  To: Dr. Werner Fink
  Cc: Eric W. Biederman, Stephen C. Tweedie, Rik van Riel,
	Zlatko Calusic, Linus Torvalds, Savochkin Andrey Vladimirovich,
	steve, brent verner, Garst R. Reese, Kalle Andersson, Ben McCann,
	Alan Cox, bredelin, linux-kernel, linux-mm

On Mon, 25 Jan 1999, Dr. Werner Fink wrote:

> I'm using simple two loops in different kernel trees:
> 
>       while true; do make clean; make MAKE='make -j10'; done

Tried now.

> which leads into load upper 30.  You can see a great performance upto

Here the load never goes over 25. There are always 35mbyte of swap used
with 128Mbyte of ram (I am using egcs-1.1.1 btw). So I guess that the
problem of pre9 you are reporting is the VM and nothing related to the
scheduler (maybe not?). 

I am writing this with the machine under load and here inside pine ;) it's
responsive as when it was idle. 

> load to 25 ... 30+ *and* a brutal break down of that performance
> at this point.  The system is a PentiumII 400MHz with 32, 64, 128MB

Here with a double-PII 450Mhz but with 1 sloww IDE hd (6mbyte sec
reported by hdparm -t /dev/hda, seek time is far worse ;). Everything is
in the same phys HD and there's only two partitions one for the swap and
one for ext2 (rootfs).

> (mem=xxx) and SCSI only.  In comparision to 2.0.36 the performance
> is *beside of this break down* much better ...  that means that only
> the performance break down at high load is the real problem.

I suggest you to try out my:

	ftp://e-mind.com/pub/linux/arca-tree/2.2.0-pre9_arca-2.gz

It's against 2.2.0-pre9 and has also my latest VM in it.

> What's about a `PG_recently_swapped_in' bit for pages which arn't found
> anymore with the swap cache?  This isn't a prediction but a protection
> against throwing out the same page in the following cycle.

I am not sure to have understood well but if the page is been throw out
from the swap cache it means that the page is gone and so it will be
difficult to mark the page PG_recently_swapped_in ;). But we could use the
same logic with a bit in the swap entry to handle that (we have 6 custom
bit to use and only one is used right now, and it's SHM_SWP_TYPE).

But I don't think it's the right approch. The swap_cache should just be
able to throw out only the right pages. See below ... 

> > People keep playing with ignoring PG_referenced in shrink_mmap for the swap cache,
> > because it doesn't seem terribly important.  If you could demonstrate
> > this is a problem we can stop ignoring it.

Eric, it's important infact. I am handling aging in the swap cache here.
That's an _important_ point for performances. I don't remeber if I pointed
out this before.

Andrea Arcangeli

--
To unsubscribe, send a message with 'unsubscribe linux-mm my@address'
in the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://humbolt.geo.uu.nl/Linux-MM/

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-25 18:43                                                                                   ` Linus Torvalds
@ 1999-01-25 19:15                                                                                     ` Stephen C. Tweedie
  0 siblings, 0 replies; 243+ messages in thread
From: Stephen C. Tweedie @ 1999-01-25 19:15 UTC (permalink / raw
  To: Linus Torvalds; +Cc: Stephen C. Tweedie, linux-mm

Hi,

On Mon, 25 Jan 1999 10:43:46 -0800 (PST), Linus Torvalds
<torvalds@transmeta.com> said:

> For example, the "multiply by two" that you removed, is done in order to
> make shrink_mmap() look at all pages when given a priority of zero.

Yes, but unfortunately that same *2 is the primary constant by which we
tune the relative aggressiveness of shrink_mmap() and
try_to_swap_out().  Simple profiling was showing that for any given
value of priority, we were biased too much against reclaiming cache for
some very common workloads: kernel builds were evicting include files
between gcc invocations on machines from 16MB right up to 64MB.

> So my only objection is basically that I think you mixed up the
> behaviour of the new patch with the (original) patch of yours that
> made count decrements conditional on the PG_referenced bit.

> Basically, this is _very_ different from the self-tuning clock you
> proposed earlier: your earlier patch had the explanation that you wanted
> to more quickly go through referenced pages, while this one goes through
> _shared_ pages more quickly. Big difference.

OK, let me give you the fuller explanation. :)

> I like the second way of thinking about it a lot more, though. And it may
> be that even though you _thought_ that the first one was due to reference
> counting, the shared page issue was the more important one. 

Yes and no.  The shared page issue dominates on low memory, that much is
clear, but the two patches do behave similarly in that case: we do not
expect to have too much excess cache in low memory, and shared pages
will dominate (and on 8MB, you can clearly see that they do dominate).
Both generations of the patch avoid counting those pages in the clock.
That was _always_ intended to be the effect in low memory.  That has not
changed in the new patch.

The page referencing issue is more significant once you have a large
cache with rapid cache turnover, in which case you really do want to age
things faster.  However, that is currently dealt with anyway, by the
fact that most processes reclaim their own memory rather than relying on
kswapd, and that they do so by shrink_mmap() first rather than relying
on the try_to_free_page() internal state that we used to have.  

As a result I really don't see the page referencing as being much of a
problem now: your other changes to vmscan.c have pretty much taken care
of that according to most of the traces I've taken.

Therefore the minimum necessary change to restore the old ac* behaviour
is to address the shared page skipping.  vmstat does show the new code
keeping a very similar balance and throughput to the old version.

--Stephen
--
To unsubscribe, send a message with 'unsubscribe linux-mm my@address'
in the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://humbolt.geo.uu.nl/Linux-MM/

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-25 19:10                                                                               ` Andrea Arcangeli
@ 1999-01-25 20:49                                                                                 ` Dr. Werner Fink
  1999-01-25 20:56                                                                                   ` Linus Torvalds
  1999-01-27 14:52                                                                                   ` Stephen C. Tweedie
  0 siblings, 2 replies; 243+ messages in thread
From: Dr. Werner Fink @ 1999-01-25 20:49 UTC (permalink / raw
  To: Andrea Arcangeli, Dr. Werner Fink
  Cc: Eric W. Biederman, Stephen C. Tweedie, Rik van Riel,
	Zlatko Calusic, Linus Torvalds, Savochkin Andrey Vladimirovich,
	steve, brent verner, Garst R. Reese, Kalle Andersson, Ben McCann,
	Alan Cox, bredelin, linux-kernel, linux-mm

> > load to 25 ... 30+ *and* a brutal break down of that performance
> > at this point.  The system is a PentiumII 400MHz with 32, 64, 128MB
> 
> Here with a double-PII 450Mhz but with 1 sloww IDE hd (6mbyte sec
> reported by hdparm -t /dev/hda, seek time is far worse ;). Everything is
> in the same phys HD and there's only two partitions one for the swap and
> one for ext2 (rootfs).

Ok its a bit better than a single PII 400 MHz :-)
... with less than 64MB the break downs are going to be the common state
whereas with 128MB the system is usable.  Nevertheless whenever both make
loops taking the filesystem tree at the same time, the system performance
slows down dramatically (a `break down').

> > What's about a `PG_recently_swapped_in' bit for pages which arn't found
> > anymore with the swap cache?  This isn't a prediction but a protection
> > against throwing out the same page in the following cycle.
> 
> I am not sure to have understood well but if the page is been throw out
> from the swap cache it means that the page is gone and so it will be
> difficult to mark the page PG_recently_swapped_in ;). But we could use the
> same logic with a bit in the swap entry to handle that (we have 6 custom
> bit to use and only one is used right now, and it's SHM_SWP_TYPE).

This hypothetical bit should only be set if the page is read physical
from the swap device/file.  That means it would take one step more
to swap out this page again (test_and_clear_bit of both 
PG_recently_swapped_in and PG_referenced).

> 
> But I don't think it's the right approch. The swap_cache should just be
> able to throw out only the right pages. See below ... 
> 
> > > People keep playing with ignoring PG_referenced in shrink_mmap for the
> > > swap cache,
> > > because it doesn't seem terribly important.  If you could demonstrate
> > > this is a problem we can stop ignoring it.
> 
> Eric, it's important infact. I am handling aging in the swap cache here.
> That's an _important_ point for performances. I don't remeber if I pointed
> out this before.


              Werner
--
To unsubscribe, send a message with 'unsubscribe linux-mm my@address'
in the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://humbolt.geo.uu.nl/Linux-MM/

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-25 20:49                                                                                 ` Dr. Werner Fink
@ 1999-01-25 20:56                                                                                   ` Linus Torvalds
  1999-01-26 12:23                                                                                     ` Rik van Riel
  1999-01-27 14:52                                                                                   ` Stephen C. Tweedie
  1 sibling, 1 reply; 243+ messages in thread
From: Linus Torvalds @ 1999-01-25 20:56 UTC (permalink / raw
  To: Dr. Werner Fink
  Cc: Andrea Arcangeli, Eric W. Biederman, Stephen C. Tweedie,
	Rik van Riel, Zlatko Calusic, Savochkin Andrey Vladimirovich,
	steve, brent verner, Garst R. Reese, Kalle Andersson, Ben McCann,
	Alan Cox, bredelin, linux-kernel, linux-mm


On Mon, 25 Jan 1999, Dr. Werner Fink wrote:
> 
> This hypothetical bit should only be set if the page is read physical
> from the swap device/file.  That means it would take one step more
> to swap out this page again (test_and_clear_bit of both 
> PG_recently_swapped_in and PG_referenced).

Ehh - it is already marked "accessed" in the page tables, which
essentially amounts to exactly that kind of two-level aging (the
PG_referenced bit only takes effect once the swapped-in page has once more
been evicted from the page tables) 

		Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm my@address'
in the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://humbolt.geo.uu.nl/Linux-MM/

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-25  1:38                                                                                     ` Alan Cox
  1999-01-25  1:04                                                                                       ` Andrea Arcangeli
@ 1999-01-25 21:59                                                                                       ` Gerard Roudier
  1999-01-26 11:45                                                                                         ` Thomas Sailer
  1999-01-26 13:06                                                                                         ` Stephen C. Tweedie
  1 sibling, 2 replies; 243+ messages in thread
From: Gerard Roudier @ 1999-01-25 21:59 UTC (permalink / raw
  To: Alan Cox
  Cc: Linus Torvalds, sct, werner, andrea, riel, Zlatko.Calusic,
	ebiederm+eric, saw, steve, damonbrent, reese, kalle.andersson,
	bmccann, bredelin, linux-kernel, linux-mm

On Mon, 25 Jan 1999, Alan Cox wrote:

> > > There are real cases where grab large linear block is needed.
> > 
> > Nobody has so far shown a reasonable implementation where this would be
> > possible.
> 
> Thats as maybe. However someone needs to find a way to do it. Right now I
> can run a matrox meteor on netbsd,freebsd,openbsd,windows95, nt but not Linux

I donnot know of all these systems, but, AFAIR, FreeBSD does not guarantee
the malloc_contig() function will succeed every time, after the system has
been started (obviously for allocation > PAGE_SIZE).

If you tell me that some system XXX is able to quickly free Mega-Bytes of
physical contiguous memory at any time when it is asked for such a
brain-deaded allocation, then for sure, I will never use system XXX,
because this magic behaviour seems not to be possible without some
paranoid VM policy that may affect badly performances for normal stuff.

Now, I agree that it is theorically possible to free a large contiguous
physical memory on most systems at any time, by flushing caches and moving
virtual pages that are only virtually referenced and this move will not
changes their virtual reference(s).

> Thats not meant as a flippant remark - its something to be stuck on the 2.3
> problem chart. Its just a question of who out there is sitting on the 
> solution.

Anything that requires more that 1 PAGE of physical memory at a time on
running systems is a very bad thing in my opinion. The PAGE is the only
required granularity of physical memory you should need on a virtual
memory system. If you ever need more, then you break simplicity and go
straight-away to complexity and cross-fingered behaviours. The only
exception is at system start-up, where it is still time for pooling 
every-thing that needs so. I am not going to change my mind on this topic
and I donnot care of crappy 'Designed for Windows' things.

Regards,
   Gerard.

--
To unsubscribe, send a message with 'unsubscribe linux-mm my@address'
in the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://humbolt.geo.uu.nl/Linux-MM/

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-25 16:25                                                                                 ` Stephen C. Tweedie
                                                                                                     ` (2 preceding siblings ...)
  1999-01-25 18:43                                                                                   ` Linus Torvalds
@ 1999-01-26  1:57                                                                                   ` Andrea Arcangeli
  1999-01-26 18:37                                                                                     ` Andrea Arcangeli
  1999-01-27 12:13                                                                                     ` Stephen C. Tweedie
  3 siblings, 2 replies; 243+ messages in thread
From: Andrea Arcangeli @ 1999-01-26  1:57 UTC (permalink / raw
  To: Stephen C. Tweedie
  Cc: Linus Torvalds, Alan Cox, werner, riel, Zlatko.Calusic,
	ebiederm+eric, saw, steve, damonbrent, reese, kalle.andersson,
	bmccann, bredelin, linux-kernel, linux-mm

On Mon, 25 Jan 1999, Stephen C. Tweedie wrote:

> --- mm/filemap.c.~1~	Thu Jan 21 10:26:41 1999
> +++ mm/filemap.c	Mon Jan 25 12:59:38 1999
> @@ -125,7 +125,7 @@
>  	struct page * page;
>  	int count;
>  
> -	count = (limit << 1) >> priority;
> +	count = limit >> priority;
>  
>  	page = mem_map + clock;
>  	do {
> @@ -147,7 +147,6 @@
>  			clock = page - mem_map;
>  		}
>  		
> -		count--;

OK to remove the << 1 and to move count-- after checking referenced.

>  		referenced = test_and_clear_bit(PG_referenced, &page->flags);
>  
>  		if (PageLocked(page))
> @@ -159,6 +158,8 @@
>  		/* We can't free pages unless there's just one user */
>  		if (atomic_read(&page->count) != 1)
>  			continue;
> +
> +		count--;

but this is plain bogus. When your machine will reach 0 freeable pages
(and that happens a bit before to kill the process because OOM) you'll get
an infinite loop in shrink_mmap().

Andrea Arcangeli

--
To unsubscribe, send a message with 'unsubscribe linux-mm my@address'
in the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://humbolt.geo.uu.nl/Linux-MM/

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-25 21:59                                                                                       ` Gerard Roudier
@ 1999-01-26 11:45                                                                                         ` Thomas Sailer
  1999-01-26 20:48                                                                                           ` Gerard Roudier
  1999-01-26 13:06                                                                                         ` Stephen C. Tweedie
  1 sibling, 1 reply; 243+ messages in thread
From: Thomas Sailer @ 1999-01-26 11:45 UTC (permalink / raw
  To: Gerard Roudier, linux-kernel, linux-mm

Gerard Roudier wrote:

> If you tell me that some system XXX is able to quickly free Mega-Bytes of
> physical contiguous memory at any time when it is asked for such a

Noone said it has to happen quickly, it's entirely useful even if
the calling process (and possibly others) will sleep for 10secs.
These allocations are very uncommon, but nevertheless sometimes
necessary for some device (drivers).

> brain-deaded allocation, then for sure, I will never use system XXX,
> because this magic behaviour seems not to be possible without some
> paranoid VM policy that may affect badly performances for normal stuff.

You may well call the devices that need this broken, the problem
is that they are in rather widespread use.

If we don't find an algorithm that doesn't affect preformance for
the normal stuff, (why would something like selecting
a memory region and forcing everything that's currently in the
way to be swapped out not work?), then we should probably have
a special pool for these "perverse" mappings.

But I think there's a rather generic problem: how are you going
to support 32bit PCI busmasters in machines with more than
4Gig main memory? It's conceptually the same as how are you
going to support ISA DMA with more than 16Meg main memory.

32bit only PCI busmasters are very common these days, I don't
know a single PCI soundcard that can do 64bit master (or even slave)
cycles. Also, all PCI soundcards I know which have a hardware
wavetable synth (without sample ROM) require ridiculously
large contiguous allocations (>= 1M) for the synth to work.

> Anything that requires more that 1 PAGE of physical memory at a time on
> running systems is a very bad thing in my opinion. The PAGE is the only

Ok, then remove any soundcard from your system. That might be acceptable
for you, but probably not for 90% of the Linux users.

Tom
--
To unsubscribe, send a message with 'unsubscribe linux-mm my@address'
in the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://humbolt.geo.uu.nl/Linux-MM/

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-25 20:56                                                                                   ` Linus Torvalds
@ 1999-01-26 12:23                                                                                     ` Rik van Riel
  1999-01-26 15:44                                                                                       ` Andrea Arcangeli
  0 siblings, 1 reply; 243+ messages in thread
From: Rik van Riel @ 1999-01-26 12:23 UTC (permalink / raw
  To: Linus Torvalds
  Cc: Dr. Werner Fink, Andrea Arcangeli, Eric W. Biederman,
	Stephen C. Tweedie, Zlatko Calusic,
	Savochkin Andrey Vladimirovich, steve, brent verner,
	Garst R. Reese, Kalle Andersson, Ben McCann, Alan Cox, bredelin,
	linux-kernel, linux-mm

On Mon, 25 Jan 1999, Linus Torvalds wrote:
> On Mon, 25 Jan 1999, Dr. Werner Fink wrote:
> > 
> > This hypothetical bit should only be set if the page is read physical
> > from the swap device/file.  That means it would take one step more
> > to swap out this page again (test_and_clear_bit of both 
> > PG_recently_swapped_in and PG_referenced).
> 
> Ehh - it is already marked "accessed" in the page tables, which
> essentially amounts to exactly that kind of two-level aging (the
> PG_referenced bit only takes effect once the swapped-in page has
> once more been evicted from the page tables)

With a bit of imagination, you might even be able to
call our current scheme two-handed...

Even though it's a bit different, it seems like we
have all advantages and none of the disadvantages of
a two-handed system. The main difference is that we
do I/O on the first hand and page eviction on the
second. This gives us a buffer of ready-to-evict
pages which we can easily free when we're in a hurry.

The only thing we really need now is a way to keep
track of (and manage) that buffer of freeable pages.
I believe Andrea has a patch for that -- we should
check it out and incorporate something like that ASAP.

There are several reasons why we need it:
- we should never run out of freeable pages
  because that can introduce too much latency
  and possibly even system instability
- page aging only is effective/optimal when the
  freeable buffer is large enough
- when the freeable buffer is too large, we might
  have too many soft pagefaults or other overhead
  (not very much of a concern, but still...)
- keeping a more or less fixed distance between
  both hands could make the I/O less bursty and
  improve system I/O performance

Rik -- If a Microsoft product fails, who do you sue?
+-------------------------------------------------------------------+
| Linux memory management tour guide.             riel@nl.linux.org |
| Scouting Vries cubscout leader.     http://www.nl.linux.org/~riel |
+-------------------------------------------------------------------+

--
To unsubscribe, send a message with 'unsubscribe linux-mm my@address'
in the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://humbolt.geo.uu.nl/Linux-MM/

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-25 21:59                                                                                       ` Gerard Roudier
  1999-01-26 11:45                                                                                         ` Thomas Sailer
@ 1999-01-26 13:06                                                                                         ` Stephen C. Tweedie
  1999-01-26 14:28                                                                                           ` Alan Cox
  1 sibling, 1 reply; 243+ messages in thread
From: Stephen C. Tweedie @ 1999-01-26 13:06 UTC (permalink / raw
  To: Gerard Roudier
  Cc: Alan Cox, Linus Torvalds, sct, werner, andrea, riel,
	Zlatko.Calusic, ebiederm+eric, saw, steve, damonbrent, reese,
	kalle.andersson, bmccann, bredelin, linux-kernel, linux-mm

Hi,

On Mon, 25 Jan 1999 22:59:00 +0100 (MET), Gerard Roudier
<groudier@club-internet.fr> said:

> If you tell me that some system XXX is able to quickly free Mega-Bytes of
> physical contiguous memory at any time when it is asked for such a
> brain-deaded allocation, then for sure, I will never use system XXX,
> because this magic behaviour seems not to be possible without some
> paranoid VM policy that may affect badly performances for normal stuff.

It is really not hard to reserve a certain amount of memory (up to some
fraction, say 25% or 50% of physical memory) for use only by pagable
allocations.  Most desktop boxes will _not_ require more than 50% of
memory for locked kernel pages.  Recovering any given range of
contiguous pages from that pagable region may be expensive but will
_always_ be possible, and given that it will usually be a one-off
expense during driver setup, there is no reason why we cannot support
it.

--Stephen
--
To unsubscribe, send a message with 'unsubscribe linux-mm my@address'
in the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://humbolt.geo.uu.nl/Linux-MM/

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-26 14:28                                                                                           ` Alan Cox
@ 1999-01-26 14:15                                                                                             ` MOLNAR Ingo
  1999-01-26 14:36                                                                                               ` yodaiken
                                                                                                                 ` (2 more replies)
  1999-01-26 14:21                                                                                             ` Rik van Riel
  1 sibling, 3 replies; 243+ messages in thread
From: MOLNAR Ingo @ 1999-01-26 14:15 UTC (permalink / raw
  To: Alan Cox
  Cc: Stephen C. Tweedie, groudier, torvalds, werner, andrea, riel,
	Zlatko.Calusic, ebiederm+eric, saw, steve, damonbrent, reese,
	kalle.andersson, bmccann, bredelin, linux-kernel, linux-mm

On Tue, 26 Jan 1999, Alan Cox wrote:

> Something like
> 
> Chop memory into 4Mb sized chunks that hold the perfectly normal and
> existing pages and buddy memory allocator. Set a flag on 25-33% of them
> to a max of say 10 and for <12Mb boxes simply say "tough".

this is conceptually 'boot-time allocation of big buffers' by splitting
all available memory into two pieces:

	size_kernel: generic memory
	size_user: only swappable

(size_kernel+size_user = ca. size_allmemory)

This still doesnt solve the 'what if we need more big buffers than
size_user' and 'what if we need kernel memory more than size_kernel'
questions, and both are valid.

another (2.3 issue) approach could be to make two garantees for
'pinned-down kernel allocations':

	1) either they will be deallocated after some definit timeout
	2) or they can be explicitly deallocated via special mechanizms

this enables us to implement a 100% algorithm of 'moving' kernel-space
objects, without having to do kernel-space paging or other runtime costs. 
Most 'pinned down' objects already have a specific timeout (buffer heads,
used dentries) or can already be 'flushed'. (eg specific unused
dentry-cache members, and other cached but freeable kernel-space objects).

the mechanizm is to 'ban' kernel-allocations from certain pages either
passively (by waiting on the objects in question), or by actively moving
them. The 'flushing' part is easy and we mainly already use it to reclaim
memory.

the toughest part is the 'moving' stuff, which is not yet present and
hard/impossible to implement in a clean and maintainable way. We need this
eg. for sockets, files, (not inodes fortunately), task structures, vmas,
mms, signal structures, etc. It really feels like a rewrite that needs a
very good architecture to be successful, ie. we _have to_ guarantee
'correctness' automatically somehow to not let this develop into a mess. 
Also, it must have only very limited 'subsystem-side' complexity to not
hinder development. Eg. in the debugging version we could have a
per-allocation timer that warns us if we havent 'unused' that particular
object within a given timeout. Dont know. It's tough.

but iff this mechanizm is present, we could 'ban' allocations from
whatever physical page in the system (this isnt much different from all-VM
solutions, they have to wait for swapouts too anyway, but is a heck better
and faster wrt. mappings issues). This is very generic and does not
presume any 'split' between usage types. This is not at all limited to 4m
sections or whatever, and certainly works on any Linux system even 4M RAM
boxes, and it's still possible to use up all memory for kernel allocations
eg. for dedicated router boxes.

yes it restricts and complicates the way kernel subsystems can allocate
buffers, but we _have_ to do that iff we want to solve the problem 100%.

-- mingo

--
To unsubscribe, send a message with 'unsubscribe linux-mm my@address'
in the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://humbolt.geo.uu.nl/Linux-MM/

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-26 14:28                                                                                           ` Alan Cox
  1999-01-26 14:15                                                                                             ` MOLNAR Ingo
@ 1999-01-26 14:21                                                                                             ` Rik van Riel
  1 sibling, 0 replies; 243+ messages in thread
From: Rik van Riel @ 1999-01-26 14:21 UTC (permalink / raw
  To: Alan Cox
  Cc: Stephen C. Tweedie, groudier, torvalds, werner, andrea,
	Zlatko.Calusic, ebiederm+eric, saw, steve, damonbrent, reese,
	kalle.andersson, bmccann, bredelin, linux-kernel, linux-mm

On Tue, 26 Jan 1999, Alan Cox wrote:

> Chop memory into 4Mb sized chunks that hold the perfectly normal
> and existing pages and buddy memory allocator. Set a flag on
> 25-33% of them to a max of say 10 and for <12Mb boxes simply say
> "tough".

We might also want to flag non-cached and dma areas too.
That way we can hand cached, non-dma memory to the kernel,
use non-cached stuff for buffer memory and page tables,
keeping dma-able memory relatively clean and keeping the
kernel (and critical pages) fast.

Maybe the execute bit should also have some influence on
placement. Having executable text in uncached memory may
well give a larger performance penalty than putting user
data there...

In my zone allocator design I have outlined 5 or 7 (depending
on how you look at it) different memory usages for the Linux
kernel. You might want to check that out to see if you've
overlooked something:

http://www.nl.linux.org/~riel/zone-alloc.html

> The performance impact of that on free page requests seems to be
> pretty minimal. In actual fact it wil help performance in some
> cases since the machine can't easily be killed by going out of non
> kernel space allocations - the 25% is also a "can do work" sanity
> check.

It's very well possible to keep separate free memory stats
and free memory from the different area's as needed.

cheers,

Rik -- If a Microsoft product fails, who do you sue?
+-------------------------------------------------------------------+
| Linux memory management tour guide.             riel@nl.linux.org |
| Scouting Vries cubscout leader.     http://www.nl.linux.org/~riel |
+-------------------------------------------------------------------+

--
To unsubscribe, send a message with 'unsubscribe linux-mm my@address'
in the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://humbolt.geo.uu.nl/Linux-MM/

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-26 13:06                                                                                         ` Stephen C. Tweedie
@ 1999-01-26 14:28                                                                                           ` Alan Cox
  1999-01-26 14:15                                                                                             ` MOLNAR Ingo
  1999-01-26 14:21                                                                                             ` Rik van Riel
  0 siblings, 2 replies; 243+ messages in thread
From: Alan Cox @ 1999-01-26 14:28 UTC (permalink / raw
  To: Stephen C. Tweedie
  Cc: groudier, alan, torvalds, werner, andrea, riel, Zlatko.Calusic,
	ebiederm+eric, saw, steve, damonbrent, reese, kalle.andersson,
	bmccann, bredelin, linux-kernel, linux-mm

> It is really not hard to reserve a certain amount of memory (up to some
> fraction, say 25% or 50% of physical memory) for use only by pagable

I was guessing 25%. 

> allocations.  Most desktop boxes will _not_ require more than 50% of
> memory for locked kernel pages.  Recovering any given range of
> contiguous pages from that pagable region may be expensive but will
> _always_ be possible, and given that it will usually be a one-off
> expense during driver setup, there is no reason why we cannot support
> it.

Something like

Chop memory into 4Mb sized chunks that hold the perfectly normal and existing
pages and buddy memory allocator. Set a flag on 25-33% of them to a max of say 
10 and for <12Mb boxes simply say "tough". 

The performance impact of that on free page requests seems to be pretty minimal.
In actual fact it wil help performance in some cases since the machine can't 
easily be killed by going out of non kernel space allocations - the 25% is
also a "can do work" sanity check.

Alan

--
To unsubscribe, send a message with 'unsubscribe linux-mm my@address'
in the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://humbolt.geo.uu.nl/Linux-MM/

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-26 14:15                                                                                             ` MOLNAR Ingo
@ 1999-01-26 14:36                                                                                               ` yodaiken
  1999-01-26 15:21                                                                                                 ` MOLNAR Ingo
  1999-01-26 15:46                                                                                                 ` Alan Cox
  1999-01-26 16:37                                                                                               ` Stephen C. Tweedie
  1999-01-27 11:35                                                                                               ` Jakub Jelinek
  2 siblings, 2 replies; 243+ messages in thread
From: yodaiken @ 1999-01-26 14:36 UTC (permalink / raw
  To: MOLNAR Ingo
  Cc: alan, sct, groudier, torvalds, werner, andrea, riel,
	Zlatko.Calusic, ebiederm+eric, saw, steve, damonbrent, reese,
	kalle.andersson, bmccann, bredelin, linux-kernel, linux-mm

> 
> On Tue, 26 Jan 1999, Alan Cox wrote:
> 
> > Something like
> > 
> > Chop memory into 4Mb sized chunks that hold the perfectly normal and
> > existing pages and buddy memory allocator. Set a flag on 25-33% of them
> > to a max of say 10 and for <12Mb boxes simply say "tough".
> 
> this is conceptually 'boot-time allocation of big buffers' by splitting
> all available memory into two pieces:
> 
> 	size_kernel: generic memory
> 	size_user: only swappable
> 
> (size_kernel+size_user = ca. size_allmemory)
> 
> This still doesnt solve the 'what if we need more big buffers than
> size_user' and 'what if we need kernel memory more than size_kernel'
> questions, and both are valid.

Solved by reboot.

> the toughest part is the 'moving' stuff, which is not yet present and
> hard/impossible to implement in a clean and maintainable way. We need this
> eg. for sockets, files, (not inodes fortunately), task structures, vmas,

What's the benefit?  If you need big chunks of physical memory, then you
obviously are willing to sacrifice efficient use of every last byte.

> yes it restricts and complicates the way kernel subsystems can allocate
> buffers, but we _have_ to do that iff we want to solve the problem 100%.

So for that last 10% of "solve" we introduce a lot of complexity into 
every subsystem?

--
To unsubscribe, send a message with 'unsubscribe linux-mm my@address'
in the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://humbolt.geo.uu.nl/Linux-MM/

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-26 14:36                                                                                               ` yodaiken
@ 1999-01-26 15:21                                                                                                 ` MOLNAR Ingo
  1999-01-27 10:31                                                                                                   ` yodaiken
  1999-01-26 15:46                                                                                                 ` Alan Cox
  1 sibling, 1 reply; 243+ messages in thread
From: MOLNAR Ingo @ 1999-01-26 15:21 UTC (permalink / raw
  To: yodaiken
  Cc: alan, sct, groudier, torvalds, werner, andrea, riel,
	Zlatko.Calusic, ebiederm+eric, saw, steve, damonbrent, reese,
	kalle.andersson, bmccann, bredelin, linux-kernel, linux-mm

On Tue, 26 Jan 1999 yodaiken@chelm.cs.nmt.edu wrote:

> What's the benefit?  If you need big chunks of physical memory, then you
> obviously are willing to sacrifice efficient use of every last byte. 

no, what i want to have is support for on-demand shared-physical-memory
hardware. Resource management. Alan has listed a few examples, and the
list is not expected to get smaller. You are right, if we want to have big
chunks of physical memory then we'll allocate it on reboot.

i dont think it's correct to say: 'anything that cannot be segmented in
the physical memory space with page granularity, is considered to be
broken in this regard and is not guaranteed to be 100% supported by the
Linux architecture'. 

> > yes it restricts and complicates the way kernel subsystems can allocate
> > buffers, but we _have_ to do that iff we want to solve the problem 100%.
> 
> So for that last 10% of "solve" we introduce a lot of complexity into 
> every subsystem?

no, as i pointed it out:

> Also, it must have only very limited 'subsystem-side' complexity to not
> hinder development. [...]

plus, i'd like to point out that if we do something, we preferredly want
to do it 100% correct, especially if the 'packet loss' is visible by
user-space as well. But i'm not at all requesting it: 

> the toughest part is the 'moving' stuff, which is not yet present and
> hard/impossible to implement in a clean and maintainable way.
       ^^^^^^^^^^---(this might as well be the case)

-- mingo

--
To unsubscribe, send a message with 'unsubscribe linux-mm my@address'
in the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://humbolt.geo.uu.nl/Linux-MM/

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-26 12:23                                                                                     ` Rik van Riel
@ 1999-01-26 15:44                                                                                       ` Andrea Arcangeli
  0 siblings, 0 replies; 243+ messages in thread
From: Andrea Arcangeli @ 1999-01-26 15:44 UTC (permalink / raw
  To: Rik van Riel
  Cc: Linus Torvalds, Dr. Werner Fink, Eric W. Biederman,
	Stephen C. Tweedie, Zlatko Calusic,
	Savochkin Andrey Vladimirovich, steve, brent verner,
	Garst R. Reese, Kalle Andersson, Ben McCann, Alan Cox, bredelin,
	linux-kernel, linux-mm

On Tue, 26 Jan 1999, Rik van Riel wrote:

> The only thing we really need now is a way to keep
> track of (and manage) that buffer of freeable pages.
> I believe Andrea has a patch for that -- we should
> check it out and incorporate something like that ASAP.

It's just running fine here. If somebody want to run it too, just
go in sync with my current tree:

	ftp://e-mind.com/pub/linux/arca-tree/2.2.0-pre9_arca-3.gz

When you'll press SHIFT+SCROLL-LOCK you'll see both a Free and Freeable
fields.  The freeable fields tell you how many freeable pages you have on
your machine between both the buffer and the file cache. And my VM just
autobalance in function of the percentage of freeable pages in the system.
This works very well.

The only remark of my implementation is that I added a check in the
free_pages() common path. Doing that the accounting of the freeable pages
in the cache it's been trivial. I didn't changed the way the cache is
allocated and deallocated because I wanted to see how much knowing the
freeable pages number could be useful in the try_to_free_pages() alogrithm
before going into major hacks. So now I have the right number with the
minimal changes with a bit (really only a bit) of overhead in
free_pages()). I am sure that the overhead I added in free_pages() is
_not_ noticable in benchmarks (it's the same of checking for __GFP_WAIT at
the start of every __get_free_pages()). 

> There are several reasons why we need it:
> - we should never run out of freeable pages
>   because that can introduce too much latency
>   and possibly even system instability

Hmm, that's look like mostly a performances probelem (but I am supposing
that try_to_free_pages() has a safe implementation of course ;). 

> - page aging only is effective/optimal when the
>   freeable buffer is large enough

Infact. This is the major point. And the nice side effect is that once the
freeable pages are balanced to a certain number, everything else between
cache and buffers got automagically balanced. We don't need min limitis of
buffers or of cache anymore and this allow to use _all_ the memory as
best.

> - when the freeable buffer is too large, we might
>   have too many soft pagefaults or other overhead
>   (not very much of a concern, but still...)

Agreed.

> - keeping a more or less fixed distance between
>   both hands could make the I/O less bursty and
>   improve system I/O performance

Exactly this is the other major point. Keeping a balance of freeable pages
force the algorithm do swapout and shrink_mmap in a way that scale very
well.

Andrea Arcangeli

--
To unsubscribe, send a message with 'unsubscribe linux-mm my@address'
in the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://humbolt.geo.uu.nl/Linux-MM/

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-26 14:36                                                                                               ` yodaiken
  1999-01-26 15:21                                                                                                 ` MOLNAR Ingo
@ 1999-01-26 15:46                                                                                                 ` Alan Cox
  1999-01-26 16:45                                                                                                   ` Stephen C. Tweedie
  1 sibling, 1 reply; 243+ messages in thread
From: Alan Cox @ 1999-01-26 15:46 UTC (permalink / raw
  To: yodaiken
  Cc: mingo, alan, sct, groudier, torvalds, werner, andrea, riel,
	Zlatko.Calusic, ebiederm+eric, saw, steve, damonbrent, reese,
	kalle.andersson, bmccann, bredelin, linux-kernel, linux-mm

> > the toughest part is the 'moving' stuff, which is not yet present and
> > hard/impossible to implement in a clean and maintainable way. We need this
> > eg. for sockets, files, (not inodes fortunately), task structures, vmas,
> 
> What's the benefit?  If you need big chunks of physical memory, then you
> obviously are willing to sacrifice efficient use of every last byte.
> 
> > yes it restricts and complicates the way kernel subsystems can allocate
> > buffers, but we _have_ to do that iff we want to solve the problem 100%.
> 
> So for that last 10% of "solve" we introduce a lot of complexity into 
> every subsystem?

We don't need to solve the 100% case. Simply being sure we can (slowly)
allocate up to 25% of RAM in huge chunks is going to be enough. Good point
Ingo on one thing I'd missed - the big chunks themselves need some kind
of handles since the moment we hand out 512K chunks we may not be able to 
shuffle and get a 4Mb block

--
To unsubscribe, send a message with 'unsubscribe linux-mm my@address'
in the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://humbolt.geo.uu.nl/Linux-MM/

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-26 14:15                                                                                             ` MOLNAR Ingo
  1999-01-26 14:36                                                                                               ` yodaiken
@ 1999-01-26 16:37                                                                                               ` Stephen C. Tweedie
  1999-01-27 11:35                                                                                               ` Jakub Jelinek
  2 siblings, 0 replies; 243+ messages in thread
From: Stephen C. Tweedie @ 1999-01-26 16:37 UTC (permalink / raw
  To: MOLNAR Ingo
  Cc: Alan Cox, Stephen C. Tweedie, groudier, torvalds, werner, andrea,
	riel, Zlatko.Calusic, ebiederm+eric, saw, steve, damonbrent,
	reese, kalle.andersson, bmccann, bredelin, linux-kernel, linux-mm

Hi,

On Tue, 26 Jan 1999 15:15:04 +0100 (CET), MOLNAR Ingo
<mingo@chiara.csoma.elte.hu> said:

> this is conceptually 'boot-time allocation of big buffers' by splitting
> all available memory into two pieces:

> 	size_kernel: generic memory
> 	size_user: only swappable

which is something we already need to do if we want to support 4G
physical memory cleanly (ie. add support for anonymous and cache pages
to be mapped on demand into the kernel's VA, rather than having the
whole of physical memory virtually mapped at all times).

> the toughest part is the 'moving' stuff, which is not yet present and
> hard/impossible to implement in a clean and maintainable way. 

Not at all.  We only need to be able to free swappable pages: only the
swappable region will be usable for large contiguous allocations.  We
don't even need to move them: they can be paged back in afterwards.
Obviously this is expensive if we expect to do it often, but in reality
it is probably something we only need to do at driver setup time (or
occasionally on driver open).

--Stephen
--
To unsubscribe, send a message with 'unsubscribe linux-mm my@address'
in the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://humbolt.geo.uu.nl/Linux-MM/

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-26 15:46                                                                                                 ` Alan Cox
@ 1999-01-26 16:45                                                                                                   ` Stephen C. Tweedie
  1999-01-30  7:01                                                                                                     ` yodaiken
  0 siblings, 1 reply; 243+ messages in thread
From: Stephen C. Tweedie @ 1999-01-26 16:45 UTC (permalink / raw
  To: Alan Cox
  Cc: yodaiken, mingo, sct, groudier, torvalds, werner, andrea, riel,
	Zlatko.Calusic, ebiederm+eric, saw, steve, damonbrent, reese,
	kalle.andersson, bmccann, bredelin, linux-kernel, linux-mm

Hi,

On Tue, 26 Jan 1999 15:46:23 +0000 (GMT), alan@lxorguk.ukuu.org.uk (Alan
Cox) said:
> We don't need to solve the 100% case. Simply being sure we can (slowly)
> allocate up to 25% of RAM in huge chunks is going to be enough. Good point
> Ingo on one thing I'd missed - the big chunks themselves need some kind
> of handles since the moment we hand out 512K chunks we may not be able to 
> shuffle and get a 4Mb block

The idea was to decide what region to hand out, _then_ to clear it.
Standard best-fit algorithms apply when carving up the region.

--Stephen
--
To unsubscribe, send a message with 'unsubscribe linux-mm my@address'
in the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://humbolt.geo.uu.nl/Linux-MM/

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-26  1:57                                                                                   ` Andrea Arcangeli
@ 1999-01-26 18:37                                                                                     ` Andrea Arcangeli
  1999-01-27 12:13                                                                                     ` Stephen C. Tweedie
  1 sibling, 0 replies; 243+ messages in thread
From: Andrea Arcangeli @ 1999-01-26 18:37 UTC (permalink / raw
  To: Stephen C. Tweedie
  Cc: Linus Torvalds, Alan Cox, werner, riel, Zlatko.Calusic,
	ebiederm+eric, saw, steve, damonbrent, reese, kalle.andersson,
	bmccann, bredelin, linux-kernel, linux-mm

On Tue, 26 Jan 1999, Andrea Arcangeli wrote:

> On Mon, 25 Jan 1999, Stephen C. Tweedie wrote:
> 
> > --- mm/filemap.c.~1~	Thu Jan 21 10:26:41 1999
> > +++ mm/filemap.c	Mon Jan 25 12:59:38 1999
> > @@ -125,7 +125,7 @@
> >  	struct page * page;
> >  	int count;
> >  
> > -	count = (limit << 1) >> priority;
> > +	count = limit >> priority;
> >  
> >  	page = mem_map + clock;
> >  	do {
> > @@ -147,7 +147,6 @@
> >  			clock = page - mem_map;
> >  		}
> >  		
> > -		count--;
> 
> OK to remove the << 1 and to move count-- after checking referenced.
> 
> >  		referenced = test_and_clear_bit(PG_referenced, &page->flags);
> >  
> >  		if (PageLocked(page))
> > @@ -159,6 +158,8 @@
> >  		/* We can't free pages unless there's just one user */
> >  		if (atomic_read(&page->count) != 1)
> >  			continue;
> > +
> > +		count--;
> 
> but this is plain bogus. When your machine will reach 0 freeable pages
> (and that happens a bit before to kill the process because OOM) you'll get
> an infinite loop in shrink_mmap().

So I gues that it won't be hard to lockup 2.2.0 just causing the number of
freeable pages to go to 0. Did somebody tried to go OOM with 2.2.0 yet?

Here I can cause nr_freeable_pages to go to 0 pretty easily (and that
happens just a bit before to get 1 process killed). I hope that 2.2.0 VM
will behave differently. 

Andrea Arcangeli

--
To unsubscribe, send a message with 'unsubscribe linux-mm my@address'
in the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://humbolt.geo.uu.nl/Linux-MM/

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-26 11:45                                                                                         ` Thomas Sailer
@ 1999-01-26 20:48                                                                                           ` Gerard Roudier
  1999-01-26 21:24                                                                                             ` Thomas Sailer
                                                                                                               ` (2 more replies)
  0 siblings, 3 replies; 243+ messages in thread
From: Gerard Roudier @ 1999-01-26 20:48 UTC (permalink / raw
  To: Thomas Sailer; +Cc: linux-kernel, linux-mm

On Tue, 26 Jan 1999, Thomas Sailer wrote:

> Gerard Roudier wrote:
> 
> > If you tell me that some system XXX is able to quickly free Mega-Bytes of
> > physical contiguous memory at any time when it is asked for such a
> 
> Noone said it has to happen quickly, it's entirely useful even if
> the calling process (and possibly others) will sleep for 10secs.
> These allocations are very uncommon, but nevertheless sometimes
> necessary for some device (drivers).

I suggest to allow some application program to decide what stuff to
victimize and to be able to tell the kernel about, but not to ask the
kernel for doing the bad work for you and then critisize it.

> > brain-deaded allocation, then for sure, I will never use system XXX,
> > because this magic behaviour seems not to be possible without some
> > paranoid VM policy that may affect badly performances for normal stuff.
> 
> You may well call the devices that need this broken, the problem
> is that they are in rather widespread use.

There are bunches of things that are widespread used nowadays and that 
should have disappeard since years if people were a bit more concerned 
by technical and progress considerations.

For example, it seems that 32 bits systems are not enough to provide a
flat virtual addressing space far larger than the physical address space
needed for applications (that was the primary goal of virtual memory
invention). If we were powered a bit more by technical considerations, we
should drop support of 32 bits systems immediately since as you know 64
bits systems are available since years and Linux supports them quite well.

Each time we add support or maintain support of crap, we just encourage
crap and allow the mediocrity to last longer that it really deserves. 

A device that requires more contiguous space than 1 PAGE for its 
support is crap. Because designers spared peanuts by not implementing 
address translation tables, we just have to complete their work by 
complexifying O/Ses. The win is 0.02 euro of silicium for them but 
lots of time wasted by O/Ses guys to support the crap.

> If we don't find an algorithm that doesn't affect preformance for
> the normal stuff, (why would something like selecting
> a memory region and forcing everything that's currently in the
> way to be swapped out not work?), then we should probably have
> a special pool for these "perverse" mappings.
> 
> But I think there's a rather generic problem: how are you going
> to support 32bit PCI busmasters in machines with more than
> 4Gig main memory? It's conceptually the same as how are you
> going to support ISA DMA with more than 16Meg main memory.

What the ratio of machines that need 4 GB of more for doing their 
work?
How much does they cost?
What can we do, if some people that have such machines want to use 
IO controllers that are not able to DMA the whole physical space?
We just may suggest them to learn or to get help from a psychiatric, 
but we should not accept to waste time trying to make the crap work 
less worse.

> 32bit only PCI busmasters are very common these days, I don't
> know a single PCI soundcard that can do 64bit master (or even slave)
> cycles. Also, all PCI soundcards I know which have a hardware
> wavetable synth (without sample ROM) require ridiculously
> large contiguous allocations (>= 1M) for the synth to work.

Are you sure a soundcard is really required for systems that run 
with GBs of memory?

> > Anything that requires more that 1 PAGE of physical memory at a time on
> > running systems is a very bad thing in my opinion. The PAGE is the only
> 
> Ok, then remove any soundcard from your system. That might be acceptable
> for you, but probably not for 90% of the Linux users.

A real Linux user is able to make a custom kernel that incorporates some
driver at boot-up, and can live with that. The ones that are whining about
their PinkSocket O/S having problem to load the sound driver module at 
run-time is another busyness in my opinion.

Regards,
   Gerard.

--
To unsubscribe, send a message with 'unsubscribe linux-mm my@address'
in the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://humbolt.geo.uu.nl/Linux-MM/

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-26 20:48                                                                                           ` Gerard Roudier
@ 1999-01-26 21:24                                                                                             ` Thomas Sailer
  1999-01-27  0:25                                                                                             ` David Lang
  1999-01-27 16:05                                                                                             ` Stephen C. Tweedie
  2 siblings, 0 replies; 243+ messages in thread
From: Thomas Sailer @ 1999-01-26 21:24 UTC (permalink / raw
  To: Gerard Roudier; +Cc: linux-kernel, linux-mm

Gerard Roudier wrote:

> What can we do, if some people that have such machines want to use
> IO controllers that are not able to DMA the whole physical space?

Does it matter for a soundcard that just needs its 64k buffer allocated
on driver open and then be happy for the rest of its life?

Fact is that soundcard design was broken since its inception and
I've given up hope that someone in that business sees some light 8-)

> Are you sure a soundcard is really required for systems that run
> with GBs of memory?

Have you seen a PC without one lately?
Allmost all Linux guys I know want to listen to MP3 files 8-)

Tom
--
To unsubscribe, send a message with 'unsubscribe linux-mm my@address'
in the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://humbolt.geo.uu.nl/Linux-MM/

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-26 20:48                                                                                           ` Gerard Roudier
  1999-01-26 21:24                                                                                             ` Thomas Sailer
@ 1999-01-27  0:25                                                                                             ` David Lang
  1999-01-27 16:05                                                                                             ` Stephen C. Tweedie
  2 siblings, 0 replies; 243+ messages in thread
From: David Lang @ 1999-01-27  0:25 UTC (permalink / raw
  To: Gerard Roudier; +Cc: Thomas Sailer, linux-kernel, linux-mm

-----BEGIN PGP SIGNED MESSAGE-----

why don'y you spin off your own verion of Linux that only supports that
hardware that you consider "good enough" and let the rest of us go back to
making the system work in the real world where lots of things are not
"good enough" by someones definition.

David Lang

"If users are made to understand that the system administrator's job is to
make computers run, and not to make them happy, they can, in fact, be made
happy most of the time. If users are allowed to believe that the system
administrator's job is to make them happy, they can, in fact, never be made
happy." 
- -Paul Evans (as quoted by Barb Dijker in "Managing Support Staff", LISA '97)

On Tue, 26 Jan 1999, Gerard Roudier wrote:

> Date: Tue, 26 Jan 1999 21:48:59 +0100 (MET)
> From: Gerard Roudier <groudier@club-internet.fr>
> To: Thomas Sailer <sailer@ife.ee.ethz.ch>
> Cc: linux-kernel@vger.rutgers.edu, linux-mm@kvack.org
> Subject: Re: MM deadlock [was: Re: arca-vm-8...]
> 
> 
> 
> On Tue, 26 Jan 1999, Thomas Sailer wrote:
> 
> > Gerard Roudier wrote:
> > 
> > > If you tell me that some system XXX is able to quickly free Mega-Bytes of
> > > physical contiguous memory at any time when it is asked for such a
> > 
> > Noone said it has to happen quickly, it's entirely useful even if
> > the calling process (and possibly others) will sleep for 10secs.
> > These allocations are very uncommon, but nevertheless sometimes
> > necessary for some device (drivers).
> 
> I suggest to allow some application program to decide what stuff to
> victimize and to be able to tell the kernel about, but not to ask the
> kernel for doing the bad work for you and then critisize it.
> 
> > > brain-deaded allocation, then for sure, I will never use system XXX,
> > > because this magic behaviour seems not to be possible without some
> > > paranoid VM policy that may affect badly performances for normal stuff.
> > 
> > You may well call the devices that need this broken, the problem
> > is that they are in rather widespread use.
> 
> There are bunches of things that are widespread used nowadays and that 
> should have disappeard since years if people were a bit more concerned 
> by technical and progress considerations.
> 
> For example, it seems that 32 bits systems are not enough to provide a
> flat virtual addressing space far larger than the physical address space
> needed for applications (that was the primary goal of virtual memory
> invention). If we were powered a bit more by technical considerations, we
> should drop support of 32 bits systems immediately since as you know 64
> bits systems are available since years and Linux supports them quite well.
> 
> Each time we add support or maintain support of crap, we just encourage
> crap and allow the mediocrity to last longer that it really deserves. 
> 
> A device that requires more contiguous space than 1 PAGE for its 
> support is crap. Because designers spared peanuts by not implementing 
> address translation tables, we just have to complete their work by 
> complexifying O/Ses. The win is 0.02 euro of silicium for them but 
> lots of time wasted by O/Ses guys to support the crap.
> 
> > If we don't find an algorithm that doesn't affect preformance for
> > the normal stuff, (why would something like selecting
> > a memory region and forcing everything that's currently in the
> > way to be swapped out not work?), then we should probably have
> > a special pool for these "perverse" mappings.
> > 
> > But I think there's a rather generic problem: how are you going
> > to support 32bit PCI busmasters in machines with more than
> > 4Gig main memory? It's conceptually the same as how are you
> > going to support ISA DMA with more than 16Meg main memory.
> 
> What the ratio of machines that need 4 GB of more for doing their 
> work?
> How much does they cost?
> What can we do, if some people that have such machines want to use 
> IO controllers that are not able to DMA the whole physical space?
> We just may suggest them to learn or to get help from a psychiatric, 
> but we should not accept to waste time trying to make the crap work 
> less worse.
> 
> > 32bit only PCI busmasters are very common these days, I don't
> > know a single PCI soundcard that can do 64bit master (or even slave)
> > cycles. Also, all PCI soundcards I know which have a hardware
> > wavetable synth (without sample ROM) require ridiculously
> > large contiguous allocations (>= 1M) for the synth to work.
> 
> Are you sure a soundcard is really required for systems that run 
> with GBs of memory?
> 
> > > Anything that requires more that 1 PAGE of physical memory at a time on
> > > running systems is a very bad thing in my opinion. The PAGE is the only
> > 
> > Ok, then remove any soundcard from your system. That might be acceptable
> > for you, but probably not for 90% of the Linux users.
> 
> A real Linux user is able to make a custom kernel that incorporates some
> driver at boot-up, and can live with that. The ones that are whining about
> their PinkSocket O/S having problem to load the sound driver module at 
> run-time is another busyness in my opinion.
> 
> Regards,
>    Gerard.
> 
> 
> -
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.rutgers.edu
> Please read the FAQ at http://www.tux.org/lkml/
> 

-----BEGIN PGP SIGNATURE-----
Version: PGP for Personal Privacy 5.0
Charset: noconv

iQEVAwUBNq5c+T7msCGEppcbAQGiwQf9FwZmLH7ZItRrlqz/uijqbH8xDdyZW8QG
bQ/t57UMRhsfnBprs1AT3Iy+3MEZ7oUiUkeDnH5vYm1HvKwYGBW9Derf3Fk4GMZk
S86V1q/kYS3JKN+kXtRz4IOjqgZXzajPCKF8IQhGmD+mBO1gFMomFSqb62161HJj
reHwBt6bbHrP/P7RzKh4SjvZ7vMZy2aZdSCzEWunn5oj9Bazj98fAsluCEwzbgcj
hgfR4DJLUgbqDMM1ewWjnBAqwQrf9hcjblZQrEMGwvtt/ZAQ/Xjba0Akx9NoxIU8
ZYqy2j7AgiYpT/cVArhoLyJIoes1FqtOrcqjKVHgLSRZPgzQUpzBCQ==
=eN9Y
-----END PGP SIGNATURE-----

--
To unsubscribe, send a message with 'unsubscribe linux-mm my@address'
in the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://humbolt.geo.uu.nl/Linux-MM/

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-26 15:21                                                                                                 ` MOLNAR Ingo
@ 1999-01-27 10:31                                                                                                   ` yodaiken
  0 siblings, 0 replies; 243+ messages in thread
From: yodaiken @ 1999-01-27 10:31 UTC (permalink / raw
  To: mingo
  Cc: alan, sct, groudier, torvalds, werner, andrea, riel,
	Zlatko.Calusic, ebiederm+eric, saw, steve, damonbrent, reese,
	kalle.andersson, bmccann, bredelin, linux-kernel, linux-mm

> i dont think it's correct to say: 'anything that cannot be segmented in
> the physical memory space with page granularity, is considered to be
> broken in this regard and is not guaranteed to be 100% supported by the
> Linux architecture'. 

Sure. But let's keep in mind that paging is designed to avoid memory
fragmentation, and big chunks mean you don't care about some minor loss
of usable memory. If you have a 4G phys memory and need big linear pieces,
it is far better to waste 3.9 meg by aligning end of data, then to complicate
all memory allocation techniques in kernel so you don't waste it. In fact
you'd probably want a simplified slab
         kmalloc:
               if size is 4meg allocate a 4meg chunk from 4meg list
               else if size< 4 meg
                    either allocate a new 4 meg chunk and take space from it
                    or find partially used 4 meg chunk with enough space in it.

so you might have a list of partially used 4meg chunks lying about wasting
some space, but ...

--
To unsubscribe, send a message with 'unsubscribe linux-mm my@address'
in the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://humbolt.geo.uu.nl/Linux-MM/

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-26 14:15                                                                                             ` MOLNAR Ingo
  1999-01-26 14:36                                                                                               ` yodaiken
  1999-01-26 16:37                                                                                               ` Stephen C. Tweedie
@ 1999-01-27 11:35                                                                                               ` Jakub Jelinek
  2 siblings, 0 replies; 243+ messages in thread
From: Jakub Jelinek @ 1999-01-27 11:35 UTC (permalink / raw
  To: MOLNAR Ingo
  Cc: alan, sct, groudier, torvalds, werner, andrea, riel,
	Zlatko.Calusic, ebiederm+eric, saw, steve, damonbrent, reese,
	kalle.andersson, bmccann, bredelin, linux-kernel, linux-mm

> > Something like
> > 
> > Chop memory into 4Mb sized chunks that hold the perfectly normal and
> > existing pages and buddy memory allocator. Set a flag on 25-33% of them
> > to a max of say 10 and for <12Mb boxes simply say "tough".
> 
> this is conceptually 'boot-time allocation of big buffers' by splitting
> all available memory into two pieces:
> 
> 	size_kernel: generic memory
> 	size_user: only swappable
> 
> (size_kernel+size_user = ca. size_allmemory)
> 
> This still doesnt solve the 'what if we need more big buffers than
> size_user' and 'what if we need kernel memory more than size_kernel'
> questions, and both are valid.

It does not have to look like that.
I guess we need some size_user memory on some ports anyway (i386 above 1GB
physical, sun4d above 2GB physical, sparc64 memory hotplug mem_map), but the
rest should be general memory.
I bet forcing every single driver in the kernel which ever does some kmalloc
to write hooks for relocating that buffer is utopic - it would add too much
complexity everywhere. You often don't keep track where all you put pointers
to the kmalloced area. So we'll have to live with some unmovable objects.
But for the rest, the memory allocator can behave like this:
either have some chunks in which non-swappable memory is being allocated, or
have some rule, e.g. that non-swappable memory grows from the lowest
physical pages up.
Now, for a swappable get_free_pages you can allocate it from anywhere, but
it would be good to give precedence to the memory outside of the current
non-swappable region(s).
For non-swappable get_free_pages, you first try hard to allocate it from the
current non-swappable region(s) (first looking if there are free pages, then
look if there are swappable pages (and in the latter case either swap them
off, or just move them to swappable regions)). If all non-swappable
region(s) are full of non-swappable allocations, then you allocate another
non-swappable region and swap-off/move some pages from there.
As long as we keep most of the objects swappable/movable, this will work
well. If there are too many unmovable objects, it will lead to deadly
fragmentation and this won't work.
But most of the objects are swappable/movable: everything referenced
in user pages only, vmalloc regions, or can be easily flushed.

Cheers,
    Jakub
___________________________________________________________________
Jakub Jelinek | jj@sunsite.mff.cuni.cz | http://sunsite.mff.cuni.cz
Administrator of SunSITE Czech Republic, MFF, Charles University
___________________________________________________________________
UltraLinux  |  http://ultra.linux.cz/  |  http://ultra.penguin.cz/
Linux version 2.2.0 on a sparc64 machine (3958.37 BogoMips)
___________________________________________________________________
--
To unsubscribe, send a message with 'unsubscribe linux-mm my@address'
in the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://humbolt.geo.uu.nl/Linux-MM/

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-26  1:57                                                                                   ` Andrea Arcangeli
  1999-01-26 18:37                                                                                     ` Andrea Arcangeli
@ 1999-01-27 12:13                                                                                     ` Stephen C. Tweedie
  1 sibling, 0 replies; 243+ messages in thread
From: Stephen C. Tweedie @ 1999-01-27 12:13 UTC (permalink / raw
  To: Andrea Arcangeli
  Cc: Stephen C. Tweedie, Linus Torvalds, Alan Cox, werner, riel,
	Zlatko.Calusic, ebiederm+eric, saw, steve, damonbrent, reese,
	kalle.andersson, bmccann, bredelin, linux-kernel, linux-mm

Hi,

On Tue, 26 Jan 1999 02:57:41 +0100 (CET), Andrea Arcangeli
<andrea@e-mind.com> said:

>> /* We can't free pages unless there's just one user */
>> if (atomic_read(&page->count) != 1)
>> continue;
>> +
>> +		count--;

> but this is plain bogus. When your machine will reach 0 freeable pages
> (and that happens a bit before to kill the process because OOM) you'll get
> an infinite loop in shrink_mmap().

No.  We skip only shared pages.  We still count page tables, vmalloced
pages, task structs, kmalloc/slab pages, dma buffers and all those other
kernel pages which do not belong to the VM.

--Stephen
--
To unsubscribe, send a message with 'unsubscribe linux-mm my@address'
in the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://humbolt.geo.uu.nl/Linux-MM/

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-25 20:49                                                                                 ` Dr. Werner Fink
  1999-01-25 20:56                                                                                   ` Linus Torvalds
@ 1999-01-27 14:52                                                                                   ` Stephen C. Tweedie
  1999-01-28 19:12                                                                                     ` Dr. Werner Fink
  1 sibling, 1 reply; 243+ messages in thread
From: Stephen C. Tweedie @ 1999-01-27 14:52 UTC (permalink / raw
  To: Dr. Werner Fink
  Cc: Andrea Arcangeli, Eric W. Biederman, Stephen C. Tweedie,
	Rik van Riel, Zlatko Calusic, Linus Torvalds,
	Savochkin Andrey Vladimirovich, steve, brent verner,
	Garst R. Reese, Kalle Andersson, Ben McCann, Alan Cox, bredelin,
	linux-kernel, linux-mm

Hi,

On Mon, 25 Jan 1999 21:49:29 +0100, "Dr. Werner Fink" <werner@suse.de>
said:

> Ok its a bit better than a single PII 400 MHz :-)
> ... with less than 64MB the break downs are going to be the common state
> whereas with 128MB the system is usable.  Nevertheless whenever both make
> loops taking the filesystem tree at the same time, the system performance
> slows down dramatically (a `break down').

Not for me.  That's probably just the advantage of having swap on a
separate disk, but I've got both a "find /" and a "wc /usr/bin/*"
running right now, and interactive performance is not noticeably
degraded on 2.2.0-release with 64MB (and that is with two active users
on the box right now).  Concurrent filesystem and swap IO on the same
spindle is always going to suck.

--Stephen
--
To unsubscribe, send a message with 'unsubscribe linux-mm my@address'
in the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://humbolt.geo.uu.nl/Linux-MM/

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-26 20:48                                                                                           ` Gerard Roudier
  1999-01-26 21:24                                                                                             ` Thomas Sailer
  1999-01-27  0:25                                                                                             ` David Lang
@ 1999-01-27 16:05                                                                                             ` Stephen C. Tweedie
  1999-01-27 20:11                                                                                               ` Gerard Roudier
  2 siblings, 1 reply; 243+ messages in thread
From: Stephen C. Tweedie @ 1999-01-27 16:05 UTC (permalink / raw
  To: Gerard Roudier; +Cc: Thomas Sailer, linux-kernel, linux-mm, Stephen Tweedie

Hi,

On Tue, 26 Jan 1999 21:48:59 +0100 (MET), Gerard Roudier
<groudier@club-internet.fr> said:

> I suggest to allow some application program to decide what stuff to
> victimize and to be able to tell the kernel about, 

Yep, there is already a madvise() fuction in most modern unixen: it is
especially useful for giving cache hints.  

> There are bunches of things that are widespread used nowadays and that 
> should have disappeard since years if people were a bit more concerned 
> by technical and progress considerations.

Yes.  I see what you mean.  We should immediately remove Linux support
for FAT filesystems, the ISA bus and 8086 virtual mode.

Not.

> For example, it seems that 32 bits systems are not enough to provide a
> flat virtual addressing space far larger than the physical address space
> needed for applications (that was the primary goal of virtual memory
> invention).

*One* of the primary goals.  The other was protected multitasking.  The
x86 architecture today is perfectly well capable of supporting mutliple
32-bit address spaces within a 36 bit (64GB) physical address space, and
large multiuser environments would benefit enormously from such an
environment.

> A device that requires more contiguous space than 1 PAGE for its 
> support is crap. 

So?  IDE is crap because it doesn't support multiple outstanding
commands.  If you honestly believe that this means we should remove IDE
support from the kernel, then you are living on another planet where
getting real work done by real users doesn't matter.  Fact is, we _can_
support this stuff, and users want us to.

--Stephen
--
To unsubscribe, send a message with 'unsubscribe linux-mm my@address'
in the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://humbolt.geo.uu.nl/Linux-MM/

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-27 16:05                                                                                             ` Stephen C. Tweedie
@ 1999-01-27 20:11                                                                                               ` Gerard Roudier
  0 siblings, 0 replies; 243+ messages in thread
From: Gerard Roudier @ 1999-01-27 20:11 UTC (permalink / raw
  To: Stephen C. Tweedie; +Cc: Thomas Sailer, linux-kernel, linux-mm

On Wed, 27 Jan 1999, Stephen C. Tweedie wrote:

> Hi,
> 
> On Tue, 26 Jan 1999 21:48:59 +0100 (MET), Gerard Roudier
> <groudier@club-internet.fr> said:
> 

[ ... ]

> > There are bunches of things that are widespread used nowadays and that 
> > should have disappeard since years if people were a bit more concerned 
> > by technical and progress considerations.
> 
> Yes.  I see what you mean.  We should immediately remove Linux support
> for FAT filesystems, the ISA bus and 8086 virtual mode.
> 
> Not.

AFAIK, it is what M$$ is intending to do. If the Linux strategy is to be 
the greatest O/S for obsolete hardware, we can support that stuff years
after M$$ has dropped the support of it.

> > For example, it seems that 32 bits systems are not enough to provide a
> > flat virtual addressing space far larger than the physical address space
> > needed for applications (that was the primary goal of virtual memory
> > invention).
> 
> *One* of the primary goals.  The other was protected multitasking.  The
> x86 architecture today is perfectly well capable of supporting mutliple
> 32-bit address spaces within a 36 bit (64GB) physical address space, and
> large multiuser environments would benefit enormously from such an
> environment.

64 GB of memory needs 36 address lines. It is obvious to handle that on 64
bit machines that exists since _years_, but very painfull on 32 bit
addressing machines. Implementing complex algorithms for handling this
stupidity is stupidity by itself.  32 bit VM architecture is a 25 years
old technology. The fact that it still exists nowadays is because the
market place was more concerned by $$ than by real progress.  If the PC
market had started in 1980, then it might have happen that modern PCs
would use rather Z80s type processors at something like 1 GHZ that 32 bit
PII at 400 MHz.

Just thinking of the ridiculous price fast 32 bits processors are sold
today is the proof, in my opinion, that 32 bit is definitely _dead_.
People that still want to make efforts for that stuff are just stupid, in
my opinion. 

> > A device that requires more contiguous space than 1 PAGE for its 
> > support is crap. 
> 
> So?  IDE is crap because it doesn't support multiple outstanding

Indeed.

> commands.  If you honestly believe that this means we should remove IDE
> support from the kernel, then you are living on another planet where
> getting real work done by real users doesn't matter.  Fact is, we _can_
> support this stuff, and users want us to.

I live on the euro-planet and yours is just a satellit. :-))

Regards,
   Gerard.

--
To unsubscribe, send a message with 'unsubscribe linux-mm my@address'
in the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://humbolt.geo.uu.nl/Linux-MM/

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-27 14:52                                                                                   ` Stephen C. Tweedie
@ 1999-01-28 19:12                                                                                     ` Dr. Werner Fink
  0 siblings, 0 replies; 243+ messages in thread
From: Dr. Werner Fink @ 1999-01-28 19:12 UTC (permalink / raw
  To: Stephen C. Tweedie, Dr. Werner Fink
  Cc: Andrea Arcangeli, Eric W. Biederman, Rik van Riel, Zlatko Calusic,
	Linus Torvalds, Savochkin Andrey Vladimirovich, steve,
	brent verner, Garst R. Reese, Kalle Andersson, Ben McCann,
	Alan Cox, bredelin, linux-kernel, linux-mm

> > Ok its a bit better than a single PII 400 MHz :-)
> > ... with less than 64MB the break downs are going to be the common state
> > whereas with 128MB the system is usable.  Nevertheless whenever both make
> > loops taking the filesystem tree at the same time, the system performance
> > slows down dramatically (a `break down').
> 
> Not for me.  That's probably just the advantage of having swap on a
> separate disk, but I've got both a "find /" and a "wc /usr/bin/*"
> running right now, and interactive performance is not noticeably
> degraded on 2.2.0-release with 64MB (and that is with two active users
> on the box right now).  Concurrent filesystem and swap IO on the same
> spindle is always going to suck.

I'm not talking about a simple find, ... the two "make MAKE='make -j10'"
in /usr/src/linux/ and /usr/src/newkernel/linux/ do force this
`break down' with 2.2.0-pre9 if the two makes are entering
/usr/src/linux/fs/ or /usr/src/newkernel/linux/fs/ respectively
at the same time which increases the load a `bit'.


         Werner

--
To unsubscribe, send a message with 'unsubscribe linux-mm my@address'
in the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://humbolt.geo.uu.nl/Linux-MM/

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-26 16:45                                                                                                   ` Stephen C. Tweedie
@ 1999-01-30  7:01                                                                                                     ` yodaiken
  1999-02-01 13:07                                                                                                       ` Stephen C. Tweedie
  0 siblings, 1 reply; 243+ messages in thread
From: yodaiken @ 1999-01-30  7:01 UTC (permalink / raw
  To: Stephen C. Tweedie
  Cc: alan, mingo, groudier, torvalds, werner, andrea, riel,
	Zlatko.Calusic, ebiederm+eric, saw, steve, damonbrent, reese,
	kalle.andersson, bmccann, bredelin, linux-kernel, linux-mm

> 
> Hi,
> 
> On Tue, 26 Jan 1999 15:46:23 +0000 (GMT), alan@lxorguk.ukuu.org.uk (Alan
> Cox) said:
> > We don't need to solve the 100% case. Simply being sure we can (slowly)
> > allocate up to 25% of RAM in huge chunks is going to be enough. Good point
> > Ingo on one thing I'd missed - the big chunks themselves need some kind
> > of handles since the moment we hand out 512K chunks we may not be able to 
> > shuffle and get a 4Mb block
> 
> The idea was to decide what region to hand out, _then_ to clear it.
> Standard best-fit algorithms apply when carving up the region.

If clearing involves remapping kernel address space, then its a rather
complex process. 
              kmalloc
              give virt_to_bus to device
              ...
              remap 



--
To unsubscribe, send a message with 'unsubscribe linux-mm my@address'
in the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://humbolt.geo.uu.nl/Linux-MM/

^ permalink raw reply	[flat|nested] 243+ messages in thread

* Re: MM deadlock [was: Re: arca-vm-8...]
  1999-01-30  7:01                                                                                                     ` yodaiken
@ 1999-02-01 13:07                                                                                                       ` Stephen C. Tweedie
  0 siblings, 0 replies; 243+ messages in thread
From: Stephen C. Tweedie @ 1999-02-01 13:07 UTC (permalink / raw
  To: yodaiken
  Cc: Stephen C. Tweedie, alan, mingo, groudier, torvalds, werner,
	andrea, riel, Zlatko.Calusic, ebiederm+eric, saw, steve,
	damonbrent, reese, kalle.andersson, bmccann, bredelin,
	linux-kernel, linux-mm

Hi,

On Sat, 30 Jan 1999 00:01:00 -0700 (MST), yodaiken@chelm.cs.nmt.edu
said:

>> The idea was to decide what region to hand out, _then_ to clear it.
>> Standard best-fit algorithms apply when carving up the region.

> If clearing involves remapping kernel address space, then its a rather
> complex process. 

No, that is the whole point behind restricting such allocations to a
zone containing only swappable objects.  There will be no
non-relocatable objects there, and we can simply swap out each page in
the selected contiguous zone.  

_If_ we expect to do this often then we will want to keep the relocated
pages in memory, but for solving the current problem --- driver
initialisation --- that is not so important and we can rely on the
existing page swap code to just get rid of the data which is in the way.

--Stephen
--
To unsubscribe, send a message with 'unsubscribe linux-mm my@address'
in the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://humbolt.geo.uu.nl/Linux-MM/

^ permalink raw reply	[flat|nested] 243+ messages in thread

end of thread, other threads:[~1999-02-01 19:37 UTC | newest]

Thread overview: 243+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <199812290146.BAA12687@terrorserver.swansea.linux.org.uk>
1998-12-31 18:00 ` 2.2.0 Bug summary Andrea Arcangeli
1998-12-31 18:34   ` [patch] new-vm improvement [Re: 2.2.0 Bug summary] Andrea Arcangeli
1999-01-01  0:16     ` Steve Bergman
1999-01-01 17:16       ` Andrea Arcangeli
1999-01-01 16:44     ` Andrea Arcangeli
1999-01-01 20:02       ` Andrea Arcangeli
1999-01-01 23:46         ` Steve Bergman
1999-01-02  6:55           ` Linus Torvalds
1999-01-02  8:33             ` Steve Bergman
1999-01-02 14:48             ` Andrea Arcangeli
1999-01-02 15:38             ` Andrea Arcangeli
1999-01-02 18:10               ` Linus Torvalds
1999-01-02 20:52               ` Andrea Arcangeli
1999-01-03  2:59                 ` Andrea Arcangeli
1999-01-04 18:08                   ` [patch] arca-vm-6, killed kswapd [Re: [patch] new-vm improvement , [Re: 2.2.0 Bug summary]] Andrea Arcangeli
1999-01-04 20:56                     ` Linus Torvalds
1999-01-04 21:10                       ` Rik van Riel
1999-01-04 22:04                       ` Alan Cox
1999-01-04 21:55                         ` Linus Torvalds
1999-01-04 22:51                           ` Andrea Arcangeli
1999-01-05  0:32                             ` Andrea Arcangeli
1999-01-05  0:52                               ` Zlatko Calusic
1999-01-05  3:02                               ` Zlatko Calusic
1999-01-05 11:49                                 ` Andrea Arcangeli
1999-01-05 13:23                                   ` Zlatko Calusic
1999-01-05 15:42                                     ` Andrea Arcangeli
1999-01-05 16:16                                       ` Zlatko Calusic
1999-01-05 15:35                               ` arca-vm-8 [Re: [patch] arca-vm-6, killed kswapd [Re: [patch] new-vm , improvement , [Re: 2.2.0 Bug summary]]] Andrea Arcangeli
1999-01-06 14:48                                 ` Andrea Arcangeli
1999-01-06 23:31                                   ` Andrea Arcangeli
1999-01-07  3:32                                     ` Results: 2.2.0-pre5 vs arcavm10 vs arcavm9 vs arcavm7 Steve Bergman
1999-01-07 12:02                                       ` Andrea Arcangeli
1999-01-07 20:27                                         ` Linus Torvalds
1999-01-07 23:56                                           ` Andrea Arcangeli
1999-01-07 17:35                                       ` Linus Torvalds
1999-01-07 18:44                                         ` Zlatko Calusic
1999-01-07 19:33                                           ` Linus Torvalds
1999-01-07 21:10                                             ` Zlatko Calusic
1999-01-07 19:38                                           ` Zlatko Calusic
1999-01-07 19:40                                           ` Andrea Arcangeli
1999-01-09  6:28                                           ` 2.2.0-pre[56] swap performance poor with > 1 thrashing task Dax Kelson
1999-01-09  6:32                                             ` Zlatko Calusic
1999-01-09  6:44                                               ` Linus Torvalds
1999-01-09 18:58                                                 ` Andrea Arcangeli
1999-01-11  9:21                                                 ` Buffer handling (setting PG_referenced on access) Zlatko Calusic
1999-01-11 17:44                                                   ` Linus Torvalds
1999-01-11 20:14                                                     ` Zlatko Calusic
1999-01-16 17:35                                                 ` 2.2.0-pre[56] swap performance poor with > 1 thrashing task Andrea Arcangeli
1999-01-09  7:48                                             ` Benjamin Redelings I
1999-01-09  6:53                                               ` Linus Torvalds
1999-01-09 22:39                                       ` Results: pre6 vs pre6+zlatko's_patch vs pre5 vs arcavm13 Steve Bergman
1999-01-10  0:28                                         ` Steve Bergman
1999-01-10  5:35                                           ` Linus Torvalds
1999-01-10 18:33                                             ` Andrea Arcangeli
1999-01-10 18:43                                             ` Steve Bergman
1999-01-10 19:08                                               ` Linus Torvalds
1999-01-10 19:23                                                 ` Vladimir Dergachev
1999-01-10 20:09                                                 ` Andrea Arcangeli
1999-01-10 20:29                                                 ` Steve Bergman
1999-01-10 21:41                                                   ` Linus Torvalds
1999-01-10 23:33                                                     ` testing/pre-7 and do_poll() Chip Salzenberg
1999-01-11  6:02                                                       ` Linus Torvalds
1999-01-11  6:26                                                         ` Chip Salzenberg
1999-01-11  6:46                                                           ` Linus Torvalds
1999-01-11  6:59                                                             ` Chip Salzenberg
1999-01-11  7:02                                                               ` Linus Torvalds
1999-01-11 22:08                                                                 ` Shawn Leas
1999-01-11 22:13                                                                   ` Linus Torvalds
1999-01-12  0:25                                                                     ` estafford
1999-01-12  8:25                                                                       ` Shawn Leas
1999-01-12  7:06                                                                     ` Gregory Maxwell
1999-01-11 20:20                                                       ` Adam Heath
1999-01-11 16:57                                                   ` Results: pre6 vs pre6+zlatko's_patch vs pre5 vs arcavm13 Steve Bergman
1999-01-11 19:36                                                     ` Andrea Arcangeli
1999-01-11 23:03                                                       ` Andrea Arcangeli
1999-01-11 23:38                                                         ` Zlatko Calusic
1999-01-12  2:02                                                         ` Steve Bergman
1999-01-12  3:21                                                           ` Results: Zlatko's new vm patch Steve Bergman
1999-01-12  5:33                                                             ` Linus Torvalds
1999-01-12 14:49                                                               ` Andrea Arcangeli
1999-01-12 16:58                                                               ` Joseph Anthony
1999-01-12 18:16                                                                 ` Stephen C. Tweedie
1999-01-12 20:15                                                                   ` Michael K Vance
1999-01-13 19:25                                                                     ` Stephen C. Tweedie
1999-01-12 18:24                                                               ` Michael K Vance
1999-01-13  0:01                                                               ` Where to find pre7. Was: " Robert Thorncrantz
1999-01-13 20:47                                                             ` [patch] arca-vm-19 [Re: Results: Zlatko's new vm patch] Andrea Arcangeli
1999-01-14 12:30                                                               ` Andrea Arcangeli
1999-01-15 23:56                                                                 ` [patch] NEW: arca-vm-21, swapout via shrink_mmap using PG_dirty Andrea Arcangeli
1999-01-16 16:49                                                                   ` Andrea Arcangeli
1999-01-17 23:47                                                                     ` Andrea Arcangeli
1999-01-18  5:11                                                                       ` Linus Torvalds
1999-01-18  7:28                                                                         ` Eric W. Biederman
1999-01-18 10:00                                                                           ` Andrea Arcangeli
1999-01-18  9:15                                                                         ` Andrea Arcangeli
1999-01-18 17:49                                                                           ` Linus Torvalds
1999-01-18 19:22                                                                       ` Andrea Arcangeli
1999-01-10 20:40                                               ` Results: pre6 vs pre6+zlatko's_patch vs pre5 vs arcavm13 Andrea Arcangeli
1999-01-10 20:50                                                 ` Linus Torvalds
1999-01-10 21:01                                                   ` Andrea Arcangeli
1999-01-10 21:51                                                     ` Steve Bergman
1999-01-10 22:50                                                       ` Results: arcavm15, et. al Steve Bergman
1999-01-11  0:20                                                         ` Steve Bergman
1999-01-11 13:21                                                         ` Andrea Arcangeli
1999-01-11  3:47                                         ` Results: pre6 vs pre6+zlatko's_patch vs pre5 vs arcavm13 Gregory Maxwell
1999-01-06 23:35                                   ` arca-vm-8 [Re: [patch] arca-vm-6, killed kswapd [Re: [patch] new-vm , improvement , [Re: 2.2.0 Bug summary]]] Linus Torvalds
1999-01-07  4:30                                     ` Eric W. Biederman
1999-01-07 17:56                                       ` Linus Torvalds
1999-01-07 18:18                                         ` Rik van Riel
1999-01-07 19:19                                           ` arca-vm-8 [Re: [patch] arca-vm-6, killed kswapd [Re: [patch] Alan Cox
1999-01-07 18:55                                         ` arca-vm-8 [Re: [patch] arca-vm-6, killed kswapd [Re: [patch] new-vm , improvement , [Re: 2.2.0 Bug summary]]] Zlatko Calusic
1999-01-07 22:57                                         ` Linus Torvalds
1999-01-08  1:16                                           ` Linus Torvalds
1999-01-08 10:45                                             ` Andrea Arcangeli
1999-01-08 19:06                                               ` Linus Torvalds
1999-01-09  9:43                                           ` MM deadlock [was: Re: arca-vm-8...] Savochkin Andrey Vladimirovich
1999-01-09 18:00                                             ` Linus Torvalds
1999-01-09 18:41                                               ` Andrea Arcangeli
1999-01-10 21:41                                                 ` Stephen C. Tweedie
1999-01-10 21:47                                                   ` Linus Torvalds
1999-01-09 21:50                                               ` Linus Torvalds
1999-01-10 11:56                                                 ` Savochkin Andrey Vladimirovich
1999-01-10 17:59                                                   ` Andrea Arcangeli
1999-01-10 22:33                                                   ` Stephen C. Tweedie
1999-01-10 16:59                                                 ` Stephen C. Tweedie
1999-01-10 18:13                                                   ` Andrea Arcangeli
1999-01-10 18:35                                                   ` Linus Torvalds
1999-01-10 19:45                                                     ` Alan Cox
1999-01-10 19:03                                                       ` Andrea Arcangeli
1999-01-10 21:39                                                         ` Stephen C. Tweedie
1999-01-10 19:09                                                       ` Linus Torvalds
1999-01-10 20:33                                                         ` Alan Cox
1999-01-10 20:07                                                           ` Linus Torvalds
1999-01-10 22:18                                                     ` Stephen C. Tweedie
1999-01-10 22:49                                                     ` Stephen C. Tweedie
1999-01-11  6:04                                                       ` Eric W. Biederman
1999-01-12 16:06                                                         ` Stephen C. Tweedie
1999-01-12 17:54                                                           ` Linus Torvalds
1999-01-12 18:44                                                             ` Zlatko Calusic
1999-01-12 19:05                                                               ` Andrea Arcangeli
1999-01-13 17:48                                                                 ` Stephen C. Tweedie
1999-01-13 18:07                                                                   ` 2.2.0-pre6 ain't nice =( Kalle Andersson
1999-01-13 19:05                                                                   ` MM deadlock [was: Re: arca-vm-8...] Alan Cox
1999-01-13 19:23                                                                     ` MOLNAR Ingo
1999-01-13 19:26                                                                     ` Andrea Arcangeli
1999-01-14 11:02                                                                       ` Mike Jagdis
1999-01-14 22:38                                                                         ` Andrea Arcangeli
1999-01-15  7:40                                                                       ` Agus Budy Wuysang
1999-01-14 10:48                                                                   ` Mike Jagdis
1999-01-12 21:46                                                               ` Rik van Riel
1999-01-13  6:52                                                                 ` Zlatko Calusic
1999-01-13 13:45                                                                 ` Andrea Arcangeli
1999-01-13 13:58                                                                   ` Chris Evans
1999-01-13 15:07                                                                     ` Andrea Arcangeli
1999-01-13 22:11                                                                       ` Stephen C. Tweedie
1999-01-13 14:59                                                                   ` Rik van Riel
1999-01-13 18:10                                                                     ` Andrea Arcangeli
1999-01-13 22:14                                                                       ` Stephen C. Tweedie
1999-01-14 14:53                                                                         ` Dr. Werner Fink
1999-01-21 16:50                                                                           ` Stephen C. Tweedie
1999-01-21 19:53                                                                             ` Andrea Arcangeli
1999-01-22 13:55                                                                               ` Stephen C. Tweedie
1999-01-22 19:45                                                                                 ` Andrea Arcangeli
1999-01-23 23:20                                                                             ` Alan Cox
1999-01-24  0:19                                                                               ` Linus Torvalds
1999-01-24 18:33                                                                                 ` Gregory Maxwell
1999-01-25  0:21                                                                                   ` Linus Torvalds
1999-01-25  1:28                                                                                     ` Alan Cox
1999-01-25  3:35                                                                                       ` pmonta
1999-01-25  4:17                                                                                       ` Linus Torvalds
1999-01-24 20:33                                                                                 ` Alan Cox
1999-01-25  0:27                                                                                   ` Linus Torvalds
1999-01-25  1:38                                                                                     ` Alan Cox
1999-01-25  1:04                                                                                       ` Andrea Arcangeli
1999-01-25  2:10                                                                                         ` Alan Cox
1999-01-25  3:16                                                                                           ` Garst R. Reese
1999-01-25 10:49                                                                                             ` Alan Cox
1999-01-25 14:06                                                                                           ` Rik van Riel
1999-01-25 21:59                                                                                       ` Gerard Roudier
1999-01-26 11:45                                                                                         ` Thomas Sailer
1999-01-26 20:48                                                                                           ` Gerard Roudier
1999-01-26 21:24                                                                                             ` Thomas Sailer
1999-01-27  0:25                                                                                             ` David Lang
1999-01-27 16:05                                                                                             ` Stephen C. Tweedie
1999-01-27 20:11                                                                                               ` Gerard Roudier
1999-01-26 13:06                                                                                         ` Stephen C. Tweedie
1999-01-26 14:28                                                                                           ` Alan Cox
1999-01-26 14:15                                                                                             ` MOLNAR Ingo
1999-01-26 14:36                                                                                               ` yodaiken
1999-01-26 15:21                                                                                                 ` MOLNAR Ingo
1999-01-27 10:31                                                                                                   ` yodaiken
1999-01-26 15:46                                                                                                 ` Alan Cox
1999-01-26 16:45                                                                                                   ` Stephen C. Tweedie
1999-01-30  7:01                                                                                                     ` yodaiken
1999-02-01 13:07                                                                                                       ` Stephen C. Tweedie
1999-01-26 16:37                                                                                               ` Stephen C. Tweedie
1999-01-27 11:35                                                                                               ` Jakub Jelinek
1999-01-26 14:21                                                                                             ` Rik van Riel
1999-01-25 16:25                                                                                 ` Stephen C. Tweedie
1999-01-25 16:52                                                                                   ` Andrea Arcangeli
1999-01-25 18:27                                                                                   ` Linus Torvalds
1999-01-25 18:43                                                                                     ` Stephen C. Tweedie
1999-01-25 18:49                                                                                       ` Linus Torvalds
1999-01-25 18:43                                                                                   ` Linus Torvalds
1999-01-25 19:15                                                                                     ` Stephen C. Tweedie
1999-01-26  1:57                                                                                   ` Andrea Arcangeli
1999-01-26 18:37                                                                                     ` Andrea Arcangeli
1999-01-27 12:13                                                                                     ` Stephen C. Tweedie
1999-01-22 16:29                                                                           ` Eric W. Biederman
1999-01-25 13:14                                                                             ` Dr. Werner Fink
1999-01-25 17:56                                                                               ` Stephen C. Tweedie
1999-01-25 19:10                                                                               ` Andrea Arcangeli
1999-01-25 20:49                                                                                 ` Dr. Werner Fink
1999-01-25 20:56                                                                                   ` Linus Torvalds
1999-01-26 12:23                                                                                     ` Rik van Riel
1999-01-26 15:44                                                                                       ` Andrea Arcangeli
1999-01-27 14:52                                                                                   ` Stephen C. Tweedie
1999-01-28 19:12                                                                                     ` Dr. Werner Fink
1999-01-13 17:55                                                                   ` [PATCH] " Stephen C. Tweedie
1999-01-13 18:52                                                                     ` Andrea Arcangeli
1999-01-13 22:10                                                                       ` Stephen C. Tweedie
1999-01-13 22:30                                                                         ` Linus Torvalds
1999-01-11 11:20                                                       ` Pavel Machek
1999-01-11 17:35                                                         ` Stephen C. Tweedie
1999-01-11 14:11                                                     ` Savochkin Andrey Vladimirovich
1999-01-11 17:55                                                       ` Linus Torvalds
1999-01-11 18:37                                                         ` Andrea Arcangeli
1999-01-08  2:56                                         ` arca-vm-8 [Re: [patch] arca-vm-6, killed kswapd [Re: [patch] new-vm , improvement , [Re: 2.2.0 Bug summary]]] Eric W. Biederman
1999-01-09  0:50                                         ` David S. Miller
1999-01-09  2:13                                         ` Stephen C. Tweedie
1999-01-09  2:34                                           ` Andrea Arcangeli
1999-01-09  9:30                                             ` Stephen C. Tweedie
1999-01-09 12:11                                           ` Andrea Arcangeli
1999-01-07 14:11                                     ` Andrea Arcangeli
1999-01-07 18:19                                       ` Linus Torvalds
1999-01-07 20:35                                         ` Andrea Arcangeli
1999-01-07 23:51                                           ` Linus Torvalds
1999-01-08  0:04                                             ` Andrea Arcangeli
1999-01-04 22:43                         ` [patch] arca-vm-6, killed kswapd [Re: [patch] new-vm improvement , [Re: 2.2.0 Bug summary]] Andrea Arcangeli
1999-01-04 22:29                       ` Andrea Arcangeli
1999-01-05 13:33                   ` [patch] new-vm improvement [Re: 2.2.0 Bug summary] Ben McCann
1999-01-02 20:04             ` Steve Bergman
1999-01-02  3:03         ` Andrea Arcangeli

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.