All the mail mirrored from lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/3] Deferred memory initialisation fixes
@ 2015-07-17 12:22 ` Mel Gorman
  0 siblings, 0 replies; 20+ messages in thread
From: Mel Gorman @ 2015-07-17 12:22 UTC (permalink / raw
  To: Andrew Morton
  Cc: Nicolai Stange, Peter Zijlstra, Dave Hansen, Alex Ng,
	Fengguang Wu, Linux-MM, LKML, Mel Gorman

This series addresses problems reported with deferred memory initialisation
and are needed for 4.2. The first and second patches have not been confirmed
by the reporters as fixing their problems but I could replicate the issues
and they Worked For Me. The last one has been verified as working.

 fs/dcache.c        | 13 +++----------
 fs/file_table.c    | 24 +++++++++++++++---------
 include/linux/fs.h |  5 +++--
 init/main.c        |  2 +-
 mm/page_alloc.c    | 44 ++++++++++++++++++++++++++++++--------------
 5 files changed, 52 insertions(+), 36 deletions(-)

-- 
2.4.3


^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH 0/3] Deferred memory initialisation fixes
@ 2015-07-17 12:22 ` Mel Gorman
  0 siblings, 0 replies; 20+ messages in thread
From: Mel Gorman @ 2015-07-17 12:22 UTC (permalink / raw
  To: Andrew Morton
  Cc: Nicolai Stange, Peter Zijlstra, Dave Hansen, Alex Ng,
	Fengguang Wu, Linux-MM, LKML, Mel Gorman

This series addresses problems reported with deferred memory initialisation
and are needed for 4.2. The first and second patches have not been confirmed
by the reporters as fixing their problems but I could replicate the issues
and they Worked For Me. The last one has been verified as working.

 fs/dcache.c        | 13 +++----------
 fs/file_table.c    | 24 +++++++++++++++---------
 include/linux/fs.h |  5 +++--
 init/main.c        |  2 +-
 mm/page_alloc.c    | 44 ++++++++++++++++++++++++++++++--------------
 5 files changed, 52 insertions(+), 36 deletions(-)

-- 
2.4.3

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH 1/3] mm, meminit: replace rwsem with completion
  2015-07-17 12:22 ` Mel Gorman
@ 2015-07-17 12:22   ` Mel Gorman
  -1 siblings, 0 replies; 20+ messages in thread
From: Mel Gorman @ 2015-07-17 12:22 UTC (permalink / raw
  To: Andrew Morton
  Cc: Nicolai Stange, Peter Zijlstra, Dave Hansen, Alex Ng,
	Fengguang Wu, Linux-MM, LKML, Mel Gorman

From: Nicolai Stange <nicstange@gmail.com>

Commit 0e1cc95b4cc7 ("mm: meminit: finish initialisation of struct pages
before basic setup") introduced a rwsem to signal completion of the
initialization workers.

Lockdep complains about possible recursive locking:
  =============================================
  [ INFO: possible recursive locking detected ]
  4.1.0-12802-g1dc51b8 #3 Not tainted
  ---------------------------------------------
  swapper/0/1 is trying to acquire lock:
  (pgdat_init_rwsem){++++.+},
    at: [<ffffffff8424c7fb>] page_alloc_init_late+0xc7/0xe6

  but task is already holding lock:
  (pgdat_init_rwsem){++++.+},
    at: [<ffffffff8424c772>] page_alloc_init_late+0x3e/0xe6

Replace the rwsem by a completion together with an atomic
"outstanding work counter".

[peterz@infradead.org: Barrier removal on the grounds of being pointless]
[mgorman@suse.de: Applied review feedback]
Signed-off-by: Nicolai Stange <nicstange@gmail.com>
Signed-off-by: Mel Gorman <mgorman@suse.de>
---
 mm/page_alloc.c | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 506eac8b38af..a69e78c396a0 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -18,7 +18,6 @@
 #include <linux/mm.h>
 #include <linux/swap.h>
 #include <linux/interrupt.h>
-#include <linux/rwsem.h>
 #include <linux/pagemap.h>
 #include <linux/jiffies.h>
 #include <linux/bootmem.h>
@@ -1062,7 +1061,15 @@ static void __init deferred_free_range(struct page *page,
 		__free_pages_boot_core(page, pfn, 0);
 }
 
-static __initdata DECLARE_RWSEM(pgdat_init_rwsem);
+/* Completion tracking for deferred_init_memmap() threads */
+static atomic_t pgdat_init_n_undone __initdata;
+static __initdata DECLARE_COMPLETION(pgdat_init_all_done_comp);
+
+static inline void __init pgdat_init_report_one_done(void)
+{
+	if (atomic_dec_and_test(&pgdat_init_n_undone))
+		complete(&pgdat_init_all_done_comp);
+}
 
 /* Initialise remaining memory on a node */
 static int __init deferred_init_memmap(void *data)
@@ -1079,7 +1086,7 @@ static int __init deferred_init_memmap(void *data)
 	const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
 
 	if (first_init_pfn == ULONG_MAX) {
-		up_read(&pgdat_init_rwsem);
+		pgdat_init_report_one_done();
 		return 0;
 	}
 
@@ -1179,7 +1186,8 @@ free_range:
 
 	pr_info("node %d initialised, %lu pages in %ums\n", nid, nr_pages,
 					jiffies_to_msecs(jiffies - start));
-	up_read(&pgdat_init_rwsem);
+
+	pgdat_init_report_one_done();
 	return 0;
 }
 
@@ -1187,14 +1195,14 @@ void __init page_alloc_init_late(void)
 {
 	int nid;
 
+	/* There will be num_node_state(N_MEMORY) threads */
+	atomic_set(&pgdat_init_n_undone, num_node_state(N_MEMORY));
 	for_each_node_state(nid, N_MEMORY) {
-		down_read(&pgdat_init_rwsem);
 		kthread_run(deferred_init_memmap, NODE_DATA(nid), "pgdatinit%d", nid);
 	}
 
 	/* Block until all are initialised */
-	down_write(&pgdat_init_rwsem);
-	up_write(&pgdat_init_rwsem);
+	wait_for_completion(&pgdat_init_all_done_comp);
 }
 #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
 
-- 
2.4.3


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH 1/3] mm, meminit: replace rwsem with completion
@ 2015-07-17 12:22   ` Mel Gorman
  0 siblings, 0 replies; 20+ messages in thread
From: Mel Gorman @ 2015-07-17 12:22 UTC (permalink / raw
  To: Andrew Morton
  Cc: Nicolai Stange, Peter Zijlstra, Dave Hansen, Alex Ng,
	Fengguang Wu, Linux-MM, LKML, Mel Gorman

From: Nicolai Stange <nicstange@gmail.com>

Commit 0e1cc95b4cc7 ("mm: meminit: finish initialisation of struct pages
before basic setup") introduced a rwsem to signal completion of the
initialization workers.

Lockdep complains about possible recursive locking:
  =============================================
  [ INFO: possible recursive locking detected ]
  4.1.0-12802-g1dc51b8 #3 Not tainted
  ---------------------------------------------
  swapper/0/1 is trying to acquire lock:
  (pgdat_init_rwsem){++++.+},
    at: [<ffffffff8424c7fb>] page_alloc_init_late+0xc7/0xe6

  but task is already holding lock:
  (pgdat_init_rwsem){++++.+},
    at: [<ffffffff8424c772>] page_alloc_init_late+0x3e/0xe6

Replace the rwsem by a completion together with an atomic
"outstanding work counter".

[peterz@infradead.org: Barrier removal on the grounds of being pointless]
[mgorman@suse.de: Applied review feedback]
Signed-off-by: Nicolai Stange <nicstange@gmail.com>
Signed-off-by: Mel Gorman <mgorman@suse.de>
---
 mm/page_alloc.c | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 506eac8b38af..a69e78c396a0 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -18,7 +18,6 @@
 #include <linux/mm.h>
 #include <linux/swap.h>
 #include <linux/interrupt.h>
-#include <linux/rwsem.h>
 #include <linux/pagemap.h>
 #include <linux/jiffies.h>
 #include <linux/bootmem.h>
@@ -1062,7 +1061,15 @@ static void __init deferred_free_range(struct page *page,
 		__free_pages_boot_core(page, pfn, 0);
 }
 
-static __initdata DECLARE_RWSEM(pgdat_init_rwsem);
+/* Completion tracking for deferred_init_memmap() threads */
+static atomic_t pgdat_init_n_undone __initdata;
+static __initdata DECLARE_COMPLETION(pgdat_init_all_done_comp);
+
+static inline void __init pgdat_init_report_one_done(void)
+{
+	if (atomic_dec_and_test(&pgdat_init_n_undone))
+		complete(&pgdat_init_all_done_comp);
+}
 
 /* Initialise remaining memory on a node */
 static int __init deferred_init_memmap(void *data)
@@ -1079,7 +1086,7 @@ static int __init deferred_init_memmap(void *data)
 	const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
 
 	if (first_init_pfn == ULONG_MAX) {
-		up_read(&pgdat_init_rwsem);
+		pgdat_init_report_one_done();
 		return 0;
 	}
 
@@ -1179,7 +1186,8 @@ free_range:
 
 	pr_info("node %d initialised, %lu pages in %ums\n", nid, nr_pages,
 					jiffies_to_msecs(jiffies - start));
-	up_read(&pgdat_init_rwsem);
+
+	pgdat_init_report_one_done();
 	return 0;
 }
 
@@ -1187,14 +1195,14 @@ void __init page_alloc_init_late(void)
 {
 	int nid;
 
+	/* There will be num_node_state(N_MEMORY) threads */
+	atomic_set(&pgdat_init_n_undone, num_node_state(N_MEMORY));
 	for_each_node_state(nid, N_MEMORY) {
-		down_read(&pgdat_init_rwsem);
 		kthread_run(deferred_init_memmap, NODE_DATA(nid), "pgdatinit%d", nid);
 	}
 
 	/* Block until all are initialised */
-	down_write(&pgdat_init_rwsem);
-	up_write(&pgdat_init_rwsem);
+	wait_for_completion(&pgdat_init_all_done_comp);
 }
 #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
 
-- 
2.4.3

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH 2/3] fs, file table: Reinit files_stat.max_files after deferred memory initialisation
  2015-07-17 12:22 ` Mel Gorman
@ 2015-07-17 12:22   ` Mel Gorman
  -1 siblings, 0 replies; 20+ messages in thread
From: Mel Gorman @ 2015-07-17 12:22 UTC (permalink / raw
  To: Andrew Morton
  Cc: Nicolai Stange, Peter Zijlstra, Dave Hansen, Alex Ng,
	Fengguang Wu, Linux-MM, LKML, Mel Gorman

Dave Hansen reported the following;

	My laptop has been behaving strangely with 4.2-rc2.  Once I log
	in to my X session, I start getting all kinds of strange errors
	from applications and see this in my dmesg:

        	VFS: file-max limit 8192 reached

The problem is that the file-max is calculated before memory is fully
initialised and miscalculates how much memory the kernel is using. This
patch recalculates file-max after deferred memory initialisation. Note
that using memory hotplug infrastructure would not have avoided this
problem as the value is not recalculated after memory hot-add.

4.1:             files_stat.max_files = 6582781
4.2-rc2:         files_stat.max_files = 8192
4.2-rc2 patched: files_stat.max_files = 6562467

Small differences with the patch applied and 4.1 but not enough to matter.

Signed-off-by: Mel Gorman <mgorman@suse.de>
---
 fs/dcache.c        | 13 +++----------
 fs/file_table.c    | 24 +++++++++++++++---------
 include/linux/fs.h |  5 +++--
 init/main.c        |  2 +-
 mm/page_alloc.c    |  3 +++
 5 files changed, 25 insertions(+), 22 deletions(-)

diff --git a/fs/dcache.c b/fs/dcache.c
index 5c8ea15e73a5..9b5fe503f6cb 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -3442,22 +3442,15 @@ void __init vfs_caches_init_early(void)
 	inode_init_early();
 }
 
-void __init vfs_caches_init(unsigned long mempages)
+void __init vfs_caches_init(void)
 {
-	unsigned long reserve;
-
-	/* Base hash sizes on available memory, with a reserve equal to
-           150% of current kernel size */
-
-	reserve = min((mempages - nr_free_pages()) * 3/2, mempages - 1);
-	mempages -= reserve;
-
 	names_cachep = kmem_cache_create("names_cache", PATH_MAX, 0,
 			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
 
 	dcache_init();
 	inode_init();
-	files_init(mempages);
+	files_init();
+	files_maxfiles_init();
 	mnt_init();
 	bdev_cache_init();
 	chrdev_init();
diff --git a/fs/file_table.c b/fs/file_table.c
index 7f9d407c7595..ad17e05ebf95 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -25,6 +25,7 @@
 #include <linux/hardirq.h>
 #include <linux/task_work.h>
 #include <linux/ima.h>
+#include <linux/swap.h>
 
 #include <linux/atomic.h>
 
@@ -308,19 +309,24 @@ void put_filp(struct file *file)
 	}
 }
 
-void __init files_init(unsigned long mempages)
+void __init files_init(void)
 { 
-	unsigned long n;
-
 	filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0,
 			SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
+	percpu_counter_init(&nr_files, 0, GFP_KERNEL);
+}
 
-	/*
-	 * One file with associated inode and dcache is very roughly 1K.
-	 * Per default don't use more than 10% of our memory for files. 
-	 */ 
+/*
+ * One file with associated inode and dcache is very roughly 1K. Per default
+ * do not use more than 10% of our memory for files.
+ */
+void __init files_maxfiles_init(void)
+{
+	unsigned long n;
+	unsigned long memreserve = (totalram_pages - nr_free_pages()) * 3/2;
+
+	memreserve = min(memreserve, totalram_pages - 1);
+	n = ((totalram_pages - memreserve) * (PAGE_SIZE / 1024)) / 10;
 
-	n = (mempages * (PAGE_SIZE / 1024)) / 10;
 	files_stat.max_files = max_t(unsigned long, n, NR_FILE);
-	percpu_counter_init(&nr_files, 0, GFP_KERNEL);
 } 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index a0653e560c26..e6ceaae3a50e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -55,7 +55,8 @@ struct vm_fault;
 
 extern void __init inode_init(void);
 extern void __init inode_init_early(void);
-extern void __init files_init(unsigned long);
+extern void __init files_init(void);
+extern void __init files_maxfiles_init(void);
 
 extern struct files_stat_struct files_stat;
 extern unsigned long get_max_files(void);
@@ -2235,7 +2236,7 @@ extern int ioctl_preallocate(struct file *filp, void __user *argp);
 
 /* fs/dcache.c */
 extern void __init vfs_caches_init_early(void);
-extern void __init vfs_caches_init(unsigned long);
+extern void __init vfs_caches_init(void);
 
 extern struct kmem_cache *names_cachep;
 
diff --git a/init/main.c b/init/main.c
index c5d5626289ce..56506553d4d8 100644
--- a/init/main.c
+++ b/init/main.c
@@ -656,7 +656,7 @@ asmlinkage __visible void __init start_kernel(void)
 	key_init();
 	security_init();
 	dbg_late_init();
-	vfs_caches_init(totalram_pages);
+	vfs_caches_init();
 	signals_init();
 	/* rootfs populating might need page-writeback */
 	page_writeback_init();
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a69e78c396a0..94e2599830c2 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1203,6 +1203,9 @@ void __init page_alloc_init_late(void)
 
 	/* Block until all are initialised */
 	wait_for_completion(&pgdat_init_all_done_comp);
+
+	/* Reinit limits that are based on free pages after the kernel is up */
+	files_maxfiles_init();
 }
 #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
 
-- 
2.4.3


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH 2/3] fs, file table: Reinit files_stat.max_files after deferred memory initialisation
@ 2015-07-17 12:22   ` Mel Gorman
  0 siblings, 0 replies; 20+ messages in thread
From: Mel Gorman @ 2015-07-17 12:22 UTC (permalink / raw
  To: Andrew Morton
  Cc: Nicolai Stange, Peter Zijlstra, Dave Hansen, Alex Ng,
	Fengguang Wu, Linux-MM, LKML, Mel Gorman

Dave Hansen reported the following;

	My laptop has been behaving strangely with 4.2-rc2.  Once I log
	in to my X session, I start getting all kinds of strange errors
	from applications and see this in my dmesg:

        	VFS: file-max limit 8192 reached

The problem is that the file-max is calculated before memory is fully
initialised and miscalculates how much memory the kernel is using. This
patch recalculates file-max after deferred memory initialisation. Note
that using memory hotplug infrastructure would not have avoided this
problem as the value is not recalculated after memory hot-add.

4.1:             files_stat.max_files = 6582781
4.2-rc2:         files_stat.max_files = 8192
4.2-rc2 patched: files_stat.max_files = 6562467

Small differences with the patch applied and 4.1 but not enough to matter.

Signed-off-by: Mel Gorman <mgorman@suse.de>
---
 fs/dcache.c        | 13 +++----------
 fs/file_table.c    | 24 +++++++++++++++---------
 include/linux/fs.h |  5 +++--
 init/main.c        |  2 +-
 mm/page_alloc.c    |  3 +++
 5 files changed, 25 insertions(+), 22 deletions(-)

diff --git a/fs/dcache.c b/fs/dcache.c
index 5c8ea15e73a5..9b5fe503f6cb 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -3442,22 +3442,15 @@ void __init vfs_caches_init_early(void)
 	inode_init_early();
 }
 
-void __init vfs_caches_init(unsigned long mempages)
+void __init vfs_caches_init(void)
 {
-	unsigned long reserve;
-
-	/* Base hash sizes on available memory, with a reserve equal to
-           150% of current kernel size */
-
-	reserve = min((mempages - nr_free_pages()) * 3/2, mempages - 1);
-	mempages -= reserve;
-
 	names_cachep = kmem_cache_create("names_cache", PATH_MAX, 0,
 			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
 
 	dcache_init();
 	inode_init();
-	files_init(mempages);
+	files_init();
+	files_maxfiles_init();
 	mnt_init();
 	bdev_cache_init();
 	chrdev_init();
diff --git a/fs/file_table.c b/fs/file_table.c
index 7f9d407c7595..ad17e05ebf95 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -25,6 +25,7 @@
 #include <linux/hardirq.h>
 #include <linux/task_work.h>
 #include <linux/ima.h>
+#include <linux/swap.h>
 
 #include <linux/atomic.h>
 
@@ -308,19 +309,24 @@ void put_filp(struct file *file)
 	}
 }
 
-void __init files_init(unsigned long mempages)
+void __init files_init(void)
 { 
-	unsigned long n;
-
 	filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0,
 			SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
+	percpu_counter_init(&nr_files, 0, GFP_KERNEL);
+}
 
-	/*
-	 * One file with associated inode and dcache is very roughly 1K.
-	 * Per default don't use more than 10% of our memory for files. 
-	 */ 
+/*
+ * One file with associated inode and dcache is very roughly 1K. Per default
+ * do not use more than 10% of our memory for files.
+ */
+void __init files_maxfiles_init(void)
+{
+	unsigned long n;
+	unsigned long memreserve = (totalram_pages - nr_free_pages()) * 3/2;
+
+	memreserve = min(memreserve, totalram_pages - 1);
+	n = ((totalram_pages - memreserve) * (PAGE_SIZE / 1024)) / 10;
 
-	n = (mempages * (PAGE_SIZE / 1024)) / 10;
 	files_stat.max_files = max_t(unsigned long, n, NR_FILE);
-	percpu_counter_init(&nr_files, 0, GFP_KERNEL);
 } 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index a0653e560c26..e6ceaae3a50e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -55,7 +55,8 @@ struct vm_fault;
 
 extern void __init inode_init(void);
 extern void __init inode_init_early(void);
-extern void __init files_init(unsigned long);
+extern void __init files_init(void);
+extern void __init files_maxfiles_init(void);
 
 extern struct files_stat_struct files_stat;
 extern unsigned long get_max_files(void);
@@ -2235,7 +2236,7 @@ extern int ioctl_preallocate(struct file *filp, void __user *argp);
 
 /* fs/dcache.c */
 extern void __init vfs_caches_init_early(void);
-extern void __init vfs_caches_init(unsigned long);
+extern void __init vfs_caches_init(void);
 
 extern struct kmem_cache *names_cachep;
 
diff --git a/init/main.c b/init/main.c
index c5d5626289ce..56506553d4d8 100644
--- a/init/main.c
+++ b/init/main.c
@@ -656,7 +656,7 @@ asmlinkage __visible void __init start_kernel(void)
 	key_init();
 	security_init();
 	dbg_late_init();
-	vfs_caches_init(totalram_pages);
+	vfs_caches_init();
 	signals_init();
 	/* rootfs populating might need page-writeback */
 	page_writeback_init();
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a69e78c396a0..94e2599830c2 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1203,6 +1203,9 @@ void __init page_alloc_init_late(void)
 
 	/* Block until all are initialised */
 	wait_for_completion(&pgdat_init_all_done_comp);
+
+	/* Reinit limits that are based on free pages after the kernel is up */
+	files_maxfiles_init();
 }
 #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
 
-- 
2.4.3

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH 3/3] mm, meminit: Allow early_pfn_to_nid to be used during runtime
  2015-07-17 12:22 ` Mel Gorman
@ 2015-07-17 12:22   ` Mel Gorman
  -1 siblings, 0 replies; 20+ messages in thread
From: Mel Gorman @ 2015-07-17 12:22 UTC (permalink / raw
  To: Andrew Morton
  Cc: Nicolai Stange, Peter Zijlstra, Dave Hansen, Alex Ng,
	Fengguang Wu, Linux-MM, LKML, Mel Gorman

early_pfn_to_nid historically was inherently not SMP safe but only
used during boot which is inherently single threaded or during hotplug
which is protected by a giant mutex. With deferred memory initialisation
there was a thread-safe version introduced and the early_pfn_to_nid
would trigger a BUG_ON if used unsafely. Memory hotplug hit that check.
This patch makes early_pfn_to_nid introduces a lock to make it safe to
use during hotplug.

Reported-and-tested-by: Alex Ng <alexng@microsoft.com>
Signed-off-by: Mel Gorman <mgorman@suse.de>
---
 mm/page_alloc.c | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 94e2599830c2..f1e841c67b7a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -982,21 +982,26 @@ static void __init __free_pages_boot_core(struct page *page,
 
 #if defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) || \
 	defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP)
-/* Only safe to use early in boot when initialisation is single-threaded */
+
 static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata;
 
 int __meminit early_pfn_to_nid(unsigned long pfn)
 {
+	static DEFINE_SPINLOCK(early_pfn_lock);
 	int nid;
 
-	/* The system will behave unpredictably otherwise */
-	BUG_ON(system_state != SYSTEM_BOOTING);
+	/* Avoid locking overhead during boot but hotplug must lock */
+	if (system_state != SYSTEM_BOOTING)
+		spin_lock(&early_pfn_lock);
 
 	nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache);
-	if (nid >= 0)
-		return nid;
-	/* just returns 0 */
-	return 0;
+	if (nid < 0)
+		nid = 0;
+
+	if (system_state != SYSTEM_BOOTING)
+		spin_unlock(&early_pfn_lock);
+
+	return nid;
 }
 #endif
 
-- 
2.4.3


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH 3/3] mm, meminit: Allow early_pfn_to_nid to be used during runtime
@ 2015-07-17 12:22   ` Mel Gorman
  0 siblings, 0 replies; 20+ messages in thread
From: Mel Gorman @ 2015-07-17 12:22 UTC (permalink / raw
  To: Andrew Morton
  Cc: Nicolai Stange, Peter Zijlstra, Dave Hansen, Alex Ng,
	Fengguang Wu, Linux-MM, LKML, Mel Gorman

early_pfn_to_nid historically was inherently not SMP safe but only
used during boot which is inherently single threaded or during hotplug
which is protected by a giant mutex. With deferred memory initialisation
there was a thread-safe version introduced and the early_pfn_to_nid
would trigger a BUG_ON if used unsafely. Memory hotplug hit that check.
This patch makes early_pfn_to_nid introduces a lock to make it safe to
use during hotplug.

Reported-and-tested-by: Alex Ng <alexng@microsoft.com>
Signed-off-by: Mel Gorman <mgorman@suse.de>
---
 mm/page_alloc.c | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 94e2599830c2..f1e841c67b7a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -982,21 +982,26 @@ static void __init __free_pages_boot_core(struct page *page,
 
 #if defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) || \
 	defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP)
-/* Only safe to use early in boot when initialisation is single-threaded */
+
 static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata;
 
 int __meminit early_pfn_to_nid(unsigned long pfn)
 {
+	static DEFINE_SPINLOCK(early_pfn_lock);
 	int nid;
 
-	/* The system will behave unpredictably otherwise */
-	BUG_ON(system_state != SYSTEM_BOOTING);
+	/* Avoid locking overhead during boot but hotplug must lock */
+	if (system_state != SYSTEM_BOOTING)
+		spin_lock(&early_pfn_lock);
 
 	nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache);
-	if (nid >= 0)
-		return nid;
-	/* just returns 0 */
-	return 0;
+	if (nid < 0)
+		nid = 0;
+
+	if (system_state != SYSTEM_BOOTING)
+		spin_unlock(&early_pfn_lock);
+
+	return nid;
 }
 #endif
 
-- 
2.4.3

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [PATCH 3/3] mm, meminit: Allow early_pfn_to_nid to be used during runtime
  2015-07-17 12:22   ` Mel Gorman
@ 2015-07-17 13:12     ` Peter Zijlstra
  -1 siblings, 0 replies; 20+ messages in thread
From: Peter Zijlstra @ 2015-07-17 13:12 UTC (permalink / raw
  To: Mel Gorman
  Cc: Andrew Morton, Nicolai Stange, Dave Hansen, Alex Ng, Fengguang Wu,
	Linux-MM, LKML

On Fri, Jul 17, 2015 at 01:22:04PM +0100, Mel Gorman wrote:
>  int __meminit early_pfn_to_nid(unsigned long pfn)
>  {
> +	static DEFINE_SPINLOCK(early_pfn_lock);
>  	int nid;
>  
> -	/* The system will behave unpredictably otherwise */
> -	BUG_ON(system_state != SYSTEM_BOOTING);
> +	/* Avoid locking overhead during boot but hotplug must lock */
> +	if (system_state != SYSTEM_BOOTING)
> +		spin_lock(&early_pfn_lock);
>  
>  	nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache);
> -	if (nid >= 0)
> -		return nid;
> -	/* just returns 0 */
> -	return 0;
> +	if (nid < 0)
> +		nid = 0;
> +
> +	if (system_state != SYSTEM_BOOTING)
> +		spin_unlock(&early_pfn_lock);
> +
> +	return nid;
>  }

Why the conditional locking?

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 3/3] mm, meminit: Allow early_pfn_to_nid to be used during runtime
@ 2015-07-17 13:12     ` Peter Zijlstra
  0 siblings, 0 replies; 20+ messages in thread
From: Peter Zijlstra @ 2015-07-17 13:12 UTC (permalink / raw
  To: Mel Gorman
  Cc: Andrew Morton, Nicolai Stange, Dave Hansen, Alex Ng, Fengguang Wu,
	Linux-MM, LKML

On Fri, Jul 17, 2015 at 01:22:04PM +0100, Mel Gorman wrote:
>  int __meminit early_pfn_to_nid(unsigned long pfn)
>  {
> +	static DEFINE_SPINLOCK(early_pfn_lock);
>  	int nid;
>  
> -	/* The system will behave unpredictably otherwise */
> -	BUG_ON(system_state != SYSTEM_BOOTING);
> +	/* Avoid locking overhead during boot but hotplug must lock */
> +	if (system_state != SYSTEM_BOOTING)
> +		spin_lock(&early_pfn_lock);
>  
>  	nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache);
> -	if (nid >= 0)
> -		return nid;
> -	/* just returns 0 */
> -	return 0;
> +	if (nid < 0)
> +		nid = 0;
> +
> +	if (system_state != SYSTEM_BOOTING)
> +		spin_unlock(&early_pfn_lock);
> +
> +	return nid;
>  }

Why the conditional locking?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/3] mm, meminit: replace rwsem with completion
  2015-07-17 12:22   ` Mel Gorman
@ 2015-07-17 13:12     ` Peter Zijlstra
  -1 siblings, 0 replies; 20+ messages in thread
From: Peter Zijlstra @ 2015-07-17 13:12 UTC (permalink / raw
  To: Mel Gorman
  Cc: Andrew Morton, Nicolai Stange, Dave Hansen, Alex Ng, Fengguang Wu,
	Linux-MM, LKML

On Fri, Jul 17, 2015 at 01:22:02PM +0100, Mel Gorman wrote:
> From: Nicolai Stange <nicstange@gmail.com>
> 
> Commit 0e1cc95b4cc7 ("mm: meminit: finish initialisation of struct pages
> before basic setup") introduced a rwsem to signal completion of the
> initialization workers.
> 
> Lockdep complains about possible recursive locking:
>   =============================================
>   [ INFO: possible recursive locking detected ]
>   4.1.0-12802-g1dc51b8 #3 Not tainted
>   ---------------------------------------------
>   swapper/0/1 is trying to acquire lock:
>   (pgdat_init_rwsem){++++.+},
>     at: [<ffffffff8424c7fb>] page_alloc_init_late+0xc7/0xe6
> 
>   but task is already holding lock:
>   (pgdat_init_rwsem){++++.+},
>     at: [<ffffffff8424c772>] page_alloc_init_late+0x3e/0xe6
> 
> Replace the rwsem by a completion together with an atomic
> "outstanding work counter".
> 
> [peterz@infradead.org: Barrier removal on the grounds of being pointless]
> [mgorman@suse.de: Applied review feedback]
> Signed-off-by: Nicolai Stange <nicstange@gmail.com>
> Signed-off-by: Mel Gorman <mgorman@suse.de>

Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>

Thanks!

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/3] mm, meminit: replace rwsem with completion
@ 2015-07-17 13:12     ` Peter Zijlstra
  0 siblings, 0 replies; 20+ messages in thread
From: Peter Zijlstra @ 2015-07-17 13:12 UTC (permalink / raw
  To: Mel Gorman
  Cc: Andrew Morton, Nicolai Stange, Dave Hansen, Alex Ng, Fengguang Wu,
	Linux-MM, LKML

On Fri, Jul 17, 2015 at 01:22:02PM +0100, Mel Gorman wrote:
> From: Nicolai Stange <nicstange@gmail.com>
> 
> Commit 0e1cc95b4cc7 ("mm: meminit: finish initialisation of struct pages
> before basic setup") introduced a rwsem to signal completion of the
> initialization workers.
> 
> Lockdep complains about possible recursive locking:
>   =============================================
>   [ INFO: possible recursive locking detected ]
>   4.1.0-12802-g1dc51b8 #3 Not tainted
>   ---------------------------------------------
>   swapper/0/1 is trying to acquire lock:
>   (pgdat_init_rwsem){++++.+},
>     at: [<ffffffff8424c7fb>] page_alloc_init_late+0xc7/0xe6
> 
>   but task is already holding lock:
>   (pgdat_init_rwsem){++++.+},
>     at: [<ffffffff8424c772>] page_alloc_init_late+0x3e/0xe6
> 
> Replace the rwsem by a completion together with an atomic
> "outstanding work counter".
> 
> [peterz@infradead.org: Barrier removal on the grounds of being pointless]
> [mgorman@suse.de: Applied review feedback]
> Signed-off-by: Nicolai Stange <nicstange@gmail.com>
> Signed-off-by: Mel Gorman <mgorman@suse.de>

Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>

Thanks!

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 3/3] mm, meminit: Allow early_pfn_to_nid to be used during runtime
  2015-07-17 13:12     ` Peter Zijlstra
@ 2015-07-17 13:17       ` Mel Gorman
  -1 siblings, 0 replies; 20+ messages in thread
From: Mel Gorman @ 2015-07-17 13:17 UTC (permalink / raw
  To: Peter Zijlstra
  Cc: Andrew Morton, Nicolai Stange, Dave Hansen, Alex Ng, Fengguang Wu,
	Linux-MM, LKML

On Fri, Jul 17, 2015 at 03:12:32PM +0200, Peter Zijlstra wrote:
> On Fri, Jul 17, 2015 at 01:22:04PM +0100, Mel Gorman wrote:
> >  int __meminit early_pfn_to_nid(unsigned long pfn)
> >  {
> > +	static DEFINE_SPINLOCK(early_pfn_lock);
> >  	int nid;
> >  
> > -	/* The system will behave unpredictably otherwise */
> > -	BUG_ON(system_state != SYSTEM_BOOTING);
> > +	/* Avoid locking overhead during boot but hotplug must lock */
> > +	if (system_state != SYSTEM_BOOTING)
> > +		spin_lock(&early_pfn_lock);
> >  
> >  	nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache);
> > -	if (nid >= 0)
> > -		return nid;
> > -	/* just returns 0 */
> > -	return 0;
> > +	if (nid < 0)
> > +		nid = 0;
> > +
> > +	if (system_state != SYSTEM_BOOTING)
> > +		spin_unlock(&early_pfn_lock);
> > +
> > +	return nid;
> >  }
> 
> Why the conditional locking?

Unnecessary during boot when it's inherently serialised. The point of
the deferred initialisation was to boot as quickly as possible.

-- 
Mel Gorman
SUSE Labs

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 3/3] mm, meminit: Allow early_pfn_to_nid to be used during runtime
@ 2015-07-17 13:17       ` Mel Gorman
  0 siblings, 0 replies; 20+ messages in thread
From: Mel Gorman @ 2015-07-17 13:17 UTC (permalink / raw
  To: Peter Zijlstra
  Cc: Andrew Morton, Nicolai Stange, Dave Hansen, Alex Ng, Fengguang Wu,
	Linux-MM, LKML

On Fri, Jul 17, 2015 at 03:12:32PM +0200, Peter Zijlstra wrote:
> On Fri, Jul 17, 2015 at 01:22:04PM +0100, Mel Gorman wrote:
> >  int __meminit early_pfn_to_nid(unsigned long pfn)
> >  {
> > +	static DEFINE_SPINLOCK(early_pfn_lock);
> >  	int nid;
> >  
> > -	/* The system will behave unpredictably otherwise */
> > -	BUG_ON(system_state != SYSTEM_BOOTING);
> > +	/* Avoid locking overhead during boot but hotplug must lock */
> > +	if (system_state != SYSTEM_BOOTING)
> > +		spin_lock(&early_pfn_lock);
> >  
> >  	nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache);
> > -	if (nid >= 0)
> > -		return nid;
> > -	/* just returns 0 */
> > -	return 0;
> > +	if (nid < 0)
> > +		nid = 0;
> > +
> > +	if (system_state != SYSTEM_BOOTING)
> > +		spin_unlock(&early_pfn_lock);
> > +
> > +	return nid;
> >  }
> 
> Why the conditional locking?

Unnecessary during boot when it's inherently serialised. The point of
the deferred initialisation was to boot as quickly as possible.

-- 
Mel Gorman
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 3/3] mm, meminit: Allow early_pfn_to_nid to be used during runtime
  2015-07-17 13:17       ` Mel Gorman
@ 2015-07-17 13:29         ` Peter Zijlstra
  -1 siblings, 0 replies; 20+ messages in thread
From: Peter Zijlstra @ 2015-07-17 13:29 UTC (permalink / raw
  To: Mel Gorman
  Cc: Andrew Morton, Nicolai Stange, Dave Hansen, Alex Ng, Fengguang Wu,
	Linux-MM, LKML

On Fri, Jul 17, 2015 at 02:17:29PM +0100, Mel Gorman wrote:
> On Fri, Jul 17, 2015 at 03:12:32PM +0200, Peter Zijlstra wrote:
> > On Fri, Jul 17, 2015 at 01:22:04PM +0100, Mel Gorman wrote:
> > >  int __meminit early_pfn_to_nid(unsigned long pfn)
> > >  {
> > > +	static DEFINE_SPINLOCK(early_pfn_lock);
> > >  	int nid;
> > >  
> > > -	/* The system will behave unpredictably otherwise */
> > > -	BUG_ON(system_state != SYSTEM_BOOTING);
> > > +	/* Avoid locking overhead during boot but hotplug must lock */
> > > +	if (system_state != SYSTEM_BOOTING)
> > > +		spin_lock(&early_pfn_lock);
> > >  
> > >  	nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache);
> > > -	if (nid >= 0)
> > > -		return nid;
> > > -	/* just returns 0 */
> > > -	return 0;
> > > +	if (nid < 0)
> > > +		nid = 0;
> > > +
> > > +	if (system_state != SYSTEM_BOOTING)
> > > +		spin_unlock(&early_pfn_lock);
> > > +
> > > +	return nid;
> > >  }
> > 
> > Why the conditional locking?
> 
> Unnecessary during boot when it's inherently serialised. The point of
> the deferred initialisation was to boot as quickly as possible.

Sure, but does it make a measurable difference?

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 3/3] mm, meminit: Allow early_pfn_to_nid to be used during runtime
@ 2015-07-17 13:29         ` Peter Zijlstra
  0 siblings, 0 replies; 20+ messages in thread
From: Peter Zijlstra @ 2015-07-17 13:29 UTC (permalink / raw
  To: Mel Gorman
  Cc: Andrew Morton, Nicolai Stange, Dave Hansen, Alex Ng, Fengguang Wu,
	Linux-MM, LKML

On Fri, Jul 17, 2015 at 02:17:29PM +0100, Mel Gorman wrote:
> On Fri, Jul 17, 2015 at 03:12:32PM +0200, Peter Zijlstra wrote:
> > On Fri, Jul 17, 2015 at 01:22:04PM +0100, Mel Gorman wrote:
> > >  int __meminit early_pfn_to_nid(unsigned long pfn)
> > >  {
> > > +	static DEFINE_SPINLOCK(early_pfn_lock);
> > >  	int nid;
> > >  
> > > -	/* The system will behave unpredictably otherwise */
> > > -	BUG_ON(system_state != SYSTEM_BOOTING);
> > > +	/* Avoid locking overhead during boot but hotplug must lock */
> > > +	if (system_state != SYSTEM_BOOTING)
> > > +		spin_lock(&early_pfn_lock);
> > >  
> > >  	nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache);
> > > -	if (nid >= 0)
> > > -		return nid;
> > > -	/* just returns 0 */
> > > -	return 0;
> > > +	if (nid < 0)
> > > +		nid = 0;
> > > +
> > > +	if (system_state != SYSTEM_BOOTING)
> > > +		spin_unlock(&early_pfn_lock);
> > > +
> > > +	return nid;
> > >  }
> > 
> > Why the conditional locking?
> 
> Unnecessary during boot when it's inherently serialised. The point of
> the deferred initialisation was to boot as quickly as possible.

Sure, but does it make a measurable difference?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 3/3] mm, meminit: Allow early_pfn_to_nid to be used during runtime
  2015-07-17 13:29         ` Peter Zijlstra
@ 2015-07-17 13:39           ` Mel Gorman
  -1 siblings, 0 replies; 20+ messages in thread
From: Mel Gorman @ 2015-07-17 13:39 UTC (permalink / raw
  To: Peter Zijlstra
  Cc: Andrew Morton, Nicolai Stange, Dave Hansen, Alex Ng, Fengguang Wu,
	Linux-MM, LKML

On Fri, Jul 17, 2015 at 03:29:22PM +0200, Peter Zijlstra wrote:
> On Fri, Jul 17, 2015 at 02:17:29PM +0100, Mel Gorman wrote:
> > On Fri, Jul 17, 2015 at 03:12:32PM +0200, Peter Zijlstra wrote:
> > > On Fri, Jul 17, 2015 at 01:22:04PM +0100, Mel Gorman wrote:
> > > >  int __meminit early_pfn_to_nid(unsigned long pfn)
> > > >  {
> > > > +	static DEFINE_SPINLOCK(early_pfn_lock);
> > > >  	int nid;
> > > >  
> > > > -	/* The system will behave unpredictably otherwise */
> > > > -	BUG_ON(system_state != SYSTEM_BOOTING);
> > > > +	/* Avoid locking overhead during boot but hotplug must lock */
> > > > +	if (system_state != SYSTEM_BOOTING)
> > > > +		spin_lock(&early_pfn_lock);
> > > >  
> > > >  	nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache);
> > > > -	if (nid >= 0)
> > > > -		return nid;
> > > > -	/* just returns 0 */
> > > > -	return 0;
> > > > +	if (nid < 0)
> > > > +		nid = 0;
> > > > +
> > > > +	if (system_state != SYSTEM_BOOTING)
> > > > +		spin_unlock(&early_pfn_lock);
> > > > +
> > > > +	return nid;
> > > >  }
> > > 
> > > Why the conditional locking?
> > 
> > Unnecessary during boot when it's inherently serialised. The point of
> > the deferred initialisation was to boot as quickly as possible.
> 
> Sure, but does it make a measurable difference?

I'm don't know and no longer have access to the necessary machine to test
any more. You make a reasonable point and I would be surprised if it was
noticable. On the other hand, conditional locking is evil and the patch
reflected my thinking at the time "we don't need locks during boot". It's
the type of thinking that should be backed with figures if it was to be
used at all so lets go with;

---8<---
mm, meminit: Allow early_pfn_to_nid to be used during runtime v2

early_pfn_to_nid historically was inherently not SMP safe but only
used during boot which is inherently single threaded or during hotplug
which is protected by a giant mutex. With deferred memory initialisation
there was a thread-safe version introduced and the early_pfn_to_nid
would trigger a BUG_ON if used unsafely. Memory hotplug hit that check.
This patch makes early_pfn_to_nid introduces a lock to make it safe to
use during hotplug.

Reported-and-tested-by: Alex Ng <alexng@microsoft.com>
Signed-off-by: Mel Gorman <mgorman@suse.de>
---
 mm/page_alloc.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 94e2599830c2..93316f3bcecb 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -982,21 +982,21 @@ static void __init __free_pages_boot_core(struct page *page,
 
 #if defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) || \
 	defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP)
-/* Only safe to use early in boot when initialisation is single-threaded */
+
 static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata;
 
 int __meminit early_pfn_to_nid(unsigned long pfn)
 {
+	static DEFINE_SPINLOCK(early_pfn_lock);
 	int nid;
 
-	/* The system will behave unpredictably otherwise */
-	BUG_ON(system_state != SYSTEM_BOOTING);
-
+	spin_lock(&early_pfn_lock);
 	nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache);
-	if (nid >= 0)
-		return nid;
-	/* just returns 0 */
-	return 0;
+	if (nid < 0)
+		nid = 0;
+	spin_unlock(&early_pfn_lock);
+
+	return nid;
 }
 #endif
 

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [PATCH 3/3] mm, meminit: Allow early_pfn_to_nid to be used during runtime
@ 2015-07-17 13:39           ` Mel Gorman
  0 siblings, 0 replies; 20+ messages in thread
From: Mel Gorman @ 2015-07-17 13:39 UTC (permalink / raw
  To: Peter Zijlstra
  Cc: Andrew Morton, Nicolai Stange, Dave Hansen, Alex Ng, Fengguang Wu,
	Linux-MM, LKML

On Fri, Jul 17, 2015 at 03:29:22PM +0200, Peter Zijlstra wrote:
> On Fri, Jul 17, 2015 at 02:17:29PM +0100, Mel Gorman wrote:
> > On Fri, Jul 17, 2015 at 03:12:32PM +0200, Peter Zijlstra wrote:
> > > On Fri, Jul 17, 2015 at 01:22:04PM +0100, Mel Gorman wrote:
> > > >  int __meminit early_pfn_to_nid(unsigned long pfn)
> > > >  {
> > > > +	static DEFINE_SPINLOCK(early_pfn_lock);
> > > >  	int nid;
> > > >  
> > > > -	/* The system will behave unpredictably otherwise */
> > > > -	BUG_ON(system_state != SYSTEM_BOOTING);
> > > > +	/* Avoid locking overhead during boot but hotplug must lock */
> > > > +	if (system_state != SYSTEM_BOOTING)
> > > > +		spin_lock(&early_pfn_lock);
> > > >  
> > > >  	nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache);
> > > > -	if (nid >= 0)
> > > > -		return nid;
> > > > -	/* just returns 0 */
> > > > -	return 0;
> > > > +	if (nid < 0)
> > > > +		nid = 0;
> > > > +
> > > > +	if (system_state != SYSTEM_BOOTING)
> > > > +		spin_unlock(&early_pfn_lock);
> > > > +
> > > > +	return nid;
> > > >  }
> > > 
> > > Why the conditional locking?
> > 
> > Unnecessary during boot when it's inherently serialised. The point of
> > the deferred initialisation was to boot as quickly as possible.
> 
> Sure, but does it make a measurable difference?

I'm don't know and no longer have access to the necessary machine to test
any more. You make a reasonable point and I would be surprised if it was
noticable. On the other hand, conditional locking is evil and the patch
reflected my thinking at the time "we don't need locks during boot". It's
the type of thinking that should be backed with figures if it was to be
used at all so lets go with;

---8<---
mm, meminit: Allow early_pfn_to_nid to be used during runtime v2

early_pfn_to_nid historically was inherently not SMP safe but only
used during boot which is inherently single threaded or during hotplug
which is protected by a giant mutex. With deferred memory initialisation
there was a thread-safe version introduced and the early_pfn_to_nid
would trigger a BUG_ON if used unsafely. Memory hotplug hit that check.
This patch makes early_pfn_to_nid introduces a lock to make it safe to
use during hotplug.

Reported-and-tested-by: Alex Ng <alexng@microsoft.com>
Signed-off-by: Mel Gorman <mgorman@suse.de>
---
 mm/page_alloc.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 94e2599830c2..93316f3bcecb 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -982,21 +982,21 @@ static void __init __free_pages_boot_core(struct page *page,
 
 #if defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) || \
 	defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP)
-/* Only safe to use early in boot when initialisation is single-threaded */
+
 static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata;
 
 int __meminit early_pfn_to_nid(unsigned long pfn)
 {
+	static DEFINE_SPINLOCK(early_pfn_lock);
 	int nid;
 
-	/* The system will behave unpredictably otherwise */
-	BUG_ON(system_state != SYSTEM_BOOTING);
-
+	spin_lock(&early_pfn_lock);
 	nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache);
-	if (nid >= 0)
-		return nid;
-	/* just returns 0 */
-	return 0;
+	if (nid < 0)
+		nid = 0;
+	spin_unlock(&early_pfn_lock);
+
+	return nid;
 }
 #endif
 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [PATCH 3/3] mm, meminit: Allow early_pfn_to_nid to be used during runtime
  2015-07-17 13:39           ` Mel Gorman
@ 2015-07-17 13:50             ` Peter Zijlstra
  -1 siblings, 0 replies; 20+ messages in thread
From: Peter Zijlstra @ 2015-07-17 13:50 UTC (permalink / raw
  To: Mel Gorman
  Cc: Andrew Morton, Nicolai Stange, Dave Hansen, Alex Ng, Fengguang Wu,
	Linux-MM, LKML

On Fri, Jul 17, 2015 at 02:39:13PM +0100, Mel Gorman wrote:

> I'm don't know and no longer have access to the necessary machine to test
> any more. You make a reasonable point and I would be surprised if it was
> noticable. On the other hand, conditional locking is evil and the patch
> reflected my thinking at the time "we don't need locks during boot". It's
> the type of thinking that should be backed with figures if it was to be
> used at all so lets go with;

Last time I tested it, an uncontended spinlock (cache hot) ran around 20
cycles, the unlock is a regular store (x86) and in single digit cycles.
I doubt modern hardware makes it go slower.

> ---8<---
> mm, meminit: Allow early_pfn_to_nid to be used during runtime v2
> 
> early_pfn_to_nid historically was inherently not SMP safe but only
> used during boot which is inherently single threaded or during hotplug
> which is protected by a giant mutex. With deferred memory initialisation
> there was a thread-safe version introduced and the early_pfn_to_nid
> would trigger a BUG_ON if used unsafely. Memory hotplug hit that check.
> This patch makes early_pfn_to_nid introduces a lock to make it safe to
> use during hotplug.
> 
> Reported-and-tested-by: Alex Ng <alexng@microsoft.com>
> Signed-off-by: Mel Gorman <mgorman@suse.de>

Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 3/3] mm, meminit: Allow early_pfn_to_nid to be used during runtime
@ 2015-07-17 13:50             ` Peter Zijlstra
  0 siblings, 0 replies; 20+ messages in thread
From: Peter Zijlstra @ 2015-07-17 13:50 UTC (permalink / raw
  To: Mel Gorman
  Cc: Andrew Morton, Nicolai Stange, Dave Hansen, Alex Ng, Fengguang Wu,
	Linux-MM, LKML

On Fri, Jul 17, 2015 at 02:39:13PM +0100, Mel Gorman wrote:

> I'm don't know and no longer have access to the necessary machine to test
> any more. You make a reasonable point and I would be surprised if it was
> noticable. On the other hand, conditional locking is evil and the patch
> reflected my thinking at the time "we don't need locks during boot". It's
> the type of thinking that should be backed with figures if it was to be
> used at all so lets go with;

Last time I tested it, an uncontended spinlock (cache hot) ran around 20
cycles, the unlock is a regular store (x86) and in single digit cycles.
I doubt modern hardware makes it go slower.

> ---8<---
> mm, meminit: Allow early_pfn_to_nid to be used during runtime v2
> 
> early_pfn_to_nid historically was inherently not SMP safe but only
> used during boot which is inherently single threaded or during hotplug
> which is protected by a giant mutex. With deferred memory initialisation
> there was a thread-safe version introduced and the early_pfn_to_nid
> would trigger a BUG_ON if used unsafely. Memory hotplug hit that check.
> This patch makes early_pfn_to_nid introduces a lock to make it safe to
> use during hotplug.
> 
> Reported-and-tested-by: Alex Ng <alexng@microsoft.com>
> Signed-off-by: Mel Gorman <mgorman@suse.de>

Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 20+ messages in thread

end of thread, other threads:[~2015-07-17 13:50 UTC | newest]

Thread overview: 20+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2015-07-17 12:22 [PATCH 0/3] Deferred memory initialisation fixes Mel Gorman
2015-07-17 12:22 ` Mel Gorman
2015-07-17 12:22 ` [PATCH 1/3] mm, meminit: replace rwsem with completion Mel Gorman
2015-07-17 12:22   ` Mel Gorman
2015-07-17 13:12   ` Peter Zijlstra
2015-07-17 13:12     ` Peter Zijlstra
2015-07-17 12:22 ` [PATCH 2/3] fs, file table: Reinit files_stat.max_files after deferred memory initialisation Mel Gorman
2015-07-17 12:22   ` Mel Gorman
2015-07-17 12:22 ` [PATCH 3/3] mm, meminit: Allow early_pfn_to_nid to be used during runtime Mel Gorman
2015-07-17 12:22   ` Mel Gorman
2015-07-17 13:12   ` Peter Zijlstra
2015-07-17 13:12     ` Peter Zijlstra
2015-07-17 13:17     ` Mel Gorman
2015-07-17 13:17       ` Mel Gorman
2015-07-17 13:29       ` Peter Zijlstra
2015-07-17 13:29         ` Peter Zijlstra
2015-07-17 13:39         ` Mel Gorman
2015-07-17 13:39           ` Mel Gorman
2015-07-17 13:50           ` Peter Zijlstra
2015-07-17 13:50             ` Peter Zijlstra

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.