* [PATCH 01/11] lib/dlock-list: Distributed and lock-protected lists
2023-12-06 6:05 [PATCH 0/11] vfs: inode cache scalability improvements Dave Chinner
@ 2023-12-06 6:05 ` Dave Chinner
2023-12-07 2:23 ` Al Viro
2023-12-06 6:05 ` [PATCH 02/11] vfs: Remove unnecessary list_for_each_entry_safe() variants Dave Chinner
` (10 subsequent siblings)
11 siblings, 1 reply; 34+ messages in thread
From: Dave Chinner @ 2023-12-06 6:05 UTC (permalink / raw)
To: linux-fsdevel
Cc: linux-block, linux-cachefs, dhowells, gfs2, dm-devel,
linux-security-module, selinux, linux-kernel
From: Waiman Long <longman@redhat.com>
Linked list is used everywhere in the Linux kernel. However, if many
threads are trying to add or delete entries into the same linked list,
it can create a performance bottleneck.
This patch introduces a new list APIs that provide a set of distributed
lists (one per CPU), each of which is protected by its own spinlock.
To the callers, however, the set of lists acts like a single
consolidated list. This allows list entries insertion and deletion
operations to happen in parallel instead of being serialized with a
global list and lock.
List entry insertion is strictly per cpu. List deletion, however, can
happen in a cpu other than the one that did the insertion. So we still
need lock to protect the list. Because of that, there may still be
a small amount of contention when deletion is being done.
A new header file include/linux/dlock-list.h will be added with the
associated dlock_list_head and dlock_list_node structures. The following
functions are provided to manage the per-cpu list:
1. int alloc_dlock_list_heads(struct dlock_list_heads *dlist)
2. void free_dlock_list_heads(struct dlock_list_heads *dlist)
3. void dlock_list_add(struct dlock_list_node *node,
struct dlock_list_heads *dlist)
4. void dlock_list_del(struct dlock_list *node)
Iteration of all the list entries within a dlock list array
is done by calling either the dlist_for_each_entry() or
dlist_for_each_entry_safe() macros. They correspond to the
list_for_each_entry() and list_for_each_entry_safe() macros
respectively. The iteration states are keep in a dlock_list_iter
structure that is passed to the iteration macros.
Signed-off-by: Waiman Long <longman@redhat.com>
Reviewed-by: Jan Kara <jack@suse.cz>
---
include/linux/dlock-list.h | 242 +++++++++++++++++++++++++++++++++++++
lib/Makefile | 2 +-
lib/dlock-list.c | 234 +++++++++++++++++++++++++++++++++++
3 files changed, 477 insertions(+), 1 deletion(-)
create mode 100644 include/linux/dlock-list.h
create mode 100644 lib/dlock-list.c
diff --git a/include/linux/dlock-list.h b/include/linux/dlock-list.h
new file mode 100644
index 000000000000..327cb9edc7e3
--- /dev/null
+++ b/include/linux/dlock-list.h
@@ -0,0 +1,242 @@
+/*
+ * Distributed and locked list
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * (C) Copyright 2016 Hewlett-Packard Enterprise Development LP
+ * (C) Copyright 2017-2018 Red Hat, Inc.
+ *
+ * Authors: Waiman Long <longman@redhat.com>
+ */
+#ifndef __LINUX_DLOCK_LIST_H
+#define __LINUX_DLOCK_LIST_H
+
+#include <linux/spinlock.h>
+#include <linux/list.h>
+
+/*
+ * include/linux/dlock-list.h
+ *
+ * The dlock_list_head structure contains the spinlock. It is cacheline
+ * aligned to reduce contention among different CPUs. The other
+ * dlock_list_node structures contains a pointer to the head entry instead.
+ */
+struct dlock_list_head {
+ struct list_head list;
+ spinlock_t lock;
+} ____cacheline_aligned_in_smp;
+
+struct dlock_list_heads {
+ struct dlock_list_head *heads;
+};
+
+/*
+ * dlock list node data structure
+ */
+struct dlock_list_node {
+ struct list_head list;
+ struct dlock_list_head *head;
+};
+
+/*
+ * dlock list iteration state
+ *
+ * This is an opaque data structure that may change. Users of this structure
+ * should not access the structure members directly other than using the
+ * helper functions and macros provided in this header file.
+ */
+struct dlock_list_iter {
+ int index;
+ struct dlock_list_head *head, *entry;
+};
+
+#define DLOCK_LIST_ITER_INIT(dlist) \
+ { \
+ .index = -1, \
+ .head = (dlist)->heads, \
+ }
+
+#define DEFINE_DLOCK_LIST_ITER(s, heads) \
+ struct dlock_list_iter s = DLOCK_LIST_ITER_INIT(heads)
+
+static inline void init_dlock_list_iter(struct dlock_list_iter *iter,
+ struct dlock_list_heads *heads)
+{
+ *iter = (struct dlock_list_iter)DLOCK_LIST_ITER_INIT(heads);
+}
+
+#define DLOCK_LIST_NODE_INIT(name) \
+ { \
+ .list = LIST_HEAD_INIT(name) \
+ }
+
+static inline void init_dlock_list_node(struct dlock_list_node *node)
+{
+ *node = (struct dlock_list_node)DLOCK_LIST_NODE_INIT(node->list);
+}
+
+/**
+ * dlock_list_unlock - unlock the spinlock that protects the current list
+ * @iter: Pointer to the dlock list iterator structure
+ */
+static inline void dlock_list_unlock(struct dlock_list_iter *iter)
+{
+ spin_unlock(&iter->entry->lock);
+}
+
+/**
+ * dlock_list_relock - lock the spinlock that protects the current list
+ * @iter: Pointer to the dlock list iterator structure
+ */
+static inline void dlock_list_relock(struct dlock_list_iter *iter)
+{
+ spin_lock(&iter->entry->lock);
+}
+
+/*
+ * Allocation and freeing of dlock list
+ */
+extern int __alloc_dlock_list_heads(struct dlock_list_heads *dlist,
+ struct lock_class_key *key);
+extern void free_dlock_list_heads(struct dlock_list_heads *dlist);
+
+/**
+ * alloc_dlock_list_head - Initialize and allocate the list of head entries.
+ * @dlist : Pointer to the dlock_list_heads structure to be initialized
+ * Return : 0 if successful, -ENOMEM if memory allocation error
+ */
+#define alloc_dlock_list_heads(dlist) \
+({ \
+ static struct lock_class_key _key; \
+ __alloc_dlock_list_heads(dlist, &_key); \
+})
+
+/*
+ * Check if a dlock list is empty or not.
+ */
+extern bool dlock_lists_empty(struct dlock_list_heads *dlist);
+
+/*
+ * The dlock list addition and deletion functions here are not irq-safe.
+ * Special irq-safe variants will have to be added if we need them.
+ */
+extern void dlock_lists_add(struct dlock_list_node *node,
+ struct dlock_list_heads *dlist);
+extern void dlock_lists_del(struct dlock_list_node *node);
+
+/*
+ * Find the first entry of the next available list.
+ */
+extern struct dlock_list_node *
+__dlock_list_next_list(struct dlock_list_iter *iter);
+
+/**
+ * __dlock_list_next_entry - Iterate to the next entry of the dlock list
+ * @curr : Pointer to the current dlock_list_node structure
+ * @iter : Pointer to the dlock list iterator structure
+ * Return: Pointer to the next entry or NULL if all the entries are iterated
+ *
+ * The iterator has to be properly initialized before calling this function.
+ */
+static inline struct dlock_list_node *
+__dlock_list_next_entry(struct dlock_list_node *curr,
+ struct dlock_list_iter *iter)
+{
+ /*
+ * Find next entry
+ */
+ if (curr)
+ curr = list_next_entry(curr, list);
+
+ if (!curr || (&curr->list == &iter->entry->list)) {
+ /*
+ * The current list has been exhausted, try the next available
+ * list.
+ */
+ curr = __dlock_list_next_list(iter);
+ }
+
+ return curr; /* Continue the iteration */
+}
+
+/**
+ * _dlock_list_next_list_entry - get first element from next list in iterator
+ * @iter : The dlock list iterator.
+ * @pos : A variable of the struct that is embedded in.
+ * @member: The name of the dlock_list_node within the struct.
+ * Return : Pointer to first entry or NULL if all the lists are iterated.
+ */
+#define _dlock_list_next_list_entry(iter, pos, member) \
+ ({ \
+ struct dlock_list_node *_n; \
+ _n = __dlock_list_next_entry(NULL, iter); \
+ _n ? list_entry(_n, typeof(*pos), member) : NULL; \
+ })
+
+/**
+ * _dlock_list_next_entry - iterate to the next entry of the list
+ * @pos : The type * to cursor
+ * @iter : The dlock list iterator.
+ * @member: The name of the dlock_list_node within the struct.
+ * Return : Pointer to the next entry or NULL if all the entries are iterated.
+ *
+ * Note that pos can't be NULL.
+ */
+#define _dlock_list_next_entry(pos, iter, member) \
+ ({ \
+ struct dlock_list_node *_n; \
+ _n = __dlock_list_next_entry(&(pos)->member, iter); \
+ _n ? list_entry(_n, typeof(*(pos)), member) : NULL; \
+ })
+
+/**
+ * dlist_for_each_entry - iterate over the dlock list
+ * @pos : Type * to use as a loop cursor
+ * @iter : The dlock list iterator
+ * @member: The name of the dlock_list_node within the struct
+ *
+ * This iteration macro isn't safe with respect to list entry removal, but
+ * it can correctly iterate newly added entries right after the current one.
+ * This iteration function is designed to be used in a while loop.
+ */
+#define dlist_for_each_entry(pos, iter, member) \
+ for (pos = _dlock_list_next_list_entry(iter, pos, member); \
+ pos != NULL; \
+ pos = _dlock_list_next_entry(pos, iter, member))
+
+/**
+ * dlist_for_each_entry_safe - iterate over the dlock list & safe over removal
+ * @pos : Type * to use as a loop cursor
+ * @n : Another type * to use as temporary storage
+ * @iter : The dlock list iterator
+ * @member: The name of the dlock_list_node within the struct
+ *
+ * This iteration macro is safe with respect to list entry removal.
+ * However, it cannot correctly iterate newly added entries right after the
+ * current one.
+ *
+ * The call to __dlock_list_next_list() is deferred until the next entry
+ * is being iterated to avoid use-after-unlock problem.
+ */
+#define dlist_for_each_entry_safe(pos, n, iter, member) \
+ for (pos = NULL; \
+ ({ \
+ if (!pos || \
+ (&(pos)->member.list == &(iter)->entry->list)) \
+ pos = _dlock_list_next_list_entry(iter, pos, \
+ member); \
+ if (pos) \
+ n = list_next_entry(pos, member.list); \
+ pos; \
+ }); \
+ pos = n)
+
+#endif /* __LINUX_DLOCK_LIST_H */
diff --git a/lib/Makefile b/lib/Makefile
index 6b09731d8e61..73d84b569f1e 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -48,7 +48,7 @@ obj-y += bcd.o sort.o parser.o debug_locks.o random32.o \
bsearch.o find_bit.o llist.o lwq.o memweight.o kfifo.o \
percpu-refcount.o rhashtable.o base64.o \
once.o refcount.o rcuref.o usercopy.o errseq.o bucket_locks.o \
- generic-radix-tree.o bitmap-str.o
+ generic-radix-tree.o bitmap-str.o dlock-list.o
obj-$(CONFIG_STRING_SELFTEST) += test_string.o
obj-y += string_helpers.o
obj-$(CONFIG_TEST_STRING_HELPERS) += test-string_helpers.o
diff --git a/lib/dlock-list.c b/lib/dlock-list.c
new file mode 100644
index 000000000000..f64ea4cc5e79
--- /dev/null
+++ b/lib/dlock-list.c
@@ -0,0 +1,234 @@
+/*
+ * Distributed and locked list
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * (C) Copyright 2016 Hewlett-Packard Enterprise Development LP
+ * (C) Copyright 2017-2018 Red Hat, Inc.
+ *
+ * Authors: Waiman Long <longman@redhat.com>
+ */
+#include <linux/dlock-list.h>
+#include <linux/lockdep.h>
+#include <linux/slab.h>
+#include <linux/cpumask.h>
+
+/*
+ * The distributed and locked list is a distributed set of lists each of
+ * which is protected by its own spinlock, but acts like a single
+ * consolidated list to the callers. For scaling purpose, the number of
+ * lists used is equal to the number of possible CPUs in the system to
+ * minimize contention.
+ *
+ * However, it is possible that individual CPU numbers may be equal to
+ * or greater than the number of possible CPUs when there are holes in
+ * the CPU number list. As a result, we need to map the CPU number to a
+ * list index.
+ */
+static DEFINE_PER_CPU_READ_MOSTLY(int, cpu2idx);
+
+/*
+ * Initialize cpu2idx mapping table
+ *
+ * It is possible that a dlock-list can be allocated before the cpu2idx is
+ * initialized. In this case, all the cpus are mapped to the first entry
+ * before initialization.
+ *
+ */
+static int __init cpu2idx_init(void)
+{
+ int idx, cpu;
+
+ idx = 0;
+ for_each_possible_cpu(cpu)
+ per_cpu(cpu2idx, cpu) = idx++;
+ return 0;
+}
+postcore_initcall(cpu2idx_init);
+
+/**
+ * __alloc_dlock_list_heads - Initialize and allocate the list of head entries
+ * @dlist: Pointer to the dlock_list_heads structure to be initialized
+ * @key : The lock class key to be used for lockdep
+ * Return: 0 if successful, -ENOMEM if memory allocation error
+ *
+ * This function does not allocate the dlock_list_heads structure itself. The
+ * callers will have to do their own memory allocation, if necessary. However,
+ * this allows embedding the dlock_list_heads structure directly into other
+ * structures.
+ *
+ * Dynamically allocated locks need to have their own special lock class
+ * to avoid lockdep warning.
+ */
+int __alloc_dlock_list_heads(struct dlock_list_heads *dlist,
+ struct lock_class_key *key)
+{
+ int idx;
+
+ dlist->heads = kcalloc(nr_cpu_ids, sizeof(struct dlock_list_head),
+ GFP_KERNEL);
+
+ if (!dlist->heads)
+ return -ENOMEM;
+
+ for (idx = 0; idx < nr_cpu_ids; idx++) {
+ struct dlock_list_head *head = &dlist->heads[idx];
+
+ INIT_LIST_HEAD(&head->list);
+ head->lock = __SPIN_LOCK_UNLOCKED(&head->lock);
+ lockdep_set_class(&head->lock, key);
+ }
+ return 0;
+}
+EXPORT_SYMBOL(__alloc_dlock_list_heads);
+
+/**
+ * free_dlock_list_heads - Free all the heads entries of the dlock list
+ * @dlist: Pointer of the dlock_list_heads structure to be freed
+ *
+ * This function doesn't free the dlock_list_heads structure itself. So
+ * the caller will have to do it, if necessary.
+ */
+void free_dlock_list_heads(struct dlock_list_heads *dlist)
+{
+ kfree(dlist->heads);
+ dlist->heads = NULL;
+}
+EXPORT_SYMBOL(free_dlock_list_heads);
+
+/**
+ * dlock_lists_empty - Check if all the dlock lists are empty
+ * @dlist: Pointer to the dlock_list_heads structure
+ * Return: true if list is empty, false otherwise.
+ *
+ * This can be a pretty expensive function call. If this function is required
+ * in a performance critical path, we may have to maintain a global count
+ * of the list entries in the global dlock_list_heads structure instead.
+ */
+bool dlock_lists_empty(struct dlock_list_heads *dlist)
+{
+ int idx;
+
+ for (idx = 0; idx < nr_cpu_ids; idx++)
+ if (!list_empty(&dlist->heads[idx].list))
+ return false;
+ return true;
+}
+EXPORT_SYMBOL(dlock_lists_empty);
+
+/**
+ * dlock_lists_add - Adds a node to the given dlock list
+ * @node : The node to be added
+ * @dlist: The dlock list where the node is to be added
+ *
+ * List selection is based on the CPU being used when the dlock_list_add()
+ * function is called. However, deletion may be done by a different CPU.
+ */
+void dlock_lists_add(struct dlock_list_node *node,
+ struct dlock_list_heads *dlist)
+{
+ struct dlock_list_head *head = &dlist->heads[this_cpu_read(cpu2idx)];
+
+ /*
+ * There is no need to disable preemption
+ */
+ spin_lock(&head->lock);
+ WRITE_ONCE(node->head, head);
+ list_add(&node->list, &head->list);
+ spin_unlock(&head->lock);
+}
+EXPORT_SYMBOL(dlock_lists_add);
+
+/**
+ * dlock_lists_del - Delete a node from a dlock list
+ * @node : The node to be deleted
+ *
+ * We need to check the lock pointer again after taking the lock to guard
+ * against concurrent deletion of the same node. If the lock pointer changes
+ * (becomes NULL or to a different one), we assume that the deletion was done
+ * elsewhere. A warning will be printed if this happens as it is likely to be
+ * a bug.
+ */
+void dlock_lists_del(struct dlock_list_node *node)
+{
+ struct dlock_list_head *head;
+ bool retry;
+
+ do {
+ head = READ_ONCE(node->head);
+ if (WARN_ONCE(!head, "%s: node 0x%lx has no associated head\n",
+ __func__, (unsigned long)node))
+ return;
+
+ spin_lock(&head->lock);
+ if (likely(head == READ_ONCE(node->head))) {
+ list_del_init(&node->list);
+ WRITE_ONCE(node->head, NULL);
+ retry = false;
+ } else {
+ /*
+ * The lock has somehow changed. Retry again if it is
+ * not NULL. Otherwise, just ignore the delete
+ * operation.
+ */
+ retry = (READ_ONCE(node->head) != NULL);
+ }
+ spin_unlock(&head->lock);
+ } while (retry);
+}
+EXPORT_SYMBOL(dlock_lists_del);
+
+/**
+ * __dlock_list_next_list: Find the first entry of the next available list
+ * @dlist: Pointer to the dlock_list_heads structure
+ * @iter : Pointer to the dlock list iterator structure
+ * Return: true if the entry is found, false if all the lists exhausted
+ *
+ * The information about the next available list will be put into the iterator.
+ */
+struct dlock_list_node *__dlock_list_next_list(struct dlock_list_iter *iter)
+{
+ struct dlock_list_node *next;
+ struct dlock_list_head *head;
+
+restart:
+ if (iter->entry) {
+ spin_unlock(&iter->entry->lock);
+ iter->entry = NULL;
+ }
+
+next_list:
+ /*
+ * Try next list
+ */
+ if (++iter->index >= nr_cpu_ids)
+ return NULL; /* All the entries iterated */
+
+ if (list_empty(&iter->head[iter->index].list))
+ goto next_list;
+
+ head = iter->entry = &iter->head[iter->index];
+ spin_lock(&head->lock);
+ /*
+ * There is a slight chance that the list may become empty just
+ * before the lock is acquired. So an additional check is
+ * needed to make sure that a valid node will be returned.
+ */
+ if (list_empty(&head->list))
+ goto restart;
+
+ next = list_entry(head->list.next, struct dlock_list_node,
+ list);
+ WARN_ON_ONCE(next->head != head);
+
+ return next;
+}
+EXPORT_SYMBOL(__dlock_list_next_list);
--
2.42.0
^ permalink raw reply related [flat|nested] 34+ messages in thread
* Re: [PATCH 01/11] lib/dlock-list: Distributed and lock-protected lists
2023-12-06 6:05 ` [PATCH 01/11] lib/dlock-list: Distributed and lock-protected lists Dave Chinner
@ 2023-12-07 2:23 ` Al Viro
0 siblings, 0 replies; 34+ messages in thread
From: Al Viro @ 2023-12-07 2:23 UTC (permalink / raw)
To: Dave Chinner
Cc: linux-fsdevel, linux-block, linux-cachefs, dhowells, gfs2,
dm-devel, linux-security-module, selinux, linux-kernel
On Wed, Dec 06, 2023 at 05:05:30PM +1100, Dave Chinner wrote:
> +static inline struct dlock_list_node *
> +__dlock_list_next_entry(struct dlock_list_node *curr,
> + struct dlock_list_iter *iter)
> +{
> + /*
> + * Find next entry
> + */
> + if (curr)
> + curr = list_next_entry(curr, list);
> +
> + if (!curr || (&curr->list == &iter->entry->list)) {
Hmm... hlist, perhaps? I mean, that way the thing becomes
if (curr)
curr = hlist_entry_safe(curr->node.next,
struct dlock_list_node, node);
if (!curr)
curr = __dlock_list_next_list(iter);
return curr;
BTW, does anybody have objections against
#define hlist_first_entry(head, type, member)
hlist_entry_safe((head)->first, type, member)
#define hlist_next_entry(pos, member)
hlist_entry_safe((pos)->member.next, typeof(*pos), member)
added in list.h?
> +static int __init cpu2idx_init(void)
> +{
> + int idx, cpu;
> +
> + idx = 0;
> + for_each_possible_cpu(cpu)
> + per_cpu(cpu2idx, cpu) = idx++;
> + return 0;
> +}
> +postcore_initcall(cpu2idx_init);
Is it early enough? Feels like that ought to be done from smp_init() or
right after it...
> +/**
> + * dlock_lists_empty - Check if all the dlock lists are empty
> + * @dlist: Pointer to the dlock_list_heads structure
> + * Return: true if list is empty, false otherwise.
> + *
> + * This can be a pretty expensive function call. If this function is required
> + * in a performance critical path, we may have to maintain a global count
> + * of the list entries in the global dlock_list_heads structure instead.
> + */
> +bool dlock_lists_empty(struct dlock_list_heads *dlist)
> +{
> + int idx;
> +
> + for (idx = 0; idx < nr_cpu_ids; idx++)
> + if (!list_empty(&dlist->heads[idx].list))
> + return false;
> + return true;
> +}
Umm... How would one use it, anyway? You'd need to stop all insertions
first, wouldn't you?
> + */
> +struct dlock_list_node *__dlock_list_next_list(struct dlock_list_iter *iter)
> +{
> + struct dlock_list_node *next;
> + struct dlock_list_head *head;
> +
> +restart:
> + if (iter->entry) {
> + spin_unlock(&iter->entry->lock);
> + iter->entry = NULL;
> + }
> +
> +next_list:
> + /*
> + * Try next list
> + */
> + if (++iter->index >= nr_cpu_ids)
> + return NULL; /* All the entries iterated */
> +
> + if (list_empty(&iter->head[iter->index].list))
> + goto next_list;
> +
> + head = iter->entry = &iter->head[iter->index];
> + spin_lock(&head->lock);
> + /*
> + * There is a slight chance that the list may become empty just
> + * before the lock is acquired. So an additional check is
> + * needed to make sure that a valid node will be returned.
> + */
> + if (list_empty(&head->list))
> + goto restart;
> +
> + next = list_entry(head->list.next, struct dlock_list_node,
> + list);
> + WARN_ON_ONCE(next->head != head);
> +
> + return next;
> +}
Perhaps something like
if (iter->entry) {
spin_unlock(&iter->entry->lock);
iter->entry = NULL;
}
while (++iter->index < nr_cpu_ids) {
struct dlock_list_head *head = &iter->head[iter->index];
if (list_empty(head->list))
continue;
spin_lock(&head->lock);
// recheck under lock
if (unlikely(list_empty(&head->list))) {
spin_unlock(&head->lock);
continue;
}
iter->entry = head;
next = list_first_entry(&head->list,
struct dlock_list_node, list);
WARN_ON_ONCE(next->head != head);
return next;
}
return NULL;
^ permalink raw reply [flat|nested] 34+ messages in thread
* [PATCH 02/11] vfs: Remove unnecessary list_for_each_entry_safe() variants
2023-12-06 6:05 [PATCH 0/11] vfs: inode cache scalability improvements Dave Chinner
2023-12-06 6:05 ` [PATCH 01/11] lib/dlock-list: Distributed and lock-protected lists Dave Chinner
@ 2023-12-06 6:05 ` Dave Chinner
2023-12-07 2:26 ` Al Viro
2023-12-07 4:18 ` Kent Overstreet
2023-12-06 6:05 ` [PATCH 03/11] vfs: Use dlock list for superblock's inode list Dave Chinner
` (9 subsequent siblings)
11 siblings, 2 replies; 34+ messages in thread
From: Dave Chinner @ 2023-12-06 6:05 UTC (permalink / raw)
To: linux-fsdevel
Cc: linux-block, linux-cachefs, dhowells, gfs2, dm-devel,
linux-security-module, selinux, linux-kernel
From: Jan Kara <jack@suse.cz>
evict_inodes() and invalidate_inodes() use list_for_each_entry_safe()
to iterate sb->s_inodes list. However, since we use i_lru list entry for
our local temporary list of inodes to destroy, the inode is guaranteed
to stay in sb->s_inodes list while we hold sb->s_inode_list_lock. So
there is no real need for safe iteration variant and we can use
list_for_each_entry() just fine.
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Waiman Long <longman@redhat.com>
---
fs/inode.c | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/fs/inode.c b/fs/inode.c
index f238d987dec9..17c50a75514f 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -712,12 +712,12 @@ static void dispose_list(struct list_head *head)
*/
void evict_inodes(struct super_block *sb)
{
- struct inode *inode, *next;
+ struct inode *inode;
LIST_HEAD(dispose);
again:
spin_lock(&sb->s_inode_list_lock);
- list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
+ list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
if (atomic_read(&inode->i_count))
continue;
@@ -758,12 +758,12 @@ EXPORT_SYMBOL_GPL(evict_inodes);
*/
void invalidate_inodes(struct super_block *sb)
{
- struct inode *inode, *next;
+ struct inode *inode;
LIST_HEAD(dispose);
again:
spin_lock(&sb->s_inode_list_lock);
- list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
+ list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
spin_lock(&inode->i_lock);
if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
spin_unlock(&inode->i_lock);
--
2.42.0
^ permalink raw reply related [flat|nested] 34+ messages in thread
* Re: [PATCH 02/11] vfs: Remove unnecessary list_for_each_entry_safe() variants
2023-12-06 6:05 ` [PATCH 02/11] vfs: Remove unnecessary list_for_each_entry_safe() variants Dave Chinner
@ 2023-12-07 2:26 ` Al Viro
2023-12-07 4:18 ` Kent Overstreet
1 sibling, 0 replies; 34+ messages in thread
From: Al Viro @ 2023-12-07 2:26 UTC (permalink / raw)
To: Dave Chinner
Cc: linux-fsdevel, linux-block, linux-cachefs, dhowells, gfs2,
dm-devel, linux-security-module, selinux, linux-kernel
On Wed, Dec 06, 2023 at 05:05:31PM +1100, Dave Chinner wrote:
> From: Jan Kara <jack@suse.cz>
>
> evict_inodes() and invalidate_inodes() use list_for_each_entry_safe()
> to iterate sb->s_inodes list. However, since we use i_lru list entry for
> our local temporary list of inodes to destroy, the inode is guaranteed
> to stay in sb->s_inodes list while we hold sb->s_inode_list_lock. So
> there is no real need for safe iteration variant and we can use
> list_for_each_entry() just fine.
>
> Signed-off-by: Jan Kara <jack@suse.cz>
> Signed-off-by: Waiman Long <longman@redhat.com>
ACKed-by: Al Viro <viro@zeniv.linux.org.uk>
^ permalink raw reply [flat|nested] 34+ messages in thread
* Re: [PATCH 02/11] vfs: Remove unnecessary list_for_each_entry_safe() variants
2023-12-06 6:05 ` [PATCH 02/11] vfs: Remove unnecessary list_for_each_entry_safe() variants Dave Chinner
2023-12-07 2:26 ` Al Viro
@ 2023-12-07 4:18 ` Kent Overstreet
1 sibling, 0 replies; 34+ messages in thread
From: Kent Overstreet @ 2023-12-07 4:18 UTC (permalink / raw)
To: Dave Chinner
Cc: linux-fsdevel, linux-block, linux-cachefs, dhowells, gfs2,
dm-devel, linux-security-module, selinux, linux-kernel
On Wed, Dec 06, 2023 at 05:05:31PM +1100, Dave Chinner wrote:
> From: Jan Kara <jack@suse.cz>
>
> evict_inodes() and invalidate_inodes() use list_for_each_entry_safe()
> to iterate sb->s_inodes list. However, since we use i_lru list entry for
> our local temporary list of inodes to destroy, the inode is guaranteed
> to stay in sb->s_inodes list while we hold sb->s_inode_list_lock. So
> there is no real need for safe iteration variant and we can use
> list_for_each_entry() just fine.
>
> Signed-off-by: Jan Kara <jack@suse.cz>
> Signed-off-by: Waiman Long <longman@redhat.com>
Reviewed-by: Kent Overstreet <kent.overstreet@linux.dev>
^ permalink raw reply [flat|nested] 34+ messages in thread
* [PATCH 03/11] vfs: Use dlock list for superblock's inode list
2023-12-06 6:05 [PATCH 0/11] vfs: inode cache scalability improvements Dave Chinner
2023-12-06 6:05 ` [PATCH 01/11] lib/dlock-list: Distributed and lock-protected lists Dave Chinner
2023-12-06 6:05 ` [PATCH 02/11] vfs: Remove unnecessary list_for_each_entry_safe() variants Dave Chinner
@ 2023-12-06 6:05 ` Dave Chinner
2023-12-07 2:40 ` Al Viro
2023-12-06 6:05 ` [PATCH 04/11] lib/dlock-list: Make sibling CPUs share the same linked list Dave Chinner
` (8 subsequent siblings)
11 siblings, 1 reply; 34+ messages in thread
From: Dave Chinner @ 2023-12-06 6:05 UTC (permalink / raw)
To: linux-fsdevel
Cc: linux-block, linux-cachefs, dhowells, gfs2, dm-devel,
linux-security-module, selinux, linux-kernel
From: Waiman Long <longman@redhat.com>
[dchinner: original commit message preserved]
When many threads are trying to add or delete inode to or from
a superblock's s_inodes list, spinlock contention on the list can
become a performance bottleneck.
This patch changes the s_inodes field to become a dlock list which
is a distributed set of lists with per-list spinlocks. As a result,
the following superblock inode list (sb->s_inodes) iteration functions
in vfs are also being modified:
1. iterate_bdevs()
2. drop_pagecache_sb()
3. evict_inodes()
4. invalidate_inodes()
5. fsnotify_unmount_inodes()
6. add_dquot_ref()
7. remove_dquot_ref()
With an exit microbenchmark that creates a large number of threads,
attachs many inodes to them in procfs and then exits. The runtimes of
that microbenchmark with various number of threads before and after
the patch on a 4-socket Intel E7-8867 v3 system (64 cores, 128 threads)
on a 4.19-rc3 based kernel were as follows:
# of threads Elapsed/Sys Time Elapsed/Sys Time Speedup
Unpatched Kernel Patched Kernel
------------ ---------------- ---------------- -------
1000 59.17s/123m09.8s 18.90s/24m44.5s 3.13
1200 73.20s/151m24.1s 27.54s/50m05.3s 2.66
1400 102.04s/212m00.9s 36.75s/68m26.7s 2.78
1600 131.13s/272m52.4s 50.16s/94m23.7s 2.61
[dchinner: forward port, add new inode list traversals, etc]
[dchinner: scalability results on current TOT XFS]
With 400k inodes per thread concurrent directory traversal workload,
scalability improves at >=16 threads on 6.7-rc4 on XFS. We only test
XFS here as it is the only filesystem that demonstrates sufficient
internal scalability for the superblock inode list to be a
scalability bottleneck.
Table contains test runtime in seconds; perfect scalability is
demonstrated by the runtime staying constant as thread count goes up.
Threads 6.4-rc7 patched
------- ------- -------
2 11.673 11.158
4 9.665 9.444
8 10.622 9.275
16 12.148 9.508
32 20.518 10.308
Unpatched kernel profile at 32 threads:
- 95.45% vfs_fstatat
- 95.00% vfs_statx
- 91.00% filename_lookup
- 90.90% path_lookupat
- 90.40% walk_component
- 89.05% lookup_slow
- 88.95% __lookup_slow
- 86.38% xfs_vn_lookup
- 84.05% xfs_lookup
- 78.82% xfs_iget
- 72.58% xfs_setup_inode
- 72.54% inode_sb_list_add
- 71.12% _raw_spin_lock
- 71.09% do_raw_spin_lock
- 68.85% __pv_queued_spin_lock_slowpath
Patched kernel profile at 32 threads - the biggest single point of
contention is now the dentry cache LRU via dput():
- 21.59% 0.25% [kernel] [k] dput
- 21.34% dput
- 19.93% retain_dentry
- d_lru_add
- 19.82% list_lru_add
- 14.62% _raw_spin_lock
- 14.47% do_raw_spin_lock
10.89% __pv_queued_spin_lock_slowpath
1.78% __list_add_valid_or_report
- 0.81% _raw_spin_unlock
- do_raw_spin_unlock
0.77% __raw_callee_save___pv_queued_spin_unlock
- 0.79% _raw_spin_unlock
- 0.78% do_raw_spin_unlock
0.67% __raw_callee_save___pv_queued_spin_unlock
Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
block/bdev.c | 24 ++++++++----------------
fs/drop_caches.c | 9 ++++-----
fs/gfs2/ops_fstype.c | 21 +++++++++++----------
fs/inode.c | 37 ++++++++++++++++---------------------
fs/notify/fsnotify.c | 12 ++++++------
fs/quota/dquot.c | 22 ++++++----------------
fs/super.c | 13 +++++++------
include/linux/fs.h | 8 ++++----
security/landlock/fs.c | 25 ++++++-------------------
9 files changed, 68 insertions(+), 103 deletions(-)
diff --git a/block/bdev.c b/block/bdev.c
index 750aec178b6a..07135fd6fda4 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -437,11 +437,11 @@ long nr_blockdev_pages(void)
{
struct inode *inode;
long ret = 0;
+ DEFINE_DLOCK_LIST_ITER(iter, &blockdev_superblock->s_inodes);
- spin_lock(&blockdev_superblock->s_inode_list_lock);
- list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list)
+ dlist_for_each_entry(inode, &iter, i_sb_list) {
ret += inode->i_mapping->nrpages;
- spin_unlock(&blockdev_superblock->s_inode_list_lock);
+ }
return ret;
}
@@ -1032,9 +1032,9 @@ EXPORT_SYMBOL_GPL(bdev_mark_dead);
void sync_bdevs(bool wait)
{
struct inode *inode, *old_inode = NULL;
+ DEFINE_DLOCK_LIST_ITER(iter, &blockdev_superblock->s_inodes);
- spin_lock(&blockdev_superblock->s_inode_list_lock);
- list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) {
+ dlist_for_each_entry(inode, &iter, i_sb_list) {
struct address_space *mapping = inode->i_mapping;
struct block_device *bdev;
@@ -1046,15 +1046,8 @@ void sync_bdevs(bool wait)
}
__iget(inode);
spin_unlock(&inode->i_lock);
- spin_unlock(&blockdev_superblock->s_inode_list_lock);
- /*
- * We hold a reference to 'inode' so it couldn't have been
- * removed from s_inodes list while we dropped the
- * s_inode_list_lock We cannot iput the inode now as we can
- * be holding the last reference and we cannot iput it under
- * s_inode_list_lock. So we keep the reference and iput it
- * later.
- */
+ dlock_list_unlock(&iter);
+
iput(old_inode);
old_inode = inode;
bdev = I_BDEV(inode);
@@ -1075,9 +1068,8 @@ void sync_bdevs(bool wait)
}
mutex_unlock(&bdev->bd_disk->open_mutex);
- spin_lock(&blockdev_superblock->s_inode_list_lock);
+ dlock_list_relock(&iter);
}
- spin_unlock(&blockdev_superblock->s_inode_list_lock);
iput(old_inode);
}
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index b9575957a7c2..3596d0a7c0da 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -19,9 +19,9 @@ int sysctl_drop_caches;
static void drop_pagecache_sb(struct super_block *sb, void *unused)
{
struct inode *inode, *toput_inode = NULL;
+ DEFINE_DLOCK_LIST_ITER(iter, &sb->s_inodes);
- spin_lock(&sb->s_inode_list_lock);
- list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
+ dlist_for_each_entry(inode, &iter, i_sb_list) {
spin_lock(&inode->i_lock);
/*
* We must skip inodes in unusual state. We may also skip
@@ -35,16 +35,15 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused)
}
__iget(inode);
spin_unlock(&inode->i_lock);
- spin_unlock(&sb->s_inode_list_lock);
+ dlock_list_unlock(&iter);
invalidate_mapping_pages(inode->i_mapping, 0, -1);
iput(toput_inode);
toput_inode = inode;
cond_resched();
- spin_lock(&sb->s_inode_list_lock);
+ dlock_list_relock(&iter);
}
- spin_unlock(&sb->s_inode_list_lock);
iput(toput_inode);
}
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index b108c5d26839..1105710482e7 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -1738,22 +1738,24 @@ static int gfs2_meta_init_fs_context(struct fs_context *fc)
* attempt will time out. Since inodes are evicted sequentially, this can add
* up quickly.
*
- * Function evict_inodes() tries to keep the s_inode_list_lock list locked over
- * a long time, which prevents other inodes from being evicted concurrently.
- * This precludes the cooperative behavior we are looking for. This special
- * version of evict_inodes() avoids that.
- *
* Modeled after drop_pagecache_sb().
+ *
+ * XXX(dgc): this is particularly awful. With the dlist for inodes, concurrent
+ * access to the inode list can occur and evict_inodes() will drop the per-cpu
+ * list lock if the CPU needs rescheduling. Hence if this exists just because
+ * evict_inodes() holds the s_inode_list_lock for long periods preventing
+ * concurrent inode eviction work from being done, this can probably go away
+ * entirely now.
*/
static void gfs2_evict_inodes(struct super_block *sb)
{
struct inode *inode, *toput_inode = NULL;
struct gfs2_sbd *sdp = sb->s_fs_info;
+ DEFINE_DLOCK_LIST_ITER(iter, &sb->s_inodes);
set_bit(SDF_EVICTING, &sdp->sd_flags);
- spin_lock(&sb->s_inode_list_lock);
- list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
+ dlist_for_each_entry(inode, &iter, i_sb_list) {
spin_lock(&inode->i_lock);
if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) &&
!need_resched()) {
@@ -1762,15 +1764,14 @@ static void gfs2_evict_inodes(struct super_block *sb)
}
atomic_inc(&inode->i_count);
spin_unlock(&inode->i_lock);
- spin_unlock(&sb->s_inode_list_lock);
+ dlock_list_unlock(&iter);
iput(toput_inode);
toput_inode = inode;
cond_resched();
- spin_lock(&sb->s_inode_list_lock);
+ dlock_list_relock(&iter);
}
- spin_unlock(&sb->s_inode_list_lock);
iput(toput_inode);
}
diff --git a/fs/inode.c b/fs/inode.c
index 17c50a75514f..3426691fa305 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -30,7 +30,7 @@
* inode->i_state, inode->i_hash, __iget(), inode->i_io_list
* Inode LRU list locks protect:
* inode->i_sb->s_inode_lru, inode->i_lru
- * inode->i_sb->s_inode_list_lock protects:
+ * inode->i_sb->s_inodes->head->lock protects:
* inode->i_sb->s_inodes, inode->i_sb_list
* bdi->wb.list_lock protects:
* bdi->wb.b_{dirty,io,more_io,dirty_time}, inode->i_io_list
@@ -39,7 +39,7 @@
*
* Lock ordering:
*
- * inode->i_sb->s_inode_list_lock
+ * inode->i_sb->s_inodes->head->lock
* inode->i_lock
* Inode LRU list locks
*
@@ -47,7 +47,7 @@
* inode->i_lock
*
* inode_hash_lock
- * inode->i_sb->s_inode_list_lock
+ * inode->i_sb->s_inodes->head->lock
* inode->i_lock
*
* iunique_lock
@@ -423,7 +423,7 @@ void inode_init_once(struct inode *inode)
INIT_LIST_HEAD(&inode->i_io_list);
INIT_LIST_HEAD(&inode->i_wb_list);
INIT_LIST_HEAD(&inode->i_lru);
- INIT_LIST_HEAD(&inode->i_sb_list);
+ init_dlock_list_node(&inode->i_sb_list);
__address_space_init_once(&inode->i_data);
i_size_ordered_init(inode);
}
@@ -492,19 +492,14 @@ static void inode_lru_list_del(struct inode *inode)
*/
void inode_sb_list_add(struct inode *inode)
{
- spin_lock(&inode->i_sb->s_inode_list_lock);
- list_add(&inode->i_sb_list, &inode->i_sb->s_inodes);
- spin_unlock(&inode->i_sb->s_inode_list_lock);
+ dlock_lists_add(&inode->i_sb_list, &inode->i_sb->s_inodes);
}
EXPORT_SYMBOL_GPL(inode_sb_list_add);
static inline void inode_sb_list_del(struct inode *inode)
{
- if (!list_empty(&inode->i_sb_list)) {
- spin_lock(&inode->i_sb->s_inode_list_lock);
- list_del_init(&inode->i_sb_list);
- spin_unlock(&inode->i_sb->s_inode_list_lock);
- }
+ if (!list_empty(&inode->i_sb_list.list))
+ dlock_lists_del(&inode->i_sb_list);
}
static unsigned long hash(struct super_block *sb, unsigned long hashval)
@@ -713,11 +708,12 @@ static void dispose_list(struct list_head *head)
void evict_inodes(struct super_block *sb)
{
struct inode *inode;
+ struct dlock_list_iter iter;
LIST_HEAD(dispose);
again:
- spin_lock(&sb->s_inode_list_lock);
- list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
+ init_dlock_list_iter(&iter, &sb->s_inodes);
+ dlist_for_each_entry(inode, &iter, i_sb_list) {
if (atomic_read(&inode->i_count))
continue;
@@ -738,13 +734,12 @@ void evict_inodes(struct super_block *sb)
* bit so we don't livelock.
*/
if (need_resched()) {
- spin_unlock(&sb->s_inode_list_lock);
+ dlock_list_unlock(&iter);
cond_resched();
dispose_list(&dispose);
goto again;
}
}
- spin_unlock(&sb->s_inode_list_lock);
dispose_list(&dispose);
}
@@ -759,11 +754,12 @@ EXPORT_SYMBOL_GPL(evict_inodes);
void invalidate_inodes(struct super_block *sb)
{
struct inode *inode;
+ struct dlock_list_iter iter;
LIST_HEAD(dispose);
again:
- spin_lock(&sb->s_inode_list_lock);
- list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
+ init_dlock_list_iter(&iter, &sb->s_inodes);
+ dlist_for_each_entry(inode, &iter, i_sb_list) {
spin_lock(&inode->i_lock);
if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
spin_unlock(&inode->i_lock);
@@ -779,13 +775,12 @@ void invalidate_inodes(struct super_block *sb)
spin_unlock(&inode->i_lock);
list_add(&inode->i_lru, &dispose);
if (need_resched()) {
- spin_unlock(&sb->s_inode_list_lock);
+ dlock_list_unlock(&iter);
cond_resched();
dispose_list(&dispose);
goto again;
}
}
- spin_unlock(&sb->s_inode_list_lock);
dispose_list(&dispose);
}
@@ -1232,7 +1227,7 @@ struct inode *inode_insert5(struct inode *inode, unsigned long hashval,
* Add inode to the sb list if it's not already. It has I_NEW at this
* point, so it should be safe to test i_sb_list locklessly.
*/
- if (list_empty(&inode->i_sb_list))
+ if (list_empty(&inode->i_sb_list.list))
inode_sb_list_add(inode);
unlock:
spin_unlock(&inode_hash_lock);
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 7974e91ffe13..15e3769e76f5 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -33,14 +33,15 @@ void __fsnotify_vfsmount_delete(struct vfsmount *mnt)
* @sb: superblock being unmounted.
*
* Called during unmount with no locks held, so needs to be safe against
- * concurrent modifiers. We temporarily drop sb->s_inode_list_lock and CAN block.
+ * concurrent modifiers. We temporarily drop sb->s_inodes list lock and CAN
+ * block.
*/
static void fsnotify_unmount_inodes(struct super_block *sb)
{
struct inode *inode, *iput_inode = NULL;
+ DEFINE_DLOCK_LIST_ITER(iter, &sb->s_inodes);
- spin_lock(&sb->s_inode_list_lock);
- list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
+ dlist_for_each_entry(inode, &iter, i_sb_list) {
/*
* We cannot __iget() an inode in state I_FREEING,
* I_WILL_FREE, or I_NEW which is fine because by that point
@@ -68,7 +69,7 @@ static void fsnotify_unmount_inodes(struct super_block *sb)
__iget(inode);
spin_unlock(&inode->i_lock);
- spin_unlock(&sb->s_inode_list_lock);
+ dlock_list_unlock(&iter);
iput(iput_inode);
@@ -80,9 +81,8 @@ static void fsnotify_unmount_inodes(struct super_block *sb)
iput_inode = inode;
cond_resched();
- spin_lock(&sb->s_inode_list_lock);
+ dlock_list_relock(&iter);
}
- spin_unlock(&sb->s_inode_list_lock);
iput(iput_inode);
}
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 58b5de081b57..e873dcbe6feb 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -1024,13 +1024,13 @@ static int dqinit_needed(struct inode *inode, int type)
static int add_dquot_ref(struct super_block *sb, int type)
{
struct inode *inode, *old_inode = NULL;
+ DEFINE_DLOCK_LIST_ITER(iter, &sb->s_inodes);
#ifdef CONFIG_QUOTA_DEBUG
int reserved = 0;
#endif
int err = 0;
- spin_lock(&sb->s_inode_list_lock);
- list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
+ dlist_for_each_entry(inode, &iter, i_sb_list) {
spin_lock(&inode->i_lock);
if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
!atomic_read(&inode->i_writecount) ||
@@ -1040,7 +1040,7 @@ static int add_dquot_ref(struct super_block *sb, int type)
}
__iget(inode);
spin_unlock(&inode->i_lock);
- spin_unlock(&sb->s_inode_list_lock);
+ dlock_list_unlock(&iter);
#ifdef CONFIG_QUOTA_DEBUG
if (unlikely(inode_get_rsv_space(inode) > 0))
@@ -1053,19 +1053,10 @@ static int add_dquot_ref(struct super_block *sb, int type)
goto out;
}
- /*
- * We hold a reference to 'inode' so it couldn't have been
- * removed from s_inodes list while we dropped the
- * s_inode_list_lock. We cannot iput the inode now as we can be
- * holding the last reference and we cannot iput it under
- * s_inode_list_lock. So we keep the reference and iput it
- * later.
- */
old_inode = inode;
cond_resched();
- spin_lock(&sb->s_inode_list_lock);
+ dlock_list_relock(&iter);
}
- spin_unlock(&sb->s_inode_list_lock);
iput(old_inode);
out:
#ifdef CONFIG_QUOTA_DEBUG
@@ -1081,12 +1072,12 @@ static int add_dquot_ref(struct super_block *sb, int type)
static void remove_dquot_ref(struct super_block *sb, int type)
{
struct inode *inode;
+ DEFINE_DLOCK_LIST_ITER(iter, &sb->s_inodes);
#ifdef CONFIG_QUOTA_DEBUG
int reserved = 0;
#endif
- spin_lock(&sb->s_inode_list_lock);
- list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
+ dlist_for_each_entry(inode, &iter, i_sb_list) {
/*
* We have to scan also I_NEW inodes because they can already
* have quota pointer initialized. Luckily, we need to touch
@@ -1108,7 +1099,6 @@ static void remove_dquot_ref(struct super_block *sb, int type)
}
spin_unlock(&dq_data_lock);
}
- spin_unlock(&sb->s_inode_list_lock);
#ifdef CONFIG_QUOTA_DEBUG
if (reserved) {
printk(KERN_WARNING "VFS (%s): Writes happened after quota"
diff --git a/fs/super.c b/fs/super.c
index 076392396e72..61c19e3f06d8 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -303,6 +303,7 @@ static void destroy_unused_super(struct super_block *s)
super_unlock_excl(s);
list_lru_destroy(&s->s_dentry_lru);
list_lru_destroy(&s->s_inode_lru);
+ free_dlock_list_heads(&s->s_inodes);
security_sb_free(s);
put_user_ns(s->s_user_ns);
kfree(s->s_subtype);
@@ -367,8 +368,6 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
INIT_HLIST_NODE(&s->s_instances);
INIT_HLIST_BL_HEAD(&s->s_roots);
mutex_init(&s->s_sync_lock);
- INIT_LIST_HEAD(&s->s_inodes);
- spin_lock_init(&s->s_inode_list_lock);
INIT_LIST_HEAD(&s->s_inodes_wb);
spin_lock_init(&s->s_inode_wblist_lock);
@@ -383,6 +382,9 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
s->s_time_min = TIME64_MIN;
s->s_time_max = TIME64_MAX;
+ if (alloc_dlock_list_heads(&s->s_inodes))
+ goto fail;
+
s->s_shrink = shrinker_alloc(SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE,
"sb-%s", type->name);
if (!s->s_shrink)
@@ -695,7 +697,7 @@ void generic_shutdown_super(struct super_block *sb)
if (sop->put_super)
sop->put_super(sb);
- if (CHECK_DATA_CORRUPTION(!list_empty(&sb->s_inodes),
+ if (CHECK_DATA_CORRUPTION(!dlock_lists_empty(&sb->s_inodes),
"VFS: Busy inodes after unmount of %s (%s)",
sb->s_id, sb->s_type->name)) {
/*
@@ -704,14 +706,13 @@ void generic_shutdown_super(struct super_block *sb)
* iput_final() or such crashes cleanly.
*/
struct inode *inode;
+ DEFINE_DLOCK_LIST_ITER(iter, &sb->s_inodes);
- spin_lock(&sb->s_inode_list_lock);
- list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
+ dlist_for_each_entry(inode, &iter, i_sb_list) {
inode->i_op = VFS_PTR_POISON;
inode->i_sb = VFS_PTR_POISON;
inode->i_mapping = VFS_PTR_POISON;
}
- spin_unlock(&sb->s_inode_list_lock);
}
}
/*
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 98b7a7a8c42e..bb35591733f1 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -43,6 +43,7 @@
#include <linux/cred.h>
#include <linux/mnt_idmapping.h>
#include <linux/slab.h>
+#include <linux/dlock-list.h>
#include <asm/byteorder.h>
#include <uapi/linux/fs.h>
@@ -702,7 +703,7 @@ struct inode {
u16 i_wb_frn_history;
#endif
struct list_head i_lru; /* inode LRU list */
- struct list_head i_sb_list;
+ struct dlock_list_node i_sb_list;
struct list_head i_wb_list; /* backing dev writeback list */
union {
struct hlist_head i_dentry;
@@ -1315,9 +1316,8 @@ struct super_block {
*/
int s_stack_depth;
- /* s_inode_list_lock protects s_inodes */
- spinlock_t s_inode_list_lock ____cacheline_aligned_in_smp;
- struct list_head s_inodes; /* all inodes */
+ /* The internal per-list locks protect s_inodes */
+ struct dlock_list_heads s_inodes; /* all inodes */
spinlock_t s_inode_wblist_lock;
struct list_head s_inodes_wb; /* writeback inodes */
diff --git a/security/landlock/fs.c b/security/landlock/fs.c
index bc7c126deea2..4269d9938c09 100644
--- a/security/landlock/fs.c
+++ b/security/landlock/fs.c
@@ -844,12 +844,12 @@ static void hook_inode_free_security(struct inode *const inode)
static void hook_sb_delete(struct super_block *const sb)
{
struct inode *inode, *prev_inode = NULL;
+ DEFINE_DLOCK_LIST_ITER(iter, &sb->s_inodes);
if (!landlock_initialized)
return;
- spin_lock(&sb->s_inode_list_lock);
- list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
+ dlist_for_each_entry(inode, &iter, i_sb_list) {
struct landlock_object *object;
/* Only handles referenced inodes. */
@@ -883,6 +883,7 @@ static void hook_sb_delete(struct super_block *const sb)
/* Keeps a reference to this inode until the next loop walk. */
__iget(inode);
spin_unlock(&inode->i_lock);
+ dlock_list_unlock(&iter);
/*
* If there is no concurrent release_inode() ongoing, then we
@@ -917,25 +918,11 @@ static void hook_sb_delete(struct super_block *const sb)
rcu_read_unlock();
}
- if (prev_inode) {
- /*
- * At this point, we still own the __iget() reference
- * that we just set in this loop walk. Therefore we
- * can drop the list lock and know that the inode won't
- * disappear from under us until the next loop walk.
- */
- spin_unlock(&sb->s_inode_list_lock);
- /*
- * We can now actually put the inode reference from the
- * previous loop walk, which is not needed anymore.
- */
- iput(prev_inode);
- cond_resched();
- spin_lock(&sb->s_inode_list_lock);
- }
+ iput(prev_inode);
prev_inode = inode;
+ cond_resched();
+ dlock_list_relock(&iter);
}
- spin_unlock(&sb->s_inode_list_lock);
/* Puts the inode reference from the last loop walk, if any. */
if (prev_inode)
--
2.42.0
^ permalink raw reply related [flat|nested] 34+ messages in thread
* Re: [PATCH 03/11] vfs: Use dlock list for superblock's inode list
2023-12-06 6:05 ` [PATCH 03/11] vfs: Use dlock list for superblock's inode list Dave Chinner
@ 2023-12-07 2:40 ` Al Viro
2023-12-07 4:59 ` Dave Chinner
0 siblings, 1 reply; 34+ messages in thread
From: Al Viro @ 2023-12-07 2:40 UTC (permalink / raw)
To: Dave Chinner
Cc: linux-fsdevel, linux-block, linux-cachefs, dhowells, gfs2,
dm-devel, linux-security-module, selinux, linux-kernel
On Wed, Dec 06, 2023 at 05:05:32PM +1100, Dave Chinner wrote:
> @@ -303,6 +303,7 @@ static void destroy_unused_super(struct super_block *s)
> super_unlock_excl(s);
> list_lru_destroy(&s->s_dentry_lru);
> list_lru_destroy(&s->s_inode_lru);
> + free_dlock_list_heads(&s->s_inodes);
> security_sb_free(s);
> put_user_ns(s->s_user_ns);
> kfree(s->s_subtype);
Umm... Who's going to do that on normal umount?
^ permalink raw reply [flat|nested] 34+ messages in thread
* Re: [PATCH 03/11] vfs: Use dlock list for superblock's inode list
2023-12-07 2:40 ` Al Viro
@ 2023-12-07 4:59 ` Dave Chinner
2023-12-07 5:03 ` Kent Overstreet
0 siblings, 1 reply; 34+ messages in thread
From: Dave Chinner @ 2023-12-07 4:59 UTC (permalink / raw)
To: Al Viro
Cc: linux-fsdevel, linux-block, linux-cachefs, dhowells, gfs2,
dm-devel, linux-security-module, selinux, linux-kernel
On Thu, Dec 07, 2023 at 02:40:24AM +0000, Al Viro wrote:
> On Wed, Dec 06, 2023 at 05:05:32PM +1100, Dave Chinner wrote:
>
> > @@ -303,6 +303,7 @@ static void destroy_unused_super(struct super_block *s)
> > super_unlock_excl(s);
> > list_lru_destroy(&s->s_dentry_lru);
> > list_lru_destroy(&s->s_inode_lru);
> > + free_dlock_list_heads(&s->s_inodes);
> > security_sb_free(s);
> > put_user_ns(s->s_user_ns);
> > kfree(s->s_subtype);
>
> Umm... Who's going to do that on normal umount?
Huh. So neither KASAN nor kmemleak has told me that s->s-inodes was
being leaked. I'm guessing a rebase sometime in the past silently
dropped a critical hunk from deactivate_locked_super() in the bit
bucket, but as nothing since whenever that happened has failed or
flagged a memory leak I didn't notice.
Such great tools we have......
-Dave.
--
Dave Chinner
david@fromorbit.com
^ permalink raw reply [flat|nested] 34+ messages in thread
* Re: [PATCH 03/11] vfs: Use dlock list for superblock's inode list
2023-12-07 4:59 ` Dave Chinner
@ 2023-12-07 5:03 ` Kent Overstreet
0 siblings, 0 replies; 34+ messages in thread
From: Kent Overstreet @ 2023-12-07 5:03 UTC (permalink / raw)
To: Dave Chinner
Cc: Al Viro, linux-fsdevel, linux-block, linux-cachefs, dhowells,
gfs2, dm-devel, linux-security-module, selinux, linux-kernel,
Suren Baghdasaryan
On Thu, Dec 07, 2023 at 03:59:10PM +1100, Dave Chinner wrote:
> On Thu, Dec 07, 2023 at 02:40:24AM +0000, Al Viro wrote:
> > On Wed, Dec 06, 2023 at 05:05:32PM +1100, Dave Chinner wrote:
> >
> > > @@ -303,6 +303,7 @@ static void destroy_unused_super(struct super_block *s)
> > > super_unlock_excl(s);
> > > list_lru_destroy(&s->s_dentry_lru);
> > > list_lru_destroy(&s->s_inode_lru);
> > > + free_dlock_list_heads(&s->s_inodes);
> > > security_sb_free(s);
> > > put_user_ns(s->s_user_ns);
> > > kfree(s->s_subtype);
> >
> > Umm... Who's going to do that on normal umount?
>
> Huh. So neither KASAN nor kmemleak has told me that s->s-inodes was
> being leaked. I'm guessing a rebase sometime in the past silently
> dropped a critical hunk from deactivate_locked_super() in the bit
> bucket, but as nothing since whenever that happened has failed or
> flagged a memory leak I didn't notice.
>
> Such great tools we have......
kmemleak has always seemed flakey to me (as one would expect, it's
difficult code to test). If we can ever get memory allocation profiling
merged - it won't be a direct replacement but it'll be more reliable.
^ permalink raw reply [flat|nested] 34+ messages in thread
* [PATCH 04/11] lib/dlock-list: Make sibling CPUs share the same linked list
2023-12-06 6:05 [PATCH 0/11] vfs: inode cache scalability improvements Dave Chinner
` (2 preceding siblings ...)
2023-12-06 6:05 ` [PATCH 03/11] vfs: Use dlock list for superblock's inode list Dave Chinner
@ 2023-12-06 6:05 ` Dave Chinner
2023-12-07 4:31 ` Kent Overstreet
` (2 more replies)
2023-12-06 6:05 ` [PATCH 05/11] selinux: use dlist for isec inode list Dave Chinner
` (7 subsequent siblings)
11 siblings, 3 replies; 34+ messages in thread
From: Dave Chinner @ 2023-12-06 6:05 UTC (permalink / raw)
To: linux-fsdevel
Cc: linux-block, linux-cachefs, dhowells, gfs2, dm-devel,
linux-security-module, selinux, linux-kernel
From: Waiman Long <longman@redhat.com>
The dlock list needs one list for each of the CPUs available. However,
for sibling CPUs, they are sharing the L2 and probably L1 caches
too. As a result, there is not much to gain in term of avoiding
cacheline contention while increasing the cacheline footprint of the
L1/L2 caches as separate lists may need to be in the cache.
This patch makes all the sibling CPUs share the same list, thus
reducing the number of lists that need to be maintained in each
dlock list without having any noticeable impact on performance. It
also improves dlock list iteration performance as fewer lists need
to be iterated.
Signed-off-by: Waiman Long <longman@redhat.com>
Reviewed-by: Jan Kara <jack@suse.cz>
---
lib/dlock-list.c | 74 ++++++++++++++++++++++++++++++++++++++----------
1 file changed, 59 insertions(+), 15 deletions(-)
diff --git a/lib/dlock-list.c b/lib/dlock-list.c
index f64ea4cc5e79..e2860944ec9f 100644
--- a/lib/dlock-list.c
+++ b/lib/dlock-list.c
@@ -25,31 +25,65 @@
* The distributed and locked list is a distributed set of lists each of
* which is protected by its own spinlock, but acts like a single
* consolidated list to the callers. For scaling purpose, the number of
- * lists used is equal to the number of possible CPUs in the system to
- * minimize contention.
+ * lists used is equal to the number of possible cores in the system to
+ * minimize contention. All threads of the same CPU core will share the
+ * same list.
*
- * However, it is possible that individual CPU numbers may be equal to
- * or greater than the number of possible CPUs when there are holes in
- * the CPU number list. As a result, we need to map the CPU number to a
- * list index.
+ * We need to map each CPU number to a list index.
*/
static DEFINE_PER_CPU_READ_MOSTLY(int, cpu2idx);
+static int nr_dlock_lists __read_mostly;
/*
- * Initialize cpu2idx mapping table
+ * Initialize cpu2idx mapping table & nr_dlock_lists.
*
* It is possible that a dlock-list can be allocated before the cpu2idx is
* initialized. In this case, all the cpus are mapped to the first entry
* before initialization.
*
+ * All the sibling CPUs of a sibling group will map to the same dlock list so
+ * as to reduce the number of dlock lists to be maintained while minimizing
+ * cacheline contention.
+ *
+ * As the sibling masks are set up in the core initcall phase, this function
+ * has to be done in the postcore phase to get the right data.
*/
static int __init cpu2idx_init(void)
{
int idx, cpu;
+ struct cpumask *sibling_mask;
+ static struct cpumask mask __initdata;
+ cpumask_clear(&mask);
idx = 0;
- for_each_possible_cpu(cpu)
- per_cpu(cpu2idx, cpu) = idx++;
+ for_each_possible_cpu(cpu) {
+ int scpu;
+
+ if (cpumask_test_cpu(cpu, &mask))
+ continue;
+ per_cpu(cpu2idx, cpu) = idx;
+ cpumask_set_cpu(cpu, &mask);
+
+ sibling_mask = topology_sibling_cpumask(cpu);
+ if (sibling_mask) {
+ for_each_cpu(scpu, sibling_mask) {
+ per_cpu(cpu2idx, scpu) = idx;
+ cpumask_set_cpu(scpu, &mask);
+ }
+ }
+ idx++;
+ }
+
+ /*
+ * nr_dlock_lists can only be set after cpu2idx is properly
+ * initialized.
+ */
+ smp_mb();
+ nr_dlock_lists = idx;
+ WARN_ON(nr_dlock_lists > nr_cpu_ids);
+
+ pr_info("dlock-list: %d head entries per dlock list.\n",
+ nr_dlock_lists);
return 0;
}
postcore_initcall(cpu2idx_init);
@@ -67,19 +101,23 @@ postcore_initcall(cpu2idx_init);
*
* Dynamically allocated locks need to have their own special lock class
* to avoid lockdep warning.
+ *
+ * Since nr_dlock_lists will always be <= nr_cpu_ids, having more lists
+ * than necessary allocated is not a problem other than some wasted memory.
+ * The extra lists will not be ever used as all the cpu2idx entries will be
+ * 0 before initialization.
*/
int __alloc_dlock_list_heads(struct dlock_list_heads *dlist,
struct lock_class_key *key)
{
- int idx;
+ int idx, cnt = nr_dlock_lists ? nr_dlock_lists : nr_cpu_ids;
- dlist->heads = kcalloc(nr_cpu_ids, sizeof(struct dlock_list_head),
- GFP_KERNEL);
+ dlist->heads = kcalloc(cnt, sizeof(struct dlock_list_head), GFP_KERNEL);
if (!dlist->heads)
return -ENOMEM;
- for (idx = 0; idx < nr_cpu_ids; idx++) {
+ for (idx = 0; idx < cnt; idx++) {
struct dlock_list_head *head = &dlist->heads[idx];
INIT_LIST_HEAD(&head->list);
@@ -117,7 +155,10 @@ bool dlock_lists_empty(struct dlock_list_heads *dlist)
{
int idx;
- for (idx = 0; idx < nr_cpu_ids; idx++)
+ /* Shouldn't be called before nr_dlock_lists is initialized */
+ WARN_ON_ONCE(!nr_dlock_lists);
+
+ for (idx = 0; idx < nr_dlock_lists; idx++)
if (!list_empty(&dlist->heads[idx].list))
return false;
return true;
@@ -199,6 +240,9 @@ struct dlock_list_node *__dlock_list_next_list(struct dlock_list_iter *iter)
struct dlock_list_node *next;
struct dlock_list_head *head;
+ /* Shouldn't be called before nr_dlock_lists is initialized */
+ WARN_ON_ONCE(!nr_dlock_lists);
+
restart:
if (iter->entry) {
spin_unlock(&iter->entry->lock);
@@ -209,7 +253,7 @@ struct dlock_list_node *__dlock_list_next_list(struct dlock_list_iter *iter)
/*
* Try next list
*/
- if (++iter->index >= nr_cpu_ids)
+ if (++iter->index >= nr_dlock_lists)
return NULL; /* All the entries iterated */
if (list_empty(&iter->head[iter->index].list))
--
2.42.0
^ permalink raw reply related [flat|nested] 34+ messages in thread
* Re: [PATCH 04/11] lib/dlock-list: Make sibling CPUs share the same linked list
2023-12-06 6:05 ` [PATCH 04/11] lib/dlock-list: Make sibling CPUs share the same linked list Dave Chinner
@ 2023-12-07 4:31 ` Kent Overstreet
2023-12-07 5:42 ` Kent Overstreet
2023-12-07 6:49 ` Al Viro
2 siblings, 0 replies; 34+ messages in thread
From: Kent Overstreet @ 2023-12-07 4:31 UTC (permalink / raw)
To: Dave Chinner
Cc: linux-fsdevel, linux-block, linux-cachefs, dhowells, gfs2,
dm-devel, linux-security-module, selinux, linux-kernel
On Wed, Dec 06, 2023 at 05:05:33PM +1100, Dave Chinner wrote:
> From: Waiman Long <longman@redhat.com>
>
> The dlock list needs one list for each of the CPUs available. However,
> for sibling CPUs, they are sharing the L2 and probably L1 caches
> too. As a result, there is not much to gain in term of avoiding
> cacheline contention while increasing the cacheline footprint of the
> L1/L2 caches as separate lists may need to be in the cache.
>
> This patch makes all the sibling CPUs share the same list, thus
> reducing the number of lists that need to be maintained in each
> dlock list without having any noticeable impact on performance. It
> also improves dlock list iteration performance as fewer lists need
> to be iterated.
>
> Signed-off-by: Waiman Long <longman@redhat.com>
> Reviewed-by: Jan Kara <jack@suse.cz>
We badly need this done in a more generic way. Besides shared caches,
I've done a bunch of percpu algorithms where "amount of x stranded on
percpu lists" is a major consideration and this would be preferable over
percpu lists (including in fs/aio.c).
^ permalink raw reply [flat|nested] 34+ messages in thread
* Re: [PATCH 04/11] lib/dlock-list: Make sibling CPUs share the same linked list
2023-12-06 6:05 ` [PATCH 04/11] lib/dlock-list: Make sibling CPUs share the same linked list Dave Chinner
2023-12-07 4:31 ` Kent Overstreet
@ 2023-12-07 5:42 ` Kent Overstreet
2023-12-07 6:25 ` Dave Chinner
2023-12-07 6:49 ` Al Viro
2 siblings, 1 reply; 34+ messages in thread
From: Kent Overstreet @ 2023-12-07 5:42 UTC (permalink / raw)
To: Dave Chinner, Waiman Long
Cc: linux-fsdevel, linux-block, linux-cachefs, dhowells, gfs2,
dm-devel, linux-security-module, selinux, linux-kernel, Jan Kara
On Wed, Dec 06, 2023 at 05:05:33PM +1100, Dave Chinner wrote:
> From: Waiman Long <longman@redhat.com>
>
> The dlock list needs one list for each of the CPUs available. However,
> for sibling CPUs, they are sharing the L2 and probably L1 caches
> too. As a result, there is not much to gain in term of avoiding
> cacheline contention while increasing the cacheline footprint of the
> L1/L2 caches as separate lists may need to be in the cache.
>
> This patch makes all the sibling CPUs share the same list, thus
> reducing the number of lists that need to be maintained in each
> dlock list without having any noticeable impact on performance. It
> also improves dlock list iteration performance as fewer lists need
> to be iterated.
Seems Waiman was missed on the CC
it looks like there's some duplication of this with list_lru
functionality - similar list-sharded-by-node idea.
list_lru does the sharding by page_to_nid() of the item, which saves a
pointer and allows just using a list_head in the item. OTOH, it's less
granular than what dlock-list is doing?
I think some attempt ought to be made to factor out the common ideas
hear; perhaps reworking list_lru to use this thing, and I hope someone
has looked at the page_nid idea vs. dlock_list using the current core.
But it's nice and small, and I'd like to use it elsewhere.
Reviewed-by: Kent Overstreet <kent.overstreet@linux.dev>
^ permalink raw reply [flat|nested] 34+ messages in thread
* Re: [PATCH 04/11] lib/dlock-list: Make sibling CPUs share the same linked list
2023-12-07 5:42 ` Kent Overstreet
@ 2023-12-07 6:25 ` Dave Chinner
0 siblings, 0 replies; 34+ messages in thread
From: Dave Chinner @ 2023-12-07 6:25 UTC (permalink / raw)
To: Kent Overstreet
Cc: Waiman Long, linux-fsdevel, linux-block, linux-cachefs, dhowells,
gfs2, dm-devel, linux-security-module, selinux, linux-kernel,
Jan Kara
On Thu, Dec 07, 2023 at 12:42:59AM -0500, Kent Overstreet wrote:
> On Wed, Dec 06, 2023 at 05:05:33PM +1100, Dave Chinner wrote:
> > From: Waiman Long <longman@redhat.com>
> >
> > The dlock list needs one list for each of the CPUs available. However,
> > for sibling CPUs, they are sharing the L2 and probably L1 caches
> > too. As a result, there is not much to gain in term of avoiding
> > cacheline contention while increasing the cacheline footprint of the
> > L1/L2 caches as separate lists may need to be in the cache.
> >
> > This patch makes all the sibling CPUs share the same list, thus
> > reducing the number of lists that need to be maintained in each
> > dlock list without having any noticeable impact on performance. It
> > also improves dlock list iteration performance as fewer lists need
> > to be iterated.
>
> Seems Waiman was missed on the CC
Oops, I knew I missed someone important....
> it looks like there's some duplication of this with list_lru
> functionality - similar list-sharded-by-node idea.
For completely different reasons. The list_lru is aligned to the mm
zone architecture which only partitions memory management accounting
and scanning actions down into NUMA nodes. It's also a per-node
ordered list (LRU), and it has intricate locking semantics that
expose internal list locks to external isolation functions that can
be called whilst a lock protected traversal is in progress.
Further, we have to consider that list-lru is tightly tied to
memcgs. For a single NUMA- and memcg- aware list-lru, there is
actually nr_memcgs * nr_nodes LRUs per list. The memory footprint of
a superblock list_lru gets quite gigantic when we start talking
about machines with hundreds of nodes running tens of thousands of
containers each with tens of superblocks.
That's the biggest problem with using a more memory expensive
structure for the list_lru - we're talking gigabytes of memory just
for the superblock shrinker tracking structure overhead on large
machines. This was one of the reasons why we haven't tried to make
list_lrus any more fine-grained that it absolutely needs to be to
provide acceptible scalability.
> list_lru does the sharding by page_to_nid() of the item, which
> saves a pointer and allows just using a list_head in the item.
> OTOH, it's less granular than what dlock-list is doing?
Sure, but there's a lot more to list_lrus than it being a "per-node
list". OTOH, dlock_list is really nothing more than a "per-cpu
list"....
> I think some attempt ought to be made to factor out the common
> ideas hear; perhaps reworking list_lru to use this thing, and I
> hope someone has looked at the page_nid idea vs. dlock_list using
> the current core.
I certainly have, and I haven't been able to justify the additional
memory footprint of a dlock_list over the existing per-node lists.
That may change given that XFS appears to be on the theshold of
per-node list-lru lock breakdown at 64 threads, but there's a lot
more to consider from a system perspective here than just
inode/dentry cache scalability....
-Dave.
--
Dave Chinner
david@fromorbit.com
^ permalink raw reply [flat|nested] 34+ messages in thread
* Re: [PATCH 04/11] lib/dlock-list: Make sibling CPUs share the same linked list
2023-12-06 6:05 ` [PATCH 04/11] lib/dlock-list: Make sibling CPUs share the same linked list Dave Chinner
2023-12-07 4:31 ` Kent Overstreet
2023-12-07 5:42 ` Kent Overstreet
@ 2023-12-07 6:49 ` Al Viro
2 siblings, 0 replies; 34+ messages in thread
From: Al Viro @ 2023-12-07 6:49 UTC (permalink / raw)
To: Dave Chinner
Cc: linux-fsdevel, linux-block, linux-cachefs, dhowells, gfs2,
dm-devel, linux-security-module, selinux, linux-kernel
On Wed, Dec 06, 2023 at 05:05:33PM +1100, Dave Chinner wrote:
> From: Waiman Long <longman@redhat.com>
>
> The dlock list needs one list for each of the CPUs available. However,
> for sibling CPUs, they are sharing the L2 and probably L1 caches
> too. As a result, there is not much to gain in term of avoiding
> cacheline contention while increasing the cacheline footprint of the
> L1/L2 caches as separate lists may need to be in the cache.
>
> This patch makes all the sibling CPUs share the same list, thus
> reducing the number of lists that need to be maintained in each
> dlock list without having any noticeable impact on performance. It
> also improves dlock list iteration performance as fewer lists need
> to be iterated.
Probably a dumb question, but... "available" != "possible"; the code
actually goes for the latter, which avoids nasty questions about
CPU hotplug interations. Is the sibling relation on CPUs unchanging
on CPU hotplug?
^ permalink raw reply [flat|nested] 34+ messages in thread
* [PATCH 05/11] selinux: use dlist for isec inode list
2023-12-06 6:05 [PATCH 0/11] vfs: inode cache scalability improvements Dave Chinner
` (3 preceding siblings ...)
2023-12-06 6:05 ` [PATCH 04/11] lib/dlock-list: Make sibling CPUs share the same linked list Dave Chinner
@ 2023-12-06 6:05 ` Dave Chinner
2023-12-06 21:52 ` Paul Moore
2023-12-06 6:05 ` [PATCH 06/11] vfs: factor out inode hash head calculation Dave Chinner
` (6 subsequent siblings)
11 siblings, 1 reply; 34+ messages in thread
From: Dave Chinner @ 2023-12-06 6:05 UTC (permalink / raw)
To: linux-fsdevel
Cc: linux-block, linux-cachefs, dhowells, gfs2, dm-devel,
linux-security-module, selinux, linux-kernel
From: Dave Chinner <dchinner@redhat.com>
Because it's a horrible point of lock contention under heavily
concurrent directory traversals...
- 12.14% d_instantiate
- 12.06% security_d_instantiate
- 12.13% selinux_d_instantiate
- 12.16% inode_doinit_with_dentry
- 15.45% _raw_spin_lock
- do_raw_spin_lock
14.68% __pv_queued_spin_lock_slowpath
Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
include/linux/dlock-list.h | 9 ++++
security/selinux/hooks.c | 72 +++++++++++++++----------------
security/selinux/include/objsec.h | 6 +--
3 files changed, 47 insertions(+), 40 deletions(-)
diff --git a/include/linux/dlock-list.h b/include/linux/dlock-list.h
index 327cb9edc7e3..7ad933b8875d 100644
--- a/include/linux/dlock-list.h
+++ b/include/linux/dlock-list.h
@@ -132,6 +132,15 @@ extern void dlock_lists_add(struct dlock_list_node *node,
struct dlock_list_heads *dlist);
extern void dlock_lists_del(struct dlock_list_node *node);
+static inline void
+dlock_list_del_iter(struct dlock_list_iter *iter,
+ struct dlock_list_node *node)
+{
+ WARN_ON_ONCE((iter->entry != READ_ONCE(node->head)));
+ list_del_init(&node->list);
+ WRITE_ONCE(node->head, NULL);
+}
+
/*
* Find the first entry of the next available list.
*/
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index feda711c6b7b..0358d7c66949 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -340,26 +340,11 @@ static struct inode_security_struct *backing_inode_security(struct dentry *dentr
static void inode_free_security(struct inode *inode)
{
struct inode_security_struct *isec = selinux_inode(inode);
- struct superblock_security_struct *sbsec;
if (!isec)
return;
- sbsec = selinux_superblock(inode->i_sb);
- /*
- * As not all inode security structures are in a list, we check for
- * empty list outside of the lock to make sure that we won't waste
- * time taking a lock doing nothing.
- *
- * The list_del_init() function can be safely called more than once.
- * It should not be possible for this function to be called with
- * concurrent list_add(), but for better safety against future changes
- * in the code, we use list_empty_careful() here.
- */
- if (!list_empty_careful(&isec->list)) {
- spin_lock(&sbsec->isec_lock);
- list_del_init(&isec->list);
- spin_unlock(&sbsec->isec_lock);
- }
+ if (!list_empty(&isec->list.list))
+ dlock_lists_del(&isec->list);
}
struct selinux_mnt_opts {
@@ -547,6 +532,8 @@ static int sb_finish_set_opts(struct super_block *sb)
struct superblock_security_struct *sbsec = selinux_superblock(sb);
struct dentry *root = sb->s_root;
struct inode *root_inode = d_backing_inode(root);
+ struct dlock_list_iter iter;
+ struct inode_security_struct *isec, *n;
int rc = 0;
if (sbsec->behavior == SECURITY_FS_USE_XATTR) {
@@ -570,27 +557,28 @@ static int sb_finish_set_opts(struct super_block *sb)
/* Initialize the root inode. */
rc = inode_doinit_with_dentry(root_inode, root);
- /* Initialize any other inodes associated with the superblock, e.g.
- inodes created prior to initial policy load or inodes created
- during get_sb by a pseudo filesystem that directly
- populates itself. */
- spin_lock(&sbsec->isec_lock);
- while (!list_empty(&sbsec->isec_head)) {
- struct inode_security_struct *isec =
- list_first_entry(&sbsec->isec_head,
- struct inode_security_struct, list);
+ /*
+ * Initialize any other inodes associated with the superblock, e.g.
+ * inodes created prior to initial policy load or inodes created during
+ * get_sb by a pseudo filesystem that directly populates itself.
+ */
+ init_dlock_list_iter(&iter, &sbsec->isec_head);
+ dlist_for_each_entry_safe(isec, n, &iter, list) {
struct inode *inode = isec->inode;
- list_del_init(&isec->list);
- spin_unlock(&sbsec->isec_lock);
+
+ dlock_list_del_iter(&iter, &isec->list);
+ dlock_list_unlock(&iter);
+
inode = igrab(inode);
if (inode) {
if (!IS_PRIVATE(inode))
inode_doinit_with_dentry(inode, NULL);
iput(inode);
}
- spin_lock(&sbsec->isec_lock);
+
+ dlock_list_relock(&iter);
}
- spin_unlock(&sbsec->isec_lock);
+ WARN_ON_ONCE(!dlock_lists_empty(&sbsec->isec_head));
return rc;
}
@@ -1428,10 +1416,8 @@ static int inode_doinit_with_dentry(struct inode *inode, struct dentry *opt_dent
/* Defer initialization until selinux_complete_init,
after the initial policy is loaded and the security
server is ready to handle calls. */
- spin_lock(&sbsec->isec_lock);
- if (list_empty(&isec->list))
- list_add(&isec->list, &sbsec->isec_head);
- spin_unlock(&sbsec->isec_lock);
+ if (list_empty(&isec->list.list))
+ dlock_lists_add(&isec->list, &sbsec->isec_head);
goto out_unlock;
}
@@ -2548,9 +2534,10 @@ static int selinux_sb_alloc_security(struct super_block *sb)
{
struct superblock_security_struct *sbsec = selinux_superblock(sb);
+ if (alloc_dlock_list_heads(&sbsec->isec_head))
+ return -ENOMEM;
+
mutex_init(&sbsec->lock);
- INIT_LIST_HEAD(&sbsec->isec_head);
- spin_lock_init(&sbsec->isec_lock);
sbsec->sid = SECINITSID_UNLABELED;
sbsec->def_sid = SECINITSID_FILE;
sbsec->mntpoint_sid = SECINITSID_UNLABELED;
@@ -2558,6 +2545,15 @@ static int selinux_sb_alloc_security(struct super_block *sb)
return 0;
}
+static void selinux_sb_free_security(struct super_block *sb)
+{
+ struct superblock_security_struct *sbsec = selinux_superblock(sb);
+
+ if (!sbsec)
+ return;
+ free_dlock_list_heads(&sbsec->isec_head);
+}
+
static inline int opt_len(const char *s)
{
bool open_quote = false;
@@ -2841,7 +2837,7 @@ static int selinux_inode_alloc_security(struct inode *inode)
u32 sid = current_sid();
spin_lock_init(&isec->lock);
- INIT_LIST_HEAD(&isec->list);
+ init_dlock_list_node(&isec->list);
isec->inode = inode;
isec->sid = SECINITSID_UNLABELED;
isec->sclass = SECCLASS_FILE;
@@ -2920,6 +2916,7 @@ static int selinux_inode_init_security(struct inode *inode, struct inode *dir,
if (rc)
return rc;
+
/* Possibly defer initialization to selinux_complete_init. */
if (sbsec->flags & SE_SBINITIALIZED) {
struct inode_security_struct *isec = selinux_inode(inode);
@@ -7215,6 +7212,7 @@ static struct security_hook_list selinux_hooks[] __ro_after_init = {
selinux_msg_queue_alloc_security),
LSM_HOOK_INIT(shm_alloc_security, selinux_shm_alloc_security),
LSM_HOOK_INIT(sb_alloc_security, selinux_sb_alloc_security),
+ LSM_HOOK_INIT(sb_free_security, selinux_sb_free_security),
LSM_HOOK_INIT(inode_alloc_security, selinux_inode_alloc_security),
LSM_HOOK_INIT(sem_alloc_security, selinux_sem_alloc_security),
LSM_HOOK_INIT(secid_to_secctx, selinux_secid_to_secctx),
diff --git a/security/selinux/include/objsec.h b/security/selinux/include/objsec.h
index 8159fd53c3de..e0709f429c56 100644
--- a/security/selinux/include/objsec.h
+++ b/security/selinux/include/objsec.h
@@ -24,6 +24,7 @@
#include <linux/spinlock.h>
#include <linux/lsm_hooks.h>
#include <linux/msg.h>
+#include <linux/dlock-list.h>
#include <net/net_namespace.h>
#include "flask.h"
#include "avc.h"
@@ -45,7 +46,7 @@ enum label_initialized {
struct inode_security_struct {
struct inode *inode; /* back pointer to inode object */
- struct list_head list; /* list of inode_security_struct */
+ struct dlock_list_node list; /* list of inode_security_struct */
u32 task_sid; /* SID of creating task */
u32 sid; /* SID of this object */
u16 sclass; /* security class of this object */
@@ -67,8 +68,7 @@ struct superblock_security_struct {
unsigned short behavior; /* labeling behavior */
unsigned short flags; /* which mount options were specified */
struct mutex lock;
- struct list_head isec_head;
- spinlock_t isec_lock;
+ struct dlock_list_heads isec_head;
};
struct msg_security_struct {
--
2.42.0
^ permalink raw reply related [flat|nested] 34+ messages in thread
* Re: [PATCH 05/11] selinux: use dlist for isec inode list
2023-12-06 6:05 ` [PATCH 05/11] selinux: use dlist for isec inode list Dave Chinner
@ 2023-12-06 21:52 ` Paul Moore
2023-12-06 23:04 ` Dave Chinner
0 siblings, 1 reply; 34+ messages in thread
From: Paul Moore @ 2023-12-06 21:52 UTC (permalink / raw)
To: Dave Chinner
Cc: linux-fsdevel, linux-block, linux-cachefs, dhowells, gfs2,
dm-devel, linux-security-module, selinux, linux-kernel
On Wed, Dec 6, 2023 at 1:07 AM Dave Chinner <david@fromorbit.com> wrote:
>
> From: Dave Chinner <dchinner@redhat.com>
>
> Because it's a horrible point of lock contention under heavily
> concurrent directory traversals...
>
> - 12.14% d_instantiate
> - 12.06% security_d_instantiate
> - 12.13% selinux_d_instantiate
> - 12.16% inode_doinit_with_dentry
> - 15.45% _raw_spin_lock
> - do_raw_spin_lock
> 14.68% __pv_queued_spin_lock_slowpath
>
>
> Signed-off-by: Dave Chinner <dchinner@redhat.com>
> ---
> include/linux/dlock-list.h | 9 ++++
> security/selinux/hooks.c | 72 +++++++++++++++----------------
> security/selinux/include/objsec.h | 6 +--
> 3 files changed, 47 insertions(+), 40 deletions(-)
In the cover letter you talk about testing, but I didn't see any
mention of testing with SELinux enabled. Given the lock contention
stats in the description above I'm going to assume you did test this
and pass along my ACK, but if you haven't tested the changes below
please do before sending this anywhere important.
Acked-by: Paul Moore <paul@paul-moore.com>
--
paul-moore.com
^ permalink raw reply [flat|nested] 34+ messages in thread
* Re: [PATCH 05/11] selinux: use dlist for isec inode list
2023-12-06 21:52 ` Paul Moore
@ 2023-12-06 23:04 ` Dave Chinner
2023-12-07 0:36 ` Paul Moore
0 siblings, 1 reply; 34+ messages in thread
From: Dave Chinner @ 2023-12-06 23:04 UTC (permalink / raw)
To: Paul Moore
Cc: linux-fsdevel, linux-block, linux-cachefs, dhowells, gfs2,
dm-devel, linux-security-module, selinux, linux-kernel
On Wed, Dec 06, 2023 at 04:52:42PM -0500, Paul Moore wrote:
> On Wed, Dec 6, 2023 at 1:07 AM Dave Chinner <david@fromorbit.com> wrote:
> >
> > From: Dave Chinner <dchinner@redhat.com>
> >
> > Because it's a horrible point of lock contention under heavily
> > concurrent directory traversals...
> >
> > - 12.14% d_instantiate
> > - 12.06% security_d_instantiate
> > - 12.13% selinux_d_instantiate
> > - 12.16% inode_doinit_with_dentry
> > - 15.45% _raw_spin_lock
> > - do_raw_spin_lock
> > 14.68% __pv_queued_spin_lock_slowpath
> >
> >
> > Signed-off-by: Dave Chinner <dchinner@redhat.com>
> > ---
> > include/linux/dlock-list.h | 9 ++++
> > security/selinux/hooks.c | 72 +++++++++++++++----------------
> > security/selinux/include/objsec.h | 6 +--
> > 3 files changed, 47 insertions(+), 40 deletions(-)
>
> In the cover letter you talk about testing, but I didn't see any
> mention of testing with SELinux enabled. Given the lock contention
> stats in the description above I'm going to assume you did test this
> and pass along my ACK, but if you haven't tested the changes below
> please do before sending this anywhere important.
AFAIA, I've been testing with selinux enabled - I'm trying to run
these tests in an environment as close to typical production systems
as possible and that means selinux needs to be enabled.
As such, all the fstests and perf testing has been done with selinux
in permissive mode using "-o context=system_u:object_r:root_t:s0" as
the default context for the mount.
I see this sort of thing in the profiles:
- 87.13% path_lookupat
- 86.46% walk_component
- 84.20% lookup_slow
- 84.05% __lookup_slow
- 80.81% xfs_vn_lookup
- 77.84% xfs_lookup
....
- 2.91% d_splice_alias
- 1.52% security_d_instantiate
- 1.50% selinux_d_instantiate
- 1.47% inode_doinit_with_dentry
- 0.83% inode_doinit_use_xattr
0.52% __vfs_getxattr
Which tells me that selinux is definitely doing -something- on every
inode being instantiated, so I'm pretty sure the security and
selinux paths are getting exercised...
> Acked-by: Paul Moore <paul@paul-moore.com>
Thanks!
-Dave.
--
Dave Chinner
david@fromorbit.com
^ permalink raw reply [flat|nested] 34+ messages in thread
* Re: [PATCH 05/11] selinux: use dlist for isec inode list
2023-12-06 23:04 ` Dave Chinner
@ 2023-12-07 0:36 ` Paul Moore
0 siblings, 0 replies; 34+ messages in thread
From: Paul Moore @ 2023-12-07 0:36 UTC (permalink / raw)
To: Dave Chinner
Cc: linux-fsdevel, linux-block, linux-cachefs, dhowells, gfs2,
dm-devel, linux-security-module, selinux, linux-kernel
On Wed, Dec 6, 2023 at 6:04 PM Dave Chinner <david@fromorbit.com> wrote:
> On Wed, Dec 06, 2023 at 04:52:42PM -0500, Paul Moore wrote:
> > On Wed, Dec 6, 2023 at 1:07 AM Dave Chinner <david@fromorbit.com> wrote:
> > >
> > > From: Dave Chinner <dchinner@redhat.com>
> > >
> > > Because it's a horrible point of lock contention under heavily
> > > concurrent directory traversals...
> > >
> > > - 12.14% d_instantiate
> > > - 12.06% security_d_instantiate
> > > - 12.13% selinux_d_instantiate
> > > - 12.16% inode_doinit_with_dentry
> > > - 15.45% _raw_spin_lock
> > > - do_raw_spin_lock
> > > 14.68% __pv_queued_spin_lock_slowpath
> > >
> > >
> > > Signed-off-by: Dave Chinner <dchinner@redhat.com>
> > > ---
> > > include/linux/dlock-list.h | 9 ++++
> > > security/selinux/hooks.c | 72 +++++++++++++++----------------
> > > security/selinux/include/objsec.h | 6 +--
> > > 3 files changed, 47 insertions(+), 40 deletions(-)
> >
> > In the cover letter you talk about testing, but I didn't see any
> > mention of testing with SELinux enabled. Given the lock contention
> > stats in the description above I'm going to assume you did test this
> > and pass along my ACK, but if you haven't tested the changes below
> > please do before sending this anywhere important.
>
> AFAIA, I've been testing with selinux enabled - I'm trying to run
> these tests in an environment as close to typical production systems
> as possible and that means selinux needs to be enabled.
>
> As such, all the fstests and perf testing has been done with selinux
> in permissive mode using "-o context=system_u:object_r:root_t:s0" as
> the default context for the mount.
>
> I see this sort of thing in the profiles:
>
> - 87.13% path_lookupat
> - 86.46% walk_component
> - 84.20% lookup_slow
> - 84.05% __lookup_slow
> - 80.81% xfs_vn_lookup
> - 77.84% xfs_lookup
> ....
> - 2.91% d_splice_alias
> - 1.52% security_d_instantiate
> - 1.50% selinux_d_instantiate
> - 1.47% inode_doinit_with_dentry
> - 0.83% inode_doinit_use_xattr
> 0.52% __vfs_getxattr
>
> Which tells me that selinux is definitely doing -something- on every
> inode being instantiated, so I'm pretty sure the security and
> selinux paths are getting exercised...
That's great, thanks for the confirmation. FWIW, for these patches it
doesn't really matter if the system is in enforcing or permissive
mode, or what label you use, the important bit is that you've got a
system with SELinux enabled in the kernel and you have a reasonable
policy loaded.
--
paul-moore.com
^ permalink raw reply [flat|nested] 34+ messages in thread
* [PATCH 06/11] vfs: factor out inode hash head calculation
2023-12-06 6:05 [PATCH 0/11] vfs: inode cache scalability improvements Dave Chinner
` (4 preceding siblings ...)
2023-12-06 6:05 ` [PATCH 05/11] selinux: use dlist for isec inode list Dave Chinner
@ 2023-12-06 6:05 ` Dave Chinner
2023-12-07 3:02 ` Al Viro
2023-12-06 6:05 ` [PATCH 07/11] hlist-bl: add hlist_bl_fake() Dave Chinner
` (5 subsequent siblings)
11 siblings, 1 reply; 34+ messages in thread
From: Dave Chinner @ 2023-12-06 6:05 UTC (permalink / raw)
To: linux-fsdevel
Cc: linux-block, linux-cachefs, dhowells, gfs2, dm-devel,
linux-security-module, selinux, linux-kernel
From: Dave Chinner <dchinner@redhat.com>
In preparation for changing the inode hash table implementation.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
fs/inode.c | 44 +++++++++++++++++++++++++-------------------
1 file changed, 25 insertions(+), 19 deletions(-)
diff --git a/fs/inode.c b/fs/inode.c
index 3426691fa305..fead81550cf4 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -59,6 +59,22 @@ static unsigned int i_hash_shift __ro_after_init;
static struct hlist_head *inode_hashtable __ro_after_init;
static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock);
+static unsigned long hash(struct super_block *sb, unsigned long hashval)
+{
+ unsigned long tmp;
+
+ tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) /
+ L1_CACHE_BYTES;
+ tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> i_hash_shift);
+ return tmp & i_hash_mask;
+}
+
+static inline struct hlist_head *i_hash_head(struct super_block *sb,
+ unsigned int hashval)
+{
+ return inode_hashtable + hash(sb, hashval);
+}
+
/*
* Empty aops. Can be used for the cases where the user does not
* define any of the address_space operations.
@@ -502,16 +518,6 @@ static inline void inode_sb_list_del(struct inode *inode)
dlock_lists_del(&inode->i_sb_list);
}
-static unsigned long hash(struct super_block *sb, unsigned long hashval)
-{
- unsigned long tmp;
-
- tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) /
- L1_CACHE_BYTES;
- tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> i_hash_shift);
- return tmp & i_hash_mask;
-}
-
/**
* __insert_inode_hash - hash an inode
* @inode: unhashed inode
@@ -1187,7 +1193,7 @@ struct inode *inode_insert5(struct inode *inode, unsigned long hashval,
int (*test)(struct inode *, void *),
int (*set)(struct inode *, void *), void *data)
{
- struct hlist_head *head = inode_hashtable + hash(inode->i_sb, hashval);
+ struct hlist_head *head = i_hash_head(inode->i_sb, hashval);
struct inode *old;
again:
@@ -1291,7 +1297,7 @@ EXPORT_SYMBOL(iget5_locked);
*/
struct inode *iget_locked(struct super_block *sb, unsigned long ino)
{
- struct hlist_head *head = inode_hashtable + hash(sb, ino);
+ struct hlist_head *head = i_hash_head(sb, ino);
struct inode *inode;
again:
spin_lock(&inode_hash_lock);
@@ -1359,7 +1365,7 @@ EXPORT_SYMBOL(iget_locked);
*/
static int test_inode_iunique(struct super_block *sb, unsigned long ino)
{
- struct hlist_head *b = inode_hashtable + hash(sb, ino);
+ struct hlist_head *b = i_hash_head(sb, ino);
struct inode *inode;
hlist_for_each_entry_rcu(inode, b, i_hash) {
@@ -1446,7 +1452,7 @@ EXPORT_SYMBOL(igrab);
struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval,
int (*test)(struct inode *, void *), void *data)
{
- struct hlist_head *head = inode_hashtable + hash(sb, hashval);
+ struct hlist_head *head = i_hash_head(sb, hashval);
struct inode *inode;
spin_lock(&inode_hash_lock);
@@ -1501,7 +1507,7 @@ EXPORT_SYMBOL(ilookup5);
*/
struct inode *ilookup(struct super_block *sb, unsigned long ino)
{
- struct hlist_head *head = inode_hashtable + hash(sb, ino);
+ struct hlist_head *head = i_hash_head(sb, ino);
struct inode *inode;
again:
spin_lock(&inode_hash_lock);
@@ -1550,7 +1556,7 @@ struct inode *find_inode_nowait(struct super_block *sb,
void *),
void *data)
{
- struct hlist_head *head = inode_hashtable + hash(sb, hashval);
+ struct hlist_head *head = i_hash_head(sb, hashval);
struct inode *inode, *ret_inode = NULL;
int mval;
@@ -1595,7 +1601,7 @@ EXPORT_SYMBOL(find_inode_nowait);
struct inode *find_inode_rcu(struct super_block *sb, unsigned long hashval,
int (*test)(struct inode *, void *), void *data)
{
- struct hlist_head *head = inode_hashtable + hash(sb, hashval);
+ struct hlist_head *head = i_hash_head(sb, hashval);
struct inode *inode;
RCU_LOCKDEP_WARN(!rcu_read_lock_held(),
@@ -1633,7 +1639,7 @@ EXPORT_SYMBOL(find_inode_rcu);
struct inode *find_inode_by_ino_rcu(struct super_block *sb,
unsigned long ino)
{
- struct hlist_head *head = inode_hashtable + hash(sb, ino);
+ struct hlist_head *head = i_hash_head(sb, ino);
struct inode *inode;
RCU_LOCKDEP_WARN(!rcu_read_lock_held(),
@@ -1653,7 +1659,7 @@ int insert_inode_locked(struct inode *inode)
{
struct super_block *sb = inode->i_sb;
ino_t ino = inode->i_ino;
- struct hlist_head *head = inode_hashtable + hash(sb, ino);
+ struct hlist_head *head = i_hash_head(sb, ino);
while (1) {
struct inode *old = NULL;
--
2.42.0
^ permalink raw reply related [flat|nested] 34+ messages in thread
* Re: [PATCH 06/11] vfs: factor out inode hash head calculation
2023-12-06 6:05 ` [PATCH 06/11] vfs: factor out inode hash head calculation Dave Chinner
@ 2023-12-07 3:02 ` Al Viro
0 siblings, 0 replies; 34+ messages in thread
From: Al Viro @ 2023-12-07 3:02 UTC (permalink / raw)
To: Dave Chinner
Cc: linux-fsdevel, linux-block, linux-cachefs, dhowells, gfs2,
dm-devel, linux-security-module, selinux, linux-kernel
On Wed, Dec 06, 2023 at 05:05:35PM +1100, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
>
> In preparation for changing the inode hash table implementation.
>
> Signed-off-by: Dave Chinner <dchinner@redhat.com>
ACKed-by: Al Viro <viro@zeniv.linux.org.uk>
^ permalink raw reply [flat|nested] 34+ messages in thread
* [PATCH 07/11] hlist-bl: add hlist_bl_fake()
2023-12-06 6:05 [PATCH 0/11] vfs: inode cache scalability improvements Dave Chinner
` (5 preceding siblings ...)
2023-12-06 6:05 ` [PATCH 06/11] vfs: factor out inode hash head calculation Dave Chinner
@ 2023-12-06 6:05 ` Dave Chinner
2023-12-07 3:05 ` Al Viro
2023-12-06 6:05 ` [PATCH 08/11] vfs: inode cache conversion to hash-bl Dave Chinner
` (4 subsequent siblings)
11 siblings, 1 reply; 34+ messages in thread
From: Dave Chinner @ 2023-12-06 6:05 UTC (permalink / raw)
To: linux-fsdevel
Cc: linux-block, linux-cachefs, dhowells, gfs2, dm-devel,
linux-security-module, selinux, linux-kernel
From: Dave Chinner <dchinner@redhat.com>
in preparation for switching the VFS inode cache over the hlist_bl
lists, we nee dto be able to fake a list node that looks like it is
hased for correct operation of filesystems that don't directly use
the VFS indoe cache.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
include/linux/list_bl.h | 22 ++++++++++++++++++++++
1 file changed, 22 insertions(+)
diff --git a/include/linux/list_bl.h b/include/linux/list_bl.h
index ae1b541446c9..8ee2bf5af131 100644
--- a/include/linux/list_bl.h
+++ b/include/linux/list_bl.h
@@ -143,6 +143,28 @@ static inline void hlist_bl_del_init(struct hlist_bl_node *n)
}
}
+/**
+ * hlist_bl_add_fake - create a fake list consisting of a single headless node
+ * @n: Node to make a fake list out of
+ *
+ * This makes @n appear to be its own predecessor on a headless hlist.
+ * The point of this is to allow things like hlist_bl_del() to work correctly
+ * in cases where there is no list.
+ */
+static inline void hlist_bl_add_fake(struct hlist_bl_node *n)
+{
+ n->pprev = &n->next;
+}
+
+/**
+ * hlist_fake: Is this node a fake hlist_bl?
+ * @h: Node to check for being a self-referential fake hlist.
+ */
+static inline bool hlist_bl_fake(struct hlist_bl_node *n)
+{
+ return n->pprev == &n->next;
+}
+
static inline void hlist_bl_lock(struct hlist_bl_head *b)
{
bit_spin_lock(0, (unsigned long *)b);
--
2.42.0
^ permalink raw reply related [flat|nested] 34+ messages in thread
* Re: [PATCH 07/11] hlist-bl: add hlist_bl_fake()
2023-12-06 6:05 ` [PATCH 07/11] hlist-bl: add hlist_bl_fake() Dave Chinner
@ 2023-12-07 3:05 ` Al Viro
0 siblings, 0 replies; 34+ messages in thread
From: Al Viro @ 2023-12-07 3:05 UTC (permalink / raw)
To: Dave Chinner
Cc: linux-fsdevel, linux-block, linux-cachefs, dhowells, gfs2,
dm-devel, linux-security-module, selinux, linux-kernel
On Wed, Dec 06, 2023 at 05:05:36PM +1100, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
>
> in preparation for switching the VFS inode cache over the hlist_bl
> lists, we nee dto be able to fake a list node that looks like it is
> hased for correct operation of filesystems that don't directly use
> the VFS indoe cache.
I'd probably put it as "we need hlist_bl counterparts of hlist_fake()
and hlist_add_fake()"...
^ permalink raw reply [flat|nested] 34+ messages in thread
* [PATCH 08/11] vfs: inode cache conversion to hash-bl
2023-12-06 6:05 [PATCH 0/11] vfs: inode cache scalability improvements Dave Chinner
` (6 preceding siblings ...)
2023-12-06 6:05 ` [PATCH 07/11] hlist-bl: add hlist_bl_fake() Dave Chinner
@ 2023-12-06 6:05 ` Dave Chinner
2023-12-07 4:58 ` Kent Overstreet
2023-12-07 6:42 ` Al Viro
2023-12-06 6:05 ` [PATCH 09/11] hash-bl: explicitly initialise hash-bl heads Dave Chinner
` (3 subsequent siblings)
11 siblings, 2 replies; 34+ messages in thread
From: Dave Chinner @ 2023-12-06 6:05 UTC (permalink / raw)
To: linux-fsdevel
Cc: linux-block, linux-cachefs, dhowells, gfs2, dm-devel,
linux-security-module, selinux, linux-kernel
From: Dave Chinner <dchinner@redhat.com>
Scalability of the global inode_hash_lock really sucks for
filesystems that use the vfs inode cache (i.e. everything but XFS).
Profiles of a 32-way concurrent sharded directory walk (no contended
directories) on a couple of different filesystems. All numbers from
a 6.7-rc4 kernel.
Bcachefs:
- 98.78% vfs_statx
- 97.74% filename_lookup
- 97.70% path_lookupat
- 97.54% walk_component
- 97.06% lookup_slow
- 97.03% __lookup_slow
- 96.21% bch2_lookup
- 91.87% bch2_vfs_inode_get
- 84.10% iget5_locked
- 44.09% ilookup5
- 43.50% _raw_spin_lock
- 43.49% do_raw_spin_lock
42.75% __pv_queued_spin_lock_slowpath
- 39.06% inode_insert5
- 38.46% _raw_spin_lock
- 38.46% do_raw_spin_lock
37.51% __pv_queued_spin_lock_slowpath
ext4:
- 93.75% vfs_statx
- 92.39% filename_lookup
- 92.34% path_lookupat
- 92.09% walk_component
- 91.48% lookup_slow
- 91.43% __lookup_slow
- 90.18% ext4_lookup
- 84.84% __ext4_iget
- 83.67% iget_locked
- 81.24% _raw_spin_lock
- 81.23% do_raw_spin_lock
- 78.90% __pv_queued_spin_lock_slowpath
Both bcachefs and ext4 demonstrate poor scaling at >=8 threads on
concurrent lookup or create workloads.
Hence convert the inode hash table to a RCU-aware hash-bl table just
like the dentry cache. Note that we need to store a pointer to the
hlist_bl_head the inode has been added to in the inode so that when
it comes to unhash the inode we know what list to lock. We need to
do this because, unlike the dentry cache, the hash value that is
used to hash the inode is not generated from the inode itself. i.e.
filesystems can provide this themselves so we have to either store
the hashval or the hlist head pointer in the inode to be able to
find the right list head for removal...
Concurrent walk of 400k files per thread with varying thread count
in seconds is as follows. Perfect scaling is an unchanged walk time
as thread count increases.
ext4 bcachefs
threads vanilla patched vanilla patched
2 7.923 7.358 8.003 7.276
4 8.152 7.530 9.097 8.506
8 13.090 7.871 11.752 10.015
16 24.602 9.540 24.614 13.989
32 49.536 19.314 49.179 25.982
The big wins here are at >= 8 threads, with both filesytsems now
being limited by internal filesystem algorithms, not the VFS inode
cache scalability.
Ext4 contention moves to the buffer cache on directory block
lookups:
- 66.45% 0.44% [kernel] [k] __ext4_read_dirblock
- 66.01% __ext4_read_dirblock
- 66.01% ext4_bread
- ext4_getblk
- 64.77% bdev_getblk
- 64.69% __find_get_block
- 63.01% _raw_spin_lock
- 62.96% do_raw_spin_lock
59.21% __pv_queued_spin_lock_slowpath
bcachefs contention moves to internal btree traversal locks.
- 95.37% __lookup_slow
- 93.95% bch2_lookup
- 82.57% bch2_vfs_inode_get
- 65.44% bch2_inode_find_by_inum_trans
- 65.41% bch2_inode_peek_nowarn
- 64.60% bch2_btree_iter_peek_slot
- 64.55% bch2_btree_path_traverse_one
- bch2_btree_path_traverse_cached
- 63.02% bch2_btree_path_traverse_cached_slowpath
- 56.60% mutex_lock
- 55.29% __mutex_lock_slowpath
- 55.25% __mutex_lock
50.29% osq_lock
1.84% __raw_callee_save___kvm_vcpu_is_preempted
0.54% mutex_spin_on_owner
Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
fs/inode.c | 200 ++++++++++++++++++++++++++++-----------------
include/linux/fs.h | 9 +-
2 files changed, 132 insertions(+), 77 deletions(-)
diff --git a/fs/inode.c b/fs/inode.c
index fead81550cf4..3eb9c4e5b279 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -56,8 +56,7 @@
static unsigned int i_hash_mask __ro_after_init;
static unsigned int i_hash_shift __ro_after_init;
-static struct hlist_head *inode_hashtable __ro_after_init;
-static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock);
+static struct hlist_bl_head *inode_hashtable __ro_after_init;
static unsigned long hash(struct super_block *sb, unsigned long hashval)
{
@@ -69,7 +68,7 @@ static unsigned long hash(struct super_block *sb, unsigned long hashval)
return tmp & i_hash_mask;
}
-static inline struct hlist_head *i_hash_head(struct super_block *sb,
+static inline struct hlist_bl_head *i_hash_head(struct super_block *sb,
unsigned int hashval)
{
return inode_hashtable + hash(sb, hashval);
@@ -434,7 +433,7 @@ EXPORT_SYMBOL(address_space_init_once);
void inode_init_once(struct inode *inode)
{
memset(inode, 0, sizeof(*inode));
- INIT_HLIST_NODE(&inode->i_hash);
+ INIT_HLIST_BL_NODE(&inode->i_hash);
INIT_LIST_HEAD(&inode->i_devices);
INIT_LIST_HEAD(&inode->i_io_list);
INIT_LIST_HEAD(&inode->i_wb_list);
@@ -518,6 +517,17 @@ static inline void inode_sb_list_del(struct inode *inode)
dlock_lists_del(&inode->i_sb_list);
}
+/*
+ * Ensure that we store the hash head in the inode when we insert the inode into
+ * the hlist_bl_head...
+ */
+static inline void
+__insert_inode_hash_head(struct inode *inode, struct hlist_bl_head *b)
+{
+ hlist_bl_add_head_rcu(&inode->i_hash, b);
+ inode->i_hash_head = b;
+}
+
/**
* __insert_inode_hash - hash an inode
* @inode: unhashed inode
@@ -528,13 +538,13 @@ static inline void inode_sb_list_del(struct inode *inode)
*/
void __insert_inode_hash(struct inode *inode, unsigned long hashval)
{
- struct hlist_head *b = inode_hashtable + hash(inode->i_sb, hashval);
+ struct hlist_bl_head *b = i_hash_head(inode->i_sb, hashval);
- spin_lock(&inode_hash_lock);
+ hlist_bl_lock(b);
spin_lock(&inode->i_lock);
- hlist_add_head_rcu(&inode->i_hash, b);
+ __insert_inode_hash_head(inode, b);
spin_unlock(&inode->i_lock);
- spin_unlock(&inode_hash_lock);
+ hlist_bl_unlock(b);
}
EXPORT_SYMBOL(__insert_inode_hash);
@@ -546,11 +556,44 @@ EXPORT_SYMBOL(__insert_inode_hash);
*/
void __remove_inode_hash(struct inode *inode)
{
- spin_lock(&inode_hash_lock);
- spin_lock(&inode->i_lock);
- hlist_del_init_rcu(&inode->i_hash);
- spin_unlock(&inode->i_lock);
- spin_unlock(&inode_hash_lock);
+ struct hlist_bl_head *b = inode->i_hash_head;
+
+ /*
+ * There are some callers that come through here without synchronisation
+ * and potentially with multiple references to the inode. Hence we have
+ * to handle the case that we might race with a remove and insert to a
+ * different list. Coda, in particular, seems to have a userspace API
+ * that can directly trigger "unhash/rehash to different list" behaviour
+ * without any serialisation at all.
+ *
+ * Hence we have to handle the situation where the inode->i_hash_head
+ * might point to a different list than what we expect, indicating that
+ * we raced with another unhash and potentially a new insertion. This
+ * means we have to retest the head once we have everything locked up
+ * and loop again if it doesn't match.
+ */
+ while (b) {
+ hlist_bl_lock(b);
+ spin_lock(&inode->i_lock);
+ if (b != inode->i_hash_head) {
+ hlist_bl_unlock(b);
+ b = inode->i_hash_head;
+ spin_unlock(&inode->i_lock);
+ continue;
+ }
+ /*
+ * Need to set the pprev pointer to NULL after list removal so
+ * that both RCU traversals and hlist_bl_unhashed() work
+ * correctly at this point.
+ */
+ hlist_bl_del_rcu(&inode->i_hash);
+ inode->i_hash.pprev = NULL;
+ inode->i_hash_head = NULL;
+ spin_unlock(&inode->i_lock);
+ hlist_bl_unlock(b);
+ break;
+ }
+
}
EXPORT_SYMBOL(__remove_inode_hash);
@@ -886,26 +929,28 @@ long prune_icache_sb(struct super_block *sb, struct shrink_control *sc)
return freed;
}
-static void __wait_on_freeing_inode(struct inode *inode);
+static void __wait_on_freeing_inode(struct hlist_bl_head *b,
+ struct inode *inode);
/*
* Called with the inode lock held.
*/
static struct inode *find_inode(struct super_block *sb,
- struct hlist_head *head,
+ struct hlist_bl_head *b,
int (*test)(struct inode *, void *),
void *data)
{
+ struct hlist_bl_node *node;
struct inode *inode = NULL;
repeat:
- hlist_for_each_entry(inode, head, i_hash) {
+ hlist_bl_for_each_entry(inode, node, b, i_hash) {
if (inode->i_sb != sb)
continue;
if (!test(inode, data))
continue;
spin_lock(&inode->i_lock);
if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
- __wait_on_freeing_inode(inode);
+ __wait_on_freeing_inode(b, inode);
goto repeat;
}
if (unlikely(inode->i_state & I_CREATING)) {
@@ -924,19 +969,20 @@ static struct inode *find_inode(struct super_block *sb,
* iget_locked for details.
*/
static struct inode *find_inode_fast(struct super_block *sb,
- struct hlist_head *head, unsigned long ino)
+ struct hlist_bl_head *b, unsigned long ino)
{
+ struct hlist_bl_node *node;
struct inode *inode = NULL;
repeat:
- hlist_for_each_entry(inode, head, i_hash) {
+ hlist_bl_for_each_entry(inode, node, b, i_hash) {
if (inode->i_ino != ino)
continue;
if (inode->i_sb != sb)
continue;
spin_lock(&inode->i_lock);
if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
- __wait_on_freeing_inode(inode);
+ __wait_on_freeing_inode(b, inode);
goto repeat;
}
if (unlikely(inode->i_state & I_CREATING)) {
@@ -1186,25 +1232,25 @@ EXPORT_SYMBOL(unlock_two_nondirectories);
* return it locked, hashed, and with the I_NEW flag set. The file system gets
* to fill it in before unlocking it via unlock_new_inode().
*
- * Note both @test and @set are called with the inode_hash_lock held, so can't
- * sleep.
+ * Note both @test and @set are called with the inode hash chain lock held,
+ * so can't sleep.
*/
struct inode *inode_insert5(struct inode *inode, unsigned long hashval,
int (*test)(struct inode *, void *),
int (*set)(struct inode *, void *), void *data)
{
- struct hlist_head *head = i_hash_head(inode->i_sb, hashval);
+ struct hlist_bl_head *b = i_hash_head(inode->i_sb, hashval);
struct inode *old;
again:
- spin_lock(&inode_hash_lock);
- old = find_inode(inode->i_sb, head, test, data);
+ hlist_bl_lock(b);
+ old = find_inode(inode->i_sb, b, test, data);
if (unlikely(old)) {
/*
* Uhhuh, somebody else created the same inode under us.
* Use the old inode instead of the preallocated one.
*/
- spin_unlock(&inode_hash_lock);
+ hlist_bl_unlock(b);
if (IS_ERR(old))
return NULL;
wait_on_inode(old);
@@ -1226,7 +1272,7 @@ struct inode *inode_insert5(struct inode *inode, unsigned long hashval,
*/
spin_lock(&inode->i_lock);
inode->i_state |= I_NEW;
- hlist_add_head_rcu(&inode->i_hash, head);
+ __insert_inode_hash_head(inode, b);
spin_unlock(&inode->i_lock);
/*
@@ -1236,7 +1282,7 @@ struct inode *inode_insert5(struct inode *inode, unsigned long hashval,
if (list_empty(&inode->i_sb_list.list))
inode_sb_list_add(inode);
unlock:
- spin_unlock(&inode_hash_lock);
+ hlist_bl_unlock(b);
return inode;
}
@@ -1297,12 +1343,12 @@ EXPORT_SYMBOL(iget5_locked);
*/
struct inode *iget_locked(struct super_block *sb, unsigned long ino)
{
- struct hlist_head *head = i_hash_head(sb, ino);
+ struct hlist_bl_head *b = i_hash_head(sb, ino);
struct inode *inode;
again:
- spin_lock(&inode_hash_lock);
- inode = find_inode_fast(sb, head, ino);
- spin_unlock(&inode_hash_lock);
+ hlist_bl_lock(b);
+ inode = find_inode_fast(sb, b, ino);
+ hlist_bl_unlock(b);
if (inode) {
if (IS_ERR(inode))
return NULL;
@@ -1318,17 +1364,17 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino)
if (inode) {
struct inode *old;
- spin_lock(&inode_hash_lock);
+ hlist_bl_lock(b);
/* We released the lock, so.. */
- old = find_inode_fast(sb, head, ino);
+ old = find_inode_fast(sb, b, ino);
if (!old) {
inode->i_ino = ino;
spin_lock(&inode->i_lock);
inode->i_state = I_NEW;
- hlist_add_head_rcu(&inode->i_hash, head);
+ __insert_inode_hash_head(inode, b);
spin_unlock(&inode->i_lock);
inode_sb_list_add(inode);
- spin_unlock(&inode_hash_lock);
+ hlist_bl_unlock(b);
/* Return the locked inode with I_NEW set, the
* caller is responsible for filling in the contents
@@ -1341,7 +1387,7 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino)
* us. Use the old inode instead of the one we just
* allocated.
*/
- spin_unlock(&inode_hash_lock);
+ hlist_bl_unlock(b);
destroy_inode(inode);
if (IS_ERR(old))
return NULL;
@@ -1365,10 +1411,11 @@ EXPORT_SYMBOL(iget_locked);
*/
static int test_inode_iunique(struct super_block *sb, unsigned long ino)
{
- struct hlist_head *b = i_hash_head(sb, ino);
+ struct hlist_bl_head *b = i_hash_head(sb, ino);
+ struct hlist_bl_node *node;
struct inode *inode;
- hlist_for_each_entry_rcu(inode, b, i_hash) {
+ hlist_bl_for_each_entry_rcu(inode, node, b, i_hash) {
if (inode->i_ino == ino && inode->i_sb == sb)
return 0;
}
@@ -1452,12 +1499,12 @@ EXPORT_SYMBOL(igrab);
struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval,
int (*test)(struct inode *, void *), void *data)
{
- struct hlist_head *head = i_hash_head(sb, hashval);
+ struct hlist_bl_head *b = i_hash_head(sb, hashval);
struct inode *inode;
- spin_lock(&inode_hash_lock);
- inode = find_inode(sb, head, test, data);
- spin_unlock(&inode_hash_lock);
+ hlist_bl_lock(b);
+ inode = find_inode(sb, b, test, data);
+ hlist_bl_unlock(b);
return IS_ERR(inode) ? NULL : inode;
}
@@ -1507,12 +1554,12 @@ EXPORT_SYMBOL(ilookup5);
*/
struct inode *ilookup(struct super_block *sb, unsigned long ino)
{
- struct hlist_head *head = i_hash_head(sb, ino);
+ struct hlist_bl_head *b = i_hash_head(sb, ino);
struct inode *inode;
again:
- spin_lock(&inode_hash_lock);
- inode = find_inode_fast(sb, head, ino);
- spin_unlock(&inode_hash_lock);
+ hlist_bl_lock(b);
+ inode = find_inode_fast(sb, b, ino);
+ hlist_bl_unlock(b);
if (inode) {
if (IS_ERR(inode))
@@ -1556,12 +1603,13 @@ struct inode *find_inode_nowait(struct super_block *sb,
void *),
void *data)
{
- struct hlist_head *head = i_hash_head(sb, hashval);
+ struct hlist_bl_head *b = i_hash_head(sb, hashval);
+ struct hlist_bl_node *node;
struct inode *inode, *ret_inode = NULL;
int mval;
- spin_lock(&inode_hash_lock);
- hlist_for_each_entry(inode, head, i_hash) {
+ hlist_bl_lock(b);
+ hlist_bl_for_each_entry(inode, node, b, i_hash) {
if (inode->i_sb != sb)
continue;
mval = match(inode, hashval, data);
@@ -1572,7 +1620,7 @@ struct inode *find_inode_nowait(struct super_block *sb,
goto out;
}
out:
- spin_unlock(&inode_hash_lock);
+ hlist_bl_unlock(b);
return ret_inode;
}
EXPORT_SYMBOL(find_inode_nowait);
@@ -1601,13 +1649,14 @@ EXPORT_SYMBOL(find_inode_nowait);
struct inode *find_inode_rcu(struct super_block *sb, unsigned long hashval,
int (*test)(struct inode *, void *), void *data)
{
- struct hlist_head *head = i_hash_head(sb, hashval);
+ struct hlist_bl_head *b = i_hash_head(sb, hashval);
+ struct hlist_bl_node *node;
struct inode *inode;
RCU_LOCKDEP_WARN(!rcu_read_lock_held(),
"suspicious find_inode_rcu() usage");
- hlist_for_each_entry_rcu(inode, head, i_hash) {
+ hlist_bl_for_each_entry_rcu(inode, node, b, i_hash) {
if (inode->i_sb == sb &&
!(READ_ONCE(inode->i_state) & (I_FREEING | I_WILL_FREE)) &&
test(inode, data))
@@ -1639,13 +1688,14 @@ EXPORT_SYMBOL(find_inode_rcu);
struct inode *find_inode_by_ino_rcu(struct super_block *sb,
unsigned long ino)
{
- struct hlist_head *head = i_hash_head(sb, ino);
+ struct hlist_bl_head *b = i_hash_head(sb, ino);
+ struct hlist_bl_node *node;
struct inode *inode;
RCU_LOCKDEP_WARN(!rcu_read_lock_held(),
"suspicious find_inode_by_ino_rcu() usage");
- hlist_for_each_entry_rcu(inode, head, i_hash) {
+ hlist_bl_for_each_entry_rcu(inode, node, b, i_hash) {
if (inode->i_ino == ino &&
inode->i_sb == sb &&
!(READ_ONCE(inode->i_state) & (I_FREEING | I_WILL_FREE)))
@@ -1659,39 +1709,42 @@ int insert_inode_locked(struct inode *inode)
{
struct super_block *sb = inode->i_sb;
ino_t ino = inode->i_ino;
- struct hlist_head *head = i_hash_head(sb, ino);
+ struct hlist_bl_head *b = i_hash_head(sb, ino);
while (1) {
- struct inode *old = NULL;
- spin_lock(&inode_hash_lock);
- hlist_for_each_entry(old, head, i_hash) {
- if (old->i_ino != ino)
+ struct hlist_bl_node *node;
+ struct inode *old = NULL, *t;
+
+ hlist_bl_lock(b);
+ hlist_bl_for_each_entry(t, node, b, i_hash) {
+ if (t->i_ino != ino)
continue;
- if (old->i_sb != sb)
+ if (t->i_sb != sb)
continue;
- spin_lock(&old->i_lock);
- if (old->i_state & (I_FREEING|I_WILL_FREE)) {
- spin_unlock(&old->i_lock);
+ spin_lock(&t->i_lock);
+ if (t->i_state & (I_FREEING|I_WILL_FREE)) {
+ spin_unlock(&t->i_lock);
continue;
}
+ old = t;
break;
}
if (likely(!old)) {
spin_lock(&inode->i_lock);
inode->i_state |= I_NEW | I_CREATING;
- hlist_add_head_rcu(&inode->i_hash, head);
+ __insert_inode_hash_head(inode, b);
spin_unlock(&inode->i_lock);
- spin_unlock(&inode_hash_lock);
+ hlist_bl_unlock(b);
return 0;
}
if (unlikely(old->i_state & I_CREATING)) {
spin_unlock(&old->i_lock);
- spin_unlock(&inode_hash_lock);
+ hlist_bl_unlock(b);
return -EBUSY;
}
__iget(old);
spin_unlock(&old->i_lock);
- spin_unlock(&inode_hash_lock);
+ hlist_bl_unlock(b);
wait_on_inode(old);
if (unlikely(!inode_unhashed(old))) {
iput(old);
@@ -2271,17 +2324,18 @@ EXPORT_SYMBOL(inode_needs_sync);
* wake_up_bit(&inode->i_state, __I_NEW) after removing from the hash list
* will DTRT.
*/
-static void __wait_on_freeing_inode(struct inode *inode)
+static void __wait_on_freeing_inode(struct hlist_bl_head *b,
+ struct inode *inode)
{
wait_queue_head_t *wq;
DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW);
wq = bit_waitqueue(&inode->i_state, __I_NEW);
prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
spin_unlock(&inode->i_lock);
- spin_unlock(&inode_hash_lock);
+ hlist_bl_unlock(b);
schedule();
finish_wait(wq, &wait.wq_entry);
- spin_lock(&inode_hash_lock);
+ hlist_bl_lock(b);
}
static __initdata unsigned long ihash_entries;
@@ -2307,7 +2361,7 @@ void __init inode_init_early(void)
inode_hashtable =
alloc_large_system_hash("Inode-cache",
- sizeof(struct hlist_head),
+ sizeof(struct hlist_bl_head),
ihash_entries,
14,
HASH_EARLY | HASH_ZERO,
@@ -2333,7 +2387,7 @@ void __init inode_init(void)
inode_hashtable =
alloc_large_system_hash("Inode-cache",
- sizeof(struct hlist_head),
+ sizeof(struct hlist_bl_head),
ihash_entries,
14,
HASH_ZERO,
diff --git a/include/linux/fs.h b/include/linux/fs.h
index bb35591733f1..0ef1b72340c7 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -692,7 +692,8 @@ struct inode {
unsigned long dirtied_when; /* jiffies of first dirtying */
unsigned long dirtied_time_when;
- struct hlist_node i_hash;
+ struct hlist_bl_node i_hash;
+ struct hlist_bl_head *i_hash_head;
struct list_head i_io_list; /* backing dev IO list */
#ifdef CONFIG_CGROUP_WRITEBACK
struct bdi_writeback *i_wb; /* the associated cgroup wb */
@@ -758,7 +759,7 @@ static inline unsigned int i_blocksize(const struct inode *node)
static inline int inode_unhashed(struct inode *inode)
{
- return hlist_unhashed(&inode->i_hash);
+ return hlist_bl_unhashed(&inode->i_hash);
}
/*
@@ -769,7 +770,7 @@ static inline int inode_unhashed(struct inode *inode)
*/
static inline void inode_fake_hash(struct inode *inode)
{
- hlist_add_fake(&inode->i_hash);
+ hlist_bl_add_fake(&inode->i_hash);
}
/*
@@ -2946,7 +2947,7 @@ static inline void insert_inode_hash(struct inode *inode)
extern void __remove_inode_hash(struct inode *);
static inline void remove_inode_hash(struct inode *inode)
{
- if (!inode_unhashed(inode) && !hlist_fake(&inode->i_hash))
+ if (!inode_unhashed(inode) && !hlist_bl_fake(&inode->i_hash))
__remove_inode_hash(inode);
}
--
2.42.0
^ permalink raw reply related [flat|nested] 34+ messages in thread
* Re: [PATCH 08/11] vfs: inode cache conversion to hash-bl
2023-12-06 6:05 ` [PATCH 08/11] vfs: inode cache conversion to hash-bl Dave Chinner
@ 2023-12-07 4:58 ` Kent Overstreet
2023-12-07 6:03 ` Dave Chinner
2023-12-07 6:42 ` Al Viro
1 sibling, 1 reply; 34+ messages in thread
From: Kent Overstreet @ 2023-12-07 4:58 UTC (permalink / raw)
To: Dave Chinner
Cc: linux-fsdevel, linux-block, linux-cachefs, dhowells, gfs2,
dm-devel, linux-security-module, selinux, linux-kernel
On Wed, Dec 06, 2023 at 05:05:37PM +1100, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
>
> Scalability of the global inode_hash_lock really sucks for
> filesystems that use the vfs inode cache (i.e. everything but XFS).
Ages ago, we talked about (and I attempted, but ended up swearing at
inode lifetime rules) - conversion to rhashtable instead, which I still
believe would be preferable since that code is fully lockless (and
resizeable, of course). But it turned out to be a much bigger project...
But IIRC the bulk of the work was going to be "clean up inode
refcounting/lifetime rules into something sane/modern" - maybe we could
leave some breadcrumbs/comments in fs/inode.c for what that would take,
if/when someone else is sufficiently motivated?
> threads vanilla patched vanilla patched
> 2 7.923 7.358 8.003 7.276
> 4 8.152 7.530 9.097 8.506
> 8 13.090 7.871 11.752 10.015
> 16 24.602 9.540 24.614 13.989
> 32 49.536 19.314 49.179 25.982
nice
> The big wins here are at >= 8 threads, with both filesytsems now
> being limited by internal filesystem algorithms, not the VFS inode
> cache scalability.
>
> Ext4 contention moves to the buffer cache on directory block
> lookups:
>
> - 66.45% 0.44% [kernel] [k] __ext4_read_dirblock
> - 66.01% __ext4_read_dirblock
> - 66.01% ext4_bread
> - ext4_getblk
> - 64.77% bdev_getblk
> - 64.69% __find_get_block
> - 63.01% _raw_spin_lock
> - 62.96% do_raw_spin_lock
> 59.21% __pv_queued_spin_lock_slowpath
>
> bcachefs contention moves to internal btree traversal locks.
>
> - 95.37% __lookup_slow
> - 93.95% bch2_lookup
> - 82.57% bch2_vfs_inode_get
> - 65.44% bch2_inode_find_by_inum_trans
> - 65.41% bch2_inode_peek_nowarn
> - 64.60% bch2_btree_iter_peek_slot
> - 64.55% bch2_btree_path_traverse_one
> - bch2_btree_path_traverse_cached
> - 63.02% bch2_btree_path_traverse_cached_slowpath
> - 56.60% mutex_lock
dlist-lock ought to be perfect for solving this one
Reviewed-by: Kent Overstreet <kent.overstreet@linux.dev>
^ permalink raw reply [flat|nested] 34+ messages in thread
* Re: [PATCH 08/11] vfs: inode cache conversion to hash-bl
2023-12-07 4:58 ` Kent Overstreet
@ 2023-12-07 6:03 ` Dave Chinner
0 siblings, 0 replies; 34+ messages in thread
From: Dave Chinner @ 2023-12-07 6:03 UTC (permalink / raw)
To: Kent Overstreet
Cc: linux-fsdevel, linux-block, linux-cachefs, dhowells, gfs2,
dm-devel, linux-security-module, selinux, linux-kernel
On Wed, Dec 06, 2023 at 11:58:44PM -0500, Kent Overstreet wrote:
> On Wed, Dec 06, 2023 at 05:05:37PM +1100, Dave Chinner wrote:
> > From: Dave Chinner <dchinner@redhat.com>
> >
> > Scalability of the global inode_hash_lock really sucks for
> > filesystems that use the vfs inode cache (i.e. everything but XFS).
>
> Ages ago, we talked about (and I attempted, but ended up swearing at
> inode lifetime rules) - conversion to rhashtable instead, which I still
> believe would be preferable since that code is fully lockless (and
> resizeable, of course). But it turned out to be a much bigger project...
I don't think that the size of the has table is a big issue at the
moment. We already have RCU lookups for the inode cache
(find_inode_rcu() and find_inode_by_ino_rcu()) even before this
patchset, so we don't need rhashtable for that.
We still have to prevent duplicate inodes from being added to the cache
due to racing inserts, so I think we still need some form of
serialisation on the "lookup miss+insert" side. I've not thought
about it further than that - the hash-bl removes the existing
VFS contention points and the limitations move to
filesystem internal algorithms once again.
So until the filesystems can scale to much larger thread counts and
put the pressure back on the VFS inode cache scalability, I
don't see any need to try to do anything more complex or smarter...
Cheers,
Dave.
--
Dave Chinner
david@fromorbit.com
^ permalink raw reply [flat|nested] 34+ messages in thread
* Re: [PATCH 08/11] vfs: inode cache conversion to hash-bl
2023-12-06 6:05 ` [PATCH 08/11] vfs: inode cache conversion to hash-bl Dave Chinner
2023-12-07 4:58 ` Kent Overstreet
@ 2023-12-07 6:42 ` Al Viro
1 sibling, 0 replies; 34+ messages in thread
From: Al Viro @ 2023-12-07 6:42 UTC (permalink / raw)
To: Dave Chinner
Cc: linux-fsdevel, linux-block, linux-cachefs, dhowells, gfs2,
dm-devel, linux-security-module, selinux, linux-kernel
On Wed, Dec 06, 2023 at 05:05:37PM +1100, Dave Chinner wrote:
> + /*
> + * There are some callers that come through here without synchronisation
> + * and potentially with multiple references to the inode. Hence we have
> + * to handle the case that we might race with a remove and insert to a
> + * different list. Coda, in particular, seems to have a userspace API
> + * that can directly trigger "unhash/rehash to different list" behaviour
> + * without any serialisation at all.
> + *
> + * Hence we have to handle the situation where the inode->i_hash_head
> + * might point to a different list than what we expect, indicating that
> + * we raced with another unhash and potentially a new insertion. This
> + * means we have to retest the head once we have everything locked up
> + * and loop again if it doesn't match.
> + */
coda_replace_fid() is an old headache, but it's thankfully unique - nobody else
does that kind of shit (just rechecked).
Note that coda_replace_fid() is not going to have the sucker racing with
removal from another source, and I'm 100% sure that they really want
some serialization for handling those requests.
remove_inode_hash() is misused there - "in the middle of hash key change"
is not the same state as "unhashed".
Any races between insert and unhash are bugs, not something to support.
^ permalink raw reply [flat|nested] 34+ messages in thread
* [PATCH 09/11] hash-bl: explicitly initialise hash-bl heads
2023-12-06 6:05 [PATCH 0/11] vfs: inode cache scalability improvements Dave Chinner
` (7 preceding siblings ...)
2023-12-06 6:05 ` [PATCH 08/11] vfs: inode cache conversion to hash-bl Dave Chinner
@ 2023-12-06 6:05 ` Dave Chinner
2023-12-07 3:15 ` Al Viro
2023-12-06 6:05 ` [PATCH 10/11] list_bl: don't use bit locks for PREEMPT_RT or lockdep Dave Chinner
` (2 subsequent siblings)
11 siblings, 1 reply; 34+ messages in thread
From: Dave Chinner @ 2023-12-06 6:05 UTC (permalink / raw)
To: linux-fsdevel
Cc: linux-block, linux-cachefs, dhowells, gfs2, dm-devel,
linux-security-module, selinux, linux-kernel
From: Dave Chinner <dchinner@redhat.com>
Because we are going to change how the structure is laid out to
support RTPREEMPT and LOCKDEP, just assuming that the hash table is
allocated as zeroed memory is no longer sufficient to initialise
a hash-bl table.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
fs/dcache.c | 21 ++++++++++++++++++++-
fs/fscache/cookie.c | 8 ++++++++
fs/fscache/internal.h | 6 ++++--
fs/fscache/main.c | 3 +++
fs/fscache/volume.c | 8 ++++++++
fs/inode.c | 19 ++++++++++++++++++-
6 files changed, 61 insertions(+), 4 deletions(-)
diff --git a/fs/dcache.c b/fs/dcache.c
index c82ae731df9a..9059b3a55370 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -3284,7 +3284,10 @@ __setup("dhash_entries=", set_dhash_entries);
static void __init dcache_init_early(void)
{
- /* If hashes are distributed across NUMA nodes, defer
+ int i;
+
+ /*
+ * If hashes are distributed across NUMA nodes, defer
* hash allocation until vmalloc space is available.
*/
if (hashdist)
@@ -3300,11 +3303,20 @@ static void __init dcache_init_early(void)
NULL,
0,
0);
+ /*
+ * The value returned in d_hash_shift tells us the size of the
+ * hash table that was allocated as a log2 value.
+ */
+ for (i = 0; i < (1 << d_hash_shift); i++)
+ INIT_HLIST_BL_HEAD(&dentry_hashtable[i]);
+
d_hash_shift = 32 - d_hash_shift;
}
static void __init dcache_init(void)
{
+ int i;
+
/*
* A constructor could be added for stable state like the lists,
* but it is probably not worth it because of the cache nature
@@ -3328,6 +3340,13 @@ static void __init dcache_init(void)
NULL,
0,
0);
+ /*
+ * The value returned in d_hash_shift tells us the size of the
+ * hash table that was allocated as a log2 value.
+ */
+ for (i = 0; i < (1 << d_hash_shift); i++)
+ INIT_HLIST_BL_HEAD(&dentry_hashtable[i]);
+
d_hash_shift = 32 - d_hash_shift;
}
diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c
index bce2492186d0..21617f7c88e4 100644
--- a/fs/fscache/cookie.c
+++ b/fs/fscache/cookie.c
@@ -32,6 +32,14 @@ static DECLARE_WORK(fscache_cookie_lru_work, fscache_cookie_lru_worker);
static const char fscache_cookie_states[FSCACHE_COOKIE_STATE__NR] = "-LCAIFUWRD";
static unsigned int fscache_lru_cookie_timeout = 10 * HZ;
+void fscache_cookie_hash_init(void)
+{
+ int i;
+
+ for (i = 0; i < (1 << fscache_cookie_hash_shift); i++)
+ INIT_HLIST_BL_HEAD(&fscache_cookie_hash[i]);
+}
+
void fscache_print_cookie(struct fscache_cookie *cookie, char prefix)
{
const u8 *k;
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
index 1336f517e9b1..6cbe07decc11 100644
--- a/fs/fscache/internal.h
+++ b/fs/fscache/internal.h
@@ -61,8 +61,9 @@ extern const struct seq_operations fscache_cookies_seq_ops;
#endif
extern struct timer_list fscache_cookie_lru_timer;
-extern void fscache_print_cookie(struct fscache_cookie *cookie, char prefix);
-extern bool fscache_begin_cookie_access(struct fscache_cookie *cookie,
+void fscache_cookie_hash_init(void);
+void fscache_print_cookie(struct fscache_cookie *cookie, char prefix);
+bool fscache_begin_cookie_access(struct fscache_cookie *cookie,
enum fscache_access_trace why);
static inline void fscache_see_cookie(struct fscache_cookie *cookie,
@@ -143,6 +144,7 @@ int fscache_stats_show(struct seq_file *m, void *v);
extern const struct seq_operations fscache_volumes_seq_ops;
#endif
+void fscache_volume_hash_init(void);
struct fscache_volume *fscache_get_volume(struct fscache_volume *volume,
enum fscache_volume_trace where);
void fscache_put_volume(struct fscache_volume *volume,
diff --git a/fs/fscache/main.c b/fs/fscache/main.c
index dad85fd84f6f..7db2a4423315 100644
--- a/fs/fscache/main.c
+++ b/fs/fscache/main.c
@@ -92,6 +92,9 @@ static int __init fscache_init(void)
goto error_cookie_jar;
}
+ fscache_volume_hash_init();
+ fscache_cookie_hash_init();
+
pr_notice("Loaded\n");
return 0;
diff --git a/fs/fscache/volume.c b/fs/fscache/volume.c
index cdf991bdd9de..8b029c46a3a3 100644
--- a/fs/fscache/volume.c
+++ b/fs/fscache/volume.c
@@ -17,6 +17,14 @@ static LIST_HEAD(fscache_volumes);
static void fscache_create_volume_work(struct work_struct *work);
+void fscache_volume_hash_init(void)
+{
+ int i;
+
+ for (i = 0; i < (1 << fscache_volume_hash_shift); i++)
+ INIT_HLIST_BL_HEAD(&fscache_volume_hash[i]);
+}
+
struct fscache_volume *fscache_get_volume(struct fscache_volume *volume,
enum fscache_volume_trace where)
{
diff --git a/fs/inode.c b/fs/inode.c
index 3eb9c4e5b279..57c1030ccad3 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -2353,7 +2353,10 @@ __setup("ihash_entries=", set_ihash_entries);
*/
void __init inode_init_early(void)
{
- /* If hashes are distributed across NUMA nodes, defer
+ int i;
+
+ /*
+ * If hashes are distributed across NUMA nodes, defer
* hash allocation until vmalloc space is available.
*/
if (hashdist)
@@ -2369,10 +2372,18 @@ void __init inode_init_early(void)
&i_hash_mask,
0,
0);
+ /*
+ * The value returned in i_hash_shift tells us the size of the
+ * hash table that was allocated as a log2 value.
+ */
+ for (i = 0; i < (1 << i_hash_shift); i++)
+ INIT_HLIST_BL_HEAD(&inode_hashtable[i]);
}
void __init inode_init(void)
{
+ int i;
+
/* inode slab cache */
inode_cachep = kmem_cache_create("inode_cache",
sizeof(struct inode),
@@ -2395,6 +2406,12 @@ void __init inode_init(void)
&i_hash_mask,
0,
0);
+ /*
+ * The value returned in i_hash_shift tells us the size of the
+ * hash table that was allocated as a log2 value.
+ */
+ for (i = 0; i < (1 << i_hash_shift); i++)
+ INIT_HLIST_BL_HEAD(&inode_hashtable[i]);
}
void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev)
--
2.42.0
^ permalink raw reply related [flat|nested] 34+ messages in thread
* Re: [PATCH 09/11] hash-bl: explicitly initialise hash-bl heads
2023-12-06 6:05 ` [PATCH 09/11] hash-bl: explicitly initialise hash-bl heads Dave Chinner
@ 2023-12-07 3:15 ` Al Viro
0 siblings, 0 replies; 34+ messages in thread
From: Al Viro @ 2023-12-07 3:15 UTC (permalink / raw)
To: Dave Chinner
Cc: linux-fsdevel, linux-block, linux-cachefs, dhowells, gfs2,
dm-devel, linux-security-module, selinux, linux-kernel
On Wed, Dec 06, 2023 at 05:05:38PM +1100, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
>
> Because we are going to change how the structure is laid out to
> support RTPREEMPT and LOCKDEP, just assuming that the hash table is
> allocated as zeroed memory is no longer sufficient to initialise
> a hash-bl table.
static inline void init_bl_hash(struct hlist_bl *table, int shift)?
^ permalink raw reply [flat|nested] 34+ messages in thread
* [PATCH 10/11] list_bl: don't use bit locks for PREEMPT_RT or lockdep
2023-12-06 6:05 [PATCH 0/11] vfs: inode cache scalability improvements Dave Chinner
` (8 preceding siblings ...)
2023-12-06 6:05 ` [PATCH 09/11] hash-bl: explicitly initialise hash-bl heads Dave Chinner
@ 2023-12-06 6:05 ` Dave Chinner
2023-12-07 4:16 ` Kent Overstreet
2023-12-06 6:05 ` [PATCH 11/11] hlist-bl: introduced nested locking for dm-snap Dave Chinner
2023-12-07 17:08 ` [PATCH 0/11] vfs: inode cache scalability improvements Kent Overstreet
11 siblings, 1 reply; 34+ messages in thread
From: Dave Chinner @ 2023-12-06 6:05 UTC (permalink / raw)
To: linux-fsdevel
Cc: linux-block, linux-cachefs, dhowells, gfs2, dm-devel,
linux-security-module, selinux, linux-kernel
From: Dave Chinner <dchinner@redhat.com>
hash-bl nests spinlocks inside the bit locks. This causes problems
for CONFIG_PREEMPT_RT which converts spin locks to sleeping locks,
and we're not allowed to sleep while holding a spinning lock.
Further, lockdep does not support bit locks, so we lose lockdep
coverage of the inode hash table with the hash-bl conversion.
To enable these configs to work, add an external per-chain spinlock
to the hlist_bl_head() and add helpers to use this instead of the
bit spinlock when preempt_rt or lockdep are enabled.
This converts all users of hlist-bl to use the external spinlock in
these situations, so we also gain lockdep coverage of things like
the dentry cache hash table with this change.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
include/linux/list_bl.h | 126 ++++++++++++++++++++++++++++---------
include/linux/rculist_bl.h | 13 ++++
2 files changed, 110 insertions(+), 29 deletions(-)
diff --git a/include/linux/list_bl.h b/include/linux/list_bl.h
index 8ee2bf5af131..990ad8e24e0b 100644
--- a/include/linux/list_bl.h
+++ b/include/linux/list_bl.h
@@ -4,14 +4,27 @@
#include <linux/list.h>
#include <linux/bit_spinlock.h>
+#include <linux/spinlock.h>
/*
* Special version of lists, where head of the list has a lock in the lowest
* bit. This is useful for scalable hash tables without increasing memory
* footprint overhead.
*
- * For modification operations, the 0 bit of hlist_bl_head->first
- * pointer must be set.
+ * Whilst the general use of bit spin locking is considered safe, PREEMPT_RT
+ * introduces a problem with nesting spin locks inside bit locks: spin locks
+ * become sleeping locks, and we can't sleep inside spinning locks such as bit
+ * locks. However, for RTPREEMPT, performance is less of an issue than
+ * correctness, so we trade off the memory and cache footprint of a spinlock per
+ * list so the list locks are converted to sleeping locks and work correctly
+ * with PREEMPT_RT kernels.
+ *
+ * An added advantage of this is that we can use the same trick when lockdep is
+ * enabled (again, performance doesn't matter) and gain lockdep coverage of all
+ * the hash-bl operations.
+ *
+ * For modification operations when using pure bit locking, the 0 bit of
+ * hlist_bl_head->first pointer must be set.
*
* With some small modifications, this can easily be adapted to store several
* arbitrary bits (not just a single lock bit), if the need arises to store
@@ -30,16 +43,21 @@
#define LIST_BL_BUG_ON(x)
#endif
+#undef LIST_BL_USE_SPINLOCKS
+#if defined(CONFIG_PREEMPT_RT) || defined(CONFIG_LOCKDEP)
+#define LIST_BL_USE_SPINLOCKS 1
+#endif
struct hlist_bl_head {
struct hlist_bl_node *first;
+#ifdef LIST_BL_USE_SPINLOCKS
+ spinlock_t lock;
+#endif
};
struct hlist_bl_node {
struct hlist_bl_node *next, **pprev;
};
-#define INIT_HLIST_BL_HEAD(ptr) \
- ((ptr)->first = NULL)
static inline void INIT_HLIST_BL_NODE(struct hlist_bl_node *h)
{
@@ -54,6 +72,69 @@ static inline bool hlist_bl_unhashed(const struct hlist_bl_node *h)
return !h->pprev;
}
+#ifdef LIST_BL_USE_SPINLOCKS
+#define INIT_HLIST_BL_HEAD(ptr) do { \
+ (ptr)->first = NULL; \
+ spin_lock_init(&(ptr)->lock); \
+} while (0)
+
+static inline void hlist_bl_lock(struct hlist_bl_head *b)
+{
+ spin_lock(&b->lock);
+}
+
+static inline void hlist_bl_unlock(struct hlist_bl_head *b)
+{
+ spin_unlock(&b->lock);
+}
+
+static inline bool hlist_bl_is_locked(struct hlist_bl_head *b)
+{
+ return spin_is_locked(&b->lock);
+}
+
+static inline struct hlist_bl_node *hlist_bl_first(struct hlist_bl_head *h)
+{
+ return h->first;
+}
+
+static inline void hlist_bl_set_first(struct hlist_bl_head *h,
+ struct hlist_bl_node *n)
+{
+ h->first = n;
+}
+
+static inline void hlist_bl_set_before(struct hlist_bl_node **pprev,
+ struct hlist_bl_node *n)
+{
+ WRITE_ONCE(*pprev, n);
+}
+
+static inline bool hlist_bl_empty(const struct hlist_bl_head *h)
+{
+ return !READ_ONCE(h->first);
+}
+
+#else /* !LIST_BL_USE_SPINLOCKS */
+
+#define INIT_HLIST_BL_HEAD(ptr) \
+ ((ptr)->first = NULL)
+
+static inline void hlist_bl_lock(struct hlist_bl_head *b)
+{
+ bit_spin_lock(0, (unsigned long *)b);
+}
+
+static inline void hlist_bl_unlock(struct hlist_bl_head *b)
+{
+ __bit_spin_unlock(0, (unsigned long *)b);
+}
+
+static inline bool hlist_bl_is_locked(struct hlist_bl_head *b)
+{
+ return bit_spin_is_locked(0, (unsigned long *)b);
+}
+
static inline struct hlist_bl_node *hlist_bl_first(struct hlist_bl_head *h)
{
return (struct hlist_bl_node *)
@@ -69,11 +150,21 @@ static inline void hlist_bl_set_first(struct hlist_bl_head *h,
h->first = (struct hlist_bl_node *)((unsigned long)n | LIST_BL_LOCKMASK);
}
+static inline void hlist_bl_set_before(struct hlist_bl_node **pprev,
+ struct hlist_bl_node *n)
+{
+ WRITE_ONCE(*pprev,
+ (struct hlist_bl_node *)
+ ((uintptr_t)n | ((uintptr_t)*pprev & LIST_BL_LOCKMASK)));
+}
+
static inline bool hlist_bl_empty(const struct hlist_bl_head *h)
{
return !((unsigned long)READ_ONCE(h->first) & ~LIST_BL_LOCKMASK);
}
+#endif /* LIST_BL_USE_SPINLOCKS */
+
static inline void hlist_bl_add_head(struct hlist_bl_node *n,
struct hlist_bl_head *h)
{
@@ -94,11 +185,7 @@ static inline void hlist_bl_add_before(struct hlist_bl_node *n,
n->pprev = pprev;
n->next = next;
next->pprev = &n->next;
-
- /* pprev may be `first`, so be careful not to lose the lock bit */
- WRITE_ONCE(*pprev,
- (struct hlist_bl_node *)
- ((uintptr_t)n | ((uintptr_t)*pprev & LIST_BL_LOCKMASK)));
+ hlist_bl_set_before(pprev, n);
}
static inline void hlist_bl_add_behind(struct hlist_bl_node *n,
@@ -119,11 +206,7 @@ static inline void __hlist_bl_del(struct hlist_bl_node *n)
LIST_BL_BUG_ON((unsigned long)n & LIST_BL_LOCKMASK);
- /* pprev may be `first`, so be careful not to lose the lock bit */
- WRITE_ONCE(*pprev,
- (struct hlist_bl_node *)
- ((unsigned long)next |
- ((unsigned long)*pprev & LIST_BL_LOCKMASK)));
+ hlist_bl_set_before(pprev, next);
if (next)
next->pprev = pprev;
}
@@ -165,21 +248,6 @@ static inline bool hlist_bl_fake(struct hlist_bl_node *n)
return n->pprev == &n->next;
}
-static inline void hlist_bl_lock(struct hlist_bl_head *b)
-{
- bit_spin_lock(0, (unsigned long *)b);
-}
-
-static inline void hlist_bl_unlock(struct hlist_bl_head *b)
-{
- __bit_spin_unlock(0, (unsigned long *)b);
-}
-
-static inline bool hlist_bl_is_locked(struct hlist_bl_head *b)
-{
- return bit_spin_is_locked(0, (unsigned long *)b);
-}
-
/**
* hlist_bl_for_each_entry - iterate over list of given type
* @tpos: the type * to use as a loop cursor.
diff --git a/include/linux/rculist_bl.h b/include/linux/rculist_bl.h
index 0b952d06eb0b..2d5eb5153121 100644
--- a/include/linux/rculist_bl.h
+++ b/include/linux/rculist_bl.h
@@ -8,6 +8,18 @@
#include <linux/list_bl.h>
#include <linux/rcupdate.h>
+#ifdef LIST_BL_USE_SPINLOCKS
+static inline void hlist_bl_set_first_rcu(struct hlist_bl_head *h,
+ struct hlist_bl_node *n)
+{
+ rcu_assign_pointer(h->first, n);
+}
+
+static inline struct hlist_bl_node *hlist_bl_first_rcu(struct hlist_bl_head *h)
+{
+ return rcu_dereference_check(h->first, hlist_bl_is_locked(h));
+}
+#else /* !LIST_BL_USE_SPINLOCKS */
static inline void hlist_bl_set_first_rcu(struct hlist_bl_head *h,
struct hlist_bl_node *n)
{
@@ -23,6 +35,7 @@ static inline struct hlist_bl_node *hlist_bl_first_rcu(struct hlist_bl_head *h)
return (struct hlist_bl_node *)
((unsigned long)rcu_dereference_check(h->first, hlist_bl_is_locked(h)) & ~LIST_BL_LOCKMASK);
}
+#endif /* LIST_BL_USE_SPINLOCKS */
/**
* hlist_bl_del_rcu - deletes entry from hash list without re-initialization
--
2.42.0
^ permalink raw reply related [flat|nested] 34+ messages in thread
* Re: [PATCH 10/11] list_bl: don't use bit locks for PREEMPT_RT or lockdep
2023-12-06 6:05 ` [PATCH 10/11] list_bl: don't use bit locks for PREEMPT_RT or lockdep Dave Chinner
@ 2023-12-07 4:16 ` Kent Overstreet
2023-12-07 4:41 ` Dave Chinner
0 siblings, 1 reply; 34+ messages in thread
From: Kent Overstreet @ 2023-12-07 4:16 UTC (permalink / raw)
To: Dave Chinner
Cc: linux-fsdevel, linux-block, linux-cachefs, dhowells, gfs2,
dm-devel, linux-security-module, selinux, linux-kernel
On Wed, Dec 06, 2023 at 05:05:39PM +1100, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
>
> hash-bl nests spinlocks inside the bit locks. This causes problems
> for CONFIG_PREEMPT_RT which converts spin locks to sleeping locks,
> and we're not allowed to sleep while holding a spinning lock.
>
> Further, lockdep does not support bit locks, so we lose lockdep
> coverage of the inode hash table with the hash-bl conversion.
>
> To enable these configs to work, add an external per-chain spinlock
> to the hlist_bl_head() and add helpers to use this instead of the
> bit spinlock when preempt_rt or lockdep are enabled.
>
> This converts all users of hlist-bl to use the external spinlock in
> these situations, so we also gain lockdep coverage of things like
> the dentry cache hash table with this change.
>
> Signed-off-by: Dave Chinner <dchinner@redhat.com>
Sleepable bit locks can be done with wait_on_bit(), is that worth
considering for PREEMPT_RT? Or are the other features of real locks
important there?
(not a request for the current patchset, just perhaps a note for future
work)
Reviewed-by: Kent Overstreet <kent.overstreet@linux.dev>
> ---
> include/linux/list_bl.h | 126 ++++++++++++++++++++++++++++---------
> include/linux/rculist_bl.h | 13 ++++
> 2 files changed, 110 insertions(+), 29 deletions(-)
>
> diff --git a/include/linux/list_bl.h b/include/linux/list_bl.h
> index 8ee2bf5af131..990ad8e24e0b 100644
> --- a/include/linux/list_bl.h
> +++ b/include/linux/list_bl.h
> @@ -4,14 +4,27 @@
>
> #include <linux/list.h>
> #include <linux/bit_spinlock.h>
> +#include <linux/spinlock.h>
>
> /*
> * Special version of lists, where head of the list has a lock in the lowest
> * bit. This is useful for scalable hash tables without increasing memory
> * footprint overhead.
> *
> - * For modification operations, the 0 bit of hlist_bl_head->first
> - * pointer must be set.
> + * Whilst the general use of bit spin locking is considered safe, PREEMPT_RT
> + * introduces a problem with nesting spin locks inside bit locks: spin locks
> + * become sleeping locks, and we can't sleep inside spinning locks such as bit
> + * locks. However, for RTPREEMPT, performance is less of an issue than
> + * correctness, so we trade off the memory and cache footprint of a spinlock per
> + * list so the list locks are converted to sleeping locks and work correctly
> + * with PREEMPT_RT kernels.
> + *
> + * An added advantage of this is that we can use the same trick when lockdep is
> + * enabled (again, performance doesn't matter) and gain lockdep coverage of all
> + * the hash-bl operations.
> + *
> + * For modification operations when using pure bit locking, the 0 bit of
> + * hlist_bl_head->first pointer must be set.
> *
> * With some small modifications, this can easily be adapted to store several
> * arbitrary bits (not just a single lock bit), if the need arises to store
> @@ -30,16 +43,21 @@
> #define LIST_BL_BUG_ON(x)
> #endif
>
> +#undef LIST_BL_USE_SPINLOCKS
> +#if defined(CONFIG_PREEMPT_RT) || defined(CONFIG_LOCKDEP)
> +#define LIST_BL_USE_SPINLOCKS 1
> +#endif
>
> struct hlist_bl_head {
> struct hlist_bl_node *first;
> +#ifdef LIST_BL_USE_SPINLOCKS
> + spinlock_t lock;
> +#endif
> };
>
> struct hlist_bl_node {
> struct hlist_bl_node *next, **pprev;
> };
> -#define INIT_HLIST_BL_HEAD(ptr) \
> - ((ptr)->first = NULL)
>
> static inline void INIT_HLIST_BL_NODE(struct hlist_bl_node *h)
> {
> @@ -54,6 +72,69 @@ static inline bool hlist_bl_unhashed(const struct hlist_bl_node *h)
> return !h->pprev;
> }
>
> +#ifdef LIST_BL_USE_SPINLOCKS
> +#define INIT_HLIST_BL_HEAD(ptr) do { \
> + (ptr)->first = NULL; \
> + spin_lock_init(&(ptr)->lock); \
> +} while (0)
> +
> +static inline void hlist_bl_lock(struct hlist_bl_head *b)
> +{
> + spin_lock(&b->lock);
> +}
> +
> +static inline void hlist_bl_unlock(struct hlist_bl_head *b)
> +{
> + spin_unlock(&b->lock);
> +}
> +
> +static inline bool hlist_bl_is_locked(struct hlist_bl_head *b)
> +{
> + return spin_is_locked(&b->lock);
> +}
> +
> +static inline struct hlist_bl_node *hlist_bl_first(struct hlist_bl_head *h)
> +{
> + return h->first;
> +}
> +
> +static inline void hlist_bl_set_first(struct hlist_bl_head *h,
> + struct hlist_bl_node *n)
> +{
> + h->first = n;
> +}
> +
> +static inline void hlist_bl_set_before(struct hlist_bl_node **pprev,
> + struct hlist_bl_node *n)
> +{
> + WRITE_ONCE(*pprev, n);
> +}
> +
> +static inline bool hlist_bl_empty(const struct hlist_bl_head *h)
> +{
> + return !READ_ONCE(h->first);
> +}
> +
> +#else /* !LIST_BL_USE_SPINLOCKS */
> +
> +#define INIT_HLIST_BL_HEAD(ptr) \
> + ((ptr)->first = NULL)
> +
> +static inline void hlist_bl_lock(struct hlist_bl_head *b)
> +{
> + bit_spin_lock(0, (unsigned long *)b);
> +}
> +
> +static inline void hlist_bl_unlock(struct hlist_bl_head *b)
> +{
> + __bit_spin_unlock(0, (unsigned long *)b);
> +}
> +
> +static inline bool hlist_bl_is_locked(struct hlist_bl_head *b)
> +{
> + return bit_spin_is_locked(0, (unsigned long *)b);
> +}
> +
> static inline struct hlist_bl_node *hlist_bl_first(struct hlist_bl_head *h)
> {
> return (struct hlist_bl_node *)
> @@ -69,11 +150,21 @@ static inline void hlist_bl_set_first(struct hlist_bl_head *h,
> h->first = (struct hlist_bl_node *)((unsigned long)n | LIST_BL_LOCKMASK);
> }
>
> +static inline void hlist_bl_set_before(struct hlist_bl_node **pprev,
> + struct hlist_bl_node *n)
> +{
> + WRITE_ONCE(*pprev,
> + (struct hlist_bl_node *)
> + ((uintptr_t)n | ((uintptr_t)*pprev & LIST_BL_LOCKMASK)));
> +}
> +
> static inline bool hlist_bl_empty(const struct hlist_bl_head *h)
> {
> return !((unsigned long)READ_ONCE(h->first) & ~LIST_BL_LOCKMASK);
> }
>
> +#endif /* LIST_BL_USE_SPINLOCKS */
> +
> static inline void hlist_bl_add_head(struct hlist_bl_node *n,
> struct hlist_bl_head *h)
> {
> @@ -94,11 +185,7 @@ static inline void hlist_bl_add_before(struct hlist_bl_node *n,
> n->pprev = pprev;
> n->next = next;
> next->pprev = &n->next;
> -
> - /* pprev may be `first`, so be careful not to lose the lock bit */
> - WRITE_ONCE(*pprev,
> - (struct hlist_bl_node *)
> - ((uintptr_t)n | ((uintptr_t)*pprev & LIST_BL_LOCKMASK)));
> + hlist_bl_set_before(pprev, n);
> }
>
> static inline void hlist_bl_add_behind(struct hlist_bl_node *n,
> @@ -119,11 +206,7 @@ static inline void __hlist_bl_del(struct hlist_bl_node *n)
>
> LIST_BL_BUG_ON((unsigned long)n & LIST_BL_LOCKMASK);
>
> - /* pprev may be `first`, so be careful not to lose the lock bit */
> - WRITE_ONCE(*pprev,
> - (struct hlist_bl_node *)
> - ((unsigned long)next |
> - ((unsigned long)*pprev & LIST_BL_LOCKMASK)));
> + hlist_bl_set_before(pprev, next);
> if (next)
> next->pprev = pprev;
> }
> @@ -165,21 +248,6 @@ static inline bool hlist_bl_fake(struct hlist_bl_node *n)
> return n->pprev == &n->next;
> }
>
> -static inline void hlist_bl_lock(struct hlist_bl_head *b)
> -{
> - bit_spin_lock(0, (unsigned long *)b);
> -}
> -
> -static inline void hlist_bl_unlock(struct hlist_bl_head *b)
> -{
> - __bit_spin_unlock(0, (unsigned long *)b);
> -}
> -
> -static inline bool hlist_bl_is_locked(struct hlist_bl_head *b)
> -{
> - return bit_spin_is_locked(0, (unsigned long *)b);
> -}
> -
> /**
> * hlist_bl_for_each_entry - iterate over list of given type
> * @tpos: the type * to use as a loop cursor.
> diff --git a/include/linux/rculist_bl.h b/include/linux/rculist_bl.h
> index 0b952d06eb0b..2d5eb5153121 100644
> --- a/include/linux/rculist_bl.h
> +++ b/include/linux/rculist_bl.h
> @@ -8,6 +8,18 @@
> #include <linux/list_bl.h>
> #include <linux/rcupdate.h>
>
> +#ifdef LIST_BL_USE_SPINLOCKS
> +static inline void hlist_bl_set_first_rcu(struct hlist_bl_head *h,
> + struct hlist_bl_node *n)
> +{
> + rcu_assign_pointer(h->first, n);
> +}
> +
> +static inline struct hlist_bl_node *hlist_bl_first_rcu(struct hlist_bl_head *h)
> +{
> + return rcu_dereference_check(h->first, hlist_bl_is_locked(h));
> +}
> +#else /* !LIST_BL_USE_SPINLOCKS */
> static inline void hlist_bl_set_first_rcu(struct hlist_bl_head *h,
> struct hlist_bl_node *n)
> {
> @@ -23,6 +35,7 @@ static inline struct hlist_bl_node *hlist_bl_first_rcu(struct hlist_bl_head *h)
> return (struct hlist_bl_node *)
> ((unsigned long)rcu_dereference_check(h->first, hlist_bl_is_locked(h)) & ~LIST_BL_LOCKMASK);
> }
> +#endif /* LIST_BL_USE_SPINLOCKS */
>
> /**
> * hlist_bl_del_rcu - deletes entry from hash list without re-initialization
> --
> 2.42.0
>
^ permalink raw reply [flat|nested] 34+ messages in thread
* Re: [PATCH 10/11] list_bl: don't use bit locks for PREEMPT_RT or lockdep
2023-12-07 4:16 ` Kent Overstreet
@ 2023-12-07 4:41 ` Dave Chinner
0 siblings, 0 replies; 34+ messages in thread
From: Dave Chinner @ 2023-12-07 4:41 UTC (permalink / raw)
To: Kent Overstreet
Cc: linux-fsdevel, linux-block, linux-cachefs, dhowells, gfs2,
dm-devel, linux-security-module, selinux, linux-kernel
On Wed, Dec 06, 2023 at 11:16:50PM -0500, Kent Overstreet wrote:
> On Wed, Dec 06, 2023 at 05:05:39PM +1100, Dave Chinner wrote:
> > From: Dave Chinner <dchinner@redhat.com>
> >
> > hash-bl nests spinlocks inside the bit locks. This causes problems
> > for CONFIG_PREEMPT_RT which converts spin locks to sleeping locks,
> > and we're not allowed to sleep while holding a spinning lock.
> >
> > Further, lockdep does not support bit locks, so we lose lockdep
> > coverage of the inode hash table with the hash-bl conversion.
> >
> > To enable these configs to work, add an external per-chain spinlock
> > to the hlist_bl_head() and add helpers to use this instead of the
> > bit spinlock when preempt_rt or lockdep are enabled.
> >
> > This converts all users of hlist-bl to use the external spinlock in
> > these situations, so we also gain lockdep coverage of things like
> > the dentry cache hash table with this change.
> >
> > Signed-off-by: Dave Chinner <dchinner@redhat.com>
>
> Sleepable bit locks can be done with wait_on_bit(), is that worth
> considering for PREEMPT_RT? Or are the other features of real locks
> important there?
I think wait_on_bit() is not scalable. It hashes down to one of 256
shared struct wait_queue_heads which have thundering herd
behaviours, and it requires the locker to always run
prepare_to_wait() and finish_wait(). This means there is at least
one spinlock_irqsave()/unlock pair needed, sometimes two, just to
get an uncontended sleeping bit lock.
So as a fast path operation that requires lock scalability, it's
going to be better to use a straight spinlock that doesn't require
irq safety as it's far less expensive than a sleeping bit lock.
Whether CONFIG_PREEMPT_RT changes that equation at all is not at all
clear to me, and so I'll leave that consideration to RT people if
they see a need to address it. In the mean time, we need to use an
external spinlock for lockdep validation so it really doesn't make
any sense at all to add a third locking variant with completely
different semantics just for PREEMPT_RT...
-Dave.
--
Dave Chinner
david@fromorbit.com
^ permalink raw reply [flat|nested] 34+ messages in thread
* [PATCH 11/11] hlist-bl: introduced nested locking for dm-snap
2023-12-06 6:05 [PATCH 0/11] vfs: inode cache scalability improvements Dave Chinner
` (9 preceding siblings ...)
2023-12-06 6:05 ` [PATCH 10/11] list_bl: don't use bit locks for PREEMPT_RT or lockdep Dave Chinner
@ 2023-12-06 6:05 ` Dave Chinner
2023-12-07 17:08 ` [PATCH 0/11] vfs: inode cache scalability improvements Kent Overstreet
11 siblings, 0 replies; 34+ messages in thread
From: Dave Chinner @ 2023-12-06 6:05 UTC (permalink / raw)
To: linux-fsdevel
Cc: linux-block, linux-cachefs, dhowells, gfs2, dm-devel,
linux-security-module, selinux, linux-kernel
From: Dave Chinner <dchinner@redhat.com>
Testing with lockdep enabled threw this warning from generic/081 in
fstests:
[ 2369.724151] ============================================
[ 2369.725805] WARNING: possible recursive locking detected
[ 2369.727125] 6.7.0-rc2-dgc+ #1952 Not tainted
[ 2369.728647] --------------------------------------------
[ 2369.730197] systemd-udevd/389493 is trying to acquire lock:
[ 2369.732378] ffff888116a1a320 (&(et->table + i)->lock){+.+.}-{2:2}, at: snapshot_map+0x13e/0x7f0
[ 2369.736197]
but task is already holding lock:
[ 2369.738657] ffff8881098a4fd0 (&(et->table + i)->lock){+.+.}-{2:2}, at: snapshot_map+0x136/0x7f0
[ 2369.742118]
other info that might help us debug this:
[ 2369.744403] Possible unsafe locking scenario:
[ 2369.746814] CPU0
[ 2369.747675] ----
[ 2369.748496] lock(&(et->table + i)->lock);
[ 2369.749877] lock(&(et->table + i)->lock);
[ 2369.751241]
*** DEADLOCK ***
[ 2369.753173] May be due to missing lock nesting notation
[ 2369.754963] 4 locks held by systemd-udevd/389493:
[ 2369.756124] #0: ffff88811b3a1f48 (mapping.invalidate_lock#2){++++}-{3:3}, at: page_cache_ra_unbounded+0x69/0x190
[ 2369.758516] #1: ffff888121ceff10 (&md->io_barrier){.+.+}-{0:0}, at: dm_get_live_table+0x52/0xd0
[ 2369.760888] #2: ffff888110240078 (&s->lock#2){++++}-{3:3}, at: snapshot_map+0x12e/0x7f0
[ 2369.763254] #3: ffff8881098a4fd0 (&(et->table + i)->lock){+.+.}-{2:2}, at: snapshot_map+0x136/0x7f0
[ 2369.765896]
stack backtrace:
[ 2369.767429] CPU: 3 PID: 389493 Comm: systemd-udevd Not tainted 6.7.0-rc2-dgc+ #1952
[ 2369.770203] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
[ 2369.773771] Call Trace:
[ 2369.774657] <TASK>
[ 2369.775494] dump_stack_lvl+0x5c/0xc0
[ 2369.776765] dump_stack+0x10/0x20
[ 2369.778031] print_deadlock_bug+0x220/0x2f0
[ 2369.779568] __lock_acquire+0x1255/0x2180
[ 2369.781013] lock_acquire+0xb9/0x2c0
[ 2369.782456] ? snapshot_map+0x13e/0x7f0
[ 2369.783927] ? snapshot_map+0x136/0x7f0
[ 2369.785240] _raw_spin_lock+0x34/0x70
[ 2369.786413] ? snapshot_map+0x13e/0x7f0
[ 2369.787482] snapshot_map+0x13e/0x7f0
[ 2369.788462] ? lockdep_init_map_type+0x75/0x250
[ 2369.789650] __map_bio+0x1d7/0x200
[ 2369.790364] dm_submit_bio+0x17d/0x570
[ 2369.791387] __submit_bio+0x4a/0x80
[ 2369.792215] submit_bio_noacct_nocheck+0x108/0x350
[ 2369.793357] submit_bio_noacct+0x115/0x450
[ 2369.794334] submit_bio+0x43/0x60
[ 2369.795112] mpage_readahead+0xf1/0x130
[ 2369.796037] ? blkdev_write_begin+0x30/0x30
[ 2369.797007] blkdev_readahead+0x15/0x20
[ 2369.797893] read_pages+0x5c/0x230
[ 2369.798703] page_cache_ra_unbounded+0x143/0x190
[ 2369.799810] force_page_cache_ra+0x9a/0xc0
[ 2369.800754] page_cache_sync_ra+0x2e/0x50
[ 2369.801704] filemap_get_pages+0x112/0x630
[ 2369.802696] ? __lock_acquire+0x413/0x2180
[ 2369.803663] filemap_read+0xfc/0x3a0
[ 2369.804527] ? __might_sleep+0x42/0x70
[ 2369.805443] blkdev_read_iter+0x6d/0x150
[ 2369.806370] vfs_read+0x1a6/0x2d0
[ 2369.807148] ksys_read+0x71/0xf0
[ 2369.807936] __x64_sys_read+0x19/0x20
[ 2369.808810] do_syscall_64+0x3c/0xe0
[ 2369.809746] entry_SYSCALL_64_after_hwframe+0x63/0x6b
[ 2369.810914] RIP: 0033:0x7f9f14dbb03d
Turns out that dm-snap holds two hash-bl locks at the same time,
so we need nesting semantics to ensure lockdep understands what is
going on.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
drivers/md/dm-snap.c | 2 +-
include/linux/list_bl.h | 10 ++++++++++
2 files changed, 11 insertions(+), 1 deletion(-)
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index bf7a574499a3..cd97d5cb295d 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -645,7 +645,7 @@ static void dm_exception_table_lock_init(struct dm_snapshot *s, chunk_t chunk,
static void dm_exception_table_lock(struct dm_exception_table_lock *lock)
{
hlist_bl_lock(lock->complete_slot);
- hlist_bl_lock(lock->pending_slot);
+ hlist_bl_lock_nested(lock->pending_slot, SINGLE_DEPTH_NESTING);
}
static void dm_exception_table_unlock(struct dm_exception_table_lock *lock)
diff --git a/include/linux/list_bl.h b/include/linux/list_bl.h
index 990ad8e24e0b..0e3e60c10563 100644
--- a/include/linux/list_bl.h
+++ b/include/linux/list_bl.h
@@ -83,6 +83,11 @@ static inline void hlist_bl_lock(struct hlist_bl_head *b)
spin_lock(&b->lock);
}
+static inline void hlist_bl_lock_nested(struct hlist_bl_head *b, int subclass)
+{
+ spin_lock_nested(&b->lock, subclass);
+}
+
static inline void hlist_bl_unlock(struct hlist_bl_head *b)
{
spin_unlock(&b->lock);
@@ -125,6 +130,11 @@ static inline void hlist_bl_lock(struct hlist_bl_head *b)
bit_spin_lock(0, (unsigned long *)b);
}
+static inline void hlist_bl_lock_nested(struct hlist_bl_head *b, int subclass)
+{
+ hlist_bl_lock(b);
+}
+
static inline void hlist_bl_unlock(struct hlist_bl_head *b)
{
__bit_spin_unlock(0, (unsigned long *)b);
--
2.42.0
^ permalink raw reply related [flat|nested] 34+ messages in thread
* Re: [PATCH 0/11] vfs: inode cache scalability improvements
2023-12-06 6:05 [PATCH 0/11] vfs: inode cache scalability improvements Dave Chinner
` (10 preceding siblings ...)
2023-12-06 6:05 ` [PATCH 11/11] hlist-bl: introduced nested locking for dm-snap Dave Chinner
@ 2023-12-07 17:08 ` Kent Overstreet
11 siblings, 0 replies; 34+ messages in thread
From: Kent Overstreet @ 2023-12-07 17:08 UTC (permalink / raw)
To: Dave Chinner
Cc: linux-fsdevel, linux-block, linux-cachefs, dhowells, gfs2,
dm-devel, linux-security-module, selinux, linux-kernel
On Wed, Dec 06, 2023 at 05:05:29PM +1100, Dave Chinner wrote:
...o
> Git tree containing this series can be pulled from:
>
> https://git.kernel.org/pub/scm/linux/kernel/git/dgc/linux-xfs.git vfs-scale
For the series:
Tested-by: Kent Overstreet <kent.overstreet@linux.dev>
^ permalink raw reply [flat|nested] 34+ messages in thread