gfs2.lists.linux.dev archive mirror
 help / color / mirror / Atom feed
From: Kent Overstreet <kent.overstreet@linux.dev>
To: Dave Chinner <david@fromorbit.com>
Cc: linux-fsdevel@vger.kernel.org, linux-block@vger.kernel.org,
	linux-cachefs@redhat.com, dhowells@redhat.com,
	gfs2@lists.linux.dev, dm-devel@lists.linux.dev,
	linux-security-module@vger.kernel.org, selinux@vger.kernel.org,
	linux-kernel@vger.kernel.org
Subject: Re: [PATCH 10/11] list_bl: don't use bit locks for PREEMPT_RT or lockdep
Date: Wed, 6 Dec 2023 23:16:50 -0500	[thread overview]
Message-ID: <20231207041650.3tzzmv2jfrr5vppl@moria.home.lan> (raw)
In-Reply-To: <20231206060629.2827226-11-david@fromorbit.com>

On Wed, Dec 06, 2023 at 05:05:39PM +1100, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> hash-bl nests spinlocks inside the bit locks. This causes problems
> for CONFIG_PREEMPT_RT which converts spin locks to sleeping locks,
> and we're not allowed to sleep while holding a spinning lock.
> 
> Further, lockdep does not support bit locks, so we lose lockdep
> coverage of the inode hash table with the hash-bl conversion.
> 
> To enable these configs to work, add an external per-chain spinlock
> to the hlist_bl_head() and add helpers to use this instead of the
> bit spinlock when preempt_rt or lockdep are enabled.
> 
> This converts all users of hlist-bl to use the external spinlock in
> these situations, so we also gain lockdep coverage of things like
> the dentry cache hash table with this change.
> 
> Signed-off-by: Dave Chinner <dchinner@redhat.com>

Sleepable bit locks can be done with wait_on_bit(), is that worth
considering for PREEMPT_RT? Or are the other features of real locks
important there?

(not a request for the current patchset, just perhaps a note for future
work)

Reviewed-by: Kent Overstreet <kent.overstreet@linux.dev>

> ---
>  include/linux/list_bl.h    | 126 ++++++++++++++++++++++++++++---------
>  include/linux/rculist_bl.h |  13 ++++
>  2 files changed, 110 insertions(+), 29 deletions(-)
> 
> diff --git a/include/linux/list_bl.h b/include/linux/list_bl.h
> index 8ee2bf5af131..990ad8e24e0b 100644
> --- a/include/linux/list_bl.h
> +++ b/include/linux/list_bl.h
> @@ -4,14 +4,27 @@
>  
>  #include <linux/list.h>
>  #include <linux/bit_spinlock.h>
> +#include <linux/spinlock.h>
>  
>  /*
>   * Special version of lists, where head of the list has a lock in the lowest
>   * bit. This is useful for scalable hash tables without increasing memory
>   * footprint overhead.
>   *
> - * For modification operations, the 0 bit of hlist_bl_head->first
> - * pointer must be set.
> + * Whilst the general use of bit spin locking is considered safe, PREEMPT_RT
> + * introduces a problem with nesting spin locks inside bit locks: spin locks
> + * become sleeping locks, and we can't sleep inside spinning locks such as bit
> + * locks. However, for RTPREEMPT, performance is less of an issue than
> + * correctness, so we trade off the memory and cache footprint of a spinlock per
> + * list so the list locks are converted to sleeping locks and work correctly
> + * with PREEMPT_RT kernels.
> + *
> + * An added advantage of this is that we can use the same trick when lockdep is
> + * enabled (again, performance doesn't matter) and gain lockdep coverage of all
> + * the hash-bl operations.
> + *
> + * For modification operations when using pure bit locking, the 0 bit of
> + * hlist_bl_head->first pointer must be set.
>   *
>   * With some small modifications, this can easily be adapted to store several
>   * arbitrary bits (not just a single lock bit), if the need arises to store
> @@ -30,16 +43,21 @@
>  #define LIST_BL_BUG_ON(x)
>  #endif
>  
> +#undef LIST_BL_USE_SPINLOCKS
> +#if defined(CONFIG_PREEMPT_RT) || defined(CONFIG_LOCKDEP)
> +#define LIST_BL_USE_SPINLOCKS	1
> +#endif
>  
>  struct hlist_bl_head {
>  	struct hlist_bl_node *first;
> +#ifdef LIST_BL_USE_SPINLOCKS
> +	spinlock_t lock;
> +#endif
>  };
>  
>  struct hlist_bl_node {
>  	struct hlist_bl_node *next, **pprev;
>  };
> -#define INIT_HLIST_BL_HEAD(ptr) \
> -	((ptr)->first = NULL)
>  
>  static inline void INIT_HLIST_BL_NODE(struct hlist_bl_node *h)
>  {
> @@ -54,6 +72,69 @@ static inline bool  hlist_bl_unhashed(const struct hlist_bl_node *h)
>  	return !h->pprev;
>  }
>  
> +#ifdef LIST_BL_USE_SPINLOCKS
> +#define INIT_HLIST_BL_HEAD(ptr) do { \
> +	(ptr)->first = NULL; \
> +	spin_lock_init(&(ptr)->lock); \
> +} while (0)
> +
> +static inline void hlist_bl_lock(struct hlist_bl_head *b)
> +{
> +	spin_lock(&b->lock);
> +}
> +
> +static inline void hlist_bl_unlock(struct hlist_bl_head *b)
> +{
> +	spin_unlock(&b->lock);
> +}
> +
> +static inline bool hlist_bl_is_locked(struct hlist_bl_head *b)
> +{
> +	return spin_is_locked(&b->lock);
> +}
> +
> +static inline struct hlist_bl_node *hlist_bl_first(struct hlist_bl_head *h)
> +{
> +	return h->first;
> +}
> +
> +static inline void hlist_bl_set_first(struct hlist_bl_head *h,
> +					struct hlist_bl_node *n)
> +{
> +	h->first = n;
> +}
> +
> +static inline void hlist_bl_set_before(struct hlist_bl_node **pprev,
> +					struct hlist_bl_node *n)
> +{
> +	WRITE_ONCE(*pprev, n);
> +}
> +
> +static inline bool hlist_bl_empty(const struct hlist_bl_head *h)
> +{
> +	return !READ_ONCE(h->first);
> +}
> +
> +#else /* !LIST_BL_USE_SPINLOCKS */
> +
> +#define INIT_HLIST_BL_HEAD(ptr) \
> +	((ptr)->first = NULL)
> +
> +static inline void hlist_bl_lock(struct hlist_bl_head *b)
> +{
> +	bit_spin_lock(0, (unsigned long *)b);
> +}
> +
> +static inline void hlist_bl_unlock(struct hlist_bl_head *b)
> +{
> +	__bit_spin_unlock(0, (unsigned long *)b);
> +}
> +
> +static inline bool hlist_bl_is_locked(struct hlist_bl_head *b)
> +{
> +	return bit_spin_is_locked(0, (unsigned long *)b);
> +}
> +
>  static inline struct hlist_bl_node *hlist_bl_first(struct hlist_bl_head *h)
>  {
>  	return (struct hlist_bl_node *)
> @@ -69,11 +150,21 @@ static inline void hlist_bl_set_first(struct hlist_bl_head *h,
>  	h->first = (struct hlist_bl_node *)((unsigned long)n | LIST_BL_LOCKMASK);
>  }
>  
> +static inline void hlist_bl_set_before(struct hlist_bl_node **pprev,
> +					struct hlist_bl_node *n)
> +{
> +	WRITE_ONCE(*pprev,
> +		   (struct hlist_bl_node *)
> +			((uintptr_t)n | ((uintptr_t)*pprev & LIST_BL_LOCKMASK)));
> +}
> +
>  static inline bool hlist_bl_empty(const struct hlist_bl_head *h)
>  {
>  	return !((unsigned long)READ_ONCE(h->first) & ~LIST_BL_LOCKMASK);
>  }
>  
> +#endif /* LIST_BL_USE_SPINLOCKS */
> +
>  static inline void hlist_bl_add_head(struct hlist_bl_node *n,
>  					struct hlist_bl_head *h)
>  {
> @@ -94,11 +185,7 @@ static inline void hlist_bl_add_before(struct hlist_bl_node *n,
>  	n->pprev = pprev;
>  	n->next = next;
>  	next->pprev = &n->next;
> -
> -	/* pprev may be `first`, so be careful not to lose the lock bit */
> -	WRITE_ONCE(*pprev,
> -		   (struct hlist_bl_node *)
> -			((uintptr_t)n | ((uintptr_t)*pprev & LIST_BL_LOCKMASK)));
> +	hlist_bl_set_before(pprev, n);
>  }
>  
>  static inline void hlist_bl_add_behind(struct hlist_bl_node *n,
> @@ -119,11 +206,7 @@ static inline void __hlist_bl_del(struct hlist_bl_node *n)
>  
>  	LIST_BL_BUG_ON((unsigned long)n & LIST_BL_LOCKMASK);
>  
> -	/* pprev may be `first`, so be careful not to lose the lock bit */
> -	WRITE_ONCE(*pprev,
> -		   (struct hlist_bl_node *)
> -			((unsigned long)next |
> -			 ((unsigned long)*pprev & LIST_BL_LOCKMASK)));
> +	hlist_bl_set_before(pprev, next);
>  	if (next)
>  		next->pprev = pprev;
>  }
> @@ -165,21 +248,6 @@ static inline bool hlist_bl_fake(struct hlist_bl_node *n)
>  	return n->pprev == &n->next;
>  }
>  
> -static inline void hlist_bl_lock(struct hlist_bl_head *b)
> -{
> -	bit_spin_lock(0, (unsigned long *)b);
> -}
> -
> -static inline void hlist_bl_unlock(struct hlist_bl_head *b)
> -{
> -	__bit_spin_unlock(0, (unsigned long *)b);
> -}
> -
> -static inline bool hlist_bl_is_locked(struct hlist_bl_head *b)
> -{
> -	return bit_spin_is_locked(0, (unsigned long *)b);
> -}
> -
>  /**
>   * hlist_bl_for_each_entry	- iterate over list of given type
>   * @tpos:	the type * to use as a loop cursor.
> diff --git a/include/linux/rculist_bl.h b/include/linux/rculist_bl.h
> index 0b952d06eb0b..2d5eb5153121 100644
> --- a/include/linux/rculist_bl.h
> +++ b/include/linux/rculist_bl.h
> @@ -8,6 +8,18 @@
>  #include <linux/list_bl.h>
>  #include <linux/rcupdate.h>
>  
> +#ifdef LIST_BL_USE_SPINLOCKS
> +static inline void hlist_bl_set_first_rcu(struct hlist_bl_head *h,
> +					struct hlist_bl_node *n)
> +{
> +	rcu_assign_pointer(h->first, n);
> +}
> +
> +static inline struct hlist_bl_node *hlist_bl_first_rcu(struct hlist_bl_head *h)
> +{
> +	return rcu_dereference_check(h->first, hlist_bl_is_locked(h));
> +}
> +#else /* !LIST_BL_USE_SPINLOCKS */
>  static inline void hlist_bl_set_first_rcu(struct hlist_bl_head *h,
>  					struct hlist_bl_node *n)
>  {
> @@ -23,6 +35,7 @@ static inline struct hlist_bl_node *hlist_bl_first_rcu(struct hlist_bl_head *h)
>  	return (struct hlist_bl_node *)
>  		((unsigned long)rcu_dereference_check(h->first, hlist_bl_is_locked(h)) & ~LIST_BL_LOCKMASK);
>  }
> +#endif /* LIST_BL_USE_SPINLOCKS */
>  
>  /**
>   * hlist_bl_del_rcu - deletes entry from hash list without re-initialization
> -- 
> 2.42.0
> 

  reply	other threads:[~2023-12-07  4:16 UTC|newest]

Thread overview: 34+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-12-06  6:05 [PATCH 0/11] vfs: inode cache scalability improvements Dave Chinner
2023-12-06  6:05 ` [PATCH 01/11] lib/dlock-list: Distributed and lock-protected lists Dave Chinner
2023-12-07  2:23   ` Al Viro
2023-12-06  6:05 ` [PATCH 02/11] vfs: Remove unnecessary list_for_each_entry_safe() variants Dave Chinner
2023-12-07  2:26   ` Al Viro
2023-12-07  4:18   ` Kent Overstreet
2023-12-06  6:05 ` [PATCH 03/11] vfs: Use dlock list for superblock's inode list Dave Chinner
2023-12-07  2:40   ` Al Viro
2023-12-07  4:59     ` Dave Chinner
2023-12-07  5:03       ` Kent Overstreet
2023-12-06  6:05 ` [PATCH 04/11] lib/dlock-list: Make sibling CPUs share the same linked list Dave Chinner
2023-12-07  4:31   ` Kent Overstreet
2023-12-07  5:42   ` Kent Overstreet
2023-12-07  6:25     ` Dave Chinner
2023-12-07  6:49   ` Al Viro
2023-12-06  6:05 ` [PATCH 05/11] selinux: use dlist for isec inode list Dave Chinner
2023-12-06 21:52   ` Paul Moore
2023-12-06 23:04     ` Dave Chinner
2023-12-07  0:36       ` Paul Moore
2023-12-06  6:05 ` [PATCH 06/11] vfs: factor out inode hash head calculation Dave Chinner
2023-12-07  3:02   ` Al Viro
2023-12-06  6:05 ` [PATCH 07/11] hlist-bl: add hlist_bl_fake() Dave Chinner
2023-12-07  3:05   ` Al Viro
2023-12-06  6:05 ` [PATCH 08/11] vfs: inode cache conversion to hash-bl Dave Chinner
2023-12-07  4:58   ` Kent Overstreet
2023-12-07  6:03     ` Dave Chinner
2023-12-07  6:42   ` Al Viro
2023-12-06  6:05 ` [PATCH 09/11] hash-bl: explicitly initialise hash-bl heads Dave Chinner
2023-12-07  3:15   ` Al Viro
2023-12-06  6:05 ` [PATCH 10/11] list_bl: don't use bit locks for PREEMPT_RT or lockdep Dave Chinner
2023-12-07  4:16   ` Kent Overstreet [this message]
2023-12-07  4:41     ` Dave Chinner
2023-12-06  6:05 ` [PATCH 11/11] hlist-bl: introduced nested locking for dm-snap Dave Chinner
2023-12-07 17:08 ` [PATCH 0/11] vfs: inode cache scalability improvements Kent Overstreet

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20231207041650.3tzzmv2jfrr5vppl@moria.home.lan \
    --to=kent.overstreet@linux.dev \
    --cc=david@fromorbit.com \
    --cc=dhowells@redhat.com \
    --cc=dm-devel@lists.linux.dev \
    --cc=gfs2@lists.linux.dev \
    --cc=linux-block@vger.kernel.org \
    --cc=linux-cachefs@redhat.com \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-security-module@vger.kernel.org \
    --cc=selinux@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).