LKML Archive mirror
 help / color / mirror / Atom feed
* [RFC PATCH] dm: Avoid sending redundant empty flush bios to the same block device
@ 2024-04-22 10:05 Yang Yang
  2024-04-25  2:54 ` YangYang
  0 siblings, 1 reply; 2+ messages in thread
From: Yang Yang @ 2024-04-22 10:05 UTC (permalink / raw)
  To: Alasdair Kergon, Mike Snitzer, Mikulas Patocka, dm-devel,
	linux-kernel
  Cc: Yang Yang

__send_empty_flush() sends empty flush bios to every target in the
dm_table. However, if the num_targets exceeds the number of block
devices in the dm_table's device list, it could lead to multiple
invocations of __send_duplicate_bios() for the same block device.
Typically, a single thread sending numerous empty flush bios to one
block device is redundant, as these bios are likely to be merged by the
flush state machine. In scenarios where num_targets significantly
outweighs the number of block devices, such behavior may result in a
noteworthy decrease in performance.

This issue can be reproduced using this command line:
  for i in {0..1023}; do
    echo $((8000*$i)) 8000 linear /dev/sda2 $((16384*$i))
  done | dmsetup create example

With this fix, a random write with fsync workload executed with the
following fio command:

  fio --group_reporting --name=benchmark --filename=/dev/mapper/example \
      --ioengine=sync --invalidate=1 --numjobs=16 --rw=randwrite \
      --blocksize=4k --size=2G --time_based --runtime=30 --fdatasync=1

results in an increase from 857 KB/s to 30.8 MB/s of the write
throughput (3580% increase).

Signed-off-by: Yang Yang <yang.yang@vivo.com>
---
 drivers/md/dm-core.h          |  1 +
 drivers/md/dm-table.c         |  7 +++++
 drivers/md/dm.c               | 59 +++++++++++++++++++++++++++++++++++
 include/linux/device-mapper.h |  6 ++++
 4 files changed, 73 insertions(+)

diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h
index e6757a30dcca..7e3f2168289f 100644
--- a/drivers/md/dm-core.h
+++ b/drivers/md/dm-core.h
@@ -217,6 +217,7 @@ struct dm_table {
 	/* a list of devices used by this table */
 	struct list_head devices;
 	struct rw_semaphore devices_lock;
+	unsigned short num_devices;
 
 	/* events get handed up using this callback */
 	void (*event_fn)(void *data);
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 41f1d731ae5a..ddc60e498afb 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -2133,6 +2133,8 @@ void dm_table_postsuspend_targets(struct dm_table *t)
 
 int dm_table_resume_targets(struct dm_table *t)
 {
+	struct list_head *devices = dm_table_get_devices(t);
+	struct dm_dev_internal *dd;
 	unsigned int i;
 	int r = 0;
 
@@ -2159,6 +2161,11 @@ int dm_table_resume_targets(struct dm_table *t)
 			ti->type->resume(ti);
 	}
 
+	t->num_devices = 0;
+
+	list_for_each_entry(dd, devices, list)
+		dd->dm_dev->index = ++(t->num_devices);
+
 	return 0;
 }
 
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 56aa2a8b9d71..7297235291f6 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -48,6 +48,8 @@
  */
 #define REQ_DM_POLL_LIST	REQ_DRV
 
+#define DM_MAX_TABLE_DEVICES	1024
+
 static const char *_name = DM_NAME;
 
 static unsigned int major;
@@ -1543,10 +1545,38 @@ static unsigned int __send_duplicate_bios(struct clone_info *ci, struct dm_targe
 	return ret;
 }
 
+static inline bool has_redundant_flush(struct dm_table *t,
+		unsigned long **bitmap)
+{
+	if (t->num_devices < t->num_targets) {
+		/* Add a limit here to prevent excessive memory usage for bitmaps */
+		if (t->num_devices >= DM_MAX_TABLE_DEVICES)
+			return false;
+
+		/* dm_dev's index starts from 1, so need plus 1 here */
+		*bitmap = bitmap_zalloc(t->num_devices + 1, GFP_KERNEL);
+		if (*bitmap)
+			return true;
+	}
+
+	return false;
+}
+
+static int dm_get_dev_index(struct dm_target *ti, struct dm_dev *dev,
+				     sector_t start, sector_t len, void *data)
+{
+	unsigned short *index = data;
+	*index = dev->index;
+	return 0;
+}
+
 static void __send_empty_flush(struct clone_info *ci)
 {
 	struct dm_table *t = ci->map;
 	struct bio flush_bio;
+	unsigned long *handled_map;
+	unsigned int nr_handled = 0;
+	bool check = has_redundant_flush(t, &handled_map);
 
 	/*
 	 * Use an on-stack bio for this, it's safe since we don't
@@ -1562,17 +1592,46 @@ static void __send_empty_flush(struct clone_info *ci)
 
 	for (unsigned int i = 0; i < t->num_targets; i++) {
 		unsigned int bios;
+		unsigned short index = 0;
 		struct dm_target *ti = dm_table_get_target(t, i);
 
 		if (unlikely(ti->num_flush_bios == 0))
 			continue;
 
+		/*
+		 * If the num_targets is greater than the number of block devices
+		 * in the dm_table's devices list, __send_empty_flush() might
+		 * invoke __send_duplicate_bios() multiple times for the same
+		 * block device. This could lead to a substantial decrease in
+		 * performance when num_targets significantly exceeds the number
+		 * of block devices.
+		 * Ensure that __send_duplicate_bios() is only called once for
+		 * each block device.
+		 */
+		if (check) {
+			if (nr_handled == t->num_devices)
+				break;
+
+			if (ti->type->iterate_devices)
+				ti->type->iterate_devices(ti, dm_get_dev_index, &index);
+
+			if (index > 0) {
+				if (__test_and_set_bit(index, handled_map))
+					continue;
+				else
+					nr_handled++;
+			}
+		}
+
 		atomic_add(ti->num_flush_bios, &ci->io->io_count);
 		bios = __send_duplicate_bios(ci, ti, ti->num_flush_bios,
 					     NULL, GFP_NOWAIT);
 		atomic_sub(ti->num_flush_bios - bios, &ci->io->io_count);
 	}
 
+	if (check)
+		bitmap_free(handled_map);
+
 	/*
 	 * alloc_io() takes one extra reference for submission, so the
 	 * reference won't reach 0 without the following subtraction
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 82b2195efaca..4a54b4f0a609 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -169,6 +169,12 @@ struct dm_dev {
 	struct dax_device *dax_dev;
 	blk_mode_t mode;
 	char name[16];
+
+	/*
+	 * sequential number for each dm_dev in dm_table's devices list,
+	 * start from 1
+	 */
+	unsigned short index;
 };
 
 /*
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 2+ messages in thread

* Re: [RFC PATCH] dm: Avoid sending redundant empty flush bios to the same block device
  2024-04-22 10:05 [RFC PATCH] dm: Avoid sending redundant empty flush bios to the same block device Yang Yang
@ 2024-04-25  2:54 ` YangYang
  0 siblings, 0 replies; 2+ messages in thread
From: YangYang @ 2024-04-25  2:54 UTC (permalink / raw)
  To: Alasdair Kergon, Mike Snitzer, Mikulas Patocka; +Cc: linux-kernel, dm-devel

On 2024/4/22 18:05, Yang Yang wrote:
> __send_empty_flush() sends empty flush bios to every target in the
> dm_table. However, if the num_targets exceeds the number of block
> devices in the dm_table's device list, it could lead to multiple
> invocations of __send_duplicate_bios() for the same block device.
> Typically, a single thread sending numerous empty flush bios to one
> block device is redundant, as these bios are likely to be merged by the
> flush state machine. In scenarios where num_targets significantly
> outweighs the number of block devices, such behavior may result in a
> noteworthy decrease in performance.
> 
> This issue can be reproduced using this command line:
>    for i in {0..1023}; do
>      echo $((8000*$i)) 8000 linear /dev/sda2 $((16384*$i))
>    done | dmsetup create example
> 
> With this fix, a random write with fsync workload executed with the
> following fio command:
> 
>    fio --group_reporting --name=benchmark --filename=/dev/mapper/example \
>        --ioengine=sync --invalidate=1 --numjobs=16 --rw=randwrite \
>        --blocksize=4k --size=2G --time_based --runtime=30 --fdatasync=1
> 
> results in an increase from 857 KB/s to 30.8 MB/s of the write
> throughput (3580% increase).
> 
> Signed-off-by: Yang Yang <yang.yang@vivo.com>
> ---
>   drivers/md/dm-core.h          |  1 +
>   drivers/md/dm-table.c         |  7 +++++
>   drivers/md/dm.c               | 59 +++++++++++++++++++++++++++++++++++
>   include/linux/device-mapper.h |  6 ++++
>   4 files changed, 73 insertions(+)
> 
> diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h
> index e6757a30dcca..7e3f2168289f 100644
> --- a/drivers/md/dm-core.h
> +++ b/drivers/md/dm-core.h
> @@ -217,6 +217,7 @@ struct dm_table {
>   	/* a list of devices used by this table */
>   	struct list_head devices;
>   	struct rw_semaphore devices_lock;
> +	unsigned short num_devices;
>   
>   	/* events get handed up using this callback */
>   	void (*event_fn)(void *data);
> diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
> index 41f1d731ae5a..ddc60e498afb 100644
> --- a/drivers/md/dm-table.c
> +++ b/drivers/md/dm-table.c
> @@ -2133,6 +2133,8 @@ void dm_table_postsuspend_targets(struct dm_table *t)
>   
>   int dm_table_resume_targets(struct dm_table *t)
>   {
> +	struct list_head *devices = dm_table_get_devices(t);
> +	struct dm_dev_internal *dd;
>   	unsigned int i;
>   	int r = 0;
>   
> @@ -2159,6 +2161,11 @@ int dm_table_resume_targets(struct dm_table *t)
>   			ti->type->resume(ti);
>   	}
>   
> +	t->num_devices = 0;
> +
> +	list_for_each_entry(dd, devices, list)
> +		dd->dm_dev->index = ++(t->num_devices);
> +
>   	return 0;
>   }
>   
> diff --git a/drivers/md/dm.c b/drivers/md/dm.c
> index 56aa2a8b9d71..7297235291f6 100644
> --- a/drivers/md/dm.c
> +++ b/drivers/md/dm.c
> @@ -48,6 +48,8 @@
>    */
>   #define REQ_DM_POLL_LIST	REQ_DRV
>   
> +#define DM_MAX_TABLE_DEVICES	1024
> +
>   static const char *_name = DM_NAME;
>   
>   static unsigned int major;
> @@ -1543,10 +1545,38 @@ static unsigned int __send_duplicate_bios(struct clone_info *ci, struct dm_targe
>   	return ret;
>   }
>   
> +static inline bool has_redundant_flush(struct dm_table *t,
> +		unsigned long **bitmap)
> +{
> +	if (t->num_devices < t->num_targets) {
> +		/* Add a limit here to prevent excessive memory usage for bitmaps */
> +		if (t->num_devices >= DM_MAX_TABLE_DEVICES)
> +			return false;
> +
> +		/* dm_dev's index starts from 1, so need plus 1 here */
> +		*bitmap = bitmap_zalloc(t->num_devices + 1, GFP_KERNEL);
> +		if (*bitmap)
> +			return true;
> +	}
> +
> +	return false;
> +}
> +
> +static int dm_get_dev_index(struct dm_target *ti, struct dm_dev *dev,
> +				     sector_t start, sector_t len, void *data)
> +{
> +	unsigned short *index = data;
> +	*index = dev->index;
> +	return 0;
> +}
> +
>   static void __send_empty_flush(struct clone_info *ci)
>   {
>   	struct dm_table *t = ci->map;
>   	struct bio flush_bio;
> +	unsigned long *handled_map;
> +	unsigned int nr_handled = 0;
> +	bool check = has_redundant_flush(t, &handled_map);
>   
>   	/*
>   	 * Use an on-stack bio for this, it's safe since we don't
> @@ -1562,17 +1592,46 @@ static void __send_empty_flush(struct clone_info *ci)
>   
>   	for (unsigned int i = 0; i < t->num_targets; i++) {
>   		unsigned int bios;
> +		unsigned short index = 0;
>   		struct dm_target *ti = dm_table_get_target(t, i);
>   
>   		if (unlikely(ti->num_flush_bios == 0))
>   			continue;
>   
> +		/*
> +		 * If the num_targets is greater than the number of block devices
> +		 * in the dm_table's devices list, __send_empty_flush() might
> +		 * invoke __send_duplicate_bios() multiple times for the same
> +		 * block device. This could lead to a substantial decrease in
> +		 * performance when num_targets significantly exceeds the number
> +		 * of block devices.
> +		 * Ensure that __send_duplicate_bios() is only called once for
> +		 * each block device.
> +		 */
> +		if (check) {
> +			if (nr_handled == t->num_devices)
> +				break;
> +
> +			if (ti->type->iterate_devices)
> +				ti->type->iterate_devices(ti, dm_get_dev_index, &index);
> +
> +			if (index > 0) {
> +				if (__test_and_set_bit(index, handled_map))
> +					continue;
> +				else
> +					nr_handled++;
> +			}
> +		}
> +
>   		atomic_add(ti->num_flush_bios, &ci->io->io_count);
>   		bios = __send_duplicate_bios(ci, ti, ti->num_flush_bios,
>   					     NULL, GFP_NOWAIT);
>   		atomic_sub(ti->num_flush_bios - bios, &ci->io->io_count);
>   	}
>   
> +	if (check)
> +		bitmap_free(handled_map);
> +
>   	/*
>   	 * alloc_io() takes one extra reference for submission, so the
>   	 * reference won't reach 0 without the following subtraction
> diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
> index 82b2195efaca..4a54b4f0a609 100644
> --- a/include/linux/device-mapper.h
> +++ b/include/linux/device-mapper.h
> @@ -169,6 +169,12 @@ struct dm_dev {
>   	struct dax_device *dax_dev;
>   	blk_mode_t mode;
>   	char name[16];
> +
> +	/*
> +	 * sequential number for each dm_dev in dm_table's devices list,
> +	 * start from 1
> +	 */
> +	unsigned short index;
>   };
>   
>   /*

Hello experts,
Any comments are welcome!

Thanks

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2024-04-25  2:55 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-04-22 10:05 [RFC PATCH] dm: Avoid sending redundant empty flush bios to the same block device Yang Yang
2024-04-25  2:54 ` YangYang

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).