All the mail mirrored from lore.kernel.org
 help / color / mirror / Atom feed
From: Alexey Zhuravlev <bzzz@whamcloud.com>
To: linux-ext4@vger.kernel.org
Subject: merge extent blocks when possible
Date: Sat, 27 Apr 2024 09:35:41 +0300	[thread overview]
Message-ID: <20240427093541.77f14d8b@x390.bzzz77.ru> (raw)


Hi,

Please, consider for inclusion the patch attempting to handle the problem with deep extent tree.

Thanks, Alex

From 0dd791c266e2d00c3217d0f1b836abdbfd4f146f Mon Sep 17 00:00:00 2001
From: Alex Zhuravlev <bzzz@whamcloud.com>
Date: Wed, 19 Jul 2023 21:22:20 +0300
Subject: [PATCH 1/2] In some cases when a lot of extents are created initially
 by sparse file writes, they get merged over time, but there is no way to
 merge blocks in different indexes.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

For example, if a file is written synchronously, all even blocks first, then odd blocks.
The resulting extents tree looks like the following in "debugfs stat" output, often
with only a single block in each index/leaf:

   EXTENTS:
   (ETB0):33796
   (ETB1):33795
   (0-677):2588672-2589349
   (ETB1):2590753
   (678):2589350
   (ETB1):2590720
   (679-1357):2589351-2590029
   (ETB1):2590752
   (1358):2590030
   (ETB1):2590721
   (1359-2037):2590031-2590709
   (ETB1):2590751
   (2038):2590710
   (ETB1):2590722
   :
   :

With the patch applied the index and lead blocks are properly merged (0.6% slower
under this random sync write workload, but later read IOPS are greatly reduced):

   EXTENTS:
   (ETB0):33796
   (ETB1):2590736
   (0-2047):2588672-2590719
   (2048-11999):2592768-2602719

Originally the problem was hit with a real application operating on huge datasets and with just
27371 extents "inode has invalid extent depth: 6” problem occurred.
With the patch applied the application succeeded having finally 73637 in 3-level tree.

Signed-off-by: Alex Zhuravlev <bzzz@whamcloud.com>
---
 fs/ext4/extents.c     | 185 ++++++++++++++++++++++++++++++++++++++++--
 fs/jbd2/transaction.c |   1 +
 2 files changed, 180 insertions(+), 6 deletions(-)

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index e57054bdc5fd..a2de6b863df1 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -1885,7 +1885,7 @@ static void ext4_ext_try_to_merge_up(handle_t *handle,
  * This function tries to merge the @ex extent to neighbours in the tree, then
  * tries to collapse the extent tree into the inode.
  */
-static void ext4_ext_try_to_merge(handle_t *handle,
+static int ext4_ext_try_to_merge(handle_t *handle,
 				  struct inode *inode,
 				  struct ext4_ext_path *path,
 				  struct ext4_extent *ex)
@@ -1902,9 +1902,178 @@ static void ext4_ext_try_to_merge(handle_t *handle,
 		merge_done = ext4_ext_try_to_merge_right(inode, path, ex - 1);
 
 	if (!merge_done)
-		(void) ext4_ext_try_to_merge_right(inode, path, ex);
+		merge_done = ext4_ext_try_to_merge_right(inode, path, ex);
 
 	ext4_ext_try_to_merge_up(handle, inode, path);
+
+	return merge_done;
+}
+
+/*
+ * This function tries to merge blocks from @path into @npath
+ */
+static int ext4_ext_merge_blocks(handle_t *handle,
+				struct inode *inode,
+				struct ext4_ext_path *path,
+				struct ext4_ext_path *npath)
+{
+	unsigned int depth = ext_depth(inode);
+	int used, nused, free, i, k, err;
+	ext4_fsblk_t next;
+
+	if (path[depth].p_hdr == npath[depth].p_hdr)
+		return 0;
+
+	used = le16_to_cpu(path[depth].p_hdr->eh_entries);
+	free = le16_to_cpu(npath[depth].p_hdr->eh_max) -
+		le16_to_cpu(npath[depth].p_hdr->eh_entries);
+	if (free < used)
+		return 0;
+
+	err = ext4_ext_get_access(handle, inode, path + depth);
+	if (err)
+		return err;
+	err = ext4_ext_get_access(handle, inode, npath + depth);
+	if (err)
+		return err;
+
+	/* move entries from the current leave to the next one */
+	nused = le16_to_cpu(npath[depth].p_hdr->eh_entries);
+	memmove(EXT_FIRST_EXTENT(npath[depth].p_hdr) + used,
+		EXT_FIRST_EXTENT(npath[depth].p_hdr),
+		nused * sizeof(struct ext4_extent));
+	memcpy(EXT_FIRST_EXTENT(npath[depth].p_hdr),
+		EXT_FIRST_EXTENT(path[depth].p_hdr),
+		used * sizeof(struct ext4_extent));
+	le16_add_cpu(&npath[depth].p_hdr->eh_entries, used);
+	le16_add_cpu(&path[depth].p_hdr->eh_entries, -used);
+	ext4_ext_try_to_merge_right(inode, npath,
+					EXT_FIRST_EXTENT(npath[depth].p_hdr));
+
+	err = ext4_ext_dirty(handle, inode, path + depth);
+	if (err)
+		return err;
+	err = ext4_ext_dirty(handle, inode, npath + depth);
+	if (err)
+		return err;
+
+	/* otherwise the index won't get corrected */
+	npath[depth].p_ext = EXT_FIRST_EXTENT(npath[depth].p_hdr);
+	err = ext4_ext_correct_indexes(handle, inode, npath);
+	if (err)
+		return err;
+
+	for (i = depth - 1; i >= 0; i--) {
+
+		next = ext4_idx_pblock(path[i].p_idx);
+		ext4_free_blocks(handle, inode, NULL, next, 1,
+				EXT4_FREE_BLOCKS_METADATA |
+				EXT4_FREE_BLOCKS_FORGET);
+		err = ext4_ext_get_access(handle, inode, path + i);
+		if (err)
+			return err;
+		le16_add_cpu(&path[i].p_hdr->eh_entries, -1);
+		if (le16_to_cpu(path[i].p_hdr->eh_entries) == 0) {
+			/* whole index block collapsed, go up */
+			continue;
+		}
+		/* remove index pointer */
+		used = EXT_LAST_INDEX(path[i].p_hdr) - path[i].p_idx + 1;
+		memmove(path[i].p_idx, path[i].p_idx + 1,
+			used * sizeof(struct ext4_extent_idx));
+
+		err = ext4_ext_dirty(handle, inode, path + i);
+		if (err)
+			return err;
+
+		if (path[i].p_hdr == npath[i].p_hdr)
+			break;
+
+		/* try to move index pointers */
+		used = le16_to_cpu(path[i].p_hdr->eh_entries);
+		free = le16_to_cpu(npath[i].p_hdr->eh_max) -
+			le16_to_cpu(npath[i].p_hdr->eh_entries);
+		if (used > free)
+			break;
+		err = ext4_ext_get_access(handle, inode, npath + i);
+		if (err)
+			return err;
+		memmove(EXT_FIRST_INDEX(npath[i].p_hdr) + used,
+			EXT_FIRST_INDEX(npath[i].p_hdr),
+			npath[i].p_hdr->eh_entries * sizeof(struct ext4_extent_idx));
+		memcpy(EXT_FIRST_INDEX(npath[i].p_hdr), EXT_FIRST_INDEX(path[i].p_hdr),
+			used * sizeof(struct ext4_extent_idx));
+		le16_add_cpu(&path[i].p_hdr->eh_entries, -used);
+		le16_add_cpu(&npath[i].p_hdr->eh_entries, used);
+		err = ext4_ext_dirty(handle, inode, path + i);
+		if (err)
+			return err;
+		err = ext4_ext_dirty(handle, inode, npath + i);
+		if (err)
+			return err;
+
+		/* correct index above */
+		for (k = i; k > 0; k--) {
+			err = ext4_ext_get_access(handle, inode, npath + k - 1);
+			if (err)
+				return err;
+			npath[k-1].p_idx->ei_block =
+				EXT_FIRST_INDEX(npath[k].p_hdr)->ei_block;
+			err = ext4_ext_dirty(handle, inode, npath + k - 1);
+			if (err)
+				return err;
+		}
+	}
+
+	/*
+	 * TODO: given we've got two paths, it should be possible to
+	 * collapse those two blocks into the root one in some cases
+	 */
+	return 1;
+}
+
+static int ext4_ext_try_to_merge_blocks(handle_t *handle,
+		struct inode *inode,
+		struct ext4_ext_path *path)
+{
+	struct ext4_ext_path *npath = NULL;
+	unsigned int depth = ext_depth(inode);
+	ext4_lblk_t next;
+	int used, rc = 0;
+
+	if (depth == 0)
+		return 0;
+
+	used = le16_to_cpu(path[depth].p_hdr->eh_entries);
+	/* don't be too agressive as checking space in
+	 * the next block is not free */
+	if (used > ext4_ext_space_block(inode, 0) / 4)
+		return 0;
+
+	/* try to merge to the next block */
+	next = ext4_ext_next_leaf_block(path);
+	if (next == EXT_MAX_BLOCKS)
+		return 0;
+	npath = ext4_find_extent(inode, next, NULL, 0);
+	if (IS_ERR(npath))
+		return 0;
+	rc = ext4_ext_merge_blocks(handle, inode, path, npath);
+	ext4_ext_drop_refs(npath);
+	kfree(npath);
+	if (rc)
+		return rc > 0 ? 0 : rc;
+
+	/* try to merge with the previous block */
+	if (EXT_FIRST_EXTENT(path[depth].p_hdr)->ee_block == 0)
+		return 0;
+	next = EXT_FIRST_EXTENT(path[depth].p_hdr)->ee_block - 1;
+	npath = ext4_find_extent(inode, next, NULL, 0);
+	if (IS_ERR(npath))
+		return 0;
+	rc = ext4_ext_merge_blocks(handle, inode, npath, path);
+	ext4_ext_drop_refs(npath);
+	kfree(npath);
+	return rc > 0 ? 0 : rc;
 }
 
 /*
@@ -1976,6 +2145,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
 	int depth, len, err;
 	ext4_lblk_t next;
 	int mb_flags = 0, unwritten;
+	int merged = 0;
 
 	if (gb_flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
 		mb_flags |= EXT4_MB_DELALLOC_RESERVED;
@@ -2167,8 +2337,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
 merge:
 	/* try to merge extents */
 	if (!(gb_flags & EXT4_GET_BLOCKS_PRE_IO))
-		ext4_ext_try_to_merge(handle, inode, path, nearex);
-
+		merged = ext4_ext_try_to_merge(handle, inode, path, nearex);
 
 	/* time to correct all indexes above */
 	err = ext4_ext_correct_indexes(handle, inode, path);
@@ -2176,6 +2345,8 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
 		goto cleanup;
 
 	err = ext4_ext_dirty(handle, inode, path + path->p_depth);
+	if (!err && merged)
+		err = ext4_ext_try_to_merge_blocks(handle, inode, path);
 
 cleanup:
 	ext4_free_ext_path(npath);
@@ -3741,7 +3912,8 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle,
 	/* note: ext4_ext_correct_indexes() isn't needed here because
 	 * borders are not changed
 	 */
-	ext4_ext_try_to_merge(handle, inode, path, ex);
+	if (ext4_ext_try_to_merge(handle, inode, path, ex))
+		ext4_ext_try_to_merge_blocks(handle, inode, path);
 
 	/* Mark modified extent as dirty */
 	err = ext4_ext_dirty(handle, inode, path + path->p_depth);
@@ -3804,7 +3976,8 @@ convert_initialized_extent(handle_t *handle, struct inode *inode,
 	/* note: ext4_ext_correct_indexes() isn't needed here because
 	 * borders are not changed
 	 */
-	ext4_ext_try_to_merge(handle, inode, path, ex);
+	if (ext4_ext_try_to_merge(handle, inode, path, ex))
+		ext4_ext_try_to_merge_blocks(handle, inode, path);
 
 	/* Mark modified extent as dirty */
 	err = ext4_ext_dirty(handle, inode, path + path->p_depth);
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index cb0b8d6fc0c6..4cd738fa408e 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -513,6 +513,7 @@ handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int rsv_blocks,
 		}
 		rsv_handle->h_reserved = 1;
 		rsv_handle->h_journal = journal;
+		rsv_handle->h_revoke_credits = revoke_records;
 		handle->h_rsv_handle = rsv_handle;
 	}
 	handle->h_revoke_credits = revoke_records;
-- 
2.44.0


                 reply	other threads:[~2024-04-27  6:36 UTC|newest]

Thread overview: [no followups] expand[flat|nested]  mbox.gz  Atom feed

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20240427093541.77f14d8b@x390.bzzz77.ru \
    --to=bzzz@whamcloud.com \
    --cc=linux-ext4@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.