Linux-XFS Archive mirror
 help / color / mirror / Atom feed
From: "Darrick J. Wong" <djwong@kernel.org>
To: djwong@kernel.org, cem@kernel.org
Cc: Christoph Hellwig <hch@lst.de>,
	cmaiolino@redhat.com, linux-xfs@vger.kernel.org,
	hch@infradead.org
Subject: [PATCH 089/111] libxfs: add xfile support
Date: Mon, 15 Apr 2024 18:00:23 -0700	[thread overview]
Message-ID: <171322883501.211103.5837361141423551988.stgit@frogsfrogsfrogs> (raw)
In-Reply-To: <171322882240.211103.3776766269442402814.stgit@frogsfrogsfrogs>

From: Darrick J. Wong <djwong@kernel.org>

Port the xfile functionality (anonymous pageable file-index memory) from
the kernel.  In userspace, we try to use memfd() to create tmpfs files
that are not in any namespace, matching the kernel.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 libxfs/Makefile     |    2 
 libxfs/xfile.c      |  210 +++++++++++++++++++++++++++++++++++++++++++++++++++
 libxfs/xfile.h      |   21 +++++
 repair/xfs_repair.c |   15 ++++
 4 files changed, 248 insertions(+)
 create mode 100644 libxfs/xfile.c
 create mode 100644 libxfs/xfile.h


diff --git a/libxfs/Makefile b/libxfs/Makefile
index 6f688c0ad25a..43e8ae183229 100644
--- a/libxfs/Makefile
+++ b/libxfs/Makefile
@@ -26,6 +26,7 @@ HFILES = \
 	libxfs_priv.h \
 	linux-err.h \
 	topology.h \
+	xfile.h \
 	xfs_ag_resv.h \
 	xfs_alloc.h \
 	xfs_alloc_btree.h \
@@ -66,6 +67,7 @@ CFILES = cache.c \
 	topology.c \
 	trans.c \
 	util.c \
+	xfile.c \
 	xfs_ag.c \
 	xfs_ag_resv.c \
 	xfs_alloc.c \
diff --git a/libxfs/xfile.c b/libxfs/xfile.c
new file mode 100644
index 000000000000..cba173cc17f1
--- /dev/null
+++ b/libxfs/xfile.c
@@ -0,0 +1,210 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2021-2024 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "libxfs_priv.h"
+#include "libxfs.h"
+#include "libxfs/xfile.h"
+#include <linux/memfd.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+
+/*
+ * Swappable Temporary Memory
+ * ==========================
+ *
+ * Offline checking sometimes needs to be able to stage a large amount of data
+ * in memory.  This information might not fit in the available memory and it
+ * doesn't all need to be accessible at all times.  In other words, we want an
+ * indexed data buffer to store data that can be paged out.
+ *
+ * memfd files meet those requirements.  Therefore, the xfile mechanism uses
+ * one to store our staging data.  The xfile must be freed with xfile_destroy.
+ *
+ * xfiles assume that the caller will handle all required concurrency
+ * management; file locks are not taken.
+ */
+
+/*
+ * Starting with Linux 6.3, there's a new MFD_NOEXEC_SEAL flag that disables
+ * the longstanding memfd behavior that files are created with the executable
+ * bit set, and seals the file against it being turned back on.
+ */
+#ifndef MFD_NOEXEC_SEAL
+# define MFD_NOEXEC_SEAL	(0x0008U)
+#endif
+
+/*
+ * Open a memory-backed fd to back an xfile.  We require close-on-exec here,
+ * because these memfd files function as windowed RAM and hence should never
+ * be shared with other processes.
+ */
+static int
+xfile_create_fd(
+	const char		*description)
+{
+	int			fd = -1;
+	int			ret;
+
+	/*
+	 * memfd_create was added to kernel 3.17 (2014).  MFD_NOEXEC_SEAL
+	 * causes -EINVAL on old kernels, so fall back to omitting it so that
+	 * new xfs_repair can run on an older recovery cd kernel.
+	 */
+	fd = memfd_create(description, MFD_CLOEXEC | MFD_NOEXEC_SEAL);
+	if (fd >= 0)
+		goto got_fd;
+	fd = memfd_create(description, MFD_CLOEXEC);
+	if (fd >= 0)
+		goto got_fd;
+
+	/*
+	 * O_TMPFILE exists as of kernel 3.11 (2013), which means that if we
+	 * find it, we're pretty safe in assuming O_CLOEXEC exists too.
+	 */
+	fd = open("/dev/shm", O_TMPFILE | O_CLOEXEC | O_RDWR, 0600);
+	if (fd >= 0)
+		goto got_fd;
+
+	fd = open("/tmp", O_TMPFILE | O_CLOEXEC | O_RDWR, 0600);
+	if (fd >= 0)
+		goto got_fd;
+
+	/*
+	 * mkostemp exists as of glibc 2.7 (2007) and O_CLOEXEC exists as of
+	 * kernel 2.6.23 (2007).
+	 */
+	fd = mkostemp("libxfsXXXXXX", O_CLOEXEC);
+	if (fd >= 0)
+		goto got_fd;
+
+	if (!errno)
+		errno = EOPNOTSUPP;
+	return -1;
+got_fd:
+	/*
+	 * Turn off mode bits we don't want -- group members and others should
+	 * not have access to the xfile, nor it be executable.  memfds are
+	 * created with mode 0777, but we'll be careful just in case the other
+	 * implementations fail to set 0600.
+	 */
+	ret = fchmod(fd, 0600);
+	if (ret)
+		perror("disabling xfile executable bit");
+
+	return fd;
+}
+
+/*
+ * Create an xfile of the given size.  The description will be used in the
+ * trace output.
+ */
+int
+xfile_create(
+	const char		*description,
+	struct xfile		**xfilep)
+{
+	struct xfile		*xf;
+	int			error;
+
+	xf = kmalloc(sizeof(struct xfile), 0);
+	if (!xf)
+		return -ENOMEM;
+
+	xf->fd = xfile_create_fd(description);
+	if (xf->fd < 0) {
+		error = -errno;
+		kfree(xf);
+		return error;
+	}
+
+	*xfilep = xf;
+	return 0;
+}
+
+/* Close the file and release all resources. */
+void
+xfile_destroy(
+	struct xfile		*xf)
+{
+	close(xf->fd);
+	kfree(xf);
+}
+
+static inline loff_t
+xfile_maxbytes(
+	struct xfile		*xf)
+{
+	if (sizeof(loff_t) == 8)
+		return LLONG_MAX;
+	return LONG_MAX;
+}
+
+/*
+ * Load an object.  Since we're treating this file as "memory", any error or
+ * short IO is treated as a failure to allocate memory.
+ */
+ssize_t
+xfile_load(
+	struct xfile		*xf,
+	void			*buf,
+	size_t			count,
+	loff_t			pos)
+{
+	ssize_t			ret;
+
+	if (count > INT_MAX)
+		return -ENOMEM;
+	if (xfile_maxbytes(xf) - pos < count)
+		return -ENOMEM;
+
+	ret = pread(xf->fd, buf, count, pos);
+	if (ret < 0)
+		return -errno;
+	if (ret != count)
+		return -ENOMEM;
+	return 0;
+}
+
+/*
+ * Store an object.  Since we're treating this file as "memory", any error or
+ * short IO is treated as a failure to allocate memory.
+ */
+ssize_t
+xfile_store(
+	struct xfile		*xf,
+	const void		*buf,
+	size_t			count,
+	loff_t			pos)
+{
+	ssize_t			ret;
+
+	if (count > INT_MAX)
+		return -E2BIG;
+	if (xfile_maxbytes(xf) - pos < count)
+		return -EFBIG;
+
+	ret = pwrite(xf->fd, buf, count, pos);
+	if (ret < 0)
+		return -errno;
+	if (ret != count)
+		return -ENOMEM;
+	return 0;
+}
+
+/* Compute the number of bytes used by a xfile. */
+unsigned long long
+xfile_bytes(
+	struct xfile		*xf)
+{
+	struct stat		statbuf;
+	int			error;
+
+	error = fstat(xf->fd, &statbuf);
+	if (error)
+		return -errno;
+
+	return (unsigned long long)statbuf.st_blocks << 9;
+}
diff --git a/libxfs/xfile.h b/libxfs/xfile.h
new file mode 100644
index 000000000000..d60084011357
--- /dev/null
+++ b/libxfs/xfile.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2021-2024 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __LIBXFS_XFILE_H__
+#define __LIBXFS_XFILE_H__
+
+struct xfile {
+	int			fd;
+};
+
+int xfile_create(const char *description, struct xfile **xfilep);
+void xfile_destroy(struct xfile *xf);
+
+ssize_t xfile_load(struct xfile *xf, void *buf, size_t count, loff_t pos);
+ssize_t xfile_store(struct xfile *xf, const void *buf, size_t count, loff_t pos);
+
+unsigned long long xfile_bytes(struct xfile *xf);
+
+#endif /* __LIBXFS_XFILE_H__ */
diff --git a/repair/xfs_repair.c b/repair/xfs_repair.c
index d4f99f36f71d..01f92e841f29 100644
--- a/repair/xfs_repair.c
+++ b/repair/xfs_repair.c
@@ -953,6 +953,20 @@ phase_end(
 		platform_crash();
 }
 
+/* Try to allow as many memfds as possible. */
+static void
+bump_max_fds(void)
+{
+	struct rlimit	rlim = { };
+	int		ret;
+
+	ret = getrlimit(RLIMIT_NOFILE, &rlim);
+	if (!ret) {
+		rlim.rlim_cur = rlim.rlim_max;
+		setrlimit(RLIMIT_NOFILE, &rlim);
+	}
+}
+
 int
 main(int argc, char **argv)
 {
@@ -972,6 +986,7 @@ main(int argc, char **argv)
 	bindtextdomain(PACKAGE, LOCALEDIR);
 	textdomain(PACKAGE);
 	dinode_bmbt_translation_init();
+	bump_max_fds();
 
 	temp_mp = &xfs_m;
 	setbuf(stdout, NULL);


  parent reply	other threads:[~2024-04-16  1:00 UTC|newest]

Thread overview: 38+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-04-16  0:51 [PATCHBOMB v3] xfsprogs: everything headed towards 6.9 Darrick J. Wong
2024-04-16  0:57 ` [PATCHSET 1/4] xfsprogs: bug fixes for 6.8 Darrick J. Wong
2024-04-16  0:58   ` [PATCH 1/5] xfs_repair: double-check with shortform attr verifiers Darrick J. Wong
2024-04-16  0:59   ` [PATCH 2/5] xfs_db: improve number extraction in getbitval Darrick J. Wong
2024-04-16  4:53     ` Christoph Hellwig
2024-04-16  0:59   ` [PATCH 3/5] xfs_scrub: fix threadcount estimates for phase 6 Darrick J. Wong
2024-04-16  4:53     ` Christoph Hellwig
2024-04-16  0:59   ` [PATCH 4/5] xfs_scrub: don't fail while reporting media scan errors Darrick J. Wong
2024-04-16  0:59   ` [PATCH 5/5] xfs_io: add linux madvise advice codes Darrick J. Wong
2024-04-17  7:34   ` [PATCHSET 1/4] xfsprogs: bug fixes for 6.8 Carlos Maiolino
2024-04-17 15:30     ` Darrick J. Wong
2024-04-16  0:58 ` [PATCHSET 2/4] libxfs: sync with 6.9 Darrick J. Wong
2024-04-16  1:00   ` [PATCH 088/111] libxfs: teach buftargs to maintain their own buffer hashtable Darrick J. Wong
2024-04-16  1:00   ` Darrick J. Wong [this message]
2024-04-16  1:00   ` [PATCH 090/111] libxfs: partition memfd files to avoid using too many fds Darrick J. Wong
2024-04-16  4:55     ` Christoph Hellwig
2024-04-16 15:49       ` Darrick J. Wong
2024-04-16 16:29         ` Christoph Hellwig
2024-04-16 16:57           ` Darrick J. Wong
2024-04-16 18:47             ` Christoph Hellwig
2024-04-16 18:55               ` Darrick J. Wong
2024-04-24 17:20     ` [PATCH v3.1 " Darrick J. Wong
2024-04-16  1:00   ` [PATCH 091/111] xfs: teach buftargs to maintain their own buffer hashtable Darrick J. Wong
2024-04-16  1:01   ` [PATCH 092/111] libxfs: support in-memory buffer cache targets Darrick J. Wong
2024-04-16  0:58 ` [PATCHSET v30.3 3/4] xfsprogs: bmap log intent cleanups Darrick J. Wong
2024-04-16  1:01   ` [PATCH 1/4] libxfs: remove kmem_alloc, kmem_zalloc, and kmem_free Darrick J. Wong
2024-04-16  4:55     ` Christoph Hellwig
2024-04-16  1:01   ` [PATCH 2/4] libxfs: add a bi_entry helper Darrick J. Wong
2024-04-16  4:55     ` Christoph Hellwig
2024-04-16  1:01   ` [PATCH 3/4] libxfs: reuse xfs_bmap_update_cancel_item Darrick J. Wong
2024-04-16  4:55     ` Christoph Hellwig
2024-04-16  1:02   ` [PATCH 4/4] libxfs: add a xattr_entry helper Darrick J. Wong
2024-04-16  4:56     ` Christoph Hellwig
2024-04-16  0:58 ` [PATCHSET v30.3 4/4] xfs_repair: minor fixes Darrick J. Wong
2024-04-16  1:02   ` [PATCH 1/1] xfs_repair: check num before bplist[num] Darrick J. Wong
2024-04-16  4:56     ` Christoph Hellwig
  -- strict thread matches above, loose matches on Subject: below --
2024-05-22  2:45 [PATCHSET v30.4 02/10] libxfs: sync with 6.9 Darrick J. Wong
2024-05-22  3:11 ` [PATCH 089/111] libxfs: add xfile support Darrick J. Wong
2024-06-03 18:49 [PATCHSET v30.5 02/10] libxfs: sync with 6.9 Darrick J. Wong
2024-06-03 19:15 ` [PATCH 089/111] libxfs: add xfile support Darrick J. Wong

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=171322883501.211103.5837361141423551988.stgit@frogsfrogsfrogs \
    --to=djwong@kernel.org \
    --cc=cem@kernel.org \
    --cc=cmaiolino@redhat.com \
    --cc=hch@infradead.org \
    --cc=hch@lst.de \
    --cc=linux-xfs@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).