Re: [RFC][PATCH] uprobe: support for private hugetlb mappings

LKML Archive mirror
 help / color / mirror / Atom feed

From: Guillaume Morin <guillaume@morinfr.org>
To: David Hildenbrand <david@redhat.com>
Cc: Guillaume Morin <guillaume@morinfr.org>,
	oleg@redhat.com, linux-kernel@vger.kernel.org,
	linux-trace-kernel@vger.kernel.org, muchun.song@linux.dev
Subject: Re: [RFC][PATCH] uprobe: support for private hugetlb mappings
Date: Thu, 2 May 2024 05:59:50 +0200	[thread overview]
Message-ID: <ZjMPtrsrUi8-QJ0G@bender.morinfr.org> (raw)
In-Reply-To: <8a7b9e65-b073-4132-9680-efc2b3af6af0@redhat.com>

On 30 Apr 21:25, David Hildenbrand wrote:
> > I tried to get the hugepd stuff right but this was the first I heard
> > about it :-) Afaict follow_huge_pmd and friends were already DTRT
> 
> I'll have to have a closer look at some details (the hugepd writability
> check looks a bit odd), but it's mostly what I would have expected!

Ok in the meantime, here is the uprobe change on your current
uprobes_cow trying to address the comments you made in your previous
message. Some of them were not 100% clear to me, so it's a best effort
patch :-) Again lightly tested

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -83,6 +83,10 @@ static const struct fs_parameter_spec hugetlb_fs_parameters[] = {
 	{}
 };
 
+bool hugetlbfs_mapping(struct address_space *mapping) {
+	return mapping->a_ops == &hugetlbfs_aops;
+}
+
 /*
  * Mask used when checking the page offset value passed in via system
  * calls.  This value will be converted to a loff_t which is signed.
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -511,6 +511,8 @@ struct hugetlbfs_sb_info {
 	umode_t mode;
 };
 
+bool hugetlbfs_mapping(struct address_space *mapping);
+
 static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb)
 {
 	return sb->s_fs_info;
@@ -557,6 +559,8 @@ static inline struct hstate *hstate_inode(struct inode *i)
 {
 	return NULL;
 }
+
+static inline bool hugetlbfs_mapping(struct address_space *mapping) { return false; }
 #endif /* !CONFIG_HUGETLBFS */
 
 #ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -11,6 +11,7 @@
 
 #include <linux/kernel.h>
 #include <linux/highmem.h>
+#include <linux/hugetlb.h>
 #include <linux/pagemap.h>	/* read_mapping_page */
 #include <linux/slab.h>
 #include <linux/sched.h>
@@ -120,7 +121,7 @@ struct xol_area {
  */
 static bool valid_vma(struct vm_area_struct *vma, bool is_register)
 {
-	vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_MAYSHARE;
+	vm_flags_t flags = VM_MAYEXEC | VM_MAYSHARE;
 
 	if (is_register)
 		flags |= VM_WRITE;
@@ -177,6 +178,19 @@ static void copy_to_page(struct page *page, unsigned long vaddr, const void *src
 	kunmap_atomic(kaddr);
 }
 
+static bool compare_pages(struct page *page1, struct page *page2, unsigned long page_size)
+{
+	char *addr1, *addr2;
+	int ret;
+
+	addr1 = kmap_local_page(page1);
+	addr2 = kmap_local_page(page2);
+	ret = memcmp(addr1, addr2, page_size);
+	kunmap_local(addr2);
+	kunmap_local(addr1);
+	return ret == 0;
+}
+
 static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode)
 {
 	uprobe_opcode_t old_opcode;
@@ -366,7 +380,9 @@ static int update_ref_ctr(struct uprobe *uprobe, struct mm_struct *mm,
 }
 
 static bool orig_page_is_identical(struct vm_area_struct *vma,
-		unsigned long vaddr, struct page *page, bool *large)
+		unsigned long vaddr, struct page *page,
+		unsigned long page_size,
+		bool *large)
 {
 	const pgoff_t index = vaddr_to_offset(vma, vaddr) >> PAGE_SHIFT;
 	struct page *orig_page = find_get_page(vma->vm_file->f_inode->i_mapping,
@@ -380,7 +396,7 @@ static bool orig_page_is_identical(struct vm_area_struct *vma,
 
 	*large = folio_test_large(orig_folio);
 	identical = folio_test_uptodate(orig_folio) &&
-		    pages_identical(page, orig_page);
+		    compare_pages(page, orig_page, page_size);
 	folio_put(orig_folio);
 	return identical;
 }
@@ -396,6 +412,81 @@ struct uwo_data {
 	uprobe_opcode_t opcode;
 };
 
+static int __write_opcode_hugetlb(pte_t *ptep, unsigned long page_mask,
+				  unsigned long vaddr,
+				  unsigned long next, struct mm_walk *walk)
+{
+	struct uwo_data *data = walk->private;
+	const bool is_register = !!is_swbp_insn(&data->opcode);
+	pte_t pte = huge_ptep_get(ptep);
+	struct folio *folio;
+	struct page *page;
+	bool large;
+	struct hstate *h = hstate_vma(walk->vma);
+	unsigned subpage_index = (vaddr & (huge_page_size(h) - 1)) >>
+		PAGE_SHIFT;
+
+	if (!pte_present(pte))
+		return UWO_RETRY;
+	page = vm_normal_page(walk->vma, vaddr, pte);
+	if (!page)
+		return UWO_RETRY;
+	folio = page_folio(page);
+
+	/* When unregistering and there is no anon folio anymore, we're done. */
+	if (!folio_test_anon(folio))
+		return is_register ? UWO_RETRY_WRITE_FAULT : UWO_DONE;
+
+	/*
+	 * See can_follow_write_pte(): we'd actually prefer requiring a
+	 * writable PTE here, but when unregistering we might no longer have
+	 * VM_WRITE ...
+	 */
+	if (!huge_pte_write(pte)) {
+		if (!PageAnonExclusive(page))
+			return UWO_RETRY_WRITE_FAULT;
+		if (unlikely(userfaultfd_wp(walk->vma) && huge_pte_uffd_wp(pte)))
+			return UWO_RETRY_WRITE_FAULT;
+		/* SOFTDIRTY is handled via pte_mkdirty() below. */
+	}
+
+	/* Unmap + flush the TLB, such that we can write atomically .*/
+	flush_cache_page(walk->vma, vaddr & page_mask, pte_pfn(pte));
+	pte = huge_ptep_clear_flush(walk->vma, vaddr & page_mask, ptep);
+	copy_to_page(nth_page(page, subpage_index), data->opcode_vaddr,
+		     &data->opcode, UPROBE_SWBP_INSN_SIZE);
+
+	/*
+	 * When unregistering, we may only zap a PTE if uffd is disabled and
+	 * the folio is not pinned ...
+	 */
+	if (is_register || userfaultfd_missing(walk->vma) ||
+	    folio_maybe_dma_pinned(folio))
+		goto remap;
+
+	/*
+	 * ... the mapped anon page is identical to the original page (that
+	 * will get faulted in on next access), and we don't have GUP pins.
+	 */
+	if (!orig_page_is_identical(walk->vma, vaddr & page_mask, page,
+				    huge_page_size(h), &large))
+		goto remap;
+
+	hugetlb_remove_rmap(folio);
+	folio_put(folio);
+	return UWO_DONE;
+remap:
+	/*
+	 * Make sure that our copy_to_page() changes become visible before the
+	 * set_huge_pte_at() write.
+	 */
+	smp_wmb();
+	/* We modified the page. Make sure to mark the PTE dirty. */
+	set_huge_pte_at(walk->mm , vaddr & page_mask, ptep,
+			huge_pte_mkdirty(pte), huge_page_size(h));
+	return UWO_DONE;
+}
+
 static int __write_opcode_pte(pte_t *ptep, unsigned long vaddr,
 		unsigned long next, struct mm_walk *walk)
 {
@@ -447,7 +538,7 @@ static int __write_opcode_pte(pte_t *ptep, unsigned long vaddr,
 	 * ... the mapped anon page is identical to the original page (that
 	 * will get faulted in on next access), and we don't have GUP pins.
 	 */
-	if (!orig_page_is_identical(walk->vma, vaddr, page, &large))
+	if (!orig_page_is_identical(walk->vma, vaddr, page, PAGE_SIZE, &large))
 		goto remap;
 
 	/* Zap it and try to reclaim swap space. */
@@ -473,6 +564,7 @@ static int __write_opcode_pte(pte_t *ptep, unsigned long vaddr,
 }
 
 static const struct mm_walk_ops write_opcode_ops = {
+	.hugetlb_entry		= __write_opcode_hugetlb,
 	.pte_entry		= __write_opcode_pte,
 	.walk_lock		= PGWALK_WRLOCK,
 };
@@ -510,6 +602,8 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
 	struct mmu_notifier_range range;
 	int ret, ref_ctr_updated = 0;
 	struct page *page;
+	unsigned long page_size = PAGE_SIZE;
+	unsigned long page_mask = PAGE_MASK;
 
 	if (WARN_ON_ONCE(!is_cow_mapping(vma->vm_flags)))
 		return -EINVAL;
@@ -528,6 +622,11 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
 	if (ret != 1)
 		goto out;
 
+	if (is_vm_hugetlb_page(vma)) {
+		struct hstate *h = hstate_vma(vma);
+		page_size = huge_page_size(h);
+		page_mask = huge_page_mask(h);
+	}
 	ret = verify_opcode(page, opcode_vaddr, &opcode);
 	put_page(page);
 	if (ret <= 0)
@@ -547,8 +646,9 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
 		 * unregistering. So trigger MMU notifiers now, as we won't
 		 * be able to do it under PTL.
 		 */
+		const unsigned long start = vaddr & page_mask;
 		mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
-					vaddr, vaddr + PAGE_SIZE);
+					start, start + page_size);
 		mmu_notifier_invalidate_range_start(&range);
 	}
 
@@ -830,8 +930,16 @@ static int __copy_insn(struct address_space *mapping, struct file *filp,
 	 */
 	if (mapping->a_ops->read_folio)
 		page = read_mapping_page(mapping, offset >> PAGE_SHIFT, filp);
-	else
+	else if (!is_file_hugepages(filp))
 		page = shmem_read_mapping_page(mapping, offset >> PAGE_SHIFT);
+	else {
+		struct hstate *h = hstate_file(filp);
+		unsigned long mask = huge_page_mask(h);
+		page = find_get_page(mapping, (offset & mask) >> PAGE_SHIFT);
+		if (IS_ERR(page))
+			return PTR_ERR(page);
+		page = nth_page(page, (offset & (huge_page_size(h) - 1)) >> PAGE_SHIFT);
+	}
 	if (IS_ERR(page))
 		return PTR_ERR(page);
 
@@ -1182,9 +1290,12 @@ static int __uprobe_register(struct inode *inode, loff_t offset,
 	if (!uc->handler && !uc->ret_handler)
 		return -EINVAL;
 
-	/* copy_insn() uses read_mapping_page() or shmem_read_mapping_page() */
+	/* copy_insn() uses read_mapping_page() or shmem/hugetlbfs specific
+	 * logic
+	 */
 	if (!inode->i_mapping->a_ops->read_folio &&
-	    !shmem_mapping(inode->i_mapping))
+	    !shmem_mapping(inode->i_mapping) &&
+	    !hugetlbfs_mapping(inode->i_mapping))
 		return -EIO;
 	/* Racy, just to catch the obvious mistakes */
 	if (offset > i_size_read(inode))

-- 
Guillaume Morin <guillaume@morinfr.org>

next prev parent reply	other threads:[~2024-05-02  4:00 UTC|newest]

Thread overview: 21+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-04-19 18:37 [RFC][PATCH] uprobe: support for private hugetlb mappings Guillaume Morin
2024-04-22  9:39 ` David Hildenbrand
2024-04-22 18:11   ` Guillaume Morin
2024-04-22 18:59     ` David Hildenbrand
2024-04-22 20:53       ` Guillaume Morin
2024-04-24 20:09         ` David Hildenbrand
2024-04-24 20:44           ` Guillaume Morin
2024-04-24 21:00             ` David Hildenbrand
2024-04-25 15:19               ` Guillaume Morin
2024-04-25 15:42                 ` David Hildenbrand
2024-04-25 19:56                 ` David Hildenbrand
2024-04-26  0:09                   ` Guillaume Morin
2024-04-26  7:19                     ` David Hildenbrand
2024-04-26 19:55                       ` Guillaume Morin
2024-04-30 15:22                         ` Guillaume Morin
2024-04-30 18:21                           ` David Hildenbrand
2024-04-30 18:58                             ` Guillaume Morin
2024-04-30 19:25                         ` David Hildenbrand
2024-05-02  3:59                           ` Guillaume Morin [this message]
2024-05-16 17:44                             ` Guillaume Morin
2024-05-16 19:52                               ` David Hildenbrand

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=ZjMPtrsrUi8-QJ0G@bender.morinfr.org \
    --to=guillaume@morinfr.org \
    --cc=david@redhat.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-trace-kernel@vger.kernel.org \
    --cc=muchun.song@linux.dev \
    --cc=oleg@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).