LKML Archive mirror
 help / color / mirror / Atom feed
From: Yan Zhao <yan.y.zhao@intel.com>
To: Alex Williamson <alex.williamson@redhat.com>
Cc: <kvm@vger.kernel.org>, <linux-kernel@vger.kernel.org>,
	<x86@kernel.org>, <jgg@nvidia.com>, <kevin.tian@intel.com>,
	<iommu@lists.linux.dev>, <pbonzini@redhat.com>,
	<seanjc@google.com>, <dave.hansen@linux.intel.com>,
	<luto@kernel.org>, <peterz@infradead.org>, <tglx@linutronix.de>,
	<mingo@redhat.com>, <bp@alien8.de>, <hpa@zytor.com>,
	<corbet@lwn.net>, <joro@8bytes.org>, <will@kernel.org>,
	<robin.murphy@arm.com>, <baolu.lu@linux.intel.com>,
	<yi.l.liu@intel.com>
Subject: Re: [PATCH 4/5] vfio/type1: Flush CPU caches on DMA pages in non-coherent domains
Date: Mon, 13 May 2024 15:11:28 +0800	[thread overview]
Message-ID: <ZkG9IEQwi7HG3YBk@yzhao56-desk.sh.intel.com> (raw)
In-Reply-To: <20240510105728.76d97bbb.alex.williamson@redhat.com>

On Fri, May 10, 2024 at 10:57:28AM -0600, Alex Williamson wrote:
> On Fri, 10 May 2024 18:31:13 +0800
> Yan Zhao <yan.y.zhao@intel.com> wrote:
> 
> > On Thu, May 09, 2024 at 12:10:49PM -0600, Alex Williamson wrote:
> > > On Tue,  7 May 2024 14:21:38 +0800
> > > Yan Zhao <yan.y.zhao@intel.com> wrote:  
> > ... 
> > > >  drivers/vfio/vfio_iommu_type1.c | 51 +++++++++++++++++++++++++++++++++
> > > >  1 file changed, 51 insertions(+)
> > > > 
> > > > diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> > > > index b5c15fe8f9fc..ce873f4220bf 100644
> > > > --- a/drivers/vfio/vfio_iommu_type1.c
> > > > +++ b/drivers/vfio/vfio_iommu_type1.c
> > > > @@ -74,6 +74,7 @@ struct vfio_iommu {
> > > >  	bool			v2;
> > > >  	bool			nesting;
> > > >  	bool			dirty_page_tracking;
> > > > +	bool			has_noncoherent_domain;
> > > >  	struct list_head	emulated_iommu_groups;
> > > >  };
> > > >  
> > > > @@ -99,6 +100,7 @@ struct vfio_dma {
> > > >  	unsigned long		*bitmap;
> > > >  	struct mm_struct	*mm;
> > > >  	size_t			locked_vm;
> > > > +	bool			cache_flush_required; /* For noncoherent domain */  
> > > 
> > > Poor packing, minimally this should be grouped with the other bools in
> > > the structure, longer term they should likely all be converted to
> > > bit fields.  
> > Yes. Will do!
> > 
> > >   
> > > >  };
> > > >  
> > > >  struct vfio_batch {
> > > > @@ -716,6 +718,9 @@ static long vfio_unpin_pages_remote(struct vfio_dma *dma, dma_addr_t iova,
> > > >  	long unlocked = 0, locked = 0;
> > > >  	long i;
> > > >  
> > > > +	if (dma->cache_flush_required)
> > > > +		arch_clean_nonsnoop_dma(pfn << PAGE_SHIFT, npage << PAGE_SHIFT);
> > > > +
> > > >  	for (i = 0; i < npage; i++, iova += PAGE_SIZE) {
> > > >  		if (put_pfn(pfn++, dma->prot)) {
> > > >  			unlocked++;
> > > > @@ -1099,6 +1104,8 @@ static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
> > > >  					    &iotlb_gather);
> > > >  	}
> > > >  
> > > > +	dma->cache_flush_required = false;
> > > > +
> > > >  	if (do_accounting) {
> > > >  		vfio_lock_acct(dma, -unlocked, true);
> > > >  		return 0;
> > > > @@ -1120,6 +1127,21 @@ static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
> > > >  	iommu->dma_avail++;
> > > >  }
> > > >  
> > > > +static void vfio_update_noncoherent_domain_state(struct vfio_iommu *iommu)
> > > > +{
> > > > +	struct vfio_domain *domain;
> > > > +	bool has_noncoherent = false;
> > > > +
> > > > +	list_for_each_entry(domain, &iommu->domain_list, next) {
> > > > +		if (domain->enforce_cache_coherency)
> > > > +			continue;
> > > > +
> > > > +		has_noncoherent = true;
> > > > +		break;
> > > > +	}
> > > > +	iommu->has_noncoherent_domain = has_noncoherent;
> > > > +}  
> > > 
> > > This should be merged with vfio_domains_have_enforce_cache_coherency()
> > > and the VFIO_DMA_CC_IOMMU extension (if we keep it, see below).  
> > Will convert it to a counter and do the merge.
> > Thanks for pointing it out!
> > 
> > >   
> > > > +
> > > >  static void vfio_update_pgsize_bitmap(struct vfio_iommu *iommu)
> > > >  {
> > > >  	struct vfio_domain *domain;
> > > > @@ -1455,6 +1477,12 @@ static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,
> > > >  
> > > >  	vfio_batch_init(&batch);
> > > >  
> > > > +	/*
> > > > +	 * Record necessity to flush CPU cache to make sure CPU cache is flushed
> > > > +	 * for both pin & map and unmap & unpin (for unwind) paths.
> > > > +	 */
> > > > +	dma->cache_flush_required = iommu->has_noncoherent_domain;
> > > > +
> > > >  	while (size) {
> > > >  		/* Pin a contiguous chunk of memory */
> > > >  		npage = vfio_pin_pages_remote(dma, vaddr + dma->size,
> > > > @@ -1466,6 +1494,10 @@ static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,
> > > >  			break;
> > > >  		}
> > > >  
> > > > +		if (dma->cache_flush_required)
> > > > +			arch_clean_nonsnoop_dma(pfn << PAGE_SHIFT,
> > > > +						npage << PAGE_SHIFT);
> > > > +
> > > >  		/* Map it! */
> > > >  		ret = vfio_iommu_map(iommu, iova + dma->size, pfn, npage,
> > > >  				     dma->prot);
> > > > @@ -1683,9 +1715,14 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
> > > >  	for (; n; n = rb_next(n)) {
> > > >  		struct vfio_dma *dma;
> > > >  		dma_addr_t iova;
> > > > +		bool cache_flush_required;
> > > >  
> > > >  		dma = rb_entry(n, struct vfio_dma, node);
> > > >  		iova = dma->iova;
> > > > +		cache_flush_required = !domain->enforce_cache_coherency &&
> > > > +				       !dma->cache_flush_required;
> > > > +		if (cache_flush_required)
> > > > +			dma->cache_flush_required = true;  
> > > 
> > > The variable name here isn't accurate and the logic is confusing.  If
> > > the domain does not enforce coherency and the mapping is not tagged as
> > > requiring a cache flush, then we need to mark the mapping as requiring
> > > a cache flush.  So the variable state is something more akin to
> > > set_cache_flush_required.  But all we're saving with this is a
> > > redundant set if the mapping is already tagged as requiring a cache
> > > flush, so it could really be simplified to:
> > > 
> > > 		dma->cache_flush_required = !domain->enforce_cache_coherency;  
> > Sorry about the confusion.
> > 
> > If dma->cache_flush_required is set to true by a domain not enforcing cache
> > coherency, we hope it will not be reset to false by a later attaching to domain 
> > enforcing cache coherency due to the lazily flushing design.
> 
> Right, ok, the vfio_dma objects are shared between domains so we never
> want to set 'dma->cache_flush_required = false' due to the addition of a
> 'domain->enforce_cache_coherent == true'.  So this could be:
> 
> 	if (!dma->cache_flush_required)
> 		dma->cache_flush_required = !domain->enforce_cache_coherency;

Though this code is easier for understanding, it leads to unnecessary setting of
dma->cache_flush_required to false, given domain->enforce_cache_coherency is
true at the most time.

> > > It might add more clarity to just name the mapping flag
> > > dma->mapped_noncoherent.  
> > 
> > The dma->cache_flush_required is to mark whether pages in a vfio_dma requires
> > cache flush in the subsequence mapping into the first non-coherent domain
> > and page unpinning.
> 
> How do we arrive at a sequence where we have dma->cache_flush_required
> that isn't the result of being mapped into a domain with
> !domain->enforce_cache_coherency?
Hmm, dma->cache_flush_required IS the result of being mapped into a domain with
!domain->enforce_cache_coherency.
My concern only arrives from the actual code sequence, i.e.
dma->cache_flush_required is set to true before the actual mapping.

If we rename it to dma->mapped_noncoherent and only set it to true after the
actual successful mapping, it would lead to more code to handle flushing for the
unwind case.
Currently, flush for unwind is handled centrally in vfio_unpin_pages_remote()
by checking dma->cache_flush_required, which is true even before a full
successful mapping, so we won't miss flush on any pages that are mapped into a
non-coherent domain in a short window.

> 
> It seems to me that we only get 'dma->cache_flush_required == true' as
> a result of being mapped into a 'domain->enforce_cache_coherency ==
> false' domain.  In that case the flush-on-map is handled at the time
> we're setting dma->cache_flush_required and what we're actually
> tracking with the flag is that the dma object has been mapped into a
> noncoherent domain.
> 
> > So, mapped_noncoherent may not be accurate.
> > Do you think it's better to put a comment for explanation? 
> > 
> > struct vfio_dma {
> >         ...    
> >         bool                    iommu_mapped;
> >         bool                    lock_cap;       /* capable(CAP_IPC_LOCK) */
> >         bool                    vaddr_invalid;
> >         /*
> >          *  Mark whether it is required to flush CPU caches when mapping pages
> >          *  of the vfio_dma to the first non-coherent domain and when unpinning
> >          *  pages of the vfio_dma
> >          */
> >         bool                    cache_flush_required;
> >         ...    
> > };
> > >   
> > > >  
> > > >  		while (iova < dma->iova + dma->size) {
> > > >  			phys_addr_t phys;
> > > > @@ -1737,6 +1774,9 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
> > > >  				size = npage << PAGE_SHIFT;
> > > >  			}
> > > >  
> > > > +			if (cache_flush_required)
> > > > +				arch_clean_nonsnoop_dma(phys, size);
> > > > +  
> > > 
> > > I agree with others as well that this arch callback should be named
> > > something relative to the cache-flush/write-back operation that it
> > > actually performs instead of the overall reason for us requiring it.
> > >  
> > Ok. If there are no objections, I'll rename it to arch_flush_cache_phys() as
> > suggested by Kevin.
> 
> Yes, better.
> 
> > > >  			ret = iommu_map(domain->domain, iova, phys, size,
> > > >  					dma->prot | IOMMU_CACHE,
> > > >  					GFP_KERNEL_ACCOUNT);
> > > > @@ -1801,6 +1841,7 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
> > > >  			vfio_unpin_pages_remote(dma, iova, phys >> PAGE_SHIFT,
> > > >  						size >> PAGE_SHIFT, true);
> > > >  		}
> > > > +		dma->cache_flush_required = false;
> > > >  	}
> > > >  
> > > >  	vfio_batch_fini(&batch);
> > > > @@ -1828,6 +1869,9 @@ static void vfio_test_domain_fgsp(struct vfio_domain *domain, struct list_head *
> > > >  	if (!pages)
> > > >  		return;
> > > >  
> > > > +	if (!domain->enforce_cache_coherency)
> > > > +		arch_clean_nonsnoop_dma(page_to_phys(pages), PAGE_SIZE * 2);
> > > > +
> > > >  	list_for_each_entry(region, regions, list) {
> > > >  		start = ALIGN(region->start, PAGE_SIZE * 2);
> > > >  		if (start >= region->end || (region->end - start < PAGE_SIZE * 2))
> > > > @@ -1847,6 +1891,9 @@ static void vfio_test_domain_fgsp(struct vfio_domain *domain, struct list_head *
> > > >  		break;
> > > >  	}
> > > >  
> > > > +	if (!domain->enforce_cache_coherency)
> > > > +		arch_clean_nonsnoop_dma(page_to_phys(pages), PAGE_SIZE * 2);
> > > > +  
> > > 
> > > Seems like this use case isn't subject to the unmap aspect since these
> > > are kernel allocated and freed pages rather than userspace pages.
> > > There's not an "ongoing use of the page" concern.
> > > 
> > > The window of opportunity for a device to discover and exploit the
> > > mapping side issue appears almost impossibly small.
> > >  
> > The concern is for a malicious device attempting DMAs automatically.
> > Do you think this concern is valid?
> > As there're only extra flushes for 4 pages, what about keeping it for safety?
> 
> Userspace doesn't know anything about these mappings, so to exploit
> them the device would somehow need to discover and interact with the
> mapping in the split second that the mapping exists, without exposing
> itself with mapping faults at the IOMMU.
> 
> I don't mind keeping the flush before map so that infinitesimal gap
> where previous data in physical memory exposed to the device is closed,
> but I have a much harder time seeing that the flush on unmap to
> synchronize physical memory is required.
> 
> For example, the potential KSM use case doesn't exist since the pages
> are not owned by the user.  Any subsequent use of the pages would be
> subject to the same condition we assumed after allocation, where the
> physical data may be inconsistent with the cached data.  It's easy to
> flush 2 pages, but I think it obscures the function of the flush if we
> can't articulate the value in this case.
>
I agree the second flush is not necessary if we are confident that functions in
between the two flushes do not and will not touch the page in CPU side.
However, can we guarantee this? For instance, is it possible for some IOMMU
driver to read/write the page for some quirks? (Or is it just a totally
paranoid?)
If that's not impossible, then ensuring cache and memory coherency before
page reclaiming is better?

> 
> > > >  	__free_pages(pages, order);
> > > >  }
> > > >  
> > > > @@ -2308,6 +2355,8 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
> > > >  
> > > >  	list_add(&domain->next, &iommu->domain_list);
> > > >  	vfio_update_pgsize_bitmap(iommu);
> > > > +	if (!domain->enforce_cache_coherency)
> > > > +		vfio_update_noncoherent_domain_state(iommu);  
> > > 
> > > Why isn't this simply:
> > > 
> > > 	if (!domain->enforce_cache_coherency)
> > > 		iommu->has_noncoherent_domain = true;  
> > Yes, it's simpler during attach.
> > 
> > > Or maybe:
> > > 
> > > 	if (!domain->enforce_cache_coherency)
> > > 		iommu->noncoherent_domains++;  
> > Yes, this counter is better.
> > I previously thought a bool can save some space.
> > 
> > > >  done:
> > > >  	/* Delete the old one and insert new iova list */
> > > >  	vfio_iommu_iova_insert_copy(iommu, &iova_copy);
> > > > @@ -2508,6 +2557,8 @@ static void vfio_iommu_type1_detach_group(void *iommu_data,
> > > >  			}
> > > >  			iommu_domain_free(domain->domain);
> > > >  			list_del(&domain->next);
> > > > +			if (!domain->enforce_cache_coherency)
> > > > +				vfio_update_noncoherent_domain_state(iommu);  
> > > 
> > > If we were to just track the number of noncoherent domains, this could
> > > simply be iommu->noncoherent_domains-- and VFIO_DMA_CC_DMA could be:
> > > 
> > > 	return iommu->noncoherent_domains ? 1 : 0;
> > > 
> > > Maybe there should be wrappers for list_add() and list_del() relative
> > > to the iommu domain list to make it just be a counter.  Thanks,  
> > 
> > Do you think we can skip the "iommu->noncoherent_domains--" in
> > vfio_iommu_type1_release() when iommu is about to be freed.
> > 
> > Asking that is also because it's hard for me to find a good name for the wrapper
> > around list_del().  :)
> 
> vfio_iommu_link_domain(), vfio_iommu_unlink_domain()?

Ah, this is a good name!

> > 
> > It follows vfio_release_domain() in vfio_iommu_type1_release(), but not in
> > vfio_iommu_type1_detach_group().
> 
> I'm not sure I understand the concern here, detach_group is performed
> under the iommu->lock where the value of iommu->noncohernet_domains is
> only guaranteed while this lock is held.  In the release callback the
> iommu->lock is not held, but we have no external users at this point.
> It's not strictly required that we decrement each domain, but it's also
> not a bad sanity test that iommu->noncoherent_domains should be zero
> after unlinking the domains.  Thanks,
I previously thought I couldn't find a name for a domain operation that's
called after vfio_release_domain(), and I couldn't merge list_del() into
vfio_release_domain() given it's not in vfio_iommu_type1_detach_group().

But vfio_iommu_unlink_domain() is a good one.
I'll rename list_del() to vfio_iommu_unlink_domain().

Thanks!
Yan


  reply	other threads:[~2024-05-13  7:12 UTC|newest]

Thread overview: 78+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-05-07  6:18 [PATCH 0/5] Enforce CPU cache flush for non-coherent device assignment Yan Zhao
2024-05-07  6:19 ` [PATCH 1/5] x86/pat: Let pat_pfn_immune_to_uc_mtrr() check MTRR for untracked PAT range Yan Zhao
2024-05-07  8:26   ` Tian, Kevin
2024-05-07  9:12     ` Yan Zhao
2024-05-08 22:14       ` Alex Williamson
2024-05-09  3:36         ` Yan Zhao
2024-05-16  7:42       ` Tian, Kevin
2024-05-16 14:07         ` Sean Christopherson
2024-05-20  2:36           ` Tian, Kevin
2024-05-07  6:20 ` [PATCH 2/5] KVM: x86/mmu: Fine-grained check of whether a invalid & RAM PFN is MMIO Yan Zhao
2024-05-07  8:39   ` Tian, Kevin
2024-05-07  9:19     ` Yan Zhao
2024-05-07  6:20 ` [PATCH 3/5] x86/mm: Introduce and export interface arch_clean_nonsnoop_dma() Yan Zhao
2024-05-07  8:51   ` Tian, Kevin
2024-05-07  9:40     ` Yan Zhao
2024-05-20 14:07   ` Christoph Hellwig
2024-05-21 15:49     ` Jason Gunthorpe
2024-05-21 16:00       ` Jason Gunthorpe
2024-05-22  3:41         ` Yan Zhao
2024-05-28  6:37         ` Christoph Hellwig
2024-06-01 19:46           ` Jason Gunthorpe
2024-06-06  2:48             ` Yan Zhao
2024-06-06 11:55               ` Jason Gunthorpe
2024-06-07  9:39                 ` Yan Zhao
2024-05-07  6:21 ` [PATCH 4/5] vfio/type1: Flush CPU caches on DMA pages in non-coherent domains Yan Zhao
2024-05-09 18:10   ` Alex Williamson
2024-05-10 10:31     ` Yan Zhao
2024-05-10 16:57       ` Alex Williamson
2024-05-13  7:11         ` Yan Zhao [this message]
2024-05-16  7:53           ` Tian, Kevin
2024-05-16  8:34           ` Tian, Kevin
2024-05-16 20:31             ` Alex Williamson
2024-05-17 17:11               ` Jason Gunthorpe
2024-05-20  2:52                 ` Tian, Kevin
2024-05-21 16:07                   ` Jason Gunthorpe
2024-05-21 16:21                     ` Alex Williamson
2024-05-21 16:34                       ` Jason Gunthorpe
2024-05-21 18:19                         ` Alex Williamson
2024-05-21 18:37                           ` Jason Gunthorpe
2024-05-22  6:24                             ` Tian, Kevin
2024-05-22 12:29                               ` Jason Gunthorpe
2024-05-22 14:43                                 ` Alex Williamson
2024-05-22 16:52                                   ` Jason Gunthorpe
2024-05-22 18:22                                     ` Alex Williamson
2024-05-22 23:26                                 ` Tian, Kevin
2024-05-22 23:32                                   ` Jason Gunthorpe
2024-05-22 23:40                                     ` Tian, Kevin
2024-05-23 14:58                                       ` Jason Gunthorpe
2024-05-23 22:47                                         ` Alex Williamson
2024-05-24  0:30                                           ` Tian, Kevin
2024-05-24 13:50                                           ` Jason Gunthorpe
2024-05-22  3:33                           ` Yan Zhao
2024-05-22  3:24                         ` Yan Zhao
2024-05-22 12:26                           ` Jason Gunthorpe
2024-05-24  3:07                             ` Yan Zhao
2024-05-16 20:50           ` Alex Williamson
2024-05-17  3:11             ` Yan Zhao
2024-05-17  4:44               ` Alex Williamson
2024-05-17  5:00                 ` Yan Zhao
2024-05-07  6:22 ` [PATCH 5/5] iommufd: " Yan Zhao
2024-05-09 14:13   ` Jason Gunthorpe
2024-05-10  8:03     ` Yan Zhao
2024-05-10 13:29       ` Jason Gunthorpe
2024-05-13  7:43         ` Yan Zhao
2024-05-14 15:11           ` Jason Gunthorpe
2024-05-15  7:06             ` Yan Zhao
2024-05-15 20:43               ` Jason Gunthorpe
2024-05-16  2:32                 ` Yan Zhao
2024-05-16  8:38                   ` Tian, Kevin
2024-05-16  9:48                     ` Yan Zhao
2024-05-17 17:04                   ` Jason Gunthorpe
2024-05-20  2:45                     ` Yan Zhao
2024-05-21 16:04                       ` Jason Gunthorpe
2024-05-22  3:17                         ` Yan Zhao
2024-05-22  6:29                           ` Yan Zhao
2024-05-22 17:01                             ` Jason Gunthorpe
2024-05-27  7:15                               ` Yan Zhao
2024-06-01 16:48                                 ` Jason Gunthorpe

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=ZkG9IEQwi7HG3YBk@yzhao56-desk.sh.intel.com \
    --to=yan.y.zhao@intel.com \
    --cc=alex.williamson@redhat.com \
    --cc=baolu.lu@linux.intel.com \
    --cc=bp@alien8.de \
    --cc=corbet@lwn.net \
    --cc=dave.hansen@linux.intel.com \
    --cc=hpa@zytor.com \
    --cc=iommu@lists.linux.dev \
    --cc=jgg@nvidia.com \
    --cc=joro@8bytes.org \
    --cc=kevin.tian@intel.com \
    --cc=kvm@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=luto@kernel.org \
    --cc=mingo@redhat.com \
    --cc=pbonzini@redhat.com \
    --cc=peterz@infradead.org \
    --cc=robin.murphy@arm.com \
    --cc=seanjc@google.com \
    --cc=tglx@linutronix.de \
    --cc=will@kernel.org \
    --cc=x86@kernel.org \
    --cc=yi.l.liu@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).