[Intel-gfx] [PATCH] mm: Report attempts to overwrite PTE from remap_pfn_range()

Kirill A. Shutemov kirill.shutemov at linux.intel.com
Thu Jun 19 13:50:18 CEST 2014


Chris Wilson wrote:
> When using remap_pfn_range() from a fault handler, we are exposed to
> races between concurrent faults. Rather than hitting a BUG, report the
> error back to the caller, like vm_insert_pfn().
> 
> v2: Fix the pte address for unmapping along the error path.
> v3: Report the error back and cleanup partial remaps.
> 
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Andrew Morton <akpm at linux-foundation.org>
> Cc: "Kirill A. Shutemov" <kirill.shutemov at linux.intel.com>
> Cc: Peter Zijlstra <peterz at infradead.org>
> Cc: Rik van Riel <riel at redhat.com>
> Cc: Mel Gorman <mgorman at suse.de>
> Cc: Cyrill Gorcunov <gorcunov at gmail.com>
> Cc: Johannes Weiner <hannes at cmpxchg.org>
> Cc: linux-mm at kvack.org
> ---
> 
> Whilst this has the semantics I want to allow two concurrent, but
> serialised, pagefaults that try to prefault the same object to succeed,
> it looks fragile and fraught with subtlety.
> -Chris
> 
> ---
>  mm/memory.c | 54 ++++++++++++++++++++++++++++++++++++++----------------
>  1 file changed, 38 insertions(+), 16 deletions(-)
> 
> diff --git a/mm/memory.c b/mm/memory.c
> index d67fd9f..be51fcc 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -1657,32 +1657,41 @@ EXPORT_SYMBOL(vm_insert_mixed);
>   * in null mappings (currently treated as "copy-on-access")
>   */
>  static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
> -			unsigned long addr, unsigned long end,
> -			unsigned long pfn, pgprot_t prot)
> +			   unsigned long addr, unsigned long end,
> +			   unsigned long pfn, pgprot_t prot,
> +			   bool first)
>  {

With this long parameter list, wouldn't it cleaner to pass down a point to
structure instead? This could simplify the code, I believe.

>  	pte_t *pte;
>  	spinlock_t *ptl;
> +	int err = 0;
>  
>  	pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
>  	if (!pte)
>  		return -ENOMEM;
>  	arch_enter_lazy_mmu_mode();
>  	do {
> -		BUG_ON(!pte_none(*pte));
> +		if (!pte_none(*pte)) {
> +			err = first ? -EBUSY : -EINVAL;
> +			pte++;
> +			break;
> +		}
> +		first = false;
>  		set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
>  		pfn++;
>  	} while (pte++, addr += PAGE_SIZE, addr != end);
>  	arch_leave_lazy_mmu_mode();
>  	pte_unmap_unlock(pte - 1, ptl);
> -	return 0;
> +	return err;
>  }
>  
>  static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
> -			unsigned long addr, unsigned long end,
> -			unsigned long pfn, pgprot_t prot)
> +				  unsigned long addr, unsigned long end,
> +				  unsigned long pfn, pgprot_t prot,
> +				  bool first)
>  {
>  	pmd_t *pmd;
>  	unsigned long next;
> +	int err;
>  
>  	pfn -= addr >> PAGE_SHIFT;
>  	pmd = pmd_alloc(mm, pud, addr);
> @@ -1691,19 +1700,23 @@ static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
>  	VM_BUG_ON(pmd_trans_huge(*pmd));
>  	do {
>  		next = pmd_addr_end(addr, end);
> -		if (remap_pte_range(mm, pmd, addr, next,
> -				pfn + (addr >> PAGE_SHIFT), prot))
> -			return -ENOMEM;
> +		err = remap_pte_range(mm, pmd, addr, next,
> +				      pfn + (addr >> PAGE_SHIFT), prot, first);
> +		if (err)
> +			return err;
> +
> +		first = false;
>  	} while (pmd++, addr = next, addr != end);
>  	return 0;
>  }
>  
>  static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd,
> -			unsigned long addr, unsigned long end,
> -			unsigned long pfn, pgprot_t prot)
> +				  unsigned long addr, unsigned long end,
> +				  unsigned long pfn, pgprot_t prot, bool first)
>  {
>  	pud_t *pud;
>  	unsigned long next;
> +	int err;
>  
>  	pfn -= addr >> PAGE_SHIFT;
>  	pud = pud_alloc(mm, pgd, addr);
> @@ -1711,9 +1724,12 @@ static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd,
>  		return -ENOMEM;
>  	do {
>  		next = pud_addr_end(addr, end);
> -		if (remap_pmd_range(mm, pud, addr, next,
> -				pfn + (addr >> PAGE_SHIFT), prot))
> -			return -ENOMEM;
> +		err = remap_pmd_range(mm, pud, addr, next,
> +				      pfn + (addr >> PAGE_SHIFT), prot, first);
> +		if (err)
> +			return err;
> +
> +		first = false;
>  	} while (pud++, addr = next, addr != end);
>  	return 0;
>  }
> @@ -1735,6 +1751,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
>  	unsigned long next;
>  	unsigned long end = addr + PAGE_ALIGN(size);
>  	struct mm_struct *mm = vma->vm_mm;
> +	bool first = true;
>  	int err;
>  
>  	/*
> @@ -1774,13 +1791,18 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
>  	do {
>  		next = pgd_addr_end(addr, end);
>  		err = remap_pud_range(mm, pgd, addr, next,
> -				pfn + (addr >> PAGE_SHIFT), prot);
> +				      pfn + (addr >> PAGE_SHIFT), prot, first);
>  		if (err)
>  			break;
> +
> +		first = false;
>  	} while (pgd++, addr = next, addr != end);
>  
> -	if (err)
> +	if (err) {
>  		untrack_pfn(vma, pfn, PAGE_ALIGN(size));
> +		if (err != -EBUSY)
> +			zap_page_range_single(vma, addr, size, NULL);

Hm. If I read it correctly, you zap whole range, not only what you've
set up. Looks wrong.

And for after zap, you probably whant to return -EBUSY to caller of
remap_pfn_range(), not -EINVAL.

> +	}
>  
>  	return err;
>  }
> -- 
> 1.9.1
> 

-- 
 Kirill A. Shutemov



More information about the Intel-gfx mailing list