diff --git a/mm/khugepaged.c b/mm/khugepaged.c index f6a92958157d..1b5c2e942df9 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1717,6 +1717,43 @@ drop_folio: return result; } +/* Can we retract page tables for this file-backed VMA? */ +static bool file_backed_vma_is_retractable(struct vm_area_struct *vma) +{ + /* + * Check vma->anon_vma to exclude MAP_PRIVATE mappings that + * got written to. These VMAs are likely not worth removing + * page tables from, as PMD-mapping is likely to be split later. + */ + if (READ_ONCE(vma->anon_vma)) + return false; + + /* + * When a vma is registered with uffd-wp, we cannot recycle + * the page table because there may be pte markers installed. + * Other vmas can still have the same file mapped hugely, but + * skip this one: it will always be mapped in small page size + * for uffd-wp registered ranges. + */ + if (userfaultfd_wp(vma)) + return false; + + /* + * If the VMA contains guard regions then we can't collapse it. + * + * This is set atomically on guard marker installation under mmap/VMA + * read lock, and here we may not hold any VMA or mmap lock at all. + * + * This is therefore serialised on the PTE page table lock, which is + * obtained on guard region installation after the flag is set, so this + * check being performed under this lock excludes races. + */ + if (vma_flag_test_atomic(vma, VM_MAYBE_GUARD_BIT)) + return false; + + return true; +} + static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) { struct vm_area_struct *vma; @@ -1731,14 +1768,6 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) spinlock_t *ptl; bool success = false; - /* - * Check vma->anon_vma to exclude MAP_PRIVATE mappings that - * got written to. These VMAs are likely not worth removing - * page tables from, as PMD-mapping is likely to be split later. - */ - if (READ_ONCE(vma->anon_vma)) - continue; - addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); if (addr & ~HPAGE_PMD_MASK || vma->vm_end < addr + HPAGE_PMD_SIZE) @@ -1750,14 +1779,8 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) if (hpage_collapse_test_exit(mm)) continue; - /* - * When a vma is registered with uffd-wp, we cannot recycle - * the page table because there may be pte markers installed. - * Other vmas can still have the same file mapped hugely, but - * skip this one: it will always be mapped in small page size - * for uffd-wp registered ranges. - */ - if (userfaultfd_wp(vma)) + + if (!file_backed_vma_is_retractable(vma)) continue; /* PTEs were notified when unmapped; but now for the PMD? */ @@ -1784,15 +1807,15 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) spin_lock_nested(ptl, SINGLE_DEPTH_NESTING); /* - * Huge page lock is still held, so normally the page table - * must remain empty; and we have already skipped anon_vma - * and userfaultfd_wp() vmas. But since the mmap_lock is not - * held, it is still possible for a racing userfaultfd_ioctl() - * to have inserted ptes or markers. Now that we hold ptlock, - * repeating the anon_vma check protects from one category, - * and repeating the userfaultfd_wp() check from another. + * Huge page lock is still held, so normally the page table must + * remain empty; and we have already skipped anon_vma and + * userfaultfd_wp() vmas. But since the mmap_lock is not held, + * it is still possible for a racing userfaultfd_ioctl() or + * madvise() to have inserted ptes or markers. Now that we hold + * ptlock, repeating the retractable checks protects us from + * races against the prior checks. */ - if (likely(!vma->anon_vma && !userfaultfd_wp(vma))) { + if (likely(file_backed_vma_is_retractable(vma))) { pgt_pmd = pmdp_collapse_flush(vma, addr, pmd); pmdp_get_lockless_sync(); success = true; diff --git a/mm/madvise.c b/mm/madvise.c index 0b3280752bfb..5dbe40be7c65 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -1141,15 +1141,21 @@ static long madvise_guard_install(struct madvise_behavior *madv_behavior) return -EINVAL; /* - * If we install guard markers, then the range is no longer - * empty from a page table perspective and therefore it's - * appropriate to have an anon_vma. - * - * This ensures that on fork, we copy page tables correctly. + * Set atomically under read lock. All pertinent readers will need to + * acquire an mmap/VMA write lock to read it. All remaining readers may + * or may not see the flag set, but we don't care. */ - err = anon_vma_prepare(vma); - if (err) - return err; + vma_flag_set_atomic(vma, VM_MAYBE_GUARD_BIT); + + /* + * If anonymous and we are establishing page tables the VMA ought to + * have an anon_vma associated with it. + */ + if (vma_is_anonymous(vma)) { + err = anon_vma_prepare(vma); + if (err) + return err; + } /* * Optimistically try to install the guard marker pages first. If any