[PATCH] mbind: check_range use standard ptwalk

Strict mbind's check for currently mapped pages being on node has been
using a slow loop which re-evaluates pgd, pud, pmd, pte for each entry:
replace that by a standard four-level page table walk like others in mm.
Since mmap_sem is held for writing, page_table_lock can be taken at the
inner level to limit latency.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
This commit is contained in:
Hugh Dickins 2005-06-21 17:15:07 -07:00 committed by Linus Torvalds
parent 941150a326
commit 91612e0df2

View File

@ -238,56 +238,81 @@ static struct mempolicy *mpol_new(int mode, unsigned long *nodes)
} }
/* Ensure all existing pages follow the policy. */ /* Ensure all existing pages follow the policy. */
static int static int check_pte_range(struct mm_struct *mm, pmd_t *pmd,
verify_pages(struct mm_struct *mm,
unsigned long addr, unsigned long end, unsigned long *nodes) unsigned long addr, unsigned long end, unsigned long *nodes)
{ {
int err = 0; pte_t *orig_pte;
pte_t *pte;
spin_lock(&mm->page_table_lock); spin_lock(&mm->page_table_lock);
while (addr < end) { orig_pte = pte = pte_offset_map(pmd, addr);
struct page *p; do {
pte_t *pte; unsigned long pfn;
pmd_t *pmd; unsigned int nid;
pud_t *pud;
pgd_t *pgd; if (!pte_present(*pte))
pgd = pgd_offset(mm, addr); continue;
if (pgd_none(*pgd)) { pfn = pte_pfn(*pte);
unsigned long next = (addr + PGDIR_SIZE) & PGDIR_MASK; if (!pfn_valid(pfn))
if (next > addr) continue;
nid = pfn_to_nid(pfn);
if (!test_bit(nid, nodes))
break; break;
addr = next; } while (pte++, addr += PAGE_SIZE, addr != end);
continue; pte_unmap(orig_pte);
}
pud = pud_offset(pgd, addr);
if (pud_none(*pud)) {
addr = (addr + PUD_SIZE) & PUD_MASK;
continue;
}
pmd = pmd_offset(pud, addr);
if (pmd_none(*pmd)) {
addr = (addr + PMD_SIZE) & PMD_MASK;
continue;
}
p = NULL;
pte = pte_offset_map(pmd, addr);
if (pte_present(*pte)) {
unsigned long pfn = pte_pfn(*pte);
if (pfn_valid(pfn))
p = pfn_to_page(pfn);
}
pte_unmap(pte);
if (p) {
unsigned nid = page_to_nid(p);
if (!test_bit(nid, nodes)) {
err = -EIO;
break;
}
}
addr += PAGE_SIZE;
}
spin_unlock(&mm->page_table_lock); spin_unlock(&mm->page_table_lock);
return err; return addr != end;
}
static inline int check_pmd_range(struct mm_struct *mm, pud_t *pud,
unsigned long addr, unsigned long end, unsigned long *nodes)
{
pmd_t *pmd;
unsigned long next;
pmd = pmd_offset(pud, addr);
do {
next = pmd_addr_end(addr, end);
if (pmd_none_or_clear_bad(pmd))
continue;
if (check_pte_range(mm, pmd, addr, next, nodes))
return -EIO;
} while (pmd++, addr = next, addr != end);
return 0;
}
static inline int check_pud_range(struct mm_struct *mm, pgd_t *pgd,
unsigned long addr, unsigned long end, unsigned long *nodes)
{
pud_t *pud;
unsigned long next;
pud = pud_offset(pgd, addr);
do {
next = pud_addr_end(addr, end);
if (pud_none_or_clear_bad(pud))
continue;
if (check_pmd_range(mm, pud, addr, next, nodes))
return -EIO;
} while (pud++, addr = next, addr != end);
return 0;
}
static inline int check_pgd_range(struct mm_struct *mm,
unsigned long addr, unsigned long end, unsigned long *nodes)
{
pgd_t *pgd;
unsigned long next;
pgd = pgd_offset(mm, addr);
do {
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd))
continue;
if (check_pud_range(mm, pgd, addr, next, nodes))
return -EIO;
} while (pgd++, addr = next, addr != end);
return 0;
} }
/* Step 1: check the range */ /* Step 1: check the range */
@ -308,7 +333,7 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
if (prev && prev->vm_end < vma->vm_start) if (prev && prev->vm_end < vma->vm_start)
return ERR_PTR(-EFAULT); return ERR_PTR(-EFAULT);
if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) { if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) {
err = verify_pages(vma->vm_mm, err = check_pgd_range(vma->vm_mm,
vma->vm_start, vma->vm_end, nodes); vma->vm_start, vma->vm_end, nodes);
if (err) { if (err) {
first = ERR_PTR(err); first = ERR_PTR(err);