mm, dax: fix DAX deadlocks

The following two locking commits in the DAX code:

commit 843172978b ("dax: fix race between simultaneous faults")
commit 46c043ede4 ("mm: take i_mmap_lock in unmap_mapping_range() for DAX")

introduced a number of deadlocks and other issues which need to be fixed
for the v4.3 kernel.  The list of issues in DAX after these commits
(some newly introduced by the commits, some preexisting) can be found
here:

  https://lkml.org/lkml/2015/9/25/602 (Subject: "Re: [PATCH] dax: fix deadlock in __dax_fault").

This undoes most of the changes introduced by those two commits,
essentially returning us to the DAX locking scheme that was used in
v4.2.

Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Dan Williams <dan.j.williams@intel.com>
Tested-by: Dave Chinner <dchinner@redhat.com>
Cc: Jan Kara <jack@suse.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Matthew Wilcox <matthew.r.wilcox@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
This commit is contained in:
Ross Zwisler 2015-10-15 15:28:32 -07:00 committed by Linus Torvalds
parent 424cdc1413
commit 0f90cc6609
2 changed files with 31 additions and 41 deletions

View File

@ -285,6 +285,7 @@ static int copy_user_bh(struct page *to, struct buffer_head *bh,
static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
struct vm_area_struct *vma, struct vm_fault *vmf) struct vm_area_struct *vma, struct vm_fault *vmf)
{ {
struct address_space *mapping = inode->i_mapping;
sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9); sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9);
unsigned long vaddr = (unsigned long)vmf->virtual_address; unsigned long vaddr = (unsigned long)vmf->virtual_address;
void __pmem *addr; void __pmem *addr;
@ -292,6 +293,8 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
pgoff_t size; pgoff_t size;
int error; int error;
i_mmap_lock_read(mapping);
/* /*
* Check truncate didn't happen while we were allocating a block. * Check truncate didn't happen while we were allocating a block.
* If it did, this block may or may not be still allocated to the * If it did, this block may or may not be still allocated to the
@ -321,6 +324,8 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
error = vm_insert_mixed(vma, vaddr, pfn); error = vm_insert_mixed(vma, vaddr, pfn);
out: out:
i_mmap_unlock_read(mapping);
return error; return error;
} }
@ -382,17 +387,15 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
* from a read fault and we've raced with a truncate * from a read fault and we've raced with a truncate
*/ */
error = -EIO; error = -EIO;
goto unlock; goto unlock_page;
} }
} else {
i_mmap_lock_write(mapping);
} }
error = get_block(inode, block, &bh, 0); error = get_block(inode, block, &bh, 0);
if (!error && (bh.b_size < PAGE_SIZE)) if (!error && (bh.b_size < PAGE_SIZE))
error = -EIO; /* fs corruption? */ error = -EIO; /* fs corruption? */
if (error) if (error)
goto unlock; goto unlock_page;
if (!buffer_mapped(&bh) && !buffer_unwritten(&bh) && !vmf->cow_page) { if (!buffer_mapped(&bh) && !buffer_unwritten(&bh) && !vmf->cow_page) {
if (vmf->flags & FAULT_FLAG_WRITE) { if (vmf->flags & FAULT_FLAG_WRITE) {
@ -403,9 +406,8 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
if (!error && (bh.b_size < PAGE_SIZE)) if (!error && (bh.b_size < PAGE_SIZE))
error = -EIO; error = -EIO;
if (error) if (error)
goto unlock; goto unlock_page;
} else { } else {
i_mmap_unlock_write(mapping);
return dax_load_hole(mapping, page, vmf); return dax_load_hole(mapping, page, vmf);
} }
} }
@ -417,15 +419,17 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
else else
clear_user_highpage(new_page, vaddr); clear_user_highpage(new_page, vaddr);
if (error) if (error)
goto unlock; goto unlock_page;
vmf->page = page; vmf->page = page;
if (!page) { if (!page) {
i_mmap_lock_read(mapping);
/* Check we didn't race with truncate */ /* Check we didn't race with truncate */
size = (i_size_read(inode) + PAGE_SIZE - 1) >> size = (i_size_read(inode) + PAGE_SIZE - 1) >>
PAGE_SHIFT; PAGE_SHIFT;
if (vmf->pgoff >= size) { if (vmf->pgoff >= size) {
i_mmap_unlock_read(mapping);
error = -EIO; error = -EIO;
goto unlock; goto out;
} }
} }
return VM_FAULT_LOCKED; return VM_FAULT_LOCKED;
@ -461,8 +465,6 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
WARN_ON_ONCE(!(vmf->flags & FAULT_FLAG_WRITE)); WARN_ON_ONCE(!(vmf->flags & FAULT_FLAG_WRITE));
} }
if (!page)
i_mmap_unlock_write(mapping);
out: out:
if (error == -ENOMEM) if (error == -ENOMEM)
return VM_FAULT_OOM | major; return VM_FAULT_OOM | major;
@ -471,14 +473,11 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
return VM_FAULT_SIGBUS | major; return VM_FAULT_SIGBUS | major;
return VM_FAULT_NOPAGE | major; return VM_FAULT_NOPAGE | major;
unlock: unlock_page:
if (page) { if (page) {
unlock_page(page); unlock_page(page);
page_cache_release(page); page_cache_release(page);
} else {
i_mmap_unlock_write(mapping);
} }
goto out; goto out;
} }
EXPORT_SYMBOL(__dax_fault); EXPORT_SYMBOL(__dax_fault);
@ -556,10 +555,10 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
block = (sector_t)pgoff << (PAGE_SHIFT - blkbits); block = (sector_t)pgoff << (PAGE_SHIFT - blkbits);
bh.b_size = PMD_SIZE; bh.b_size = PMD_SIZE;
i_mmap_lock_write(mapping);
length = get_block(inode, block, &bh, write); length = get_block(inode, block, &bh, write);
if (length) if (length)
return VM_FAULT_SIGBUS; return VM_FAULT_SIGBUS;
i_mmap_lock_read(mapping);
/* /*
* If the filesystem isn't willing to tell us the length of a hole, * If the filesystem isn't willing to tell us the length of a hole,
@ -569,36 +568,14 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE) if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE)
goto fallback; goto fallback;
sector = bh.b_blocknr << (blkbits - 9);
if (buffer_unwritten(&bh) || buffer_new(&bh)) {
int i;
length = bdev_direct_access(bh.b_bdev, sector, &kaddr, &pfn,
bh.b_size);
if (length < 0) {
result = VM_FAULT_SIGBUS;
goto out;
}
if ((length < PMD_SIZE) || (pfn & PG_PMD_COLOUR))
goto fallback;
for (i = 0; i < PTRS_PER_PMD; i++)
clear_pmem(kaddr + i * PAGE_SIZE, PAGE_SIZE);
wmb_pmem();
count_vm_event(PGMAJFAULT);
mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
result |= VM_FAULT_MAJOR;
}
/* /*
* If we allocated new storage, make sure no process has any * If we allocated new storage, make sure no process has any
* zero pages covering this hole * zero pages covering this hole
*/ */
if (buffer_new(&bh)) { if (buffer_new(&bh)) {
i_mmap_unlock_write(mapping); i_mmap_unlock_read(mapping);
unmap_mapping_range(mapping, pgoff << PAGE_SHIFT, PMD_SIZE, 0); unmap_mapping_range(mapping, pgoff << PAGE_SHIFT, PMD_SIZE, 0);
i_mmap_lock_write(mapping); i_mmap_lock_read(mapping);
} }
/* /*
@ -635,6 +612,7 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
result = VM_FAULT_NOPAGE; result = VM_FAULT_NOPAGE;
spin_unlock(ptl); spin_unlock(ptl);
} else { } else {
sector = bh.b_blocknr << (blkbits - 9);
length = bdev_direct_access(bh.b_bdev, sector, &kaddr, &pfn, length = bdev_direct_access(bh.b_bdev, sector, &kaddr, &pfn,
bh.b_size); bh.b_size);
if (length < 0) { if (length < 0) {
@ -644,15 +622,25 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
if ((length < PMD_SIZE) || (pfn & PG_PMD_COLOUR)) if ((length < PMD_SIZE) || (pfn & PG_PMD_COLOUR))
goto fallback; goto fallback;
if (buffer_unwritten(&bh) || buffer_new(&bh)) {
int i;
for (i = 0; i < PTRS_PER_PMD; i++)
clear_pmem(kaddr + i * PAGE_SIZE, PAGE_SIZE);
wmb_pmem();
count_vm_event(PGMAJFAULT);
mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
result |= VM_FAULT_MAJOR;
}
result |= vmf_insert_pfn_pmd(vma, address, pmd, pfn, write); result |= vmf_insert_pfn_pmd(vma, address, pmd, pfn, write);
} }
out: out:
i_mmap_unlock_read(mapping);
if (buffer_unwritten(&bh)) if (buffer_unwritten(&bh))
complete_unwritten(&bh, !(result & VM_FAULT_ERROR)); complete_unwritten(&bh, !(result & VM_FAULT_ERROR));
i_mmap_unlock_write(mapping);
return result; return result;
fallback: fallback:

View File

@ -2426,6 +2426,8 @@ void unmap_mapping_range(struct address_space *mapping,
if (details.last_index < details.first_index) if (details.last_index < details.first_index)
details.last_index = ULONG_MAX; details.last_index = ULONG_MAX;
/* DAX uses i_mmap_lock to serialise file truncate vs page fault */
i_mmap_lock_write(mapping); i_mmap_lock_write(mapping);
if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap))) if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap)))
unmap_mapping_range_tree(&mapping->i_mmap, &details); unmap_mapping_range_tree(&mapping->i_mmap, &details);