From 29f3ad7d8380364c86556eedf4eedd3b3d4921dc Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 4 Nov 2016 18:08:11 +0100 Subject: [PATCH 1/6] fs: Provide function to unmap metadata for a range of blocks Provide function equivalent to unmap_underlying_metadata() for a range of blocks. We somewhat optimize the function to use pagevec lookups instead of looking up buffer heads one by one and use page lock to pin buffer heads instead of mapping's private_lock to improve scalability. Signed-off-by: Jan Kara Signed-off-by: Jens Axboe --- fs/buffer.c | 76 +++++++++++++++++++++++++++++++++++++ include/linux/buffer_head.h | 2 + 2 files changed, 78 insertions(+) diff --git a/fs/buffer.c b/fs/buffer.c index af5776da814a..f8beca55240a 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -43,6 +43,7 @@ #include #include #include +#include #include static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); @@ -1636,6 +1637,81 @@ void unmap_underlying_metadata(struct block_device *bdev, sector_t block) } EXPORT_SYMBOL(unmap_underlying_metadata); +/** + * clean_bdev_aliases: clean a range of buffers in block device + * @bdev: Block device to clean buffers in + * @block: Start of a range of blocks to clean + * @len: Number of blocks to clean + * + * We are taking a range of blocks for data and we don't want writeback of any + * buffer-cache aliases starting from return from this function and until the + * moment when something will explicitly mark the buffer dirty (hopefully that + * will not happen until we will free that block ;-) We don't even need to mark + * it not-uptodate - nobody can expect anything from a newly allocated buffer + * anyway. We used to use unmap_buffer() for such invalidation, but that was + * wrong. We definitely don't want to mark the alias unmapped, for example - it + * would confuse anyone who might pick it with bread() afterwards... + * + * Also.. Note that bforget() doesn't lock the buffer. So there can be + * writeout I/O going on against recently-freed buffers. We don't wait on that + * I/O in bforget() - it's more efficient to wait on the I/O only if we really + * need to. That happens here. + */ +void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len) +{ + struct inode *bd_inode = bdev->bd_inode; + struct address_space *bd_mapping = bd_inode->i_mapping; + struct pagevec pvec; + pgoff_t index = block >> (PAGE_SHIFT - bd_inode->i_blkbits); + pgoff_t end; + int i; + struct buffer_head *bh; + struct buffer_head *head; + + end = (block + len - 1) >> (PAGE_SHIFT - bd_inode->i_blkbits); + pagevec_init(&pvec, 0); + while (index <= end && pagevec_lookup(&pvec, bd_mapping, index, + min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { + for (i = 0; i < pagevec_count(&pvec); i++) { + struct page *page = pvec.pages[i]; + + index = page->index; + if (index > end) + break; + if (!page_has_buffers(page)) + continue; + /* + * We use page lock instead of bd_mapping->private_lock + * to pin buffers here since we can afford to sleep and + * it scales better than a global spinlock lock. + */ + lock_page(page); + /* Recheck when the page is locked which pins bhs */ + if (!page_has_buffers(page)) + goto unlock_page; + head = page_buffers(page); + bh = head; + do { + if (!buffer_mapped(bh)) + goto next; + if (bh->b_blocknr >= block + len) + break; + clear_buffer_dirty(bh); + wait_on_buffer(bh); + clear_buffer_req(bh); +next: + bh = bh->b_this_page; + } while (bh != head); +unlock_page: + unlock_page(page); + } + pagevec_release(&pvec); + cond_resched(); + index++; + } +} +EXPORT_SYMBOL(clean_bdev_aliases); + /* * Size is a power-of-two in the range 512..PAGE_SIZE, * and the case we care about most is PAGE_SIZE. diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index ebbacd14d450..9c9c73ce7d4f 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -169,6 +169,8 @@ void invalidate_inode_buffers(struct inode *); int remove_inode_buffers(struct inode *inode); int sync_mapping_buffers(struct address_space *mapping); void unmap_underlying_metadata(struct block_device *bdev, sector_t block); +void clean_bdev_aliases(struct block_device *bdev, sector_t block, + sector_t len); void mark_buffer_async_write(struct buffer_head *bh); void __wait_on_buffer(struct buffer_head *); From f734c89cc96e9b6f903865cd2656d9d8a7e160e7 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 4 Nov 2016 18:08:12 +0100 Subject: [PATCH 2/6] direct-io: Use clean_bdev_aliases() instead of handmade iteration Use new provided function instead of an iteration through all allocated blocks. Signed-off-by: Jan Kara Signed-off-by: Jens Axboe --- fs/direct-io.c | 28 +++++++--------------------- 1 file changed, 7 insertions(+), 21 deletions(-) diff --git a/fs/direct-io.c b/fs/direct-io.c index a5138c564019..4ea57edf3e54 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -842,24 +842,6 @@ out: return ret; } -/* - * Clean any dirty buffers in the blockdev mapping which alias newly-created - * file blocks. Only called for S_ISREG files - blockdevs do not set - * buffer_new - */ -static void clean_blockdev_aliases(struct dio *dio, struct buffer_head *map_bh) -{ - unsigned i; - unsigned nblocks; - - nblocks = map_bh->b_size >> dio->inode->i_blkbits; - - for (i = 0; i < nblocks; i++) { - unmap_underlying_metadata(map_bh->b_bdev, - map_bh->b_blocknr + i); - } -} - /* * If we are not writing the entire block and get_block() allocated * the block for us, we need to fill-in the unused portion of the @@ -960,11 +942,15 @@ static int do_direct_IO(struct dio *dio, struct dio_submit *sdio, goto do_holes; sdio->blocks_available = - map_bh->b_size >> sdio->blkbits; + map_bh->b_size >> blkbits; sdio->next_block_for_io = map_bh->b_blocknr << sdio->blkfactor; - if (buffer_new(map_bh)) - clean_blockdev_aliases(dio, map_bh); + if (buffer_new(map_bh)) { + clean_bdev_aliases( + map_bh->b_bdev, + map_bh->b_blocknr, + map_bh->b_size >> blkbits); + } if (!sdio->blkfactor) goto do_holes; From 64e1c57fa4740ac0728afe173e5a025b0e94cd55 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 4 Nov 2016 18:08:13 +0100 Subject: [PATCH 3/6] ext4: Use clean_bdev_aliases() instead of iteration Use clean_bdev_aliases() instead of iterating through blocks one by one. Signed-off-by: Jan Kara Signed-off-by: Jens Axboe --- fs/ext4/extents.c | 13 ++----------- fs/ext4/inode.c | 15 ++++----------- 2 files changed, 6 insertions(+), 22 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index c930a0110fb4..dd5b74dfa018 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3777,14 +3777,6 @@ out: return err; } -static void unmap_underlying_metadata_blocks(struct block_device *bdev, - sector_t block, int count) -{ - int i; - for (i = 0; i < count; i++) - unmap_underlying_metadata(bdev, block + i); -} - /* * Handle EOFBLOCKS_FL flag, clearing it if necessary */ @@ -4121,9 +4113,8 @@ out: * new. */ if (allocated > map->m_len) { - unmap_underlying_metadata_blocks(inode->i_sb->s_bdev, - newblock + map->m_len, - allocated - map->m_len); + clean_bdev_aliases(inode->i_sb->s_bdev, newblock + map->m_len, + allocated - map->m_len); allocated = map->m_len; } map->m_len = allocated; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 9c064727ed62..7c7cc4ae4b8e 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -654,12 +654,8 @@ found: if (flags & EXT4_GET_BLOCKS_ZERO && map->m_flags & EXT4_MAP_MAPPED && map->m_flags & EXT4_MAP_NEW) { - ext4_lblk_t i; - - for (i = 0; i < map->m_len; i++) { - unmap_underlying_metadata(inode->i_sb->s_bdev, - map->m_pblk + i); - } + clean_bdev_aliases(inode->i_sb->s_bdev, map->m_pblk, + map->m_len); ret = ext4_issue_zeroout(inode, map->m_lblk, map->m_pblk, map->m_len); if (ret) { @@ -2360,11 +2356,8 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd) BUG_ON(map->m_len == 0); if (map->m_flags & EXT4_MAP_NEW) { - struct block_device *bdev = inode->i_sb->s_bdev; - int i; - - for (i = 0; i < map->m_len; i++) - unmap_underlying_metadata(bdev, map->m_pblk + i); + clean_bdev_aliases(inode->i_sb->s_bdev, map->m_pblk, + map->m_len); } return 0; } From 69a9bea146b185be8ec50e80eaecd8e487e689f8 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 4 Nov 2016 18:08:14 +0100 Subject: [PATCH 4/6] ext2: Use clean_bdev_aliases() instead of iteration Use clean_bdev_aliases() instead of iterating through blocks one by one. Signed-off-by: Jan Kara Signed-off-by: Jens Axboe --- fs/ext2/inode.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index d831e24dc885..eb11f7e2b8aa 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -732,16 +732,13 @@ static int ext2_get_blocks(struct inode *inode, } if (IS_DAX(inode)) { - int i; - /* * We must unmap blocks before zeroing so that writeback cannot * overwrite zeros with stale data from block device page cache. */ - for (i = 0; i < count; i++) { - unmap_underlying_metadata(inode->i_sb->s_bdev, - le32_to_cpu(chain[depth-1].key) + i); - } + clean_bdev_aliases(inode->i_sb->s_bdev, + le32_to_cpu(chain[depth-1].key), + count); /* * block must be initialised before we put it in the tree * so that it's not found by another thread before it's From e64855c6cfaa0a80c1b71c5f647cb792dc436668 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 4 Nov 2016 18:08:15 +0100 Subject: [PATCH 5/6] fs: Add helper to clean bdev aliases under a bh and use it Add a helper function that clears buffer heads from a block device aliasing passed bh. Use this helper function from filesystems instead of the original unmap_underlying_metadata() to save some boiler plate code and also have a better name for the functionalily since it is not unmapping anything for a *long* time. Signed-off-by: Jan Kara Signed-off-by: Jens Axboe --- fs/buffer.c | 8 +++----- fs/ext4/inode.c | 3 +-- fs/ext4/page-io.c | 2 +- fs/mpage.c | 3 +-- fs/ntfs/aops.c | 2 +- fs/ntfs/file.c | 5 ++--- fs/ocfs2/aops.c | 2 +- fs/ufs/balloc.c | 3 +-- fs/ufs/inode.c | 3 +-- include/linux/buffer_head.h | 4 ++++ 10 files changed, 16 insertions(+), 19 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index f8beca55240a..912d70169fca 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1821,8 +1821,7 @@ int __block_write_full_page(struct inode *inode, struct page *page, if (buffer_new(bh)) { /* blockdev mappings never come here */ clear_buffer_new(bh); - unmap_underlying_metadata(bh->b_bdev, - bh->b_blocknr); + clean_bdev_bh_alias(bh); } } bh = bh->b_this_page; @@ -2068,8 +2067,7 @@ int __block_write_begin_int(struct page *page, loff_t pos, unsigned len, } if (buffer_new(bh)) { - unmap_underlying_metadata(bh->b_bdev, - bh->b_blocknr); + clean_bdev_bh_alias(bh); if (PageUptodate(page)) { clear_buffer_new(bh); set_buffer_uptodate(bh); @@ -2709,7 +2707,7 @@ int nobh_write_begin(struct address_space *mapping, if (!buffer_mapped(bh)) is_mapped_to_disk = 0; if (buffer_new(bh)) - unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); + clean_bdev_bh_alias(bh); if (PageUptodate(page)) { set_buffer_uptodate(bh); continue; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 7c7cc4ae4b8e..2f8127601bef 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1123,8 +1123,7 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len, if (err) break; if (buffer_new(bh)) { - unmap_underlying_metadata(bh->b_bdev, - bh->b_blocknr); + clean_bdev_bh_alias(bh); if (PageUptodate(page)) { clear_buffer_new(bh); set_buffer_uptodate(bh); diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index e0b3b54cdef3..f28fd6483e04 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -457,7 +457,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io, } if (buffer_new(bh)) { clear_buffer_new(bh); - unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); + clean_bdev_bh_alias(bh); } set_buffer_async_write(bh); nr_to_submit++; diff --git a/fs/mpage.c b/fs/mpage.c index 98fc11aa7e0b..28af984a3d96 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -555,8 +555,7 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc, if (mpd->get_block(inode, block_in_file, &map_bh, 1)) goto confused; if (buffer_new(&map_bh)) - unmap_underlying_metadata(map_bh.b_bdev, - map_bh.b_blocknr); + clean_bdev_bh_alias(&map_bh); if (buffer_boundary(&map_bh)) { boundary_block = map_bh.b_blocknr; boundary_bdev = map_bh.b_bdev; diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c index d0cf6fee5c77..cc91856b5e2d 100644 --- a/fs/ntfs/aops.c +++ b/fs/ntfs/aops.c @@ -765,7 +765,7 @@ lock_retry_remap: } // TODO: Instantiate the hole. // clear_buffer_new(bh); - // unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); + // clean_bdev_bh_alias(bh); ntfs_error(vol->sb, "Writing into sparse regions is " "not supported yet. Sorry."); err = -EOPNOTSUPP; diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index bf72a2c58b75..99510d811a8c 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -740,8 +740,7 @@ map_buffer_cached: set_buffer_uptodate(bh); if (unlikely(was_hole)) { /* We allocated the buffer. */ - unmap_underlying_metadata(bh->b_bdev, - bh->b_blocknr); + clean_bdev_bh_alias(bh); if (bh_end <= pos || bh_pos >= end) mark_buffer_dirty(bh); else @@ -784,7 +783,7 @@ map_buffer_cached: continue; } /* We allocated the buffer. */ - unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); + clean_bdev_bh_alias(bh); /* * If the buffer is fully outside the write, zero it, * set it uptodate, and mark it dirty so it gets diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index c5c5b9748ea3..e8f65eefffca 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -630,7 +630,7 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, if (!buffer_mapped(bh)) { map_bh(bh, inode->i_sb, *p_blkno); - unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); + clean_bdev_bh_alias(bh); } if (PageUptodate(page)) { diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c index b035af54f538..a0376a2c1c29 100644 --- a/fs/ufs/balloc.c +++ b/fs/ufs/balloc.c @@ -307,8 +307,7 @@ static void ufs_change_blocknr(struct inode *inode, sector_t beg, (unsigned long long)(pos + newb), pos); bh->b_blocknr = newb + pos; - unmap_underlying_metadata(bh->b_bdev, - bh->b_blocknr); + clean_bdev_bh_alias(bh); mark_buffer_dirty(bh); ++j; bh = bh->b_this_page; diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index 190d64be22ed..45ceb94e89e4 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -1070,8 +1070,7 @@ static int ufs_alloc_lastblock(struct inode *inode, loff_t size) if (buffer_new(bh)) { clear_buffer_new(bh); - unmap_underlying_metadata(bh->b_bdev, - bh->b_blocknr); + clean_bdev_bh_alias(bh); /* * we do not zeroize fragment, because of * if it maped to hole, it already contains zeroes diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 9c9c73ce7d4f..d1ab91fc6d43 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -171,6 +171,10 @@ int sync_mapping_buffers(struct address_space *mapping); void unmap_underlying_metadata(struct block_device *bdev, sector_t block); void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len); +static inline void clean_bdev_bh_alias(struct buffer_head *bh) +{ + clean_bdev_aliases(bh->b_bdev, bh->b_blocknr, 1); +} void mark_buffer_async_write(struct buffer_head *bh); void __wait_on_buffer(struct buffer_head *); From ce98321bf7d274a470642ef99e1d82512673ce7c Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 4 Nov 2016 18:08:16 +0100 Subject: [PATCH 6/6] fs: Remove unmap_underlying_metadata Nobody is using this function anymore. Remove it. Signed-off-by: Jan Kara Signed-off-by: Jens Axboe --- fs/buffer.c | 32 -------------------------------- include/linux/buffer_head.h | 1 - 2 files changed, 33 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index 912d70169fca..1104ce8b4536 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1605,38 +1605,6 @@ void create_empty_buffers(struct page *page, } EXPORT_SYMBOL(create_empty_buffers); -/* - * We are taking a block for data and we don't want any output from any - * buffer-cache aliases starting from return from that function and - * until the moment when something will explicitly mark the buffer - * dirty (hopefully that will not happen until we will free that block ;-) - * We don't even need to mark it not-uptodate - nobody can expect - * anything from a newly allocated buffer anyway. We used to used - * unmap_buffer() for such invalidation, but that was wrong. We definitely - * don't want to mark the alias unmapped, for example - it would confuse - * anyone who might pick it with bread() afterwards... - * - * Also.. Note that bforget() doesn't lock the buffer. So there can - * be writeout I/O going on against recently-freed buffers. We don't - * wait on that I/O in bforget() - it's more efficient to wait on the I/O - * only if we really need to. That happens here. - */ -void unmap_underlying_metadata(struct block_device *bdev, sector_t block) -{ - struct buffer_head *old_bh; - - might_sleep(); - - old_bh = __find_get_block_slow(bdev, block); - if (old_bh) { - clear_buffer_dirty(old_bh); - wait_on_buffer(old_bh); - clear_buffer_req(old_bh); - __brelse(old_bh); - } -} -EXPORT_SYMBOL(unmap_underlying_metadata); - /** * clean_bdev_aliases: clean a range of buffers in block device * @bdev: Block device to clean buffers in diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index d1ab91fc6d43..d67ab83823ad 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -168,7 +168,6 @@ int inode_has_buffers(struct inode *); void invalidate_inode_buffers(struct inode *); int remove_inode_buffers(struct inode *inode); int sync_mapping_buffers(struct address_space *mapping); -void unmap_underlying_metadata(struct block_device *bdev, sector_t block); void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len); static inline void clean_bdev_bh_alias(struct buffer_head *bh)