diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 0639a555e16e..74c03fb0ca1d 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -156,10 +156,23 @@ struct extent_buffer *btrfs_root_node(struct btrfs_root *root) { struct extent_buffer *eb; - rcu_read_lock(); - eb = rcu_dereference(root->node); - extent_buffer_get(eb); - rcu_read_unlock(); + while (1) { + rcu_read_lock(); + eb = rcu_dereference(root->node); + + /* + * RCU really hurts here, we could free up the root node because + * it was cow'ed but we may not get the new root node yet so do + * the inc_not_zero dance and if it doesn't work then + * synchronize_rcu and try again. + */ + if (atomic_inc_not_zero(&eb->refs)) { + rcu_read_unlock(); + break; + } + rcu_read_unlock(); + synchronize_rcu(); + } return eb; } @@ -504,7 +517,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, } if (unlock_orig) btrfs_tree_unlock(buf); - free_extent_buffer(buf); + free_extent_buffer_stale(buf); btrfs_mark_buffer_dirty(cow); *cow_ret = cow; return 0; @@ -959,7 +972,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, root_sub_used(root, mid->len); btrfs_free_tree_block(trans, root, mid, 0, 1, 0); /* once for the root ptr */ - free_extent_buffer(mid); + free_extent_buffer_stale(mid); return 0; } if (btrfs_header_nritems(mid) > @@ -1016,7 +1029,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, ret = wret; root_sub_used(root, right->len); btrfs_free_tree_block(trans, root, right, 0, 1, 0); - free_extent_buffer(right); + free_extent_buffer_stale(right); right = NULL; } else { struct btrfs_disk_key right_key; @@ -1056,7 +1069,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, ret = wret; root_sub_used(root, mid->len); btrfs_free_tree_block(trans, root, mid, 0, 1, 0); - free_extent_buffer(mid); + free_extent_buffer_stale(mid); mid = NULL; } else { /* update the parent key to reflect our changes */ @@ -3781,7 +3794,9 @@ static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans, root_sub_used(root, leaf->len); + extent_buffer_get(leaf); btrfs_free_tree_block(trans, root, leaf, 0, 1, 0); + free_extent_buffer_stale(leaf); return 0; } /* diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index bc88649cffb7..0ba055e03eb8 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -923,16 +923,8 @@ static int btree_readpage(struct file *file, struct page *page) static int btree_releasepage(struct page *page, gfp_t gfp_flags) { - struct extent_map_tree *map; - struct extent_io_tree *tree; - int ret; - if (PageWriteback(page) || PageDirty(page)) return 0; - - tree = &BTRFS_I(page->mapping->host)->io_tree; - map = &BTRFS_I(page->mapping->host)->extent_tree; - /* * We need to mask out eg. __GFP_HIGHMEM and __GFP_DMA32 as we're doing * slab allocation from alloc_extent_state down the callchain where @@ -940,11 +932,7 @@ static int btree_releasepage(struct page *page, gfp_t gfp_flags) */ gfp_flags &= ~GFP_SLAB_BUG_MASK; - ret = try_release_extent_state(map, tree, page, gfp_flags); - if (!ret) - return 0; - - return try_release_extent_buffer(tree, page); + return try_release_extent_buffer(page, gfp_flags); } static void btree_invalidatepage(struct page *page, unsigned long offset) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 9b7e7682fda0..1b831ac4c079 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5018,10 +5018,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, if (is_data) { ret = btrfs_del_csums(trans, root, bytenr, num_bytes); BUG_ON(ret); - } else { - invalidate_mapping_pages(info->btree_inode->i_mapping, - bytenr >> PAGE_CACHE_SHIFT, - (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT); } ret = update_block_group(trans, root, bytenr, num_bytes, 0); @@ -6022,6 +6018,7 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level); btrfs_tree_lock(buf); clean_tree_block(trans, root, buf); + clear_bit(EXTENT_BUFFER_STALE, &buf->bflags); btrfs_set_lock_blocking(buf); btrfs_set_buffer_uptodate(buf); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 0f74262911be..0ce14369920c 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3607,6 +3607,7 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, list_add(&eb->leak_list, &buffers); spin_unlock_irqrestore(&leak_lock, flags); #endif + spin_lock_init(&eb->refs_lock); atomic_set(&eb->refs, 1); atomic_set(&eb->pages_reading, 0); @@ -3654,6 +3655,8 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb, */ if (PagePrivate(page) && page->private == (unsigned long)eb) { + BUG_ON(PageDirty(page)); + BUG_ON(PageWriteback(page)); /* * We need to make sure we haven't be attached * to a new eb. @@ -3763,7 +3766,6 @@ again: if (!atomic_inc_not_zero(&exists->refs)) { spin_unlock(&tree->buffer_lock); radix_tree_preload_end(); - synchronize_rcu(); exists = NULL; goto again; } @@ -3772,7 +3774,10 @@ again: goto free_eb; } /* add one reference for the tree */ + spin_lock(&eb->refs_lock); atomic_inc(&eb->refs); + set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags); + spin_unlock(&eb->refs_lock); spin_unlock(&tree->buffer_lock); radix_tree_preload_end(); @@ -3823,15 +3828,143 @@ struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, return NULL; } +static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head) +{ + struct extent_buffer *eb = + container_of(head, struct extent_buffer, rcu_head); + + __free_extent_buffer(eb); +} + +static int extent_buffer_under_io(struct extent_buffer *eb, + struct page *locked_page) +{ + unsigned long num_pages, i; + + num_pages = num_extent_pages(eb->start, eb->len); + for (i = 0; i < num_pages; i++) { + struct page *page = eb->pages[i]; + int need_unlock = 0; + + if (!page) + continue; + + if (page != locked_page) { + if (!trylock_page(page)) + return 1; + need_unlock = 1; + } + + if (PageDirty(page) || PageWriteback(page)) { + if (need_unlock) + unlock_page(page); + return 1; + } + if (need_unlock) + unlock_page(page); + } + + return 0; +} + +/* Expects to have eb->eb_lock already held */ +static void release_extent_buffer(struct extent_buffer *eb, gfp_t mask) +{ + WARN_ON(atomic_read(&eb->refs) == 0); + if (atomic_dec_and_test(&eb->refs)) { + struct extent_io_tree *tree = eb->tree; + int ret; + + spin_unlock(&eb->refs_lock); + + might_sleep_if(mask & __GFP_WAIT); + ret = clear_extent_bit(tree, eb->start, + eb->start + eb->len - 1, -1, 0, 0, + NULL, mask); + if (ret < 0) { + unsigned long num_pages, i; + + num_pages = num_extent_pages(eb->start, eb->len); + /* + * We failed to clear the state bits which likely means + * ENOMEM, so just re-up the eb ref and continue, we + * will get freed later on via releasepage or something + * else and will be ok. + */ + spin_lock(&eb->tree->mapping->private_lock); + spin_lock(&eb->refs_lock); + set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags); + atomic_inc(&eb->refs); + + /* + * We may have started to reclaim the pages for a newly + * allocated eb, make sure we own all of them again. + */ + for (i = 0; i < num_pages; i++) { + struct page *page = eb->pages[i]; + + if (!page) { + WARN_ON(1); + continue; + } + + BUG_ON(!PagePrivate(page)); + if (page->private != (unsigned long)eb) { + ClearPagePrivate(page); + page_cache_release(page); + attach_extent_buffer_page(eb, page); + } + } + spin_unlock(&eb->refs_lock); + spin_unlock(&eb->tree->mapping->private_lock); + return; + } + + spin_lock(&tree->buffer_lock); + radix_tree_delete(&tree->buffer, + eb->start >> PAGE_CACHE_SHIFT); + spin_unlock(&tree->buffer_lock); + + /* Should be safe to release our pages at this point */ + btrfs_release_extent_buffer_page(eb, 0); + + call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu); + return; + } + spin_unlock(&eb->refs_lock); +} + void free_extent_buffer(struct extent_buffer *eb) { if (!eb) return; - if (!atomic_dec_and_test(&eb->refs)) + spin_lock(&eb->refs_lock); + if (atomic_read(&eb->refs) == 2 && + test_bit(EXTENT_BUFFER_STALE, &eb->bflags) && + !extent_buffer_under_io(eb, NULL) && + test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) + atomic_dec(&eb->refs); + + /* + * I know this is terrible, but it's temporary until we stop tracking + * the uptodate bits and such for the extent buffers. + */ + release_extent_buffer(eb, GFP_ATOMIC); +} + +void free_extent_buffer_stale(struct extent_buffer *eb) +{ + if (!eb) return; - WARN_ON(1); + spin_lock(&eb->refs_lock); + set_bit(EXTENT_BUFFER_STALE, &eb->bflags); + + if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb, NULL) && + test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) + atomic_dec(&eb->refs); + release_extent_buffer(eb, GFP_NOFS); } int clear_extent_buffer_dirty(struct extent_io_tree *tree, @@ -3874,6 +4007,7 @@ int set_extent_buffer_dirty(struct extent_io_tree *tree, was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); num_pages = num_extent_pages(eb->start, eb->len); + WARN_ON(atomic_read(&eb->refs) == 0); for (i = 0; i < num_pages; i++) __set_page_dirty_nobuffers(extent_buffer_page(eb, i)); return was_dirty; @@ -4440,45 +4574,48 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, } } -static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head) +int try_release_extent_buffer(struct page *page, gfp_t mask) { - struct extent_buffer *eb = - container_of(head, struct extent_buffer, rcu_head); - - __free_extent_buffer(eb); -} - -int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page) -{ - u64 start = page_offset(page); - struct extent_buffer *eb = (struct extent_buffer *)page->private; - int ret = 1; - - if (!PagePrivate(page) || !eb) - return 1; - - spin_lock(&tree->buffer_lock); - if (atomic_read(&eb->refs) > 1 || - test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { - ret = 0; - goto out; - } + struct extent_buffer *eb; /* - * set @eb->refs to 0 if it is already 1, and then release the @eb. - * Or go back. + * We need to make sure noboody is attaching this page to an eb right + * now. */ - if (atomic_cmpxchg(&eb->refs, 1, 0) != 1) { - ret = 0; - goto out; + spin_lock(&page->mapping->private_lock); + if (!PagePrivate(page)) { + spin_unlock(&page->mapping->private_lock); + return 1; } - radix_tree_delete(&tree->buffer, start >> PAGE_CACHE_SHIFT); - btrfs_release_extent_buffer_page(eb, 0); -out: - spin_unlock(&tree->buffer_lock); - /* at this point we can safely release the extent buffer */ - if (atomic_read(&eb->refs) == 0) - call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu); - return ret; + eb = (struct extent_buffer *)page->private; + BUG_ON(!eb); + + /* + * This is a little awful but should be ok, we need to make sure that + * the eb doesn't disappear out from under us while we're looking at + * this page. + */ + spin_lock(&eb->refs_lock); + if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb, page)) { + spin_unlock(&eb->refs_lock); + spin_unlock(&page->mapping->private_lock); + return 0; + } + spin_unlock(&page->mapping->private_lock); + + if ((mask & GFP_NOFS) == GFP_NOFS) + mask = GFP_NOFS; + + /* + * If tree ref isn't set then we know the ref on this eb is a real ref, + * so just return, this page will likely be freed soon anyway. + */ + if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) { + spin_unlock(&eb->refs_lock); + return 0; + } + release_extent_buffer(eb, mask); + + return 1; } diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 83e432da2e26..60628341f156 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -35,6 +35,8 @@ #define EXTENT_BUFFER_DIRTY 2 #define EXTENT_BUFFER_CORRUPT 3 #define EXTENT_BUFFER_READAHEAD 4 /* this got triggered by readahead */ +#define EXTENT_BUFFER_TREE_REF 5 +#define EXTENT_BUFFER_STALE 6 /* these are flags for extent_clear_unlock_delalloc */ #define EXTENT_CLEAR_UNLOCK_PAGE 0x1 @@ -128,6 +130,7 @@ struct extent_buffer { unsigned long map_len; unsigned long bflags; struct extent_io_tree *tree; + spinlock_t refs_lock; atomic_t refs; atomic_t pages_reading; struct list_head leak_list; @@ -184,7 +187,7 @@ void extent_io_tree_init(struct extent_io_tree *tree, int try_release_extent_mapping(struct extent_map_tree *map, struct extent_io_tree *tree, struct page *page, gfp_t mask); -int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page); +int try_release_extent_buffer(struct page *page, gfp_t mask); int try_release_extent_state(struct extent_map_tree *map, struct extent_io_tree *tree, struct page *page, gfp_t mask); @@ -261,6 +264,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, u64 start, unsigned long len); void free_extent_buffer(struct extent_buffer *eb); +void free_extent_buffer_stale(struct extent_buffer *eb); #define WAIT_NONE 0 #define WAIT_COMPLETE 1 #define WAIT_PAGE_LOCK 2