diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 0cce3aafbd62..e3a4fd70f55a 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1244,7 +1244,10 @@ struct btrfs_stripe_hash { /* used by the raid56 code to lock stripes for read/modify/write */ struct btrfs_stripe_hash_table { - struct btrfs_stripe_hash *table; + struct list_head stripe_cache; + spinlock_t cache_lock; + int cache_size; + struct btrfs_stripe_hash table[]; }; #define BTRFS_STRIPE_HASH_TABLE_BITS 11 diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index d02510f34936..7ccddca9ee71 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -47,6 +47,20 @@ /* set when additional merges to this rbio are not allowed */ #define RBIO_RMW_LOCKED_BIT 1 +/* + * set when this rbio is sitting in the hash, but it is just a cache + * of past RMW + */ +#define RBIO_CACHE_BIT 2 + +/* + * set when it is safe to trust the stripe_pages for caching + */ +#define RBIO_CACHE_READY_BIT 3 + + +#define RBIO_CACHE_SIZE 1024 + struct btrfs_raid_bio { struct btrfs_fs_info *fs_info; struct btrfs_bio *bbio; @@ -65,6 +79,11 @@ struct btrfs_raid_bio { */ struct list_head hash_list; + /* + * LRU list for the stripe cache + */ + struct list_head stripe_cache; + /* * for scheduling work in the helper threads */ @@ -176,7 +195,9 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) if (!table) return -ENOMEM; - table->table = (void *)(table + 1); + spin_lock_init(&table->cache_lock); + INIT_LIST_HEAD(&table->stripe_cache); + h = table->table; for (i = 0; i < num_entries; i++) { @@ -192,6 +213,42 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) return 0; } +/* + * caching an rbio means to copy anything from the + * bio_pages array into the stripe_pages array. We + * use the page uptodate bit in the stripe cache array + * to indicate if it has valid data + * + * once the caching is done, we set the cache ready + * bit. + */ +static void cache_rbio_pages(struct btrfs_raid_bio *rbio) +{ + int i; + char *s; + char *d; + int ret; + + ret = alloc_rbio_pages(rbio); + if (ret) + return; + + for (i = 0; i < rbio->nr_pages; i++) { + if (!rbio->bio_pages[i]) + continue; + + s = kmap(rbio->bio_pages[i]); + d = kmap(rbio->stripe_pages[i]); + + memcpy(d, s, PAGE_CACHE_SIZE); + + kunmap(rbio->bio_pages[i]); + kunmap(rbio->stripe_pages[i]); + SetPageUptodate(rbio->stripe_pages[i]); + } + set_bit(RBIO_CACHE_READY_BIT, &rbio->flags); +} + /* * we hash on the first logical address of the stripe */ @@ -210,6 +267,34 @@ static int rbio_bucket(struct btrfs_raid_bio *rbio) return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS); } +/* + * stealing an rbio means taking all the uptodate pages from the stripe + * array in the source rbio and putting them into the destination rbio + */ +static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest) +{ + int i; + struct page *s; + struct page *d; + + if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags)) + return; + + for (i = 0; i < dest->nr_pages; i++) { + s = src->stripe_pages[i]; + if (!s || !PageUptodate(s)) { + continue; + } + + d = dest->stripe_pages[i]; + if (d) + __free_page(d); + + dest->stripe_pages[i] = s; + src->stripe_pages[i] = NULL; + } +} + /* * merging means we take the bio_list from the victim and * splice it into the destination. The victim should @@ -226,16 +311,170 @@ static void merge_rbio(struct btrfs_raid_bio *dest, } /* - * free the hash table used by unmount + * used to prune items that are in the cache. The caller + * must hold the hash table lock. + */ +static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio) +{ + int bucket = rbio_bucket(rbio); + struct btrfs_stripe_hash_table *table; + struct btrfs_stripe_hash *h; + int freeit = 0; + + /* + * check the bit again under the hash table lock. + */ + if (!test_bit(RBIO_CACHE_BIT, &rbio->flags)) + return; + + table = rbio->fs_info->stripe_hash_table; + h = table->table + bucket; + + /* hold the lock for the bucket because we may be + * removing it from the hash table + */ + spin_lock(&h->lock); + + /* + * hold the lock for the bio list because we need + * to make sure the bio list is empty + */ + spin_lock(&rbio->bio_list_lock); + + if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) { + list_del_init(&rbio->stripe_cache); + table->cache_size -= 1; + freeit = 1; + + /* if the bio list isn't empty, this rbio is + * still involved in an IO. We take it out + * of the cache list, and drop the ref that + * was held for the list. + * + * If the bio_list was empty, we also remove + * the rbio from the hash_table, and drop + * the corresponding ref + */ + if (bio_list_empty(&rbio->bio_list)) { + if (!list_empty(&rbio->hash_list)) { + list_del_init(&rbio->hash_list); + atomic_dec(&rbio->refs); + BUG_ON(!list_empty(&rbio->plug_list)); + } + } + } + + spin_unlock(&rbio->bio_list_lock); + spin_unlock(&h->lock); + + if (freeit) + __free_raid_bio(rbio); +} + +/* + * prune a given rbio from the cache + */ +static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio) +{ + struct btrfs_stripe_hash_table *table; + unsigned long flags; + + if (!test_bit(RBIO_CACHE_BIT, &rbio->flags)) + return; + + table = rbio->fs_info->stripe_hash_table; + + spin_lock_irqsave(&table->cache_lock, flags); + __remove_rbio_from_cache(rbio); + spin_unlock_irqrestore(&table->cache_lock, flags); +} + +/* + * remove everything in the cache + */ +void btrfs_clear_rbio_cache(struct btrfs_fs_info *info) +{ + struct btrfs_stripe_hash_table *table; + unsigned long flags; + struct btrfs_raid_bio *rbio; + + table = info->stripe_hash_table; + + spin_lock_irqsave(&table->cache_lock, flags); + while (!list_empty(&table->stripe_cache)) { + rbio = list_entry(table->stripe_cache.next, + struct btrfs_raid_bio, + stripe_cache); + __remove_rbio_from_cache(rbio); + } + spin_unlock_irqrestore(&table->cache_lock, flags); +} + +/* + * remove all cached entries and free the hash table + * used by unmount */ void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info) { if (!info->stripe_hash_table) return; + btrfs_clear_rbio_cache(info); kfree(info->stripe_hash_table); info->stripe_hash_table = NULL; } +/* + * insert an rbio into the stripe cache. It + * must have already been prepared by calling + * cache_rbio_pages + * + * If this rbio was already cached, it gets + * moved to the front of the lru. + * + * If the size of the rbio cache is too big, we + * prune an item. + */ +static void cache_rbio(struct btrfs_raid_bio *rbio) +{ + struct btrfs_stripe_hash_table *table; + unsigned long flags; + + if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags)) + return; + + table = rbio->fs_info->stripe_hash_table; + + spin_lock_irqsave(&table->cache_lock, flags); + spin_lock(&rbio->bio_list_lock); + + /* bump our ref if we were not in the list before */ + if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags)) + atomic_inc(&rbio->refs); + + if (!list_empty(&rbio->stripe_cache)){ + list_move(&rbio->stripe_cache, &table->stripe_cache); + } else { + list_add(&rbio->stripe_cache, &table->stripe_cache); + table->cache_size += 1; + } + + spin_unlock(&rbio->bio_list_lock); + + if (table->cache_size > RBIO_CACHE_SIZE) { + struct btrfs_raid_bio *found; + + found = list_entry(table->stripe_cache.prev, + struct btrfs_raid_bio, + stripe_cache); + + if (found != rbio) + __remove_rbio_from_cache(found); + } + + spin_unlock_irqrestore(&table->cache_lock, flags); + return; +} + /* * helper function to run the xor_blocks api. It is only * able to do MAX_XOR_BLOCKS at a time, so we need to @@ -303,6 +542,17 @@ static int rbio_can_merge(struct btrfs_raid_bio *last, test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) return 0; + /* + * we can't merge with cached rbios, since the + * idea is that when we merge the destination + * rbio is going to run our IO for us. We can + * steal from cached rbio's though, other functions + * handle that. + */ + if (test_bit(RBIO_CACHE_BIT, &last->flags) || + test_bit(RBIO_CACHE_BIT, &cur->flags)) + return 0; + if (last->raid_map[0] != cur->raid_map[0]) return 0; @@ -370,6 +620,7 @@ static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio) unsigned long flags; DEFINE_WAIT(wait); struct btrfs_raid_bio *freeit = NULL; + struct btrfs_raid_bio *cache_drop = NULL; int ret = 0; int walk = 0; @@ -379,6 +630,21 @@ static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio) if (cur->raid_map[0] == rbio->raid_map[0]) { spin_lock(&cur->bio_list_lock); + /* can we steal this cached rbio's pages? */ + if (bio_list_empty(&cur->bio_list) && + list_empty(&cur->plug_list) && + test_bit(RBIO_CACHE_BIT, &cur->flags) && + !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) { + list_del_init(&cur->hash_list); + atomic_dec(&cur->refs); + + steal_rbio(cur, rbio); + cache_drop = cur; + spin_unlock(&cur->bio_list_lock); + + goto lockit; + } + /* can we merge into the lock owner? */ if (rbio_can_merge(cur, rbio)) { merge_rbio(cur, rbio); @@ -388,6 +654,7 @@ static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio) goto out; } + /* * we couldn't merge with the running * rbio, see if we can merge with the @@ -417,11 +684,13 @@ static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio) goto out; } } - +lockit: atomic_inc(&rbio->refs); list_add(&rbio->hash_list, &h->hash_list); out: spin_unlock_irqrestore(&h->lock, flags); + if (cache_drop) + remove_rbio_from_cache(cache_drop); if (freeit) __free_raid_bio(freeit); return ret; @@ -436,14 +705,30 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) int bucket; struct btrfs_stripe_hash *h; unsigned long flags; + int keep_cache = 0; bucket = rbio_bucket(rbio); h = rbio->fs_info->stripe_hash_table->table + bucket; + if (list_empty(&rbio->plug_list)) + cache_rbio(rbio); + spin_lock_irqsave(&h->lock, flags); spin_lock(&rbio->bio_list_lock); if (!list_empty(&rbio->hash_list)) { + /* + * if we're still cached and there is no other IO + * to perform, just leave this rbio here for others + * to steal from later + */ + if (list_empty(&rbio->plug_list) && + test_bit(RBIO_CACHE_BIT, &rbio->flags)) { + keep_cache = 1; + clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); + BUG_ON(!bio_list_empty(&rbio->bio_list)); + goto done; + } list_del_init(&rbio->hash_list); atomic_dec(&rbio->refs); @@ -469,11 +754,12 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) if (next->read_rebuild) async_read_rebuild(next); - else + else { + steal_rbio(rbio, next); async_rmw_stripe(next); + } goto done_nolock; - } else if (waitqueue_active(&h->wait)) { spin_unlock(&rbio->bio_list_lock); spin_unlock_irqrestore(&h->lock, flags); @@ -481,11 +767,13 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) goto done_nolock; } } +done: spin_unlock(&rbio->bio_list_lock); spin_unlock_irqrestore(&h->lock, flags); done_nolock: - return; + if (!keep_cache) + remove_rbio_from_cache(rbio); } static void __free_raid_bio(struct btrfs_raid_bio *rbio) @@ -496,6 +784,7 @@ static void __free_raid_bio(struct btrfs_raid_bio *rbio) if (!atomic_dec_and_test(&rbio->refs)) return; + WARN_ON(!list_empty(&rbio->stripe_cache)); WARN_ON(!list_empty(&rbio->hash_list)); WARN_ON(!bio_list_empty(&rbio->bio_list)); @@ -630,6 +919,7 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root, bio_list_init(&rbio->bio_list); INIT_LIST_HEAD(&rbio->plug_list); spin_lock_init(&rbio->bio_list_lock); + INIT_LIST_HEAD(&rbio->stripe_cache); INIT_LIST_HEAD(&rbio->hash_list); rbio->bbio = bbio; rbio->raid_map = raid_map; @@ -864,8 +1154,17 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) /* * now that we've set rmw_locked, run through the * bio list one last time and map the page pointers + * + * We don't cache full rbios because we're assuming + * the higher layers are unlikely to use this area of + * the disk again soon. If they do use it again, + * hopefully they will send another full bio. */ index_rbio_pages(rbio); + if (!rbio_is_full(rbio)) + cache_rbio_pages(rbio); + else + clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) { struct page *p; @@ -1155,6 +1454,13 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) continue; page = rbio_stripe_page(rbio, stripe, pagenr); + /* + * the bio cache may have handed us an uptodate + * page. If so, be happy and use it + */ + if (PageUptodate(page)) + continue; + ret = rbio_add_io_page(rbio, &bio_list, page, stripe, pagenr, rbio->stripe_len); if (ret) @@ -1440,6 +1746,11 @@ cleanup: cleanup_io: if (rbio->read_rebuild) { + if (err == 0) + cache_rbio_pages(rbio); + else + clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); + rbio_orig_end_io(rbio, err, err == 0); } else if (err == 0) { rbio->faila = -1; @@ -1505,7 +1816,9 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) atomic_set(&rbio->bbio->error, 0); /* - * read everything that hasn't failed. + * read everything that hasn't failed. Thanks to the + * stripe cache, it is possible that some or all of these + * pages are going to be uptodate. */ for (stripe = 0; stripe < bbio->num_stripes; stripe++) { if (rbio->faila == stripe ||