linux/fs/f2fs/recovery.c
Chao Yu f61cce5b81 f2fs: fix inode cache leak
When testing f2fs with inline_dentry option, generic/342 reports:
VFS: Busy inodes after unmount of dm-0. Self-destruct in 5 seconds.  Have a nice day...

After rmmod f2fs module, kenrel shows following dmesg:
 =============================================================================
 BUG f2fs_inode_cache (Tainted: G           O   ): Objects remaining in f2fs_inode_cache on __kmem_cache_shutdown()
 -----------------------------------------------------------------------------

 Disabling lock debugging due to kernel taint
 INFO: Slab 0xf51ca0e0 objects=22 used=1 fp=0xd1e6fc60 flags=0x40004080
 CPU: 3 PID: 7455 Comm: rmmod Tainted: G    B      O    4.6.0-rc4+ #16
 Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006
  00000086 00000086 d062fe18 c13a83a0 f51ca0e0 d062fe38 d062fea4 c11c7276
  c1981040 f51ca0e0 00000016 00000001 d1e6fc60 40004080 656a624f 20737463
  616d6572 6e696e69 6e692067 66326620 6e695f73 5f65646f 68636163 6e6f2065
 Call Trace:
  [<c13a83a0>] dump_stack+0x5f/0x8f
  [<c11c7276>] slab_err+0x76/0x80
  [<c11cbfc0>] ? __kmem_cache_shutdown+0x100/0x2f0
  [<c11cbfc0>] ? __kmem_cache_shutdown+0x100/0x2f0
  [<c11cbfe5>] __kmem_cache_shutdown+0x125/0x2f0
  [<c1198a38>] kmem_cache_destroy+0x158/0x1f0
  [<c176b43d>] ? mutex_unlock+0xd/0x10
  [<f8f15aa3>] exit_f2fs_fs+0x4b/0x5a8 [f2fs]
  [<c10f596c>] SyS_delete_module+0x16c/0x1d0
  [<c1001b10>] ? do_fast_syscall_32+0x30/0x1c0
  [<c13c59bf>] ? __this_cpu_preempt_check+0xf/0x20
  [<c10afa7d>] ? trace_hardirqs_on_caller+0xdd/0x210
  [<c10ad50b>] ? trace_hardirqs_off+0xb/0x10
  [<c1001b81>] do_fast_syscall_32+0xa1/0x1c0
  [<c176d888>] sysenter_past_esp+0x45/0x74
 INFO: Object 0xd1e6d9e0 @offset=6624
 kmem_cache_destroy f2fs_inode_cache: Slab cache still has objects
 CPU: 3 PID: 7455 Comm: rmmod Tainted: G    B      O    4.6.0-rc4+ #16
 Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006
  00000286 00000286 d062fef4 c13a83a0 f174b000 d062ff14 d062ff28 c1198ac7
  c197fe18 f3c5b980 d062ff20 000d04f2 d062ff0c d062ff0c d062ff14 d062ff14
  f8f20dc0 fffffff5 d062e000 d062ff30 f8f15aa3 d062ff7c c10f596c 73663266
 Call Trace:
  [<c13a83a0>] dump_stack+0x5f/0x8f
  [<c1198ac7>] kmem_cache_destroy+0x1e7/0x1f0
  [<f8f15aa3>] exit_f2fs_fs+0x4b/0x5a8 [f2fs]
  [<c10f596c>] SyS_delete_module+0x16c/0x1d0
  [<c1001b10>] ? do_fast_syscall_32+0x30/0x1c0
  [<c13c59bf>] ? __this_cpu_preempt_check+0xf/0x20
  [<c10afa7d>] ? trace_hardirqs_on_caller+0xdd/0x210
  [<c10ad50b>] ? trace_hardirqs_off+0xb/0x10
  [<c1001b81>] do_fast_syscall_32+0xa1/0x1c0
  [<c176d888>] sysenter_past_esp+0x45/0x74

The reason is: in recovery flow, we use delayed iput mechanism for directory
which has recovered dentry block. It means the reference of inode will be
held until last dirty dentry page being writebacked.

But when we mount f2fs with inline_dentry option, during recovery, dirent
may only be recovered into dir inode page rather than dentry page, so there
are no chance for us to release inode reference in ->writepage when
writebacking last dentry page.

We can call paired iget/iput explicityly for inline_dentry case, but for
non-inline_dentry case, iput will call writeback_single_inode to write all
data pages synchronously, but during recovery, ->writepages of f2fs skips
writing all pages, result in losing dirent.

This patch fixes this issue by obsoleting old mechanism, and introduce a
new dir_list to hold all directory inodes which has recovered datas until
finishing recovery.

Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2016-05-07 10:44:54 -07:00

651 lines
15 KiB
C

/*
* fs/f2fs/recovery.c
*
* Copyright (c) 2012 Samsung Electronics Co., Ltd.
* http://www.samsung.com/
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
#include <linux/fs.h>
#include <linux/f2fs_fs.h>
#include "f2fs.h"
#include "node.h"
#include "segment.h"
/*
* Roll forward recovery scenarios.
*
* [Term] F: fsync_mark, D: dentry_mark
*
* 1. inode(x) | CP | inode(x) | dnode(F)
* -> Update the latest inode(x).
*
* 2. inode(x) | CP | inode(F) | dnode(F)
* -> No problem.
*
* 3. inode(x) | CP | dnode(F) | inode(x)
* -> Recover to the latest dnode(F), and drop the last inode(x)
*
* 4. inode(x) | CP | dnode(F) | inode(F)
* -> No problem.
*
* 5. CP | inode(x) | dnode(F)
* -> The inode(DF) was missing. Should drop this dnode(F).
*
* 6. CP | inode(DF) | dnode(F)
* -> No problem.
*
* 7. CP | dnode(F) | inode(DF)
* -> If f2fs_iget fails, then goto next to find inode(DF).
*
* 8. CP | dnode(F) | inode(x)
* -> If f2fs_iget fails, then goto next to find inode(DF).
* But it will fail due to no inode(DF).
*/
static struct kmem_cache *fsync_entry_slab;
bool space_for_roll_forward(struct f2fs_sb_info *sbi)
{
if (sbi->last_valid_block_count + sbi->alloc_valid_block_count
> sbi->user_block_count)
return false;
return true;
}
static struct fsync_inode_entry *get_fsync_inode(struct list_head *head,
nid_t ino)
{
struct fsync_inode_entry *entry;
list_for_each_entry(entry, head, list)
if (entry->inode->i_ino == ino)
return entry;
return NULL;
}
static struct fsync_inode_entry *add_fsync_inode(struct list_head *head,
struct inode *inode)
{
struct fsync_inode_entry *entry;
entry = kmem_cache_alloc(fsync_entry_slab, GFP_F2FS_ZERO);
if (!entry)
return NULL;
entry->inode = inode;
list_add_tail(&entry->list, head);
return entry;
}
static void del_fsync_inode(struct fsync_inode_entry *entry)
{
iput(entry->inode);
list_del(&entry->list);
kmem_cache_free(fsync_entry_slab, entry);
}
static int recover_dentry(struct inode *inode, struct page *ipage,
struct list_head *dir_list)
{
struct f2fs_inode *raw_inode = F2FS_INODE(ipage);
nid_t pino = le32_to_cpu(raw_inode->i_pino);
struct f2fs_dir_entry *de;
struct qstr name;
struct page *page;
struct inode *dir, *einode;
struct fsync_inode_entry *entry;
int err = 0;
entry = get_fsync_inode(dir_list, pino);
if (!entry) {
dir = f2fs_iget(inode->i_sb, pino);
if (IS_ERR(dir)) {
err = PTR_ERR(dir);
goto out;
}
entry = add_fsync_inode(dir_list, dir);
if (!entry) {
err = -ENOMEM;
iput(dir);
goto out;
}
}
dir = entry->inode;
if (file_enc_name(inode))
return 0;
name.len = le32_to_cpu(raw_inode->i_namelen);
name.name = raw_inode->i_name;
if (unlikely(name.len > F2FS_NAME_LEN)) {
WARN_ON(1);
err = -ENAMETOOLONG;
goto out;
}
retry:
de = f2fs_find_entry(dir, &name, &page);
if (de && inode->i_ino == le32_to_cpu(de->ino))
goto out_unmap_put;
if (de) {
einode = f2fs_iget(inode->i_sb, le32_to_cpu(de->ino));
if (IS_ERR(einode)) {
WARN_ON(1);
err = PTR_ERR(einode);
if (err == -ENOENT)
err = -EEXIST;
goto out_unmap_put;
}
err = acquire_orphan_inode(F2FS_I_SB(inode));
if (err) {
iput(einode);
goto out_unmap_put;
}
f2fs_delete_entry(de, page, dir, einode);
iput(einode);
goto retry;
}
err = __f2fs_add_link(dir, &name, inode, inode->i_ino, inode->i_mode);
goto out;
out_unmap_put:
f2fs_dentry_kunmap(dir, page);
f2fs_put_page(page, 0);
out:
f2fs_msg(inode->i_sb, KERN_NOTICE,
"%s: ino = %x, name = %s, dir = %lx, err = %d",
__func__, ino_of_node(ipage), raw_inode->i_name,
IS_ERR(dir) ? 0 : dir->i_ino, err);
return err;
}
static void recover_inode(struct inode *inode, struct page *page)
{
struct f2fs_inode *raw = F2FS_INODE(page);
char *name;
inode->i_mode = le16_to_cpu(raw->i_mode);
i_size_write(inode, le64_to_cpu(raw->i_size));
inode->i_atime.tv_sec = le64_to_cpu(raw->i_mtime);
inode->i_ctime.tv_sec = le64_to_cpu(raw->i_ctime);
inode->i_mtime.tv_sec = le64_to_cpu(raw->i_mtime);
inode->i_atime.tv_nsec = le32_to_cpu(raw->i_mtime_nsec);
inode->i_ctime.tv_nsec = le32_to_cpu(raw->i_ctime_nsec);
inode->i_mtime.tv_nsec = le32_to_cpu(raw->i_mtime_nsec);
if (file_enc_name(inode))
name = "<encrypted>";
else
name = F2FS_INODE(page)->i_name;
f2fs_msg(inode->i_sb, KERN_NOTICE, "recover_inode: ino = %x, name = %s",
ino_of_node(page), name);
}
static bool is_same_inode(struct inode *inode, struct page *ipage)
{
struct f2fs_inode *ri = F2FS_INODE(ipage);
struct timespec disk;
if (!IS_INODE(ipage))
return true;
disk.tv_sec = le64_to_cpu(ri->i_ctime);
disk.tv_nsec = le32_to_cpu(ri->i_ctime_nsec);
if (timespec_compare(&inode->i_ctime, &disk) > 0)
return false;
disk.tv_sec = le64_to_cpu(ri->i_atime);
disk.tv_nsec = le32_to_cpu(ri->i_atime_nsec);
if (timespec_compare(&inode->i_atime, &disk) > 0)
return false;
disk.tv_sec = le64_to_cpu(ri->i_mtime);
disk.tv_nsec = le32_to_cpu(ri->i_mtime_nsec);
if (timespec_compare(&inode->i_mtime, &disk) > 0)
return false;
return true;
}
static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
{
unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi));
struct curseg_info *curseg;
struct inode *inode;
struct page *page = NULL;
block_t blkaddr;
int err = 0;
/* get node pages in the current segment */
curseg = CURSEG_I(sbi, CURSEG_WARM_NODE);
blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
while (1) {
struct fsync_inode_entry *entry;
if (!is_valid_blkaddr(sbi, blkaddr, META_POR))
return 0;
page = get_tmp_page(sbi, blkaddr);
if (cp_ver != cpver_of_node(page))
break;
if (!is_fsync_dnode(page))
goto next;
entry = get_fsync_inode(head, ino_of_node(page));
if (entry) {
if (!is_same_inode(entry->inode, page))
goto next;
} else {
if (IS_INODE(page) && is_dent_dnode(page)) {
err = recover_inode_page(sbi, page);
if (err)
break;
}
/*
* CP | dnode(F) | inode(DF)
* For this case, we should not give up now.
*/
inode = f2fs_iget(sbi->sb, ino_of_node(page));
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
if (err == -ENOENT) {
err = 0;
goto next;
}
break;
}
/* add this fsync inode to the list */
entry = add_fsync_inode(head, inode);
if (!entry) {
err = -ENOMEM;
iput(inode);
break;
}
}
entry->blkaddr = blkaddr;
if (IS_INODE(page) && is_dent_dnode(page))
entry->last_dentry = blkaddr;
next:
/* check next segment */
blkaddr = next_blkaddr_of_node(page);
f2fs_put_page(page, 1);
ra_meta_pages_cond(sbi, blkaddr);
}
f2fs_put_page(page, 1);
return err;
}
static void destroy_fsync_dnodes(struct list_head *head)
{
struct fsync_inode_entry *entry, *tmp;
list_for_each_entry_safe(entry, tmp, head, list)
del_fsync_inode(entry);
}
static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
block_t blkaddr, struct dnode_of_data *dn)
{
struct seg_entry *sentry;
unsigned int segno = GET_SEGNO(sbi, blkaddr);
unsigned short blkoff = GET_BLKOFF_FROM_SEG0(sbi, blkaddr);
struct f2fs_summary_block *sum_node;
struct f2fs_summary sum;
struct page *sum_page, *node_page;
struct dnode_of_data tdn = *dn;
nid_t ino, nid;
struct inode *inode;
unsigned int offset;
block_t bidx;
int i;
sentry = get_seg_entry(sbi, segno);
if (!f2fs_test_bit(blkoff, sentry->cur_valid_map))
return 0;
/* Get the previous summary */
for (i = CURSEG_WARM_DATA; i <= CURSEG_COLD_DATA; i++) {
struct curseg_info *curseg = CURSEG_I(sbi, i);
if (curseg->segno == segno) {
sum = curseg->sum_blk->entries[blkoff];
goto got_it;
}
}
sum_page = get_sum_page(sbi, segno);
sum_node = (struct f2fs_summary_block *)page_address(sum_page);
sum = sum_node->entries[blkoff];
f2fs_put_page(sum_page, 1);
got_it:
/* Use the locked dnode page and inode */
nid = le32_to_cpu(sum.nid);
if (dn->inode->i_ino == nid) {
tdn.nid = nid;
if (!dn->inode_page_locked)
lock_page(dn->inode_page);
tdn.node_page = dn->inode_page;
tdn.ofs_in_node = le16_to_cpu(sum.ofs_in_node);
goto truncate_out;
} else if (dn->nid == nid) {
tdn.ofs_in_node = le16_to_cpu(sum.ofs_in_node);
goto truncate_out;
}
/* Get the node page */
node_page = get_node_page(sbi, nid);
if (IS_ERR(node_page))
return PTR_ERR(node_page);
offset = ofs_of_node(node_page);
ino = ino_of_node(node_page);
f2fs_put_page(node_page, 1);
if (ino != dn->inode->i_ino) {
/* Deallocate previous index in the node page */
inode = f2fs_iget(sbi->sb, ino);
if (IS_ERR(inode))
return PTR_ERR(inode);
} else {
inode = dn->inode;
}
bidx = start_bidx_of_node(offset, inode) + le16_to_cpu(sum.ofs_in_node);
/*
* if inode page is locked, unlock temporarily, but its reference
* count keeps alive.
*/
if (ino == dn->inode->i_ino && dn->inode_page_locked)
unlock_page(dn->inode_page);
set_new_dnode(&tdn, inode, NULL, NULL, 0);
if (get_dnode_of_data(&tdn, bidx, LOOKUP_NODE))
goto out;
if (tdn.data_blkaddr == blkaddr)
truncate_data_blocks_range(&tdn, 1);
f2fs_put_dnode(&tdn);
out:
if (ino != dn->inode->i_ino)
iput(inode);
else if (dn->inode_page_locked)
lock_page(dn->inode_page);
return 0;
truncate_out:
if (datablock_addr(tdn.node_page, tdn.ofs_in_node) == blkaddr)
truncate_data_blocks_range(&tdn, 1);
if (dn->inode->i_ino == nid && !dn->inode_page_locked)
unlock_page(dn->inode_page);
return 0;
}
static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
struct page *page, block_t blkaddr)
{
struct dnode_of_data dn;
struct node_info ni;
unsigned int start, end;
int err = 0, recovered = 0;
/* step 1: recover xattr */
if (IS_INODE(page)) {
recover_inline_xattr(inode, page);
} else if (f2fs_has_xattr_block(ofs_of_node(page))) {
/*
* Deprecated; xattr blocks should be found from cold log.
* But, we should remain this for backward compatibility.
*/
recover_xattr_data(inode, page, blkaddr);
goto out;
}
/* step 2: recover inline data */
if (recover_inline_data(inode, page))
goto out;
/* step 3: recover data indices */
start = start_bidx_of_node(ofs_of_node(page), inode);
end = start + ADDRS_PER_PAGE(page, inode);
set_new_dnode(&dn, inode, NULL, NULL, 0);
err = get_dnode_of_data(&dn, start, ALLOC_NODE);
if (err)
goto out;
f2fs_wait_on_page_writeback(dn.node_page, NODE, true);
get_node_info(sbi, dn.nid, &ni);
f2fs_bug_on(sbi, ni.ino != ino_of_node(page));
f2fs_bug_on(sbi, ofs_of_node(dn.node_page) != ofs_of_node(page));
for (; start < end; start++, dn.ofs_in_node++) {
block_t src, dest;
src = datablock_addr(dn.node_page, dn.ofs_in_node);
dest = datablock_addr(page, dn.ofs_in_node);
/* skip recovering if dest is the same as src */
if (src == dest)
continue;
/* dest is invalid, just invalidate src block */
if (dest == NULL_ADDR) {
truncate_data_blocks_range(&dn, 1);
continue;
}
/*
* dest is reserved block, invalidate src block
* and then reserve one new block in dnode page.
*/
if (dest == NEW_ADDR) {
truncate_data_blocks_range(&dn, 1);
err = reserve_new_block(&dn);
f2fs_bug_on(sbi, err);
continue;
}
/* dest is valid block, try to recover from src to dest */
if (is_valid_blkaddr(sbi, dest, META_POR)) {
if (src == NULL_ADDR) {
err = reserve_new_block(&dn);
/* We should not get -ENOSPC */
f2fs_bug_on(sbi, err);
}
/* Check the previous node page having this index */
err = check_index_in_prev_nodes(sbi, dest, &dn);
if (err)
goto err;
/* write dummy data page */
f2fs_replace_block(sbi, &dn, src, dest,
ni.version, false, false);
recovered++;
}
}
if (IS_INODE(dn.node_page))
sync_inode_page(&dn);
copy_node_footer(dn.node_page, page);
fill_node_footer(dn.node_page, dn.nid, ni.ino,
ofs_of_node(page), false);
set_page_dirty(dn.node_page);
err:
f2fs_put_dnode(&dn);
out:
f2fs_msg(sbi->sb, KERN_NOTICE,
"recover_data: ino = %lx, recovered = %d blocks, err = %d",
inode->i_ino, recovered, err);
return err;
}
static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list,
struct list_head *dir_list)
{
unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi));
struct curseg_info *curseg;
struct page *page = NULL;
int err = 0;
block_t blkaddr;
/* get node pages in the current segment */
curseg = CURSEG_I(sbi, CURSEG_WARM_NODE);
blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
while (1) {
struct fsync_inode_entry *entry;
if (!is_valid_blkaddr(sbi, blkaddr, META_POR))
break;
ra_meta_pages_cond(sbi, blkaddr);
page = get_tmp_page(sbi, blkaddr);
if (cp_ver != cpver_of_node(page)) {
f2fs_put_page(page, 1);
break;
}
entry = get_fsync_inode(inode_list, ino_of_node(page));
if (!entry)
goto next;
/*
* inode(x) | CP | inode(x) | dnode(F)
* In this case, we can lose the latest inode(x).
* So, call recover_inode for the inode update.
*/
if (IS_INODE(page))
recover_inode(entry->inode, page);
if (entry->last_dentry == blkaddr) {
err = recover_dentry(entry->inode, page, dir_list);
if (err) {
f2fs_put_page(page, 1);
break;
}
}
err = do_recover_data(sbi, entry->inode, page, blkaddr);
if (err) {
f2fs_put_page(page, 1);
break;
}
if (entry->blkaddr == blkaddr)
del_fsync_inode(entry);
next:
/* check next segment */
blkaddr = next_blkaddr_of_node(page);
f2fs_put_page(page, 1);
}
if (!err)
allocate_new_segments(sbi);
return err;
}
int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only)
{
struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_WARM_NODE);
struct list_head inode_list;
struct list_head dir_list;
block_t blkaddr;
int err;
int ret = 0;
bool need_writecp = false;
fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry",
sizeof(struct fsync_inode_entry));
if (!fsync_entry_slab)
return -ENOMEM;
INIT_LIST_HEAD(&inode_list);
INIT_LIST_HEAD(&dir_list);
/* prevent checkpoint */
mutex_lock(&sbi->cp_mutex);
blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
/* step #1: find fsynced inode numbers */
err = find_fsync_dnodes(sbi, &inode_list);
if (err || list_empty(&inode_list))
goto out;
if (check_only) {
ret = 1;
goto out;
}
need_writecp = true;
/* step #2: recover data */
err = recover_data(sbi, &inode_list, &dir_list);
if (!err)
f2fs_bug_on(sbi, !list_empty(&inode_list));
out:
destroy_fsync_dnodes(&inode_list);
/* truncate meta pages to be used by the recovery */
truncate_inode_pages_range(META_MAPPING(sbi),
(loff_t)MAIN_BLKADDR(sbi) << PAGE_SHIFT, -1);
if (err) {
truncate_inode_pages_final(NODE_MAPPING(sbi));
truncate_inode_pages_final(META_MAPPING(sbi));
}
clear_sbi_flag(sbi, SBI_POR_DOING);
if (err) {
bool invalidate = false;
if (discard_next_dnode(sbi, blkaddr))
invalidate = true;
/* Flush all the NAT/SIT pages */
while (get_pages(sbi, F2FS_DIRTY_META))
sync_meta_pages(sbi, META, LONG_MAX);
/* invalidate temporary meta page */
if (invalidate)
invalidate_mapping_pages(META_MAPPING(sbi),
blkaddr, blkaddr);
set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG);
mutex_unlock(&sbi->cp_mutex);
} else if (need_writecp) {
struct cp_control cpc = {
.reason = CP_RECOVERY,
};
mutex_unlock(&sbi->cp_mutex);
err = write_checkpoint(sbi, &cpc);
} else {
mutex_unlock(&sbi->cp_mutex);
}
destroy_fsync_dnodes(&dir_list);
kmem_cache_destroy(fsync_entry_slab);
return ret ? ret: err;
}