From b224ba20b4d44e7fed8eaa92d20c9ff3009023fe Mon Sep 17 00:00:00 2001 From: Chengguang Xu Date: Tue, 1 Jan 2019 21:33:11 +0800 Subject: [PATCH 01/54] f2fs: change error code to -ENOMEM from -EINVAL The error case of failing allocating memory should return -ENOMEM. Signed-off-by: Chengguang Xu Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index a9e7a1e62c66..4d23344976fd 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -269,7 +269,7 @@ static int f2fs_set_qf_name(struct super_block *sb, int qtype, if (!qname) { f2fs_msg(sb, KERN_ERR, "Not enough memory for storing quotafile name"); - return -EINVAL; + return -ENOMEM; } if (F2FS_OPTION(sbi).s_qf_names[qtype]) { if (strcmp(F2FS_OPTION(sbi).s_qf_names[qtype], qname) == 0) From 746a1117c57ad4cdbc2ae38568ad115af8d92336 Mon Sep 17 00:00:00 2001 From: Zhikang Zhang Date: Thu, 3 Jan 2019 20:06:38 +0800 Subject: [PATCH 02/54] f2fs: fix compile warnings: 'struct *' declared inside parameter list We meet these compile warnings below, which caused by missing declare structs: struct f2fs_io_info, struct extent, struct f2fs_sb_info. warning: 'struct f2fs_io_info' declared inside parameter list warning: 'struct extent_info' declared inside parameter list warning: 'struct f2fs_sb_info' declared inside parameter list Signed-off-by: Zhikang Zhang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- include/trace/events/f2fs.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 0cdf6cc5c557..9bbb7c916c12 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -141,6 +141,9 @@ TRACE_DEFINE_ENUM(CP_TRIMMED); { CP_SPEC_LOG_NUM, "log type is 2" }, \ { CP_RECOVER_DIR, "dir needs recovery" }) +struct f2fs_sb_info; +struct f2fs_io_info; +struct extent_info; struct victim_sel_policy; struct f2fs_map_blocks; From f3a0a4ad1ea473706924e3edd45b74bd0b831258 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Fri, 4 Jan 2019 01:38:29 +0000 Subject: [PATCH 03/54] f2fs: remove set but not used variable 'err' Fixes gcc '-Wunused-but-set-variable' warning: fs/f2fs/data.c: In function 'f2fs_dio_submit_bio': fs/f2fs/data.c:2585:6: warning: variable 'err' set but not used [-Wunused-but-set-variable] Signed-off-by: YueHaibing Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 35285a3d3ff6..2386a2839842 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2578,14 +2578,11 @@ static void f2fs_dio_submit_bio(int rw, struct bio *bio, struct inode *inode, { struct f2fs_private_dio *dio; bool write = (rw == REQ_OP_WRITE); - int err; dio = f2fs_kzalloc(F2FS_I_SB(inode), sizeof(struct f2fs_private_dio), GFP_NOFS); - if (!dio) { - err = -ENOMEM; + if (!dio) goto out; - } dio->inode = inode; dio->orig_end_io = bio->bi_end_io; From b5897a1fad59367a783c98babafff41c6a3731e2 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 4 Jan 2019 17:39:53 +0800 Subject: [PATCH 04/54] f2fs: check inject_rate validity during configuring Type of inject_rate is unsigned int, let's check new value's validity during configuring. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/sysfs.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 948c1a211341..b427122cb71a 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -222,6 +222,8 @@ out: #ifdef CONFIG_F2FS_FAULT_INJECTION if (a->struct_type == FAULT_INFO_TYPE && t >= (1 << FAULT_MAX)) return -EINVAL; + if (a->struct_type == FAULT_INFO_RATE && t >= UINT_MAX) + return -EINVAL; #endif if (a->struct_type == RESERVED_BLOCKS) { spin_lock(&sbi->stat_lock); From b9aad0e3d072272cc27ec556538866df74a75af8 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 3 Jan 2019 17:19:08 -0800 Subject: [PATCH 05/54] f2fs: export FS_NOCOW_FL flag to user This exports pin_file status to user. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 3 ++- fs/f2fs/file.c | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 11f86e439ac7..09862e862b06 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2366,11 +2366,12 @@ static inline void f2fs_change_bit(unsigned int nr, char *addr) #define F2FS_EXTENTS_FL 0x00080000 /* Inode uses extents */ #define F2FS_EA_INODE_FL 0x00200000 /* Inode used for large EA */ #define F2FS_EOFBLOCKS_FL 0x00400000 /* Blocks allocated beyond EOF */ +#define F2FS_NOCOW_FL 0x00800000 /* Do not cow file */ #define F2FS_INLINE_DATA_FL 0x10000000 /* Inode has inline data. */ #define F2FS_PROJINHERIT_FL 0x20000000 /* Create with parents projid */ #define F2FS_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ -#define F2FS_FL_USER_VISIBLE 0x304BDFFF /* User visible flags */ +#define F2FS_FL_USER_VISIBLE 0x30CBDFFF /* User visible flags */ #define F2FS_FL_USER_MODIFIABLE 0x204BC0FF /* User modifiable flags */ /* Flags we can manipulate with through F2FS_IOC_FSSETXATTR */ diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index b44e02b8c677..5942e6d544d2 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1658,6 +1658,8 @@ static int f2fs_ioc_getflags(struct file *filp, unsigned long arg) flags |= F2FS_ENCRYPT_FL; if (f2fs_has_inline_data(inode) || f2fs_has_inline_dentry(inode)) flags |= F2FS_INLINE_DATA_FL; + if (is_inode_flag_set(inode, FI_PIN_FILE)) + flags |= F2FS_NOCOW_FL; flags &= F2FS_FL_USER_VISIBLE; From 1ba66eab9c23ab887c5c8ff1194c06d5b3e9caf7 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 4 Jan 2019 14:26:18 +0100 Subject: [PATCH 06/54] f2fs: no need to check return value of debugfs_create functions When calling debugfs functions, there is no need to ever check the return value. The function can work or not, but the code logic should never do something different based on this. Cc: Jaegeuk Kim Cc: Chao Yu Cc: linux-f2fs-devel@lists.sourceforge.net Signed-off-by: Greg Kroah-Hartman Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 20 +++----------------- fs/f2fs/f2fs.h | 4 ++-- fs/f2fs/super.c | 5 +---- 3 files changed, 6 insertions(+), 23 deletions(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index f05b37ef7182..d00ba9b711a4 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -522,30 +522,16 @@ void f2fs_destroy_stats(struct f2fs_sb_info *sbi) kvfree(si); } -int __init f2fs_create_root_stats(void) +void __init f2fs_create_root_stats(void) { - struct dentry *file; - f2fs_debugfs_root = debugfs_create_dir("f2fs", NULL); - if (!f2fs_debugfs_root) - return -ENOMEM; - file = debugfs_create_file("status", S_IRUGO, f2fs_debugfs_root, - NULL, &stat_fops); - if (!file) { - debugfs_remove(f2fs_debugfs_root); - f2fs_debugfs_root = NULL; - return -ENOMEM; - } - - return 0; + debugfs_create_file("status", S_IRUGO, f2fs_debugfs_root, NULL, + &stat_fops); } void f2fs_destroy_root_stats(void) { - if (!f2fs_debugfs_root) - return; - debugfs_remove_recursive(f2fs_debugfs_root); f2fs_debugfs_root = NULL; } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 09862e862b06..b90ee556b66d 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3413,7 +3413,7 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) int f2fs_build_stats(struct f2fs_sb_info *sbi); void f2fs_destroy_stats(struct f2fs_sb_info *sbi); -int __init f2fs_create_root_stats(void); +void __init f2fs_create_root_stats(void); void f2fs_destroy_root_stats(void); #else #define stat_inc_cp_count(si) do { } while (0) @@ -3451,7 +3451,7 @@ void f2fs_destroy_root_stats(void); static inline int f2fs_build_stats(struct f2fs_sb_info *sbi) { return 0; } static inline void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { } -static inline int __init f2fs_create_root_stats(void) { return 0; } +static inline void __init f2fs_create_root_stats(void) { } static inline void f2fs_destroy_root_stats(void) { } #endif diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 4d23344976fd..033e1e9f89d2 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -3557,9 +3557,7 @@ static int __init init_f2fs_fs(void) err = register_filesystem(&f2fs_fs_type); if (err) goto free_shrinker; - err = f2fs_create_root_stats(); - if (err) - goto free_filesystem; + f2fs_create_root_stats(); err = f2fs_init_post_read_processing(); if (err) goto free_root_stats; @@ -3567,7 +3565,6 @@ static int __init init_f2fs_fs(void) free_root_stats: f2fs_destroy_root_stats(); -free_filesystem: unregister_filesystem(&f2fs_fs_type); free_shrinker: unregister_shrinker(&f2fs_shrinker_info); From 82e2ea2753149761ec98206b117051eee3e743a3 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 8 Jan 2019 10:21:24 +0800 Subject: [PATCH 07/54] f2fs: fix to trigger fsck if dirent.name_len is zero While traversing dirents in f2fs_fill_dentries(), if bitmap is valid, filename length should not be zero, otherwise, directory structure consistency could be corrupted, in this case, let's print related info and set SBI_NEED_FSCK to trigger fsck for repairing. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 719ba4d92098..fabdf8428c7a 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -800,6 +800,10 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, if (de->name_len == 0) { bit_pos++; ctx->pos = start_pos + bit_pos; + printk_ratelimited( + "%s, invalid namelen(0), ino:%u, run fsck to fix.", + KERN_WARNING, le32_to_cpu(de->ino)); + set_sbi_flag(sbi, SBI_NEED_FSCK); continue; } From 9ca1182a3c21e67d62d2b7566b11f3ead82534d9 Mon Sep 17 00:00:00 2001 From: Sheng Yong Date: Mon, 7 Jan 2019 15:02:34 +0800 Subject: [PATCH 08/54] f2fs: check if file namelen exceeds max value Dentry bitmap is not enough to detect incorrect dentries. So this patch also checks the namelen value of a dentry. Signed-off-by: Gong Chen Signed-off-by: Sheng Yong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index fabdf8428c7a..8dff29053150 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -814,7 +814,8 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, /* check memory boundary before moving forward */ bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); - if (unlikely(bit_pos > d->max)) { + if (unlikely(bit_pos > d->max || + le16_to_cpu(de->name_len) > F2FS_NAME_LEN)) { f2fs_msg(sbi->sb, KERN_WARNING, "%s: corrupted namelen=%d, run fsck to fix.", __func__, le16_to_cpu(de->name_len)); From b8243df07b586d46ab5bb7f7b9dd8b315f2439b5 Mon Sep 17 00:00:00 2001 From: Sheng Yong Date: Mon, 14 Jan 2019 22:05:14 +0800 Subject: [PATCH 09/54] f2fs: add brackets for macros Signed-off-by: Sheng Yong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index b90ee556b66d..eaa71bf8d49d 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -322,7 +322,7 @@ struct discard_entry { /* max discard pend list number */ #define MAX_PLIST_NUM 512 #define plist_idx(blk_num) ((blk_num) >= MAX_PLIST_NUM ? \ - (MAX_PLIST_NUM - 1) : (blk_num - 1)) + (MAX_PLIST_NUM - 1) : ((blk_num) - 1)) enum { D_PREP, /* initial */ @@ -2848,9 +2848,9 @@ static inline int get_inline_xattr_addrs(struct inode *inode) #define F2FS_OLD_ATTRIBUTE_SIZE (offsetof(struct f2fs_inode, i_addr)) #define F2FS_FITS_IN_INODE(f2fs_inode, extra_isize, field) \ - ((offsetof(typeof(*f2fs_inode), field) + \ + ((offsetof(typeof(*(f2fs_inode)), field) + \ sizeof((f2fs_inode)->field)) \ - <= (F2FS_OLD_ATTRIBUTE_SIZE + extra_isize)) \ + <= (F2FS_OLD_ATTRIBUTE_SIZE + (extra_isize))) \ static inline void f2fs_reset_iostat(struct f2fs_sb_info *sbi) { @@ -2879,8 +2879,8 @@ static inline void f2fs_update_iostat(struct f2fs_sb_info *sbi, #define __is_large_section(sbi) ((sbi)->segs_per_sec > 1) -#define __is_meta_io(fio) (PAGE_TYPE_OF_BIO(fio->type) == META && \ - (!is_read_io(fio->op) || fio->is_meta)) +#define __is_meta_io(fio) (PAGE_TYPE_OF_BIO((fio)->type) == META && \ + (!is_read_io((fio)->op) || (fio)->is_meta)) bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type); From 1b64f91923d05f3fd2b3dfe18c0b0159377d0745 Mon Sep 17 00:00:00 2001 From: Sheng Yong Date: Tue, 15 Jan 2019 20:02:15 +0000 Subject: [PATCH 10/54] f2fs: UBSAN: set boolean value iostat_enable correctly When setting /sys/fs/f2fs//iostat_enable with non-bool value, UBSAN reports the following warning. [ 7562.295484] ================================================================================ [ 7562.296531] UBSAN: Undefined behaviour in fs/f2fs/f2fs.h:2776:10 [ 7562.297651] load of value 64 is not a valid value for type '_Bool' [ 7562.298642] CPU: 1 PID: 7487 Comm: dd Not tainted 4.20.0-rc4+ #79 [ 7562.298653] Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006 [ 7562.298662] Call Trace: [ 7562.298760] dump_stack+0x46/0x5b [ 7562.298811] ubsan_epilogue+0x9/0x40 [ 7562.298830] __ubsan_handle_load_invalid_value+0x72/0x90 [ 7562.298863] f2fs_file_write_iter+0x29f/0x3f0 [ 7562.298905] __vfs_write+0x115/0x160 [ 7562.298922] vfs_write+0xa7/0x190 [ 7562.298934] ksys_write+0x50/0xc0 [ 7562.298973] do_syscall_64+0x4a/0xe0 [ 7562.298992] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 7562.299001] RIP: 0033:0x7fa45ec19c00 [ 7562.299004] Code: 73 01 c3 48 8b 0d 88 92 2c 00 f7 d8 64 89 01 48 83 c8 ff c3 66 0f 1f 44 00 00 83 3d dd eb 2c 00 00 75 10 b8 01 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 31 c3 48 83 ec 08 e8 ce 8f 01 00 48 89 04 24 [ 7562.299044] RSP: 002b:00007ffca52b49e8 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 [ 7562.299052] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007fa45ec19c00 [ 7562.299059] RDX: 0000000000000400 RSI: 000000000093f000 RDI: 0000000000000001 [ 7562.299065] RBP: 000000000093f000 R08: 0000000000000004 R09: 0000000000000000 [ 7562.299071] R10: 00007ffca52b47b0 R11: 0000000000000246 R12: 0000000000000400 [ 7562.299077] R13: 000000000093f000 R14: 000000000093f400 R15: 0000000000000000 [ 7562.299091] ================================================================================ So, if iostat_enable is enabled, set its value as true. Signed-off-by: Sheng Yong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/sysfs.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index b427122cb71a..a609c0abbdd0 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -280,10 +280,16 @@ out: return count; } - *ui = t; - if (!strcmp(a->attr.name, "iostat_enable") && *ui == 0) - f2fs_reset_iostat(sbi); + if (!strcmp(a->attr.name, "iostat_enable")) { + sbi->iostat_enable = !!t; + if (!sbi->iostat_enable) + f2fs_reset_iostat(sbi); + return count; + } + + *ui = (unsigned int)t; + return count; } From c90a357aee5f155d94f46ab0b109388b72fac2ce Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 10 Jan 2019 16:40:12 +0800 Subject: [PATCH 11/54] f2fs: fix to set sbi dirty correctly In order to record direct IO count, we add two additional type in enum count_type: F2FS_DIO_{WRITE,READ}, but those IO won't dirty filesystem metadata, so we don't need to set filesystem dirty in inc_page_count(), fix it. Fixes: 02b16d0a34a1 ("f2fs: add to account direct IO") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index eaa71bf8d49d..705d9373b5ba 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1864,13 +1864,12 @@ static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type) { atomic_inc(&sbi->nr_pages[count_type]); - if (count_type == F2FS_DIRTY_DATA || count_type == F2FS_INMEM_PAGES || - count_type == F2FS_WB_CP_DATA || count_type == F2FS_WB_DATA || - count_type == F2FS_RD_DATA || count_type == F2FS_RD_NODE || - count_type == F2FS_RD_META) - return; - - set_sbi_flag(sbi, SBI_IS_DIRTY); + if (count_type == F2FS_DIRTY_DENTS || + count_type == F2FS_DIRTY_NODES || + count_type == F2FS_DIRTY_META || + count_type == F2FS_DIRTY_QDATA || + count_type == F2FS_DIRTY_IMETA) + set_sbi_flag(sbi, SBI_IS_DIRTY); } static inline void inode_inc_dirty_pages(struct inode *inode) From 35f7e2f6b1b39e6b1704567c27c8cacdd836e8ff Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 14 Jan 2019 10:42:11 -0800 Subject: [PATCH 12/54] f2fs: run discard jobs when put_super When we umount f2fs, we need to avoid long delay due to discard commands, which is actually taking tens of seconds, if storage is very slow on UNMAP. So, this patch introduces timeout-based work on it. By default, let me give 5 seconds for discard. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 7 +++++++ fs/f2fs/f2fs.h | 5 ++++- fs/f2fs/segment.c | 11 ++++++++++- fs/f2fs/super.c | 4 +++- fs/f2fs/sysfs.c | 3 +++ 5 files changed, 27 insertions(+), 3 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index e09db014830f..e916c1eddf1b 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -86,6 +86,13 @@ Description: The unit size is one block, now only support configuring in range of [1, 512]. +What: /sys/fs/f2fs//umount_discard_timeout +Date: January 2019 +Contact: "Jaegeuk Kim" +Description: + Set timeout to issue discard commands during umount. + Default: 5 secs + What: /sys/fs/f2fs//max_victim_search Date: January 2014 Contact: "Jaegeuk Kim" diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 705d9373b5ba..4c6b7d93d75f 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -259,6 +259,7 @@ enum { #define DEF_CP_INTERVAL 60 /* 60 secs */ #define DEF_IDLE_INTERVAL 5 /* 5 secs */ #define DEF_DISABLE_INTERVAL 5 /* 5 secs */ +#define DEF_UMOUNT_DISCARD_TIMEOUT 5 /* 5 secs */ struct cp_control { int reason; @@ -378,6 +379,7 @@ struct discard_policy { bool sync; /* submit discard with REQ_SYNC flag */ bool ordered; /* issue discard by lba order */ unsigned int granularity; /* discard granularity */ + int timeout; /* discard timeout for put_super */ }; struct discard_cmd_control { @@ -1175,6 +1177,7 @@ enum { DISCARD_TIME, GC_TIME, DISABLE_TIME, + UMOUNT_DISCARD_TIMEOUT, MAX_TIME, }; @@ -3090,7 +3093,7 @@ void f2fs_invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr); bool f2fs_is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr); void f2fs_drop_discard_cmd(struct f2fs_sb_info *sbi); void f2fs_stop_discard_thread(struct f2fs_sb_info *sbi); -bool f2fs_wait_discard_bios(struct f2fs_sb_info *sbi); +bool f2fs_issue_discard_timeout(struct f2fs_sb_info *sbi); void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc); void f2fs_dirty_to_prefree(struct f2fs_sb_info *sbi); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index ccefcf8016c4..7bb5ef731cbe 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1117,6 +1117,7 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi, dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST; dpolicy->io_aware_gran = MAX_PLIST_NUM; + dpolicy->timeout = 0; if (discard_type == DPOLICY_BG) { dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME; @@ -1500,7 +1501,14 @@ static int __issue_discard_cmd(struct f2fs_sb_info *sbi, int i, issued = 0; bool io_interrupted = false; + if (dpolicy->timeout != 0) + f2fs_update_time(sbi, dpolicy->timeout); + for (i = MAX_PLIST_NUM - 1; i >= 0; i--) { + if (dpolicy->timeout != 0 && + f2fs_time_over(sbi, dpolicy->timeout)) + break; + if (i + 1 < dpolicy->granularity) break; @@ -1687,7 +1695,7 @@ void f2fs_stop_discard_thread(struct f2fs_sb_info *sbi) } /* This comes from f2fs_put_super */ -bool f2fs_wait_discard_bios(struct f2fs_sb_info *sbi) +bool f2fs_issue_discard_timeout(struct f2fs_sb_info *sbi) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct discard_policy dpolicy; @@ -1695,6 +1703,7 @@ bool f2fs_wait_discard_bios(struct f2fs_sb_info *sbi) __init_discard_policy(sbi, &dpolicy, DPOLICY_UMOUNT, dcc->discard_granularity); + dpolicy.timeout = UMOUNT_DISCARD_TIMEOUT; __issue_discard_cmd(sbi, &dpolicy); dropped = __drop_discard_cmd(sbi); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 033e1e9f89d2..a579c46a5107 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1048,7 +1048,7 @@ static void f2fs_put_super(struct super_block *sb) } /* be sure to wait for any on-going discard commands */ - dropped = f2fs_wait_discard_bios(sbi); + dropped = f2fs_issue_discard_timeout(sbi); if ((f2fs_hw_support_discard(sbi) || f2fs_hw_should_discard(sbi)) && !sbi->discard_blks && !dropped) { @@ -2710,6 +2710,8 @@ static void init_sb_info(struct f2fs_sb_info *sbi) sbi->interval_time[DISCARD_TIME] = DEF_IDLE_INTERVAL; sbi->interval_time[GC_TIME] = DEF_IDLE_INTERVAL; sbi->interval_time[DISABLE_TIME] = DEF_DISABLE_INTERVAL; + sbi->interval_time[UMOUNT_DISCARD_TIMEOUT] = + DEF_UMOUNT_DISCARD_TIMEOUT; clear_sbi_flag(sbi, SBI_NEED_FSCK); for (i = 0; i < NR_COUNT_TYPE; i++) diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index a609c0abbdd0..d7b47662a0aa 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -426,6 +426,8 @@ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, idle_interval, interval_time[REQ_TIME]); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, discard_idle_interval, interval_time[DISCARD_TIME]); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_idle_interval, interval_time[GC_TIME]); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, + umount_discard_timeout, interval_time[UMOUNT_DISCARD_TIMEOUT]); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, iostat_enable, iostat_enable); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, readdir_ra, readdir_ra); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_pin_file_thresh, gc_pin_file_threshold); @@ -483,6 +485,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(idle_interval), ATTR_LIST(discard_idle_interval), ATTR_LIST(gc_idle_interval), + ATTR_LIST(umount_discard_timeout), ATTR_LIST(iostat_enable), ATTR_LIST(readdir_ra), ATTR_LIST(gc_pin_file_thresh), From 3907a1e3993ac8220b71fe8cf9530fca390da610 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 24 Jan 2019 17:48:38 -0800 Subject: [PATCH 13/54] f2fs: add quick mode of checkpoint=disable for QA This mode returns mount() quickly with EAGAIN. We can trigger this by shutdown(F2FS_GOING_DOWN_NEED_FSCK). Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 5 +++++ fs/f2fs/f2fs.h | 2 ++ fs/f2fs/file.c | 6 +++--- fs/f2fs/segment.c | 3 +++ fs/f2fs/super.c | 5 +++++ include/linux/f2fs_fs.h | 1 + 6 files changed, 19 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 940372ebfc60..3fa4587ab0e8 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1260,6 +1260,11 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc) else __clear_ckpt_flags(ckpt, CP_DISABLED_FLAG); + if (is_sbi_flag_set(sbi, SBI_CP_DISABLED_QUICK)) + __set_ckpt_flags(ckpt, CP_DISABLED_QUICK_FLAG); + else + __clear_ckpt_flags(ckpt, CP_DISABLED_QUICK_FLAG); + if (is_sbi_flag_set(sbi, SBI_QUOTA_SKIP_FLUSH)) __set_ckpt_flags(ckpt, CP_QUOTA_NEED_FSCK_FLAG); else diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 4c6b7d93d75f..a1ae613fb28a 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -259,6 +259,7 @@ enum { #define DEF_CP_INTERVAL 60 /* 60 secs */ #define DEF_IDLE_INTERVAL 5 /* 5 secs */ #define DEF_DISABLE_INTERVAL 5 /* 5 secs */ +#define DEF_DISABLE_QUICK_INTERVAL 1 /* 1 secs */ #define DEF_UMOUNT_DISCARD_TIMEOUT 5 /* 5 secs */ struct cp_control { @@ -1166,6 +1167,7 @@ enum { SBI_IS_SHUTDOWN, /* shutdown by ioctl */ SBI_IS_RECOVERED, /* recovered orphan/data */ SBI_CP_DISABLED, /* CP was disabled last mount */ + SBI_CP_DISABLED_QUICK, /* CP was disabled quickly */ SBI_QUOTA_NEED_FLUSH, /* need to flush quota info in CP */ SBI_QUOTA_SKIP_FLUSH, /* skip flushing quota in current CP */ SBI_QUOTA_NEED_REPAIR, /* quota file may be corrupted */ diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 5942e6d544d2..7a1217a236f0 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1972,11 +1972,11 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) break; case F2FS_GOING_DOWN_NEED_FSCK: set_sbi_flag(sbi, SBI_NEED_FSCK); + set_sbi_flag(sbi, SBI_CP_DISABLED_QUICK); + set_sbi_flag(sbi, SBI_IS_DIRTY); /* do checkpoint only */ ret = f2fs_sync_fs(sb, 1); - if (ret) - goto out; - break; + goto out; default: ret = -EINVAL; goto out; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 7bb5ef731cbe..1a091890b4c8 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -868,6 +868,9 @@ int f2fs_disable_cp_again(struct f2fs_sb_info *sbi) if (holes[DATA] > ovp || holes[NODE] > ovp) return -EAGAIN; + if (is_sbi_flag_set(sbi, SBI_CP_DISABLED_QUICK) && + dirty_segments(sbi) > overprovision_segments(sbi)) + return -EAGAIN; return 0; } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index a579c46a5107..6cb1c2e94f34 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -3213,6 +3213,10 @@ try_onemore: if (__is_set_ckpt_flags(F2FS_CKPT(sbi), CP_QUOTA_NEED_FSCK_FLAG)) set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); + if (__is_set_ckpt_flags(F2FS_CKPT(sbi), CP_DISABLED_QUICK_FLAG)) { + set_sbi_flag(sbi, SBI_CP_DISABLED_QUICK); + sbi->interval_time[DISABLE_TIME] = DEF_DISABLE_QUICK_INTERVAL; + } /* Initialize device list */ err = f2fs_scan_devices(sbi); @@ -3399,6 +3403,7 @@ skip_recovery: cur_cp_version(F2FS_CKPT(sbi))); f2fs_update_time(sbi, CP_TIME); f2fs_update_time(sbi, REQ_TIME); + clear_sbi_flag(sbi, SBI_CP_DISABLED_QUICK); return 0; free_meta: diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index 162f83358abb..fc763eab10b2 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -116,6 +116,7 @@ struct f2fs_super_block { /* * For checkpoint */ +#define CP_DISABLED_QUICK_FLAG 0x00002000 #define CP_DISABLED_FLAG 0x00001000 #define CP_QUOTA_NEED_FSCK_FLAG 0x00000800 #define CP_LARGE_NAT_BITMAP_FLAG 0x00000400 From c21ea75cece27778b67f2135742132f2087c7357 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 25 Jan 2019 09:12:13 -0800 Subject: [PATCH 14/54] f2fs: try to keep CP_TRIMMED_FLAG after successful umount If every discard were issued successfully, we can avoid further discard. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 1a091890b4c8..6f0bfaeebb47 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1143,6 +1143,8 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi, } else if (discard_type == DPOLICY_UMOUNT) { dpolicy->max_requests = UINT_MAX; dpolicy->io_aware = false; + /* we need to issue all to keep CP_TRIMMED_FLAG */ + dpolicy->granularity = 1; } } From 0335a980d2380af9205af73ef2633d7b69960a4d Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 25 Jan 2019 10:26:39 -0800 Subject: [PATCH 15/54] f2fs: don't wake up too frequently, if there is lots of IOs Otherwise, it wakes up discard thread which will sleep again by busy IOs in a loop. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index a77f76f528b6..5c7ed0442d6e 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -865,7 +865,7 @@ static inline void wake_up_discard_thread(struct f2fs_sb_info *sbi, bool force) } } mutex_unlock(&dcc->cmd_lock); - if (!wakeup) + if (!wakeup || !is_idle(sbi, DISCARD_TIME)) return; wake_up: dcc->discard_wake = 1; From 909e1f05afb5522a3e5bf34e07e6c8a58d1d56cb Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 25 Jan 2019 12:05:25 -0800 Subject: [PATCH 16/54] f2fs: avoid null pointer exception in dcc_info If dcc_info is not set yet, we can get null pointer panic. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index a1ae613fb28a..b6582b8bbd72 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2226,10 +2226,17 @@ static inline bool is_idle(struct f2fs_sb_info *sbi, int type) get_pages(sbi, F2FS_RD_META) || get_pages(sbi, F2FS_WB_DATA) || get_pages(sbi, F2FS_WB_CP_DATA) || get_pages(sbi, F2FS_DIO_READ) || - get_pages(sbi, F2FS_DIO_WRITE) || - atomic_read(&SM_I(sbi)->dcc_info->queued_discard) || - atomic_read(&SM_I(sbi)->fcc_info->queued_flush)) + get_pages(sbi, F2FS_DIO_WRITE)) return false; + + if (SM_I(sbi) && SM_I(sbi)->dcc_info && + atomic_read(&SM_I(sbi)->dcc_info->queued_discard)) + return false; + + if (SM_I(sbi) && SM_I(sbi)->fcc_info && + atomic_read(&SM_I(sbi)->fcc_info->queued_flush)) + return false; + return f2fs_time_over(sbi, type); } From 22cb0dec0ee2c1d493d3e4b409cafddfca9e3e72 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Sun, 27 Jan 2019 17:59:53 -0800 Subject: [PATCH 17/54] f2fs: flush quota blocks after turnning it off After quota_off, we'll get some dirty blocks. If put_super don't have a chance to flush them by checkpoint, it causes NULL pointer exception in end_io after iput(node_inode). (e.g., by checkpoint=disable) Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 6cb1c2e94f34..38e50a4bf26d 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2027,6 +2027,12 @@ void f2fs_quota_off_umount(struct super_block *sb) set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR); } } + /* + * In case of checkpoint=disable, we must flush quota blocks. + * This can cause NULL exception for node_inode in end_io, since + * put_super already dropped it. + */ + sync_filesystem(sb); } static void f2fs_truncate_quota_inode_pages(struct super_block *sb) From 6df501312cd530ee49c4739caac9d579a45301ee Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 4 Feb 2019 10:46:42 -0800 Subject: [PATCH 18/54] fs: export evict_inodes This comes by 799ea9e9c599 ("xfs: evict all inodes involved with log redo item") Signed-off-by: Jaegeuk Kim --- fs/inode.c | 1 + fs/internal.h | 1 - include/linux/fs.h | 1 + 3 files changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/inode.c b/fs/inode.c index b0edef500590..9fa27369e8a2 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -626,6 +626,7 @@ again: dispose_list(&dispose); } +EXPORT_SYMBOL_GPL(evict_inodes); /** * invalidate_inodes - attempt to free all inodes on a superblock diff --git a/fs/internal.h b/fs/internal.h index 71859c4d0b41..6e17ee3b7c2f 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -121,7 +121,6 @@ extern void inode_add_lru(struct inode *inode); extern void inode_io_list_del(struct inode *inode); extern long get_nr_dirty_inodes(void); -extern void evict_inodes(struct super_block *); extern int invalidate_inodes(struct super_block *, bool); /* diff --git a/include/linux/fs.h b/include/linux/fs.h index e9382296305d..314bf410e3ae 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2640,6 +2640,7 @@ static inline void lockdep_annotate_inode_mutex_key(struct inode *inode) { }; #endif extern void unlock_new_inode(struct inode *); extern unsigned int get_next_ino(void); +extern void evict_inodes(struct super_block *sb); extern void __iget(struct inode * inode); extern void iget_failed(struct inode *); From 9eb897a05082d67ca10b977e5cc240bd42df3921 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 22 Jan 2019 14:04:33 -0800 Subject: [PATCH 19/54] f2fs: sync filesystem after roll-forward recovery Some works after roll-forward recovery can get an error which will release all the data structures. Let's flush them in order to make it clean. One possible corruption came from: [ 90.400500] list_del corruption. prev->next should be ffffffed1f566208, but was (null) [ 90.675349] Call trace: [ 90.677869] __list_del_entry_valid+0x94/0xb4 [ 90.682351] remove_dirty_inode+0xac/0x114 [ 90.686563] __f2fs_write_data_pages+0x6a8/0x6c8 [ 90.691302] f2fs_write_data_pages+0x40/0x4c [ 90.695695] do_writepages+0x80/0xf0 [ 90.699372] __writeback_single_inode+0xdc/0x4ac [ 90.704113] writeback_sb_inodes+0x280/0x440 [ 90.708501] wb_writeback+0x1b8/0x3d0 [ 90.712267] wb_workfn+0x1a8/0x4d4 [ 90.715765] process_one_work+0x1c0/0x3d4 [ 90.719883] worker_thread+0x224/0x344 [ 90.723739] kthread+0x120/0x130 [ 90.727055] ret_from_fork+0x10/0x18 Reported-by: Sahitya Tummala Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 5 +++-- fs/f2fs/node.c | 4 +++- fs/f2fs/super.c | 44 +++++++++++++++++++++++++++++++++----------- 3 files changed, 39 insertions(+), 14 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 3fa4587ab0e8..0dc483a7f3d8 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -307,8 +307,9 @@ static int f2fs_write_meta_pages(struct address_space *mapping, goto skip_write; /* collect a number of dirty meta pages and write together */ - if (wbc->for_kupdate || - get_pages(sbi, F2FS_DIRTY_META) < nr_pages_to_skip(sbi, META)) + if (wbc->sync_mode != WB_SYNC_ALL && + get_pages(sbi, F2FS_DIRTY_META) < + nr_pages_to_skip(sbi, META)) goto skip_write; /* if locked failed, cp will flush dirty pages instead */ diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index f6a1e1e0f891..0f76c5e7998c 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1925,7 +1925,9 @@ static int f2fs_write_node_pages(struct address_space *mapping, f2fs_balance_fs_bg(sbi); /* collect a number of dirty node pages and write together */ - if (get_pages(sbi, F2FS_DIRTY_NODES) < nr_pages_to_skip(sbi, NODE)) + if (wbc->sync_mode != WB_SYNC_ALL && + get_pages(sbi, F2FS_DIRTY_NODES) < + nr_pages_to_skip(sbi, NODE)) goto skip_write; if (wbc->sync_mode == WB_SYNC_ALL) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 38e50a4bf26d..798db23f58a6 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1459,9 +1459,16 @@ static int f2fs_enable_quotas(struct super_block *sb); static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) { + unsigned int s_flags = sbi->sb->s_flags; struct cp_control cpc; - int err; + int err = 0; + int ret; + if (s_flags & MS_RDONLY) { + f2fs_msg(sbi->sb, KERN_ERR, + "checkpoint=disable on readonly fs"); + return -EINVAL; + } sbi->sb->s_flags |= MS_ACTIVE; f2fs_update_time(sbi, DISABLE_TIME); @@ -1469,18 +1476,24 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) while (!f2fs_time_over(sbi, DISABLE_TIME)) { mutex_lock(&sbi->gc_mutex); err = f2fs_gc(sbi, true, false, NULL_SEGNO); - if (err == -ENODATA) + if (err == -ENODATA) { + err = 0; break; + } if (err && err != -EAGAIN) - return err; + break; } - err = sync_filesystem(sbi->sb); - if (err) - return err; + ret = sync_filesystem(sbi->sb); + if (ret || err) { + err = ret ? ret: err; + goto restore_flag; + } - if (f2fs_disable_cp_again(sbi)) - return -EAGAIN; + if (f2fs_disable_cp_again(sbi)) { + err = -EAGAIN; + goto restore_flag; + } mutex_lock(&sbi->gc_mutex); cpc.reason = CP_PAUSE; @@ -1489,7 +1502,9 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) sbi->unusable_block_count = 0; mutex_unlock(&sbi->gc_mutex); - return 0; +restore_flag: + sbi->sb->s_flags = s_flags; /* Restore MS_RDONLY status */ + return err; } static void f2fs_enable_checkpoint(struct f2fs_sb_info *sbi) @@ -3376,7 +3391,7 @@ skip_recovery: if (test_opt(sbi, DISABLE_CHECKPOINT)) { err = f2fs_disable_checkpoint(sbi); if (err) - goto free_meta; + goto sync_free_meta; } else if (is_set_ckpt_flags(sbi, CP_DISABLED_FLAG)) { f2fs_enable_checkpoint(sbi); } @@ -3389,7 +3404,7 @@ skip_recovery: /* After POR, we can run background GC thread.*/ err = f2fs_start_gc_thread(sbi); if (err) - goto free_meta; + goto sync_free_meta; } kvfree(options); @@ -3412,6 +3427,11 @@ skip_recovery: clear_sbi_flag(sbi, SBI_CP_DISABLED_QUICK); return 0; +sync_free_meta: + /* safe to flush all the data */ + sync_filesystem(sbi->sb); + retry = false; + free_meta: #ifdef CONFIG_QUOTA f2fs_truncate_quota_inode_pages(sb); @@ -3425,6 +3445,8 @@ free_meta: * falls into an infinite loop in f2fs_sync_meta_pages(). */ truncate_inode_pages_final(META_MAPPING(sbi)); + /* evict some inodes being cached by GC */ + evict_inodes(sb); f2fs_unregister_sysfs(sbi); free_root_inode: dput(sb->s_root); From 04c71ef5d9f7531e58be5f219950f6527b03580a Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Fri, 25 Jan 2019 15:35:01 +0800 Subject: [PATCH 20/54] f2fs: fix typos in code comments lengh -> length Signed-off-by: Geliang Tang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- include/linux/f2fs_fs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index fc763eab10b2..6dcde67ba171 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -187,7 +187,7 @@ struct f2fs_orphan_block { struct f2fs_extent { __le32 fofs; /* start file offset of the extent */ __le32 blk; /* start block address of the extent */ - __le32 len; /* lengh of the extent */ + __le32 len; /* length of the extent */ } __packed; #define F2FS_NAME_LEN 255 @@ -512,7 +512,7 @@ typedef __le32 f2fs_hash_t; struct f2fs_dir_entry { __le32 hash_code; /* hash code of file name */ __le32 ino; /* inode number */ - __le16 name_len; /* lengh of file name */ + __le16 name_len; /* length of file name */ __u8 file_type; /* file type */ } __packed; From b339a10ec62d86e6423852edafa0fc9c194f420d Mon Sep 17 00:00:00 2001 From: zhengliang Date: Thu, 24 Jan 2019 20:57:03 +0800 Subject: [PATCH 21/54] f2fs: fix to data block override node segment by mistake MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The following race could lead to data block override node segment by mistake. Task A | Task B | Task C | Task D ======= | ======== |========== | ========= open file | | | white file | | | submit bio | | | wait io complete | | | | remove file | | ........ | iput_final | | | | sync | | | do checkpoint | | | data segment free | | | | create file1 | | | allocate node segment(if it is the same segment freed by Task C) f2fs_write_end_io | | | So we need to guarantee io complete before truncate inode in f2fs_drop_inode. Signed-off-by: Zheng Liang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 798db23f58a6..166e999774bd 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -915,6 +915,10 @@ static int f2fs_drop_inode(struct inode *inode) sb_start_intwrite(inode->i_sb); f2fs_i_size_write(inode, 0); + f2fs_submit_merged_write_cond(F2FS_I_SB(inode), + inode, NULL, 0, DATA); + truncate_inode_pages_final(inode->i_mapping); + if (F2FS_HAS_BLOCKS(inode)) f2fs_truncate(inode); From 843809bad2741d2440df328d1bcb5e6faf9b8ad4 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 24 Jan 2019 17:18:07 +0800 Subject: [PATCH 22/54] f2fs: fix to document inline_xattr_size option We missed to add document for inline_xattr_size mount option in f2fs.txt, add it. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt index 348d80440433..564ccc6c7bad 100644 --- a/Documentation/filesystems/f2fs.txt +++ b/Documentation/filesystems/f2fs.txt @@ -126,6 +126,8 @@ disable_ext_identify Disable the extension list configured by mkfs, so f2fs does not aware of cold files such as media files. inline_xattr Enable the inline xattrs feature. noinline_xattr Disable the inline xattrs feature. +inline_xattr_size=%u Support configuring inline xattr size, it depends on + flexible inline xattr feature. inline_data Enable the inline data feature: New created small(<~3.4k) files can be written into inode block. inline_dentry Enable the inline dir feature: data in new created From b8fe720a614e912889603e4658223d9dec48b361 Mon Sep 17 00:00:00 2001 From: Chengguang Xu Date: Wed, 23 Jan 2019 15:49:44 +0800 Subject: [PATCH 23/54] f2fs: jump to label 'free_node_inode' when failing from d_make_root() When sb->s_root is NULL dput() will do nothing, so jump to label 'free_node_inode' instead of lable 'free_root_inode' when failing from d_make_root(). Signed-off-by: Chengguang Xu Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 166e999774bd..fa54715e4132 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -3329,7 +3329,7 @@ try_onemore: sb->s_root = d_make_root(root); /* allocate root dentry */ if (!sb->s_root) { err = -ENOMEM; - goto free_root_inode; + goto free_node_inode; } err = f2fs_register_sysfs(sbi); From be2a96a3badc2056e1a66d5613c87b957b3a3f69 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 24 Jan 2019 15:18:47 +0800 Subject: [PATCH 24/54] f2fs: fix to check inline_xattr_size boundary correctly We use below condition to check inline_xattr_size boundary: if (!F2FS_OPTION(sbi).inline_xattr_size || F2FS_OPTION(sbi).inline_xattr_size >= DEF_ADDRS_PER_INODE - F2FS_TOTAL_EXTRA_ATTR_SIZE - DEF_INLINE_RESERVED_SIZE - DEF_MIN_INLINE_SIZE) There is there problems in that check: - we should allow inline_xattr_size equaling to min size of inline {data,dentry} area. - F2FS_TOTAL_EXTRA_ATTR_SIZE and inline_xattr_size are based on different size unit, previous one is 4 bytes, latter one is 1 bytes. - DEF_MIN_INLINE_SIZE only indicate min size of inline data area, however, we need to consider min size of inline dentry area as well, minimal inline dentry should at least contain two entries: '.' and '..', so that min inline_dentry size is 40 bytes. .bitmap 1 * 1 = 1 .reserved 1 * 1 = 1 .dentry 11 * 2 = 22 .filename 8 * 2 = 16 total 40 Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 1 - fs/f2fs/super.c | 10 +++++----- include/linux/f2fs_fs.h | 13 +++++++------ 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index b6582b8bbd72..8690d5cb32f4 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -524,7 +524,6 @@ struct f2fs_flush_device { /* for inline stuff */ #define DEF_INLINE_RESERVED_SIZE 1 -#define DEF_MIN_INLINE_SIZE 1 static inline int get_extra_isize(struct inode *inode); static inline int get_inline_xattr_addrs(struct inode *inode); #define MAX_INLINE_DATA(inode) (sizeof(__le32) * \ diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index fa54715e4132..4c3015a119b4 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -835,11 +835,11 @@ static int parse_options(struct super_block *sb, char *options) return -EINVAL; } if (!F2FS_OPTION(sbi).inline_xattr_size || - F2FS_OPTION(sbi).inline_xattr_size >= - DEF_ADDRS_PER_INODE - - F2FS_TOTAL_EXTRA_ATTR_SIZE - - DEF_INLINE_RESERVED_SIZE - - DEF_MIN_INLINE_SIZE) { + F2FS_OPTION(sbi).inline_xattr_size > + DEF_ADDRS_PER_INODE - + F2FS_TOTAL_EXTRA_ATTR_SIZE / sizeof(__le32) - + DEF_INLINE_RESERVED_SIZE - + MIN_INLINE_DENTRY_SIZE / sizeof(__le32)) { f2fs_msg(sb, KERN_ERR, "inline xattr size is out of range"); return -EINVAL; diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index 6dcde67ba171..2f5fbc41118c 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -490,12 +490,12 @@ typedef __le32 f2fs_hash_t; /* * space utilization of regular dentry and inline dentry (w/o extra reservation) - * regular dentry inline dentry - * bitmap 1 * 27 = 27 1 * 23 = 23 - * reserved 1 * 3 = 3 1 * 7 = 7 - * dentry 11 * 214 = 2354 11 * 182 = 2002 - * filename 8 * 214 = 1712 8 * 182 = 1456 - * total 4096 3488 + * regular dentry inline dentry (def) inline dentry (min) + * bitmap 1 * 27 = 27 1 * 23 = 23 1 * 1 = 1 + * reserved 1 * 3 = 3 1 * 7 = 7 1 * 1 = 1 + * dentry 11 * 214 = 2354 11 * 182 = 2002 11 * 2 = 22 + * filename 8 * 214 = 1712 8 * 182 = 1456 8 * 2 = 16 + * total 4096 3488 40 * * Note: there are more reserved space in inline dentry than in regular * dentry, when converting inline dentry we should handle this carefully. @@ -507,6 +507,7 @@ typedef __le32 f2fs_hash_t; #define SIZE_OF_RESERVED (PAGE_SIZE - ((SIZE_OF_DIR_ENTRY + \ F2FS_SLOT_LEN) * \ NR_DENTRY_IN_BLOCK + SIZE_OF_DENTRY_BITMAP)) +#define MIN_INLINE_DENTRY_SIZE 40 /* just include '.' and '..' entries */ /* One directory entry slot representing F2FS_SLOT_LEN-sized file name */ struct f2fs_dir_entry { From 8ef83050d5c115b5455758c7f2ac2da115c5f9f2 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 22 Jan 2019 20:18:06 +0800 Subject: [PATCH 25/54] f2fs: fix to avoid deadlock of atomic file operations Thread A Thread B - __fput - f2fs_release_file - drop_inmem_pages - mutex_lock(&fi->inmem_lock) - __revoke_inmem_pages - lock_page(page) - open - f2fs_setattr - truncate_setsize - truncate_inode_pages_range - lock_page(page) - truncate_cleanup_page - f2fs_invalidate_page - drop_inmem_page - mutex_lock(&fi->inmem_lock); We may encounter above ABBA deadlock as reported by Kyungtae Kim: I'm reporting a bug in linux-4.17.19: "INFO: task hung in drop_inmem_page" (no reproducer) I think this might be somehow related to the following: https://groups.google.com/forum/#!searchin/syzkaller-bugs/INFO$3A$20task$20hung$20in$20%7Csort:date/syzkaller-bugs/c6soBTrdaIo/AjAzPeIzCgAJ ========================================= INFO: task syz-executor7:10822 blocked for more than 120 seconds. Not tainted 4.17.19 #1 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. syz-executor7 D27024 10822 6346 0x00000004 Call Trace: context_switch kernel/sched/core.c:2867 [inline] __schedule+0x721/0x1e60 kernel/sched/core.c:3515 schedule+0x88/0x1c0 kernel/sched/core.c:3559 schedule_preempt_disabled+0x18/0x30 kernel/sched/core.c:3617 __mutex_lock_common kernel/locking/mutex.c:833 [inline] __mutex_lock+0x5bd/0x1410 kernel/locking/mutex.c:893 mutex_lock_nested+0x1b/0x20 kernel/locking/mutex.c:908 drop_inmem_page+0xcb/0x810 fs/f2fs/segment.c:327 f2fs_invalidate_page+0x337/0x5e0 fs/f2fs/data.c:2401 do_invalidatepage mm/truncate.c:165 [inline] truncate_cleanup_page+0x261/0x330 mm/truncate.c:187 truncate_inode_pages_range+0x552/0x1610 mm/truncate.c:367 truncate_inode_pages mm/truncate.c:478 [inline] truncate_pagecache+0x6d/0x90 mm/truncate.c:801 truncate_setsize+0x81/0xa0 mm/truncate.c:826 f2fs_setattr+0x44f/0x1270 fs/f2fs/file.c:781 notify_change+0xa62/0xe80 fs/attr.c:313 do_truncate+0x12e/0x1e0 fs/open.c:63 do_last fs/namei.c:2955 [inline] path_openat+0x2042/0x29f0 fs/namei.c:3505 do_filp_open+0x1bd/0x2c0 fs/namei.c:3540 do_sys_open+0x35e/0x4e0 fs/open.c:1101 __do_sys_open fs/open.c:1119 [inline] __se_sys_open fs/open.c:1114 [inline] __x64_sys_open+0x89/0xc0 fs/open.c:1114 do_syscall_64+0xc4/0x4e0 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x4497b9 RSP: 002b:00007f734e459c68 EFLAGS: 00000246 ORIG_RAX: 0000000000000002 RAX: ffffffffffffffda RBX: 00007f734e45a6cc RCX: 00000000004497b9 RDX: 0000000000000104 RSI: 00000000000a8280 RDI: 0000000020000080 RBP: 000000000071bea0 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 00000000ffffffff R13: 0000000000007230 R14: 00000000006f02d0 R15: 00007f734e45a700 INFO: task syz-executor7:10858 blocked for more than 120 seconds. Not tainted 4.17.19 #1 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. syz-executor7 D28880 10858 6346 0x00000004 Call Trace: context_switch kernel/sched/core.c:2867 [inline] __schedule+0x721/0x1e60 kernel/sched/core.c:3515 schedule+0x88/0x1c0 kernel/sched/core.c:3559 __rwsem_down_write_failed_common kernel/locking/rwsem-xadd.c:565 [inline] rwsem_down_write_failed+0x5e6/0xc90 kernel/locking/rwsem-xadd.c:594 call_rwsem_down_write_failed+0x17/0x30 arch/x86/lib/rwsem.S:117 __down_write arch/x86/include/asm/rwsem.h:142 [inline] down_write+0x58/0xa0 kernel/locking/rwsem.c:72 inode_lock include/linux/fs.h:713 [inline] do_truncate+0x120/0x1e0 fs/open.c:61 do_last fs/namei.c:2955 [inline] path_openat+0x2042/0x29f0 fs/namei.c:3505 do_filp_open+0x1bd/0x2c0 fs/namei.c:3540 do_sys_open+0x35e/0x4e0 fs/open.c:1101 __do_sys_open fs/open.c:1119 [inline] __se_sys_open fs/open.c:1114 [inline] __x64_sys_open+0x89/0xc0 fs/open.c:1114 do_syscall_64+0xc4/0x4e0 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x4497b9 RSP: 002b:00007f734e3b4c68 EFLAGS: 00000246 ORIG_RAX: 0000000000000002 RAX: ffffffffffffffda RBX: 00007f734e3b56cc RCX: 00000000004497b9 RDX: 0000000000000104 RSI: 00000000000a8280 RDI: 0000000020000080 RBP: 000000000071c238 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 00000000ffffffff R13: 0000000000007230 R14: 00000000006f02d0 R15: 00007f734e3b5700 INFO: task syz-executor5:10829 blocked for more than 120 seconds. Not tainted 4.17.19 #1 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. syz-executor5 D28760 10829 6308 0x80000002 Call Trace: context_switch kernel/sched/core.c:2867 [inline] __schedule+0x721/0x1e60 kernel/sched/core.c:3515 schedule+0x88/0x1c0 kernel/sched/core.c:3559 io_schedule+0x21/0x80 kernel/sched/core.c:5179 wait_on_page_bit_common mm/filemap.c:1100 [inline] __lock_page+0x2b5/0x390 mm/filemap.c:1273 lock_page include/linux/pagemap.h:483 [inline] __revoke_inmem_pages+0xb35/0x11c0 fs/f2fs/segment.c:231 drop_inmem_pages+0xa3/0x3e0 fs/f2fs/segment.c:306 f2fs_release_file+0x2c7/0x330 fs/f2fs/file.c:1556 __fput+0x2c7/0x780 fs/file_table.c:209 ____fput+0x1a/0x20 fs/file_table.c:243 task_work_run+0x151/0x1d0 kernel/task_work.c:113 exit_task_work include/linux/task_work.h:22 [inline] do_exit+0x8ba/0x30a0 kernel/exit.c:865 do_group_exit+0x13b/0x3a0 kernel/exit.c:968 get_signal+0x6bb/0x1650 kernel/signal.c:2482 do_signal+0x84/0x1b70 arch/x86/kernel/signal.c:810 exit_to_usermode_loop+0x155/0x190 arch/x86/entry/common.c:162 prepare_exit_to_usermode arch/x86/entry/common.c:196 [inline] syscall_return_slowpath arch/x86/entry/common.c:265 [inline] do_syscall_64+0x445/0x4e0 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x4497b9 RSP: 002b:00007f1c68e74ce8 EFLAGS: 00000246 ORIG_RAX: 00000000000000ca RAX: fffffffffffffe00 RBX: 000000000071bf80 RCX: 00000000004497b9 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 000000000071bf80 RBP: 000000000071bf80 R08: 0000000000000000 R09: 000000000071bf58 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: 0000000000000000 R14: 00007f1c68e759c0 R15: 00007f1c68e75700 This patch tries to use trylock_page to mitigate such deadlock condition for fix. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 43 +++++++++++++++++++++++++++++++------------ 1 file changed, 31 insertions(+), 12 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 6f0bfaeebb47..67bcecfcb55c 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -215,7 +215,8 @@ void f2fs_register_inmem_page(struct inode *inode, struct page *page) } static int __revoke_inmem_pages(struct inode *inode, - struct list_head *head, bool drop, bool recover) + struct list_head *head, bool drop, bool recover, + bool trylock) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct inmem_pages *cur, *tmp; @@ -227,7 +228,16 @@ static int __revoke_inmem_pages(struct inode *inode, if (drop) trace_f2fs_commit_inmem_page(page, INMEM_DROP); - lock_page(page); + if (trylock) { + /* + * to avoid deadlock in between page lock and + * inmem_lock. + */ + if (!trylock_page(page)) + continue; + } else { + lock_page(page); + } f2fs_wait_on_page_writeback(page, DATA, true, true); @@ -318,13 +328,19 @@ void f2fs_drop_inmem_pages(struct inode *inode) struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); - mutex_lock(&fi->inmem_lock); - __revoke_inmem_pages(inode, &fi->inmem_pages, true, false); - spin_lock(&sbi->inode_lock[ATOMIC_FILE]); - if (!list_empty(&fi->inmem_ilist)) - list_del_init(&fi->inmem_ilist); - spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); - mutex_unlock(&fi->inmem_lock); + while (!list_empty(&fi->inmem_pages)) { + mutex_lock(&fi->inmem_lock); + __revoke_inmem_pages(inode, &fi->inmem_pages, + true, false, true); + + if (list_empty(&fi->inmem_pages)) { + spin_lock(&sbi->inode_lock[ATOMIC_FILE]); + if (!list_empty(&fi->inmem_ilist)) + list_del_init(&fi->inmem_ilist); + spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); + mutex_unlock(&fi->inmem_lock); + } + } clear_inode_flag(inode, FI_ATOMIC_FILE); fi->i_gc_failures[GC_FAILURE_ATOMIC] = 0; @@ -429,12 +445,15 @@ retry: * recovery or rewrite & commit last transaction. For other * error number, revoking was done by filesystem itself. */ - err = __revoke_inmem_pages(inode, &revoke_list, false, true); + err = __revoke_inmem_pages(inode, &revoke_list, + false, true, false); /* drop all uncommitted pages */ - __revoke_inmem_pages(inode, &fi->inmem_pages, true, false); + __revoke_inmem_pages(inode, &fi->inmem_pages, + true, false, false); } else { - __revoke_inmem_pages(inode, &revoke_list, false, false); + __revoke_inmem_pages(inode, &revoke_list, + false, false, false); } return err; From ecf753b3701d1556cdb57bf8442caf05b1030be6 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 2 Feb 2019 17:33:01 +0800 Subject: [PATCH 26/54] f2fs: fix potential data inconsistence of checkpoint Previously, we changed lock from cp_rwsem to node_change, it solved the deadlock issue which was caused by below race condition: Thread A Thread B - f2fs_setattr - f2fs_lock_op -- read_lock - dquot_transfer - __dquot_transfer - dquot_acquire - commit_dqblk - f2fs_quota_write - f2fs_write_begin - f2fs_write_failed - write_checkpoint - block_operations - f2fs_lock_all -- write_lock - f2fs_truncate_blocks - f2fs_lock_op -- read_lock But it breaks the sematics of cp_rwsem, in other callers like: - f2fs_file_write_iter -> f2fs_write_begin -> f2fs_write_failed - f2fs_direct_IO -> f2fs_write_failed We allow to truncate dnode w/o cp_rwsem held, result in incorrect sit bitmap update, which can cause further data corruption. So this patch reverts previous fix implementation, and try to fix deadlock by skipping calling f2fs_truncate_blocks() in f2fs_write_failed() only for quota file, and keep the preallocated data/node in the tail of quota file, we can expecte that the preallocated space can be used to store quota info latter soon. Fixes: af033b2aa8a8 ("f2fs: guarantee journalled quota data by checkpoint") Signed-off-by: Gao Xiang Signed-off-by: Sheng Yong Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 3 ++- fs/f2fs/f2fs.h | 3 +-- fs/f2fs/file.c | 14 ++++++-------- fs/f2fs/inline.c | 4 ++-- 4 files changed, 11 insertions(+), 13 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 2386a2839842..16a389733eb8 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2308,7 +2308,8 @@ static void f2fs_write_failed(struct address_space *mapping, loff_t to) down_write(&F2FS_I(inode)->i_mmap_sem); truncate_pagecache(inode, i_size); - f2fs_truncate_blocks(inode, i_size, true, true); + if (!IS_NOQUOTA(inode)) + f2fs_truncate_blocks(inode, i_size, true); up_write(&F2FS_I(inode)->i_mmap_sem); up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 8690d5cb32f4..607abef0abcf 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2927,8 +2927,7 @@ static inline bool is_valid_data_blkaddr(struct f2fs_sb_info *sbi, */ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync); void f2fs_truncate_data_blocks(struct dnode_of_data *dn); -int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock, - bool buf_write); +int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock); int f2fs_truncate(struct inode *inode); int f2fs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 7a1217a236f0..724b2094bdec 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -592,8 +592,7 @@ truncate_out: return 0; } -int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock, - bool buf_write) +int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct dnode_of_data dn; @@ -601,7 +600,6 @@ int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock, int count = 0, err = 0; struct page *ipage; bool truncate_page = false; - int flag = buf_write ? F2FS_GET_BLOCK_PRE_AIO : F2FS_GET_BLOCK_PRE_DIO; trace_f2fs_truncate_blocks_enter(inode, from); @@ -611,7 +609,7 @@ int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock, goto free_partial; if (lock) - __do_map_lock(sbi, flag, true); + f2fs_lock_op(sbi); ipage = f2fs_get_node_page(sbi, inode->i_ino); if (IS_ERR(ipage)) { @@ -649,7 +647,7 @@ free_next: err = f2fs_truncate_inode_blocks(inode, free_from); out: if (lock) - __do_map_lock(sbi, flag, false); + f2fs_unlock_op(sbi); free_partial: /* lastly zero out the first data page */ if (!err) @@ -684,7 +682,7 @@ int f2fs_truncate(struct inode *inode) return err; } - err = f2fs_truncate_blocks(inode, i_size_read(inode), true, false); + err = f2fs_truncate_blocks(inode, i_size_read(inode), true); if (err) return err; @@ -1269,7 +1267,7 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) new_size = i_size_read(inode) - len; truncate_pagecache(inode, new_size); - ret = f2fs_truncate_blocks(inode, new_size, true, false); + ret = f2fs_truncate_blocks(inode, new_size, true); up_write(&F2FS_I(inode)->i_mmap_sem); if (!ret) f2fs_i_size_write(inode, new_size); @@ -1454,7 +1452,7 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) f2fs_balance_fs(sbi, true); down_write(&F2FS_I(inode)->i_mmap_sem); - ret = f2fs_truncate_blocks(inode, i_size_read(inode), true, false); + ret = f2fs_truncate_blocks(inode, i_size_read(inode), true); up_write(&F2FS_I(inode)->i_mmap_sem); if (ret) return ret; diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 6b4aa5db6a9a..ec067935ca78 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -298,7 +298,7 @@ process_inline: clear_inode_flag(inode, FI_INLINE_DATA); f2fs_put_page(ipage, 1); } else if (ri && (ri->i_inline & F2FS_INLINE_DATA)) { - if (f2fs_truncate_blocks(inode, 0, false, false)) + if (f2fs_truncate_blocks(inode, 0, false)) return false; goto process_inline; } @@ -470,7 +470,7 @@ static int f2fs_add_inline_entries(struct inode *dir, void *inline_dentry) return 0; punch_dentry_pages: truncate_inode_pages(&dir->i_data, 0); - f2fs_truncate_blocks(dir, 0, false, false); + f2fs_truncate_blocks(dir, 0, false); f2fs_remove_dirty_inode(dir); return err; } From 011f2e498d944c991ffd72955d654520afc29f4a Mon Sep 17 00:00:00 2001 From: Sahitya Tummala Date: Mon, 4 Feb 2019 13:36:53 +0530 Subject: [PATCH 27/54] f2fs: do not use mutex lock in atomic context Fix below warning coming because of using mutex lock in atomic context. BUG: sleeping function called from invalid context at kernel/locking/mutex.c:98 in_atomic(): 1, irqs_disabled(): 0, pid: 585, name: sh Preemption disabled at: __radix_tree_preload+0x28/0x130 Call trace: dump_backtrace+0x0/0x2b4 show_stack+0x20/0x28 dump_stack+0xa8/0xe0 ___might_sleep+0x144/0x194 __might_sleep+0x58/0x8c mutex_lock+0x2c/0x48 f2fs_trace_pid+0x88/0x14c f2fs_set_node_page_dirty+0xd0/0x184 Do not use f2fs_radix_tree_insert() to avoid doing cond_resched() with spin_lock() acquired. Signed-off-by: Sahitya Tummala Signed-off-by: Jaegeuk Kim --- fs/f2fs/trace.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/trace.c b/fs/f2fs/trace.c index ce2a5eb210b6..d0ab533a9ce8 100644 --- a/fs/f2fs/trace.c +++ b/fs/f2fs/trace.c @@ -14,7 +14,7 @@ #include "trace.h" static RADIX_TREE(pids, GFP_ATOMIC); -static struct mutex pids_lock; +static spinlock_t pids_lock; static struct last_io_info last_io; static inline void __print_last_io(void) @@ -58,23 +58,29 @@ void f2fs_trace_pid(struct page *page) set_page_private(page, (unsigned long)pid); +retry: if (radix_tree_preload(GFP_NOFS)) return; - mutex_lock(&pids_lock); + spin_lock(&pids_lock); p = radix_tree_lookup(&pids, pid); if (p == current) goto out; if (p) radix_tree_delete(&pids, pid); - f2fs_radix_tree_insert(&pids, pid, current); + if (radix_tree_insert(&pids, pid, current)) { + spin_unlock(&pids_lock); + radix_tree_preload_end(); + cond_resched(); + goto retry; + } trace_printk("%3x:%3x %4x %-16s\n", MAJOR(inode->i_sb->s_dev), MINOR(inode->i_sb->s_dev), pid, current->comm); out: - mutex_unlock(&pids_lock); + spin_unlock(&pids_lock); radix_tree_preload_end(); } @@ -119,7 +125,7 @@ void f2fs_trace_ios(struct f2fs_io_info *fio, int flush) void f2fs_build_trace_ios(void) { - mutex_init(&pids_lock); + spin_lock_init(&pids_lock); } #define PIDVEC_SIZE 128 @@ -147,7 +153,7 @@ void f2fs_destroy_trace_ios(void) pid_t next_pid = 0; unsigned int found; - mutex_lock(&pids_lock); + spin_lock(&pids_lock); while ((found = gang_lookup_pids(pid, next_pid, PIDVEC_SIZE))) { unsigned idx; @@ -155,5 +161,5 @@ void f2fs_destroy_trace_ios(void) for (idx = 0; idx < found; idx++) radix_tree_delete(&pids, pid[idx]); } - mutex_unlock(&pids_lock); + spin_unlock(&pids_lock); } From d5aa6ed6b68fcdde27c83a9b091fceeb58dd623f Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 15 Feb 2019 20:57:53 -0800 Subject: [PATCH 28/54] Revert "f2fs: fix to check inline_xattr_size boundary correctly" This reverts commit 802a643228462e0e3f6e84977d912f1871f61467. --- fs/f2fs/f2fs.h | 1 + fs/f2fs/super.c | 10 +++++----- include/linux/f2fs_fs.h | 13 ++++++------- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 607abef0abcf..c836992d5a00 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -524,6 +524,7 @@ struct f2fs_flush_device { /* for inline stuff */ #define DEF_INLINE_RESERVED_SIZE 1 +#define DEF_MIN_INLINE_SIZE 1 static inline int get_extra_isize(struct inode *inode); static inline int get_inline_xattr_addrs(struct inode *inode); #define MAX_INLINE_DATA(inode) (sizeof(__le32) * \ diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 4c3015a119b4..fa54715e4132 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -835,11 +835,11 @@ static int parse_options(struct super_block *sb, char *options) return -EINVAL; } if (!F2FS_OPTION(sbi).inline_xattr_size || - F2FS_OPTION(sbi).inline_xattr_size > - DEF_ADDRS_PER_INODE - - F2FS_TOTAL_EXTRA_ATTR_SIZE / sizeof(__le32) - - DEF_INLINE_RESERVED_SIZE - - MIN_INLINE_DENTRY_SIZE / sizeof(__le32)) { + F2FS_OPTION(sbi).inline_xattr_size >= + DEF_ADDRS_PER_INODE - + F2FS_TOTAL_EXTRA_ATTR_SIZE - + DEF_INLINE_RESERVED_SIZE - + DEF_MIN_INLINE_SIZE) { f2fs_msg(sb, KERN_ERR, "inline xattr size is out of range"); return -EINVAL; diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index 2f5fbc41118c..6dcde67ba171 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -490,12 +490,12 @@ typedef __le32 f2fs_hash_t; /* * space utilization of regular dentry and inline dentry (w/o extra reservation) - * regular dentry inline dentry (def) inline dentry (min) - * bitmap 1 * 27 = 27 1 * 23 = 23 1 * 1 = 1 - * reserved 1 * 3 = 3 1 * 7 = 7 1 * 1 = 1 - * dentry 11 * 214 = 2354 11 * 182 = 2002 11 * 2 = 22 - * filename 8 * 214 = 1712 8 * 182 = 1456 8 * 2 = 16 - * total 4096 3488 40 + * regular dentry inline dentry + * bitmap 1 * 27 = 27 1 * 23 = 23 + * reserved 1 * 3 = 3 1 * 7 = 7 + * dentry 11 * 214 = 2354 11 * 182 = 2002 + * filename 8 * 214 = 1712 8 * 182 = 1456 + * total 4096 3488 * * Note: there are more reserved space in inline dentry than in regular * dentry, when converting inline dentry we should handle this carefully. @@ -507,7 +507,6 @@ typedef __le32 f2fs_hash_t; #define SIZE_OF_RESERVED (PAGE_SIZE - ((SIZE_OF_DIR_ENTRY + \ F2FS_SLOT_LEN) * \ NR_DENTRY_IN_BLOCK + SIZE_OF_DENTRY_BITMAP)) -#define MIN_INLINE_DENTRY_SIZE 40 /* just include '.' and '..' entries */ /* One directory entry slot representing F2FS_SLOT_LEN-sized file name */ struct f2fs_dir_entry { From 2599cc3d728c639ea15f88a0157f1455c4179a86 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 26 Feb 2019 09:47:34 -0800 Subject: [PATCH 29/54] Revert "f2fs: fix to avoid deadlock of atomic file operations" This reverts commit f3ac182210162c7e76997a8566a7f9869349f3d8. --- fs/f2fs/segment.c | 43 ++++++++++++------------------------------- 1 file changed, 12 insertions(+), 31 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 67bcecfcb55c..6f0bfaeebb47 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -215,8 +215,7 @@ void f2fs_register_inmem_page(struct inode *inode, struct page *page) } static int __revoke_inmem_pages(struct inode *inode, - struct list_head *head, bool drop, bool recover, - bool trylock) + struct list_head *head, bool drop, bool recover) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct inmem_pages *cur, *tmp; @@ -228,16 +227,7 @@ static int __revoke_inmem_pages(struct inode *inode, if (drop) trace_f2fs_commit_inmem_page(page, INMEM_DROP); - if (trylock) { - /* - * to avoid deadlock in between page lock and - * inmem_lock. - */ - if (!trylock_page(page)) - continue; - } else { - lock_page(page); - } + lock_page(page); f2fs_wait_on_page_writeback(page, DATA, true, true); @@ -328,19 +318,13 @@ void f2fs_drop_inmem_pages(struct inode *inode) struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); - while (!list_empty(&fi->inmem_pages)) { - mutex_lock(&fi->inmem_lock); - __revoke_inmem_pages(inode, &fi->inmem_pages, - true, false, true); - - if (list_empty(&fi->inmem_pages)) { - spin_lock(&sbi->inode_lock[ATOMIC_FILE]); - if (!list_empty(&fi->inmem_ilist)) - list_del_init(&fi->inmem_ilist); - spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); - mutex_unlock(&fi->inmem_lock); - } - } + mutex_lock(&fi->inmem_lock); + __revoke_inmem_pages(inode, &fi->inmem_pages, true, false); + spin_lock(&sbi->inode_lock[ATOMIC_FILE]); + if (!list_empty(&fi->inmem_ilist)) + list_del_init(&fi->inmem_ilist); + spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); + mutex_unlock(&fi->inmem_lock); clear_inode_flag(inode, FI_ATOMIC_FILE); fi->i_gc_failures[GC_FAILURE_ATOMIC] = 0; @@ -445,15 +429,12 @@ retry: * recovery or rewrite & commit last transaction. For other * error number, revoking was done by filesystem itself. */ - err = __revoke_inmem_pages(inode, &revoke_list, - false, true, false); + err = __revoke_inmem_pages(inode, &revoke_list, false, true); /* drop all uncommitted pages */ - __revoke_inmem_pages(inode, &fi->inmem_pages, - true, false, false); + __revoke_inmem_pages(inode, &fi->inmem_pages, true, false); } else { - __revoke_inmem_pages(inode, &revoke_list, - false, false, false); + __revoke_inmem_pages(inode, &revoke_list, false, false); } return err; From 52bff3f6ce0fd6e9324236600cfee2780a66ffb7 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 15 Feb 2019 00:08:25 +0800 Subject: [PATCH 30/54] f2fs: fix to check inline_xattr_size boundary correctly We use below condition to check inline_xattr_size boundary: if (!F2FS_OPTION(sbi).inline_xattr_size || F2FS_OPTION(sbi).inline_xattr_size >= DEF_ADDRS_PER_INODE - F2FS_TOTAL_EXTRA_ATTR_SIZE - DEF_INLINE_RESERVED_SIZE - DEF_MIN_INLINE_SIZE) There is there problems in that check: - we should allow inline_xattr_size equaling to min size of inline {data,dentry} area. - F2FS_TOTAL_EXTRA_ATTR_SIZE and inline_xattr_size are based on different size unit, previous one is 4 bytes, latter one is 1 bytes. - DEF_MIN_INLINE_SIZE only indicate min size of inline data area, however, we need to consider min size of inline dentry area as well, minimal inline dentry should at least contain two entries: '.' and '..', so that min inline_dentry size is 40 bytes. .bitmap 1 * 1 = 1 .reserved 1 * 1 = 1 .dentry 11 * 2 = 22 .filename 8 * 2 = 16 total 40 Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 1 - fs/f2fs/super.c | 13 +++++++------ include/linux/f2fs_fs.h | 13 +++++++------ 3 files changed, 14 insertions(+), 13 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index c836992d5a00..607abef0abcf 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -524,7 +524,6 @@ struct f2fs_flush_device { /* for inline stuff */ #define DEF_INLINE_RESERVED_SIZE 1 -#define DEF_MIN_INLINE_SIZE 1 static inline int get_extra_isize(struct inode *inode); static inline int get_inline_xattr_addrs(struct inode *inode); #define MAX_INLINE_DATA(inode) (sizeof(__le32) * \ diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index fa54715e4132..1a586512a0d7 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -834,12 +834,13 @@ static int parse_options(struct super_block *sb, char *options) "set with inline_xattr option"); return -EINVAL; } - if (!F2FS_OPTION(sbi).inline_xattr_size || - F2FS_OPTION(sbi).inline_xattr_size >= - DEF_ADDRS_PER_INODE - - F2FS_TOTAL_EXTRA_ATTR_SIZE - - DEF_INLINE_RESERVED_SIZE - - DEF_MIN_INLINE_SIZE) { + if (F2FS_OPTION(sbi).inline_xattr_size < + sizeof(struct f2fs_xattr_header) / sizeof(__le32) || + F2FS_OPTION(sbi).inline_xattr_size > + DEF_ADDRS_PER_INODE - + F2FS_TOTAL_EXTRA_ATTR_SIZE / sizeof(__le32) - + DEF_INLINE_RESERVED_SIZE - + MIN_INLINE_DENTRY_SIZE / sizeof(__le32)) { f2fs_msg(sb, KERN_ERR, "inline xattr size is out of range"); return -EINVAL; diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index 6dcde67ba171..2f5fbc41118c 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -490,12 +490,12 @@ typedef __le32 f2fs_hash_t; /* * space utilization of regular dentry and inline dentry (w/o extra reservation) - * regular dentry inline dentry - * bitmap 1 * 27 = 27 1 * 23 = 23 - * reserved 1 * 3 = 3 1 * 7 = 7 - * dentry 11 * 214 = 2354 11 * 182 = 2002 - * filename 8 * 214 = 1712 8 * 182 = 1456 - * total 4096 3488 + * regular dentry inline dentry (def) inline dentry (min) + * bitmap 1 * 27 = 27 1 * 23 = 23 1 * 1 = 1 + * reserved 1 * 3 = 3 1 * 7 = 7 1 * 1 = 1 + * dentry 11 * 214 = 2354 11 * 182 = 2002 11 * 2 = 22 + * filename 8 * 214 = 1712 8 * 182 = 1456 8 * 2 = 16 + * total 4096 3488 40 * * Note: there are more reserved space in inline dentry than in regular * dentry, when converting inline dentry we should handle this carefully. @@ -507,6 +507,7 @@ typedef __le32 f2fs_hash_t; #define SIZE_OF_RESERVED (PAGE_SIZE - ((SIZE_OF_DIR_ENTRY + \ F2FS_SLOT_LEN) * \ NR_DENTRY_IN_BLOCK + SIZE_OF_DENTRY_BITMAP)) +#define MIN_INLINE_DENTRY_SIZE 40 /* just include '.' and '..' entries */ /* One directory entry slot representing F2FS_SLOT_LEN-sized file name */ struct f2fs_dir_entry { From 6a19f801501c559f98ef9c46f41ff2e51d72eb4b Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 15 Feb 2019 00:16:15 +0800 Subject: [PATCH 31/54] f2fs: don't allow negative ->write_io_size_bits As Dan reported: "We put an upper bound on ->write_io_size_bits but we don't have a lower bound." So let's add lower bound check for ->write_io_size_bits in parse_options(). [We don't allow configuring ->write_io_size_bits to zero, since at least we need to fill one dummy page for aligned IO.] Reported-by: Dan Carpenter Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 1a586512a0d7..fac6875ffa3f 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -586,7 +586,7 @@ static int parse_options(struct super_block *sb, char *options) case Opt_io_size_bits: if (args->from && match_int(args, &arg)) return -EINVAL; - if (arg > __ilog2_u32(BIO_MAX_PAGES)) { + if (arg <= 0 || arg > __ilog2_u32(BIO_MAX_PAGES)) { f2fs_msg(sb, KERN_WARNING, "Not support %d, larger than %d", 1 << arg, BIO_MAX_PAGES); From b88b75de8305636fb9b3b43bd475249109304563 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 5 Feb 2019 07:59:57 -0800 Subject: [PATCH 32/54] f2fs: don't clear CP_QUOTA_NEED_FSCK_FLAG If we met this once, let fsck.f2fs clear this only. Note that, this addresses all the subtle fault injection test. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 0dc483a7f3d8..dc01dea911d6 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1268,8 +1268,10 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc) if (is_sbi_flag_set(sbi, SBI_QUOTA_SKIP_FLUSH)) __set_ckpt_flags(ckpt, CP_QUOTA_NEED_FSCK_FLAG); - else - __clear_ckpt_flags(ckpt, CP_QUOTA_NEED_FSCK_FLAG); + /* + * TODO: we count on fsck.f2fs to clear this flag until we figure out + * missing cases which clear it incorrectly. + */ if (is_sbi_flag_set(sbi, SBI_QUOTA_NEED_REPAIR)) __set_ckpt_flags(ckpt, CP_QUOTA_NEED_FSCK_FLAG); From 79c3cf3d3b5ac73b92f47e2d894b5db4b2b22402 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 15 Feb 2019 19:04:38 -0800 Subject: [PATCH 33/54] f2fs: fix wrong #endif We have to cover whole headerfile with last #endif. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 607abef0abcf..ffea1f6e8a3e 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3709,8 +3709,6 @@ extern void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate, #define f2fs_build_fault_attr(sbi, rate, type) do { } while (0) #endif -#endif - static inline bool is_journalled_quota(struct f2fs_sb_info *sbi) { #ifdef CONFIG_QUOTA @@ -3723,3 +3721,5 @@ static inline bool is_journalled_quota(struct f2fs_sb_info *sbi) #endif return false; } + +#endif From 66ebc2e50a523c2079cdc798286d2b6932d7bb64 Mon Sep 17 00:00:00 2001 From: Zeng Guangyue Date: Mon, 18 Feb 2019 14:26:41 +0800 Subject: [PATCH 34/54] f2fs: correct spelling mistake correct spelling mistake for "nunmber" Signed-off-by: Zeng Guangyue Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- include/linux/f2fs_fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index 2f5fbc41118c..b19433738690 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -285,7 +285,7 @@ enum { struct node_footer { __le32 nid; /* node id */ - __le32 ino; /* inode nunmber */ + __le32 ino; /* inode number */ __le32 flag; /* include cold/fsync/dentry marks and offset */ __le64 cp_ver; /* checkpoint version */ __le32 next_blkaddr; /* next node page block address */ From 7307334f43684753f9bd4228a126d57cd20d6c1e Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Tue, 19 Feb 2019 10:31:52 +0800 Subject: [PATCH 35/54] f2fs: silence VM_WARN_ON_ONCE in mempool_alloc Note that __GFP_ZERO is not supported for mempool_alloc, which also documented in the mempool_alloc comments. Signed-off-by: Gao Xiang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 16a389733eb8..ba03cb1412f8 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -296,9 +296,10 @@ static inline void __submit_bio(struct f2fs_sb_info *sbi, for (; start < F2FS_IO_SIZE(sbi); start++) { struct page *page = mempool_alloc(sbi->write_io_dummy, - GFP_NOIO | __GFP_ZERO | __GFP_NOFAIL); + GFP_NOIO | __GFP_NOFAIL); f2fs_bug_on(sbi, !page); + zero_user_segment(page, 0, PAGE_SIZE); SetPagePrivate(page); set_page_private(page, (unsigned long)DUMMY_WRITTEN_PAGE); lock_page(page); From 9878080f02d67df945dd34fd56cc6b41df002704 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 19 Feb 2019 16:23:53 +0800 Subject: [PATCH 36/54] f2fs: fix to retry fill_super only if recovery failed With current retry mechanism in f2fs_fill_super, first fill_super fails due to no memory, then second fill_super runs w/o recovery, if we succeed, we may lose fsynced data, it doesn't make sense. Let's retry fill_super only if it occurs non-ENOMEM error during recovery. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index fac6875ffa3f..db40d5e0a435 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -3061,10 +3061,11 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) struct f2fs_super_block *raw_super; struct inode *root; int err; - bool retry = true, need_fsck = false; + bool skip_recovery = false, need_fsck = false; char *options = NULL; int recovery, i, valid_super_block; struct curseg_info *seg_i; + int retry_cnt = 1; try_onemore: err = -EINVAL; @@ -3352,7 +3353,7 @@ try_onemore: goto free_meta; if (unlikely(is_set_ckpt_flags(sbi, CP_DISABLED_FLAG))) - goto skip_recovery; + goto reset_checkpoint; /* recover fsynced data */ if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) { @@ -3369,11 +3370,13 @@ try_onemore: if (need_fsck) set_sbi_flag(sbi, SBI_NEED_FSCK); - if (!retry) - goto skip_recovery; + if (skip_recovery) + goto reset_checkpoint; err = f2fs_recover_fsync_data(sbi, false); if (err < 0) { + if (err != -ENOMEM) + skip_recovery = true; need_fsck = true; f2fs_msg(sb, KERN_ERR, "Cannot recover all fsync data errno=%d", err); @@ -3389,7 +3392,7 @@ try_onemore: goto free_meta; } } -skip_recovery: +reset_checkpoint: /* f2fs_recover_fsync_data() cleared this already */ clear_sbi_flag(sbi, SBI_POR_DOING); @@ -3435,7 +3438,7 @@ skip_recovery: sync_free_meta: /* safe to flush all the data */ sync_filesystem(sbi->sb); - retry = false; + retry_cnt = 0; free_meta: #ifdef CONFIG_QUOTA @@ -3495,8 +3498,8 @@ free_sbi: kvfree(sbi); /* give only one another chance */ - if (retry) { - retry = false; + if (retry_cnt > 0 && skip_recovery) { + retry_cnt--; shrink_dcache_sb(sb); goto try_onemore; } From 40c364049511f14fee99dc271265f956d8d486f0 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 19 Feb 2019 17:08:18 +0800 Subject: [PATCH 37/54] f2fs: make fault injection covering __submit_flush_wait() This patch changes to allow failure of f2fs_bio_alloc() in __submit_flush_wait(), which can simulate flush error in checkpoint() for covering more error paths. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 6f0bfaeebb47..cebfcadcb9df 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -542,9 +542,13 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) static int __submit_flush_wait(struct f2fs_sb_info *sbi, struct block_device *bdev) { - struct bio *bio = f2fs_bio_alloc(sbi, 0, true); + struct bio *bio; int ret; + bio = f2fs_bio_alloc(sbi, 0, false); + if (!bio) + return -ENOMEM; + bio->bi_rw = REQ_OP_WRITE; bio->bi_bdev = bdev; ret = submit_bio_wait(WRITE_FLUSH, bio); From ac589b083d7345a7942210ca4f2901ae2758ce82 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 21 Feb 2019 20:37:14 +0800 Subject: [PATCH 38/54] f2fs: fix encrypted page memory leak For IPU path of f2fs_do_write_data_page(), in its error path, we need to release encrypted page and fscrypt context, otherwise it will cause memory leak. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index ba03cb1412f8..51d97757dea8 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1857,8 +1857,13 @@ got_it: if (fio->need_lock == LOCK_REQ) f2fs_unlock_op(fio->sbi); err = f2fs_inplace_write_data(fio); - if (err && PageWriteback(page)) - end_page_writeback(page); + if (err) { + if (f2fs_encrypted_file(inode)) + fscrypt_pullback_bio_page(&fio->encrypted_page, + true); + if (PageWriteback(page)) + end_page_writeback(page); + } trace_f2fs_do_write_data_page(fio->page, IPU); set_inode_flag(inode, FI_UPDATE_WRITE); return err; From 2e7ee431e4c36d2da217c06d9138e71a8b93bc5a Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 21 Feb 2019 20:40:13 +0800 Subject: [PATCH 39/54] f2fs: fix to update iostat correctly in IPU path In error path of IPU, we didn't account iostat correctly, fix it. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index cebfcadcb9df..4a4d49f4482f 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -3259,10 +3259,10 @@ int f2fs_inplace_write_data(struct f2fs_io_info *fio) stat_inc_inplace_blocks(fio->sbi); err = f2fs_submit_page_bio(fio); - if (!err) + if (!err) { update_device_state(fio); - - f2fs_update_iostat(fio->sbi, fio->io_type, F2FS_BLKSIZE); + f2fs_update_iostat(fio->sbi, fio->io_type, F2FS_BLKSIZE); + } return err; } From 0d150640d185b6fa04080f74010d1c7fee835c4a Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Thu, 21 Feb 2019 12:57:35 +0800 Subject: [PATCH 40/54] f2fs: no need to take page lock in readdir VFS will take inode_lock for readdir, therefore no need to take page lock in readdir at all just as the majority of other generic filesystems. This patch improves concurrency since .iterate_shared was introduced to VFS years ago. Signed-off-by: Gao Xiang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 8dff29053150..1f68e210f1db 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -896,7 +896,7 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) page_cache_sync_readahead(inode->i_mapping, ra, file, n, min(npages - n, (pgoff_t)MAX_DIR_RA_PAGES)); - dentry_page = f2fs_get_lock_data_page(inode, n, false); + dentry_page = f2fs_find_data_page(inode, n); if (IS_ERR(dentry_page)) { err = PTR_ERR(dentry_page); if (err == -ENOENT) { @@ -914,11 +914,11 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) err = f2fs_fill_dentries(ctx, &d, n * NR_DENTRY_IN_BLOCK, &fstr); if (err) { - f2fs_put_page(dentry_page, 1); + f2fs_put_page(dentry_page, 0); break; } - f2fs_put_page(dentry_page, 1); + f2fs_put_page(dentry_page, 0); } out_free: fscrypt_fname_free_buffer(&fstr); From 2d7bc3f943058acadde2e596dcb69f129937ae83 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 25 Feb 2019 09:46:45 -0800 Subject: [PATCH 41/54] f2fs: give random value to i_generation This follows to give random number to i_generation along with commit 232530680290b ("ext4: improve smp scalability for inode generation") This can be used for DUN for UFS HW encryption. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 -- fs/f2fs/namei.c | 3 ++- fs/f2fs/super.c | 1 - 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index ffea1f6e8a3e..03678fcce65c 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1307,8 +1307,6 @@ struct f2fs_sb_info { unsigned int nquota_files; /* # of quota sysfile */ - u32 s_next_generation; /* for NFS support */ - /* # of pages, see count_type */ atomic_t nr_pages[NR_COUNT_TYPE]; /* # of allocated blocks */ diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index e682589e07c4..1e8a21fdb6a2 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -50,7 +51,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) inode->i_blocks = 0; inode->i_mtime = inode->i_atime = inode->i_ctime = F2FS_I(inode)->i_crtime = current_time(inode); - inode->i_generation = sbi->s_next_generation++; + inode->i_generation = prandom_u32(); if (S_ISDIR(inode->i_mode)) F2FS_I(inode)->i_current_depth = 1; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index db40d5e0a435..f39861c1dcdb 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -3137,7 +3137,6 @@ try_onemore: sb->s_maxbytes = sbi->max_file_blocks << le32_to_cpu(raw_super->log_blocksize); sb->s_max_links = F2FS_LINK_MAX; - get_random_bytes(&sbi->s_next_generation, sizeof(u32)); #ifdef CONFIG_QUOTA sb->dq_op = &f2fs_quota_operations; From 09554b63c705d49eb3e793cbc2781f8a8662955e Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 23 Feb 2019 09:48:27 +0800 Subject: [PATCH 42/54] f2fs: fix to dirty inode for i_mode recovery As Seulbae Kim reported in bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=202637 We didn't recover permission field correctly after sudden power-cut, the reason is in setattr we didn't add inode into global dirty list once i_mode is changed, so latter checkpoint triggered by fsync will not flush last i_mode into disk, result in this problem, fix it. Reported-by: Seulbae Kim Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 724b2094bdec..8b1d393c08a4 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -770,7 +770,6 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); int err; - bool size_changed = false; if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) return -EIO; @@ -845,8 +844,6 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) down_write(&F2FS_I(inode)->i_sem); F2FS_I(inode)->last_disk_size = i_size_read(inode); up_write(&F2FS_I(inode)->i_sem); - - size_changed = true; } __setattr_copy(inode, attr); @@ -860,7 +857,7 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) } /* file size may changed here */ - f2fs_mark_inode_dirty_sync(inode, size_changed); + f2fs_mark_inode_dirty_sync(inode, true); /* inode change will produce dirty node pages flushed by checkpoint */ f2fs_balance_fs(F2FS_I_SB(inode), true); From 528a7e781c075cd35cf13413fe3a057e83e5b0fd Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 25 Feb 2019 17:11:03 +0800 Subject: [PATCH 43/54] f2fs: fix to avoid deadlock of atomic file operations Thread A Thread B - __fput - f2fs_release_file - drop_inmem_pages - mutex_lock(&fi->inmem_lock) - __revoke_inmem_pages - lock_page(page) - open - f2fs_setattr - truncate_setsize - truncate_inode_pages_range - lock_page(page) - truncate_cleanup_page - f2fs_invalidate_page - drop_inmem_page - mutex_lock(&fi->inmem_lock); We may encounter above ABBA deadlock as reported by Kyungtae Kim: I'm reporting a bug in linux-4.17.19: "INFO: task hung in drop_inmem_page" (no reproducer) I think this might be somehow related to the following: https://groups.google.com/forum/#!searchin/syzkaller-bugs/INFO$3A$20task$20hung$20in$20%7Csort:date/syzkaller-bugs/c6soBTrdaIo/AjAzPeIzCgAJ ========================================= INFO: task syz-executor7:10822 blocked for more than 120 seconds. Not tainted 4.17.19 #1 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. syz-executor7 D27024 10822 6346 0x00000004 Call Trace: context_switch kernel/sched/core.c:2867 [inline] __schedule+0x721/0x1e60 kernel/sched/core.c:3515 schedule+0x88/0x1c0 kernel/sched/core.c:3559 schedule_preempt_disabled+0x18/0x30 kernel/sched/core.c:3617 __mutex_lock_common kernel/locking/mutex.c:833 [inline] __mutex_lock+0x5bd/0x1410 kernel/locking/mutex.c:893 mutex_lock_nested+0x1b/0x20 kernel/locking/mutex.c:908 drop_inmem_page+0xcb/0x810 fs/f2fs/segment.c:327 f2fs_invalidate_page+0x337/0x5e0 fs/f2fs/data.c:2401 do_invalidatepage mm/truncate.c:165 [inline] truncate_cleanup_page+0x261/0x330 mm/truncate.c:187 truncate_inode_pages_range+0x552/0x1610 mm/truncate.c:367 truncate_inode_pages mm/truncate.c:478 [inline] truncate_pagecache+0x6d/0x90 mm/truncate.c:801 truncate_setsize+0x81/0xa0 mm/truncate.c:826 f2fs_setattr+0x44f/0x1270 fs/f2fs/file.c:781 notify_change+0xa62/0xe80 fs/attr.c:313 do_truncate+0x12e/0x1e0 fs/open.c:63 do_last fs/namei.c:2955 [inline] path_openat+0x2042/0x29f0 fs/namei.c:3505 do_filp_open+0x1bd/0x2c0 fs/namei.c:3540 do_sys_open+0x35e/0x4e0 fs/open.c:1101 __do_sys_open fs/open.c:1119 [inline] __se_sys_open fs/open.c:1114 [inline] __x64_sys_open+0x89/0xc0 fs/open.c:1114 do_syscall_64+0xc4/0x4e0 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x4497b9 RSP: 002b:00007f734e459c68 EFLAGS: 00000246 ORIG_RAX: 0000000000000002 RAX: ffffffffffffffda RBX: 00007f734e45a6cc RCX: 00000000004497b9 RDX: 0000000000000104 RSI: 00000000000a8280 RDI: 0000000020000080 RBP: 000000000071bea0 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 00000000ffffffff R13: 0000000000007230 R14: 00000000006f02d0 R15: 00007f734e45a700 INFO: task syz-executor7:10858 blocked for more than 120 seconds. Not tainted 4.17.19 #1 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. syz-executor7 D28880 10858 6346 0x00000004 Call Trace: context_switch kernel/sched/core.c:2867 [inline] __schedule+0x721/0x1e60 kernel/sched/core.c:3515 schedule+0x88/0x1c0 kernel/sched/core.c:3559 __rwsem_down_write_failed_common kernel/locking/rwsem-xadd.c:565 [inline] rwsem_down_write_failed+0x5e6/0xc90 kernel/locking/rwsem-xadd.c:594 call_rwsem_down_write_failed+0x17/0x30 arch/x86/lib/rwsem.S:117 __down_write arch/x86/include/asm/rwsem.h:142 [inline] down_write+0x58/0xa0 kernel/locking/rwsem.c:72 inode_lock include/linux/fs.h:713 [inline] do_truncate+0x120/0x1e0 fs/open.c:61 do_last fs/namei.c:2955 [inline] path_openat+0x2042/0x29f0 fs/namei.c:3505 do_filp_open+0x1bd/0x2c0 fs/namei.c:3540 do_sys_open+0x35e/0x4e0 fs/open.c:1101 __do_sys_open fs/open.c:1119 [inline] __se_sys_open fs/open.c:1114 [inline] __x64_sys_open+0x89/0xc0 fs/open.c:1114 do_syscall_64+0xc4/0x4e0 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x4497b9 RSP: 002b:00007f734e3b4c68 EFLAGS: 00000246 ORIG_RAX: 0000000000000002 RAX: ffffffffffffffda RBX: 00007f734e3b56cc RCX: 00000000004497b9 RDX: 0000000000000104 RSI: 00000000000a8280 RDI: 0000000020000080 RBP: 000000000071c238 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 00000000ffffffff R13: 0000000000007230 R14: 00000000006f02d0 R15: 00007f734e3b5700 INFO: task syz-executor5:10829 blocked for more than 120 seconds. Not tainted 4.17.19 #1 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. syz-executor5 D28760 10829 6308 0x80000002 Call Trace: context_switch kernel/sched/core.c:2867 [inline] __schedule+0x721/0x1e60 kernel/sched/core.c:3515 schedule+0x88/0x1c0 kernel/sched/core.c:3559 io_schedule+0x21/0x80 kernel/sched/core.c:5179 wait_on_page_bit_common mm/filemap.c:1100 [inline] __lock_page+0x2b5/0x390 mm/filemap.c:1273 lock_page include/linux/pagemap.h:483 [inline] __revoke_inmem_pages+0xb35/0x11c0 fs/f2fs/segment.c:231 drop_inmem_pages+0xa3/0x3e0 fs/f2fs/segment.c:306 f2fs_release_file+0x2c7/0x330 fs/f2fs/file.c:1556 __fput+0x2c7/0x780 fs/file_table.c:209 ____fput+0x1a/0x20 fs/file_table.c:243 task_work_run+0x151/0x1d0 kernel/task_work.c:113 exit_task_work include/linux/task_work.h:22 [inline] do_exit+0x8ba/0x30a0 kernel/exit.c:865 do_group_exit+0x13b/0x3a0 kernel/exit.c:968 get_signal+0x6bb/0x1650 kernel/signal.c:2482 do_signal+0x84/0x1b70 arch/x86/kernel/signal.c:810 exit_to_usermode_loop+0x155/0x190 arch/x86/entry/common.c:162 prepare_exit_to_usermode arch/x86/entry/common.c:196 [inline] syscall_return_slowpath arch/x86/entry/common.c:265 [inline] do_syscall_64+0x445/0x4e0 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x4497b9 RSP: 002b:00007f1c68e74ce8 EFLAGS: 00000246 ORIG_RAX: 00000000000000ca RAX: fffffffffffffe00 RBX: 000000000071bf80 RCX: 00000000004497b9 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 000000000071bf80 RBP: 000000000071bf80 R08: 0000000000000000 R09: 000000000071bf58 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: 0000000000000000 R14: 00007f1c68e759c0 R15: 00007f1c68e75700 This patch tries to use trylock_page to mitigate such deadlock condition for fix. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 43 +++++++++++++++++++++++++++++++------------ 1 file changed, 31 insertions(+), 12 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 4a4d49f4482f..fe0ab8d9b07f 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -215,7 +215,8 @@ void f2fs_register_inmem_page(struct inode *inode, struct page *page) } static int __revoke_inmem_pages(struct inode *inode, - struct list_head *head, bool drop, bool recover) + struct list_head *head, bool drop, bool recover, + bool trylock) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct inmem_pages *cur, *tmp; @@ -227,7 +228,16 @@ static int __revoke_inmem_pages(struct inode *inode, if (drop) trace_f2fs_commit_inmem_page(page, INMEM_DROP); - lock_page(page); + if (trylock) { + /* + * to avoid deadlock in between page lock and + * inmem_lock. + */ + if (!trylock_page(page)) + continue; + } else { + lock_page(page); + } f2fs_wait_on_page_writeback(page, DATA, true, true); @@ -318,13 +328,19 @@ void f2fs_drop_inmem_pages(struct inode *inode) struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); - mutex_lock(&fi->inmem_lock); - __revoke_inmem_pages(inode, &fi->inmem_pages, true, false); - spin_lock(&sbi->inode_lock[ATOMIC_FILE]); - if (!list_empty(&fi->inmem_ilist)) - list_del_init(&fi->inmem_ilist); - spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); - mutex_unlock(&fi->inmem_lock); + while (!list_empty(&fi->inmem_pages)) { + mutex_lock(&fi->inmem_lock); + __revoke_inmem_pages(inode, &fi->inmem_pages, + true, false, true); + + if (list_empty(&fi->inmem_pages)) { + spin_lock(&sbi->inode_lock[ATOMIC_FILE]); + if (!list_empty(&fi->inmem_ilist)) + list_del_init(&fi->inmem_ilist); + spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); + } + mutex_unlock(&fi->inmem_lock); + } clear_inode_flag(inode, FI_ATOMIC_FILE); fi->i_gc_failures[GC_FAILURE_ATOMIC] = 0; @@ -429,12 +445,15 @@ retry: * recovery or rewrite & commit last transaction. For other * error number, revoking was done by filesystem itself. */ - err = __revoke_inmem_pages(inode, &revoke_list, false, true); + err = __revoke_inmem_pages(inode, &revoke_list, + false, true, false); /* drop all uncommitted pages */ - __revoke_inmem_pages(inode, &fi->inmem_pages, true, false); + __revoke_inmem_pages(inode, &fi->inmem_pages, + true, false, false); } else { - __revoke_inmem_pages(inode, &revoke_list, false, false); + __revoke_inmem_pages(inode, &revoke_list, + false, false, false); } return err; From 0b5f8cceb8509d26d9a112898647be1502a1a8ce Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 26 Feb 2019 19:01:15 +0800 Subject: [PATCH 44/54] f2fs: trace f2fs_ioc_shutdown This patch supports to trace f2fs_ioc_shutdown. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 3 +++ include/trace/events/f2fs.h | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 8b1d393c08a4..9cc2563d6811 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1987,6 +1987,9 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) out: if (in != F2FS_GOING_DOWN_FULLSYNC) mnt_drop_write_file(filp); + + trace_f2fs_shutdown(sbi, in, ret); + return ret; } diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 9bbb7c916c12..8fd6fcbc8c7a 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -141,6 +141,14 @@ TRACE_DEFINE_ENUM(CP_TRIMMED); { CP_SPEC_LOG_NUM, "log type is 2" }, \ { CP_RECOVER_DIR, "dir needs recovery" }) +#define show_shutdown_mode(type) \ + __print_symbolic(type, \ + { F2FS_GOING_DOWN_FULLSYNC, "full sync" }, \ + { F2FS_GOING_DOWN_METASYNC, "meta sync" }, \ + { F2FS_GOING_DOWN_NOSYNC, "no sync" }, \ + { F2FS_GOING_DOWN_METAFLUSH, "meta flush" }, \ + { F2FS_GOING_DOWN_NEED_FSCK, "need fsck" }) + struct f2fs_sb_info; struct f2fs_io_info; struct extent_info; @@ -1607,6 +1615,30 @@ DEFINE_EVENT(f2fs_sync_dirty_inodes, f2fs_sync_dirty_inodes_exit, TP_ARGS(sb, type, count) ); +TRACE_EVENT(f2fs_shutdown, + + TP_PROTO(struct f2fs_sb_info *sbi, unsigned int mode, int ret), + + TP_ARGS(sbi, mode, ret), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned int, mode) + __field(int, ret) + ), + + TP_fast_assign( + __entry->dev = sbi->sb->s_dev; + __entry->mode = mode; + __entry->ret = ret; + ), + + TP_printk("dev = (%d,%d), mode: %s, ret:%d", + show_dev(__entry->dev), + show_shutdown_mode(__entry->mode), + __entry->ret) +); + #endif /* _TRACE_F2FS_H */ /* This part must be outside protection */ From ce14043aae02e0ccb146a049614547f26100d6c4 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 26 Feb 2019 19:01:16 +0800 Subject: [PATCH 45/54] f2fs: print more parameters in trace_f2fs_map_blocks for better map_blocks trace. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- include/trace/events/f2fs.h | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 8fd6fcbc8c7a..fe009522f4e1 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -536,6 +536,9 @@ TRACE_EVENT(f2fs_map_blocks, __field(block_t, m_lblk) __field(block_t, m_pblk) __field(unsigned int, m_len) + __field(unsigned int, m_flags) + __field(int, m_seg_type) + __field(bool, m_may_create) __field(int, ret) ), @@ -545,15 +548,22 @@ TRACE_EVENT(f2fs_map_blocks, __entry->m_lblk = map->m_lblk; __entry->m_pblk = map->m_pblk; __entry->m_len = map->m_len; + __entry->m_flags = map->m_flags; + __entry->m_seg_type = map->m_seg_type; + __entry->m_may_create = map->m_may_create; __entry->ret = ret; ), TP_printk("dev = (%d,%d), ino = %lu, file offset = %llu, " - "start blkaddr = 0x%llx, len = 0x%llx, err = %d", + "start blkaddr = 0x%llx, len = 0x%llx, flags = %u," + "seg_type = %d, may_create = %d, err = %d", show_dev_ino(__entry), (unsigned long long)__entry->m_lblk, (unsigned long long)__entry->m_pblk, (unsigned long long)__entry->m_len, + __entry->m_flags, + __entry->m_seg_type, + __entry->m_may_create, __entry->ret) ); From f261d5b43f934094a57aedddae092d6184cd501c Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 5 Mar 2019 17:52:33 +0800 Subject: [PATCH 46/54] f2fs: fix to use kvfree instead of kzfree As Jiqun Li reported in bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=202747 System can panic due to using wrong allocate/free function pair in xattr interface: - use kvmalloc to allocate memory - use kzfree to free memory Let's fix to use kvfree instead of kzfree, BTW, we are safe to get rid of kzfree, since there is no such confidential data stored as xattr, we don't need to zero it before free memory. Fixes: 5222595d093e ("f2fs: use kvmalloc, if kmalloc is failed") Reported-by: Jiqun Li Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/xattr.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 71b27c55988a..262bda631931 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -386,7 +386,7 @@ check: *base_addr = txattr_addr; return 0; out: - kzfree(txattr_addr); + kvfree(txattr_addr); return err; } @@ -429,7 +429,7 @@ static int read_all_xattrs(struct inode *inode, struct page *ipage, *base_addr = txattr_addr; return 0; fail: - kzfree(txattr_addr); + kvfree(txattr_addr); return err; } @@ -556,7 +556,7 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name, } error = size; out: - kzfree(base_addr); + kvfree(base_addr); return error; } @@ -595,7 +595,7 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) } error = buffer_size - rest; cleanup: - kzfree(base_addr); + kvfree(base_addr); return error; } @@ -726,7 +726,7 @@ static int __f2fs_setxattr(struct inode *inode, int index, if (!error && S_ISDIR(inode->i_mode)) set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_CP); exit: - kzfree(base_addr); + kvfree(base_addr); return error; } From a103284fe8e82b3a7a20c920d139bb0641af0172 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 6 Mar 2019 16:18:33 +0800 Subject: [PATCH 47/54] f2fs: remove wrong comment in f2fs_invalidate_page() Since 8c242db9b8c0 ("f2fs: fix stale ATOMIC_WRITTEN_PAGE private pointer"), we've started to not skip clear private flag for atomic_write page truncation, so removing old wrong comment in f2fs_invalidate_page(). Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 51d97757dea8..d1c4162e332c 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2708,7 +2708,6 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset, clear_cold_data(page); - /* This is atomic written page, keep Private */ if (IS_ATOMIC_WRITTEN_PAGE(page)) return f2fs_drop_inmem_page(inode, page); From 6831f03b94ec71929a3c0d5ee638e7845a2bfe28 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 6 Mar 2019 17:30:59 +0800 Subject: [PATCH 48/54] f2fs: fix to add refcount once page is tagged PG_private As Gao Xiang reported in bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=202749 f2fs may skip pageout() due to incorrect page reference count. The problem here is that MM defined the rule [1] very clearly that once page was set with PG_private flag, we should increment the refcount in that page, also main flows like pageout(), migrate_page() will assume there is one additional page reference count if page_has_private() returns true. But currently, f2fs won't add/del refcount when changing PG_private flag. Anyway, f2fs should follow MM's rule to make MM's related flows running as expected. [1] https://lore.kernel.org/lkml/2b19b3c4-2bc4-15fa-15cc-27a13e5c7af1@aol.com/ Reported-by: Gao Xiang Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 4 ++-- fs/f2fs/data.c | 21 ++++++++------------- fs/f2fs/dir.c | 2 +- fs/f2fs/f2fs.h | 21 +++++++++++++++++++++ fs/f2fs/node.c | 2 +- fs/f2fs/segment.c | 9 +++------ 6 files changed, 36 insertions(+), 23 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index dc01dea911d6..06554ea87b0a 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -407,7 +407,7 @@ static int f2fs_set_meta_page_dirty(struct page *page) if (!PageDirty(page)) { __set_page_dirty_nobuffers(page); inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_META); - SetPagePrivate(page); + f2fs_set_page_private(page, 0); f2fs_trace_pid(page); return 1; } @@ -958,7 +958,7 @@ void f2fs_update_dirty_page(struct inode *inode, struct page *page) inode_inc_dirty_pages(inode); spin_unlock(&sbi->inode_lock[type]); - SetPagePrivate(page); + f2fs_set_page_private(page, 0); f2fs_trace_pid(page); } diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index d1c4162e332c..aa33c500f703 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2711,8 +2711,7 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset, if (IS_ATOMIC_WRITTEN_PAGE(page)) return f2fs_drop_inmem_page(inode, page); - set_page_private(page, 0); - ClearPagePrivate(page); + f2fs_clear_page_private(page); } int f2fs_release_page(struct page *page, gfp_t wait) @@ -2726,8 +2725,7 @@ int f2fs_release_page(struct page *page, gfp_t wait) return 0; clear_cold_data(page); - set_page_private(page, 0); - ClearPagePrivate(page); + f2fs_clear_page_private(page); return 1; } @@ -2795,12 +2793,8 @@ int f2fs_migrate_page(struct address_space *mapping, return -EAGAIN; } - /* - * A reference is expected if PagePrivate set when move mapping, - * however F2FS breaks this for maintaining dirty page counts when - * truncating pages. So here adjusting the 'extra_count' make it work. - */ - extra_count = (atomic_written ? 1 : 0) - page_has_private(page); + /* one extra reference was held for atomic_write page */ + extra_count = atomic_written ? 1 : 0; rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, extra_count); if (rc != MIGRATEPAGE_SUCCESS) { @@ -2821,9 +2815,10 @@ int f2fs_migrate_page(struct address_space *mapping, get_page(newpage); } - if (PagePrivate(page)) - SetPagePrivate(newpage); - set_page_private(newpage, page_private(page)); + if (PagePrivate(page)) { + f2fs_set_page_private(newpage, page_private(page)); + f2fs_clear_page_private(page); + } migrate_page_copy(newpage, page); diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 1f68e210f1db..988af0b594a7 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -728,7 +728,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, !f2fs_truncate_hole(dir, page->index, page->index + 1)) { f2fs_clear_radix_tree_dirty_tag(page); clear_page_dirty_for_io(page); - ClearPagePrivate(page); + f2fs_clear_page_private(page); ClearPageUptodate(page); clear_cold_data(page); inode_dec_dirty_pages(dir); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 03678fcce65c..6204b15e1521 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2920,6 +2920,27 @@ static inline bool is_valid_data_blkaddr(struct f2fs_sb_info *sbi, return true; } +static inline void f2fs_set_page_private(struct page *page, + unsigned long data) +{ + if (PagePrivate(page)) + return; + + get_page(page); + SetPagePrivate(page); + set_page_private(page, data); +} + +static inline void f2fs_clear_page_private(struct page *page) +{ + if (!PagePrivate(page)) + return; + + set_page_private(page, 0); + ClearPagePrivate(page); + f2fs_put_page(page, 0); +} + /* * file.c */ diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 0f76c5e7998c..f527c2b08798 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1966,7 +1966,7 @@ static int f2fs_set_node_page_dirty(struct page *page) if (!PageDirty(page)) { __set_page_dirty_nobuffers(page); inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES); - SetPagePrivate(page); + f2fs_set_page_private(page, 0); f2fs_trace_pid(page); return 1; } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index fe0ab8d9b07f..cbfa80812a66 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -191,8 +191,7 @@ void f2fs_register_inmem_page(struct inode *inode, struct page *page) f2fs_trace_pid(page); - set_page_private(page, (unsigned long)ATOMIC_WRITTEN_PAGE); - SetPagePrivate(page); + f2fs_set_page_private(page, (unsigned long)ATOMIC_WRITTEN_PAGE); new = f2fs_kmem_cache_alloc(inmem_entry_slab, GFP_NOFS); @@ -280,8 +279,7 @@ next: ClearPageUptodate(page); clear_cold_data(page); } - set_page_private(page, 0); - ClearPagePrivate(page); + f2fs_clear_page_private(page); f2fs_put_page(page, 1); list_del(&cur->list); @@ -370,8 +368,7 @@ void f2fs_drop_inmem_page(struct inode *inode, struct page *page) kmem_cache_free(inmem_entry_slab, cur); ClearPageUptodate(page); - set_page_private(page, 0); - ClearPagePrivate(page); + f2fs_clear_page_private(page); f2fs_put_page(page, 0); trace_f2fs_commit_inmem_page(page, INMEM_INVALIDATE); From 52b876811ab7c141c9a873d62b7b933f61c4e449 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 7 Mar 2019 17:31:30 +0800 Subject: [PATCH 49/54] f2fs: don't trigger read IO for beyond EOF page In f2fs_mpage_readpages(), if page is beyond EOF, we should just zero out it, but previously, before checking previous mapping info, we missed to check filesize boundary, fix it. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index aa33c500f703..62c1be90d952 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1547,6 +1547,9 @@ static int f2fs_mpage_readpages(struct address_space *mapping, if (last_block > last_block_in_file) last_block = last_block_in_file; + /* just zeroing out page which is beyond EOF */ + if (block_in_file >= last_block) + goto zero_out; /* * Map blocks using the previous result first. */ @@ -1559,16 +1562,11 @@ static int f2fs_mpage_readpages(struct address_space *mapping, * Then do more f2fs_map_blocks() calls until we are * done with this page. */ - map.m_flags = 0; + map.m_lblk = block_in_file; + map.m_len = last_block - block_in_file; - if (block_in_file < last_block) { - map.m_lblk = block_in_file; - map.m_len = last_block - block_in_file; - - if (f2fs_map_blocks(inode, &map, 0, - F2FS_GET_BLOCK_DEFAULT)) - goto set_error_page; - } + if (f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_DEFAULT)) + goto set_error_page; got_it: if ((map.m_flags & F2FS_MAP_MAPPED)) { block_nr = map.m_pblk + block_in_file - map.m_lblk; @@ -1583,6 +1581,7 @@ got_it: DATA_GENERIC)) goto set_error_page; } else { +zero_out: zero_user_segment(page, 0, PAGE_SIZE); if (!PageUptodate(page)) SetPageUptodate(page); From 6fda31ae4bd975800270bc566dfbcc4af4f77348 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 12 Mar 2019 11:49:53 -0700 Subject: [PATCH 50/54] f2fs: give some messages for inline_xattr_size This patch adds some kernel messages when user sets wrong inline_xattr_size. Fixes: 500e0b28ecd3 ("f2fs: fix to check inline_xattr_size boundary correctly") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index f39861c1dcdb..b9d4d75f54d5 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -821,6 +821,8 @@ static int parse_options(struct super_block *sb, char *options) } if (test_opt(sbi, INLINE_XATTR_SIZE)) { + int min_size, max_size; + if (!f2fs_sb_has_extra_attr(sbi) || !f2fs_sb_has_flexible_inline_xattr(sbi)) { f2fs_msg(sb, KERN_ERR, @@ -834,15 +836,18 @@ static int parse_options(struct super_block *sb, char *options) "set with inline_xattr option"); return -EINVAL; } - if (F2FS_OPTION(sbi).inline_xattr_size < - sizeof(struct f2fs_xattr_header) / sizeof(__le32) || - F2FS_OPTION(sbi).inline_xattr_size > - DEF_ADDRS_PER_INODE - + + min_size = sizeof(struct f2fs_xattr_header) / sizeof(__le32); + max_size = DEF_ADDRS_PER_INODE - F2FS_TOTAL_EXTRA_ATTR_SIZE / sizeof(__le32) - DEF_INLINE_RESERVED_SIZE - - MIN_INLINE_DENTRY_SIZE / sizeof(__le32)) { + MIN_INLINE_DENTRY_SIZE / sizeof(__le32); + + if (F2FS_OPTION(sbi).inline_xattr_size < min_size || + F2FS_OPTION(sbi).inline_xattr_size > max_size) { f2fs_msg(sb, KERN_ERR, - "inline xattr size is out of range"); + "inline xattr size is out of range: %d ~ %d", + min_size, max_size); return -EINVAL; } } From 1f4e6f41cc90709c8faaa4b344589093d3d74d7e Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 4 Mar 2019 17:19:04 +0800 Subject: [PATCH 51/54] f2fs: fix to do sanity check with inode.i_inline_xattr_size As Paul Bandha reported in bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=202709 When I run the poc on the mounted f2fs img I get a buffer overflow in read_inline_xattr due to there being no sanity check on the value of i_inline_xattr_size. I created the img by just modifying the value of i_inline_xattr_size in the inode: i_name [test1.txt] i_ext: fofs:0 blkaddr:0 len:0 i_extra_isize [0x 18 : 24] i_inline_xattr_size [0x ffff : 65535] i_addr[ofs] [0x 0 : 0] mkdir /mnt/f2fs mount ./f2fs1.img /mnt/f2fs gcc poc.c -o poc ./poc int main() { int y = syscall(SYS_listxattr, "/mnt/f2fs/test1.txt", NULL, 0); printf("ret %d", y); printf("errno: %d\n", errno); } BUG: KASAN: slab-out-of-bounds in read_inline_xattr+0x18f/0x260 Read of size 262140 at addr ffff88011035efd8 by task f2fs1poc/3263 CPU: 0 PID: 3263 Comm: f2fs1poc Not tainted 4.18.0-custom #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.11.1-0-g0551a4be2c-prebuilt.qemu-project.org 04/01/2014 Call Trace: dump_stack+0x71/0xab print_address_description+0x83/0x250 kasan_report+0x213/0x350 memcpy+0x1f/0x50 read_inline_xattr+0x18f/0x260 read_all_xattrs+0xba/0x190 f2fs_listxattr+0x9d/0x3f0 listxattr+0xb2/0xd0 path_listxattr+0x93/0xe0 do_syscall_64+0x9d/0x220 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Let's add sanity check for inode.i_inline_xattr_size during f2fs_iget() to avoid this issue. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/inode.c | 15 +++++++++++++++ fs/f2fs/super.c | 5 +---- fs/f2fs/xattr.h | 6 ++++++ 3 files changed, 22 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 3e739e4986b8..d33f860c4e63 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -14,6 +14,7 @@ #include "f2fs.h" #include "node.h" #include "segment.h" +#include "xattr.h" #include @@ -248,6 +249,20 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) return false; } + if (f2fs_has_extra_attr(inode) && + f2fs_sb_has_flexible_inline_xattr(sbi) && + f2fs_has_inline_xattr(inode) && + (!fi->i_inline_xattr_size || + fi->i_inline_xattr_size > MAX_INLINE_XATTR_SIZE)) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_msg(sbi->sb, KERN_WARNING, + "%s: inode (ino=%lx) has corrupted " + "i_inline_xattr_size: %d, max: %zu", + __func__, inode->i_ino, fi->i_inline_xattr_size, + MAX_INLINE_XATTR_SIZE); + return false; + } + if (F2FS_I(inode)->extent_tree) { struct extent_info *ei = &F2FS_I(inode)->extent_tree->largest; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index b9d4d75f54d5..5418b1d55a4f 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -838,10 +838,7 @@ static int parse_options(struct super_block *sb, char *options) } min_size = sizeof(struct f2fs_xattr_header) / sizeof(__le32); - max_size = DEF_ADDRS_PER_INODE - - F2FS_TOTAL_EXTRA_ATTR_SIZE / sizeof(__le32) - - DEF_INLINE_RESERVED_SIZE - - MIN_INLINE_DENTRY_SIZE / sizeof(__le32); + max_size = MAX_INLINE_XATTR_SIZE; if (F2FS_OPTION(sbi).inline_xattr_size < min_size || F2FS_OPTION(sbi).inline_xattr_size > max_size) { diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h index 595d2af00a58..12a5ef3a607d 100644 --- a/fs/f2fs/xattr.h +++ b/fs/f2fs/xattr.h @@ -78,6 +78,12 @@ struct f2fs_xattr_entry { sizeof(struct f2fs_xattr_header) - \ sizeof(struct f2fs_xattr_entry)) +#define MAX_INLINE_XATTR_SIZE \ + (DEF_ADDRS_PER_INODE - \ + F2FS_TOTAL_EXTRA_ATTR_SIZE / sizeof(__le32) - \ + DEF_INLINE_RESERVED_SIZE - \ + MIN_INLINE_DENTRY_SIZE / sizeof(__le32)) + /* * On-disk structure of f2fs_xattr * We use inline xattrs space + 1 block for xattr. From 413e1f1994f2476e911d6ccebd561ffebe7d1c20 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 5 Mar 2019 19:32:26 +0800 Subject: [PATCH 52/54] f2fs: fix to adapt small inline xattr space in __find_inline_xattr() With below testcase, we will fail to find existed xattr entry: 1. mkfs.f2fs -O extra_attr -O flexible_inline_xattr /dev/zram0 2. mount -t f2fs -o inline_xattr_size=1 /dev/zram0 /mnt/f2fs/ 3. touch /mnt/f2fs/file 4. setfattr -n "user.name" -v 0 /mnt/f2fs/file 5. getfattr -n "user.name" /mnt/f2fs/file /mnt/f2fs/file: user.name: No such attribute The reason is for inode which has very small inline xattr size, __find_inline_xattr() will fail to traverse any entry due to first entry may not be loaded from xattr node yet, later, we may skip to check entire xattr datas in __find_xattr(), result in such wrong condition. This patch adds condition to check such case to avoid this issue. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/xattr.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 262bda631931..8c6161ea4e23 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -270,11 +270,11 @@ static struct f2fs_xattr_entry *__find_inline_xattr(struct inode *inode, { struct f2fs_xattr_entry *entry; unsigned int inline_size = inline_xattr_size(inode); + void *max_addr = base_addr + inline_size; list_for_each_xattr(entry, base_addr) { - if ((void *)entry + sizeof(__u32) > base_addr + inline_size || - (void *)XATTR_NEXT_ENTRY(entry) + sizeof(__u32) > - base_addr + inline_size) { + if ((void *)entry + sizeof(__u32) > max_addr || + (void *)XATTR_NEXT_ENTRY(entry) > max_addr) { *last_addr = entry; return NULL; } @@ -285,6 +285,13 @@ static struct f2fs_xattr_entry *__find_inline_xattr(struct inode *inode, if (!memcmp(entry->e_name, name, len)) break; } + + /* inline xattr header or entry across max inline xattr size */ + if (IS_XATTR_LAST_ENTRY(entry) && + (void *)entry + sizeof(__u32) > max_addr) { + *last_addr = entry; + return NULL; + } return entry; } From 557dadbd8114716f215d72d2d89060720e5ad45d Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 12 Mar 2019 15:44:27 +0800 Subject: [PATCH 53/54] f2fs: fix to avoid deadlock in f2fs_read_inline_dir() As Jiqun Li reported in bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=202883 sometimes, dead lock when make system call SYS_getdents64 with fsync() is called by another process. monkey running on android9.0 1. task 9785 held sbi->cp_rwsem and waiting lock_page() 2. task 10349 held mm_sem and waiting sbi->cp_rwsem 3. task 9709 held lock_page() and waiting mm_sem so this is a dead lock scenario. task stack is show by crash tools as following crash_arm64> bt ffffffc03c354080 PID: 9785 TASK: ffffffc03c354080 CPU: 1 COMMAND: "RxIoScheduler-3" >> #7 [ffffffc01b50fac0] __lock_page at ffffff80081b11e8 crash-arm64> bt 10349 PID: 10349 TASK: ffffffc018b83080 CPU: 1 COMMAND: "BUGLY_ASYNC_UPL" >> #3 [ffffffc01f8cfa40] rwsem_down_read_failed at ffffff8008a93afc PC: 00000033 LR: 00000000 SP: 00000000 PSTATE: ffffffffffffffff crash-arm64> bt 9709 PID: 9709 TASK: ffffffc03e7f3080 CPU: 1 COMMAND: "IntentService[A" >> #3 [ffffffc001e67850] rwsem_down_read_failed at ffffff8008a93afc >> #8 [ffffffc001e67b80] el1_ia at ffffff8008084fc4 PC: ffffff8008274114 [compat_filldir64+120] LR: ffffff80083584d4 [f2fs_fill_dentries+448] SP: ffffffc001e67b80 PSTATE: 80400145 X29: ffffffc001e67b80 X28: 0000000000000000 X27: 000000000000001a X26: 00000000000093d7 X25: ffffffc070d52480 X24: 0000000000000008 X23: 0000000000000028 X22: 00000000d43dfd60 X21: ffffffc001e67e90 X20: 0000000000000011 X19: ffffff80093a4000 X18: 0000000000000000 X17: 0000000000000000 X16: 0000000000000000 X15: 0000000000000000 X14: ffffffffffffffff X13: 0000000000000008 X12: 0101010101010101 X11: 7f7f7f7f7f7f7f7f X10: 6a6a6a6a6a6a6a6a X9: 7f7f7f7f7f7f7f7f X8: 0000000080808000 X7: ffffff800827409c X6: 0000000080808000 X5: 0000000000000008 X4: 00000000000093d7 X3: 000000000000001a X2: 0000000000000011 X1: ffffffc070d52480 X0: 0000000000800238 >> #9 [ffffffc001e67be0] f2fs_fill_dentries at ffffff80083584d0 PC: 0000003c LR: 00000000 SP: 00000000 PSTATE: 000000d9 X12: f48a02ff X11: d4678960 X10: d43dfc00 X9: d4678ae4 X8: 00000058 X7: d4678994 X6: d43de800 X5: 000000d9 X4: d43dfc0c X3: d43dfc10 X2: d46799c8 X1: 00000000 X0: 00001068 Below potential deadlock will happen between three threads: Thread A Thread B Thread C - f2fs_do_sync_file - f2fs_write_checkpoint - down_write(&sbi->node_change) -- 1) - do_page_fault - down_write(&mm->mmap_sem) -- 2) - do_wp_page - f2fs_vm_page_mkwrite - getdents64 - f2fs_read_inline_dir - lock_page -- 3) - f2fs_sync_node_pages - lock_page -- 3) - __do_map_lock - down_read(&sbi->node_change) -- 1) - f2fs_fill_dentries - dir_emit - compat_filldir64 - do_page_fault - down_read(&mm->mmap_sem) -- 2) Since f2fs_readdir is protected by inode.i_rwsem, there should not be any updates in inode page, we're safe to lookup dents in inode page without its lock held, so taking off the lock to improve concurrency of readdir and avoid potential deadlock. Reported-by: Jiqun Li Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/inline.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index ec067935ca78..8d3d0022cef8 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -659,6 +659,12 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx, if (IS_ERR(ipage)) return PTR_ERR(ipage); + /* + * f2fs_readdir was protected by inode.i_rwsem, it is safe to access + * ipage without page's lock held. + */ + unlock_page(ipage); + inline_dentry = inline_data_addr(inode, ipage); make_dentry_ptr_inline(inode, &d, inline_dentry); @@ -667,7 +673,7 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx, if (!err) ctx->pos = d.max; - f2fs_put_page(ipage, 1); + f2fs_put_page(ipage, 0); return err < 0 ? err : 0; } From 22c197243aeef0fa552f7e2ae41500a437e79890 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 13 Mar 2019 16:15:08 -0700 Subject: [PATCH 54/54] f2fs: set pin_file under CAP_SYS_ADMIN Android uses pin_file for uncrypt during OTA, and that should be managed by CAP_SYS_ADMIN only. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 9cc2563d6811..637a9dfbd578 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2653,8 +2653,8 @@ static int f2fs_ioc_set_pin_file(struct file *filp, unsigned long arg) __u32 pin; int ret = 0; - if (!inode_owner_or_capable(inode)) - return -EACCES; + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; if (get_user(pin, (__u32 __user *)arg)) return -EFAULT;