From 8da9e3f7476137747b8502b87df80738861d324c Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 12 Oct 2016 17:18:56 -0700 Subject: [PATCH 001/804] f2fs: backport from (4c1fad64 - Merge tag 'for-f2fs-4.9' of git://git.kernel.org/pub/scm/linux/kernel/git/jaegeuk/f2fs) Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.txt | 18 +- fs/Kconfig | 2 + fs/Makefile | 1 + fs/crypto/Kconfig | 18 + fs/crypto/Makefile | 3 + fs/crypto/crypto.c | 568 +++++++++ fs/{f2fs/crypto_fname.c => crypto/fname.c} | 276 ++-- fs/crypto/keyinfo.c | 304 +++++ fs/crypto/policy.c | 246 ++++ fs/f2fs/Kconfig | 21 +- fs/f2fs/Makefile | 2 - fs/f2fs/acl.c | 21 +- fs/f2fs/acl.h | 3 +- fs/f2fs/checkpoint.c | 529 +++++--- fs/f2fs/crypto.c | 491 -------- fs/f2fs/crypto_key.c | 254 ---- fs/f2fs/crypto_policy.c | 212 ---- fs/f2fs/data.c | 1252 ++++++++++-------- fs/f2fs/debug.c | 80 +- fs/f2fs/dir.c | 451 +++---- fs/f2fs/extent_cache.c | 315 ++--- fs/f2fs/f2fs.h | 1010 +++++++++------ fs/f2fs/f2fs_crypto.h | 151 --- fs/f2fs/file.c | 1323 ++++++++++++++------ fs/f2fs/gc.c | 353 ++++-- fs/f2fs/gc.h | 8 - fs/f2fs/inline.c | 266 ++-- fs/f2fs/inode.c | 176 +-- fs/f2fs/namei.c | 409 +++--- fs/f2fs/node.c | 733 +++++++---- fs/f2fs/node.h | 123 +- fs/f2fs/recovery.c | 274 ++-- fs/f2fs/segment.c | 689 ++++++---- fs/f2fs/segment.h | 47 +- fs/f2fs/shrinker.c | 8 +- fs/f2fs/super.c | 845 ++++++++++--- fs/f2fs/trace.c | 6 +- fs/f2fs/xattr.c | 69 +- fs/f2fs/xattr.h | 3 +- include/linux/dcache.h | 1 + include/linux/f2fs_fs.h | 44 +- include/linux/fs.h | 7 + include/linux/fscrypto.h | 435 +++++++ include/trace/events/f2fs.h | 64 +- include/uapi/linux/fs.h | 18 + 45 files changed, 7635 insertions(+), 4494 deletions(-) create mode 100644 fs/crypto/Kconfig create mode 100644 fs/crypto/Makefile create mode 100644 fs/crypto/crypto.c rename fs/{f2fs/crypto_fname.c => crypto/fname.c} (54%) create mode 100644 fs/crypto/keyinfo.c create mode 100644 fs/crypto/policy.c delete mode 100644 fs/f2fs/crypto.c delete mode 100644 fs/f2fs/crypto_key.c delete mode 100644 fs/f2fs/crypto_policy.c delete mode 100644 fs/f2fs/f2fs_crypto.h create mode 100644 include/linux/fscrypto.h diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt index b102b436563e..753dd4f96afe 100644 --- a/Documentation/filesystems/f2fs.txt +++ b/Documentation/filesystems/f2fs.txt @@ -102,14 +102,16 @@ background_gc=%s Turn on/off cleaning operations, namely garbage collection, triggered in background when I/O subsystem is idle. If background_gc=on, it will turn on the garbage collection and if background_gc=off, garbage collection - will be truned off. If background_gc=sync, it will turn + will be turned off. If background_gc=sync, it will turn on synchronous garbage collection running in background. Default value for this option is on. So garbage collection is on by default. disable_roll_forward Disable the roll-forward recovery routine norecovery Disable the roll-forward recovery routine, mounted read- only (i.e., -o ro,disable_roll_forward) -discard Issue discard/TRIM commands when a segment is cleaned. +discard/nodiscard Enable/disable real-time discard in f2fs, if discard is + enabled, f2fs will issue discard/TRIM commands when a + segment is cleaned. no_heap Disable heap-style segment allocation which finds free segments for data from the beginning of main area, while for node from the end of main area. @@ -129,6 +131,7 @@ inline_dentry Enable the inline dir feature: data in new created directory entries can be written into inode block. The space of inode block which is used to store inline dentries is limited to ~3.4k. +noinline_dentry Diable the inline dentry feature. flush_merge Merge concurrent cache_flush commands as much as possible to eliminate redundant command issues. If the underlying device handles the cache_flush command relatively slowly, @@ -145,10 +148,15 @@ extent_cache Enable an extent cache based on rb-tree, it can cache as many as extent which map between contiguous logical address and physical address per inode, resulting in increasing the cache hit ratio. Set by default. -noextent_cache Diable an extent cache based on rb-tree explicitly, see +noextent_cache Disable an extent cache based on rb-tree explicitly, see the above extent_cache mount option. noinline_data Disable the inline data feature, inline data feature is enabled by default. +data_flush Enable data flushing before checkpoint in order to + persist data of regular and symlink. +mode=%s Control block allocation mode which supports "adaptive" + and "lfs". In "lfs" mode, there should be no random + writes towards main area. ================================================================================ DEBUGFS ENTRIES @@ -192,7 +200,7 @@ Files in /sys/fs/f2fs/ policy for garbage collection. Setting gc_idle = 0 (default) will disable this option. Setting gc_idle = 1 will select the Cost Benefit approach - & setting gc_idle = 2 will select the greedy aproach. + & setting gc_idle = 2 will select the greedy approach. reclaim_segments This parameter controls the number of prefree segments to be reclaimed. If the number of prefree @@ -298,7 +306,7 @@ The dump.f2fs shows the information of specific inode and dumps SSA and SIT to file. Each file is dump_ssa and dump_sit. The dump.f2fs is used to debug on-disk data structures of the f2fs filesystem. -It shows on-disk inode information reconized by a given inode number, and is +It shows on-disk inode information recognized by a given inode number, and is able to dump all the SSA and SIT entries into predefined files, ./dump_ssa and ./dump_sit respectively. diff --git a/fs/Kconfig b/fs/Kconfig index 6ce72d8d1ee1..16a7e2871213 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -73,6 +73,8 @@ config FILE_LOCKING for filesystems like NFS and for the flock() system call. Disabling this option saves about 11k. +source "fs/crypto/Kconfig" + source "fs/notify/Kconfig" source "fs/quota/Kconfig" diff --git a/fs/Makefile b/fs/Makefile index 79f522575cba..252c96898a43 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -30,6 +30,7 @@ obj-$(CONFIG_EVENTFD) += eventfd.o obj-$(CONFIG_USERFAULTFD) += userfaultfd.o obj-$(CONFIG_AIO) += aio.o obj-$(CONFIG_FS_DAX) += dax.o +obj-$(CONFIG_FS_ENCRYPTION) += crypto/ obj-$(CONFIG_FILE_LOCKING) += locks.o obj-$(CONFIG_COMPAT) += compat.o compat_ioctl.o obj-$(CONFIG_BINFMT_AOUT) += binfmt_aout.o diff --git a/fs/crypto/Kconfig b/fs/crypto/Kconfig new file mode 100644 index 000000000000..92348faf9865 --- /dev/null +++ b/fs/crypto/Kconfig @@ -0,0 +1,18 @@ +config FS_ENCRYPTION + tristate "FS Encryption (Per-file encryption)" + depends on BLOCK + select CRYPTO + select CRYPTO_AES + select CRYPTO_CBC + select CRYPTO_ECB + select CRYPTO_XTS + select CRYPTO_CTS + select CRYPTO_CTR + select CRYPTO_SHA256 + select KEYS + select ENCRYPTED_KEYS + help + Enable encryption of files and directories. This + feature is similar to ecryptfs, but it is more memory + efficient since it avoids caching the encrypted and + decrypted pages in the page cache. diff --git a/fs/crypto/Makefile b/fs/crypto/Makefile new file mode 100644 index 000000000000..f17684c48739 --- /dev/null +++ b/fs/crypto/Makefile @@ -0,0 +1,3 @@ +obj-$(CONFIG_FS_ENCRYPTION) += fscrypto.o + +fscrypto-y := crypto.o fname.o policy.o keyinfo.o diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c new file mode 100644 index 000000000000..2fc8c43ce531 --- /dev/null +++ b/fs/crypto/crypto.c @@ -0,0 +1,568 @@ +/* + * This contains encryption functions for per-file encryption. + * + * Copyright (C) 2015, Google, Inc. + * Copyright (C) 2015, Motorola Mobility + * + * Written by Michael Halcrow, 2014. + * + * Filename encryption additions + * Uday Savagaonkar, 2014 + * Encryption policy handling additions + * Ildar Muslukhov, 2014 + * Add fscrypt_pullback_bio_page() + * Jaegeuk Kim, 2015. + * + * This has not yet undergone a rigorous security audit. + * + * The usage of AES-XTS should conform to recommendations in NIST + * Special Publication 800-38E and IEEE P1619/D16. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static unsigned int num_prealloc_crypto_pages = 32; +static unsigned int num_prealloc_crypto_ctxs = 128; + +module_param(num_prealloc_crypto_pages, uint, 0444); +MODULE_PARM_DESC(num_prealloc_crypto_pages, + "Number of crypto pages to preallocate"); +module_param(num_prealloc_crypto_ctxs, uint, 0444); +MODULE_PARM_DESC(num_prealloc_crypto_ctxs, + "Number of crypto contexts to preallocate"); + +static mempool_t *fscrypt_bounce_page_pool = NULL; + +static LIST_HEAD(fscrypt_free_ctxs); +static DEFINE_SPINLOCK(fscrypt_ctx_lock); + +static struct workqueue_struct *fscrypt_read_workqueue; +static DEFINE_MUTEX(fscrypt_init_mutex); + +static struct kmem_cache *fscrypt_ctx_cachep; +struct kmem_cache *fscrypt_info_cachep; + +/** + * fscrypt_release_ctx() - Releases an encryption context + * @ctx: The encryption context to release. + * + * If the encryption context was allocated from the pre-allocated pool, returns + * it to that pool. Else, frees it. + * + * If there's a bounce page in the context, this frees that. + */ +void fscrypt_release_ctx(struct fscrypt_ctx *ctx) +{ + unsigned long flags; + + if (ctx->flags & FS_WRITE_PATH_FL && ctx->w.bounce_page) { + mempool_free(ctx->w.bounce_page, fscrypt_bounce_page_pool); + ctx->w.bounce_page = NULL; + } + ctx->w.control_page = NULL; + if (ctx->flags & FS_CTX_REQUIRES_FREE_ENCRYPT_FL) { + kmem_cache_free(fscrypt_ctx_cachep, ctx); + } else { + spin_lock_irqsave(&fscrypt_ctx_lock, flags); + list_add(&ctx->free_list, &fscrypt_free_ctxs); + spin_unlock_irqrestore(&fscrypt_ctx_lock, flags); + } +} +EXPORT_SYMBOL(fscrypt_release_ctx); + +/** + * fscrypt_get_ctx() - Gets an encryption context + * @inode: The inode for which we are doing the crypto + * @gfp_flags: The gfp flag for memory allocation + * + * Allocates and initializes an encryption context. + * + * Return: An allocated and initialized encryption context on success; error + * value or NULL otherwise. + */ +struct fscrypt_ctx *fscrypt_get_ctx(struct inode *inode, gfp_t gfp_flags) +{ + struct fscrypt_ctx *ctx = NULL; + struct fscrypt_info *ci = inode->i_crypt_info; + unsigned long flags; + + if (ci == NULL) + return ERR_PTR(-ENOKEY); + + /* + * We first try getting the ctx from a free list because in + * the common case the ctx will have an allocated and + * initialized crypto tfm, so it's probably a worthwhile + * optimization. For the bounce page, we first try getting it + * from the kernel allocator because that's just about as fast + * as getting it from a list and because a cache of free pages + * should generally be a "last resort" option for a filesystem + * to be able to do its job. + */ + spin_lock_irqsave(&fscrypt_ctx_lock, flags); + ctx = list_first_entry_or_null(&fscrypt_free_ctxs, + struct fscrypt_ctx, free_list); + if (ctx) + list_del(&ctx->free_list); + spin_unlock_irqrestore(&fscrypt_ctx_lock, flags); + if (!ctx) { + ctx = kmem_cache_zalloc(fscrypt_ctx_cachep, gfp_flags); + if (!ctx) + return ERR_PTR(-ENOMEM); + ctx->flags |= FS_CTX_REQUIRES_FREE_ENCRYPT_FL; + } else { + ctx->flags &= ~FS_CTX_REQUIRES_FREE_ENCRYPT_FL; + } + ctx->flags &= ~FS_WRITE_PATH_FL; + return ctx; +} +EXPORT_SYMBOL(fscrypt_get_ctx); + +/** + * fscrypt_complete() - The completion callback for page encryption + * @req: The asynchronous encryption request context + * @res: The result of the encryption operation + */ +static void fscrypt_complete(struct crypto_async_request *req, int res) +{ + struct fscrypt_completion_result *ecr = req->data; + + if (res == -EINPROGRESS) + return; + ecr->res = res; + complete(&ecr->completion); +} + +typedef enum { + FS_DECRYPT = 0, + FS_ENCRYPT, +} fscrypt_direction_t; + +static int do_page_crypto(struct inode *inode, + fscrypt_direction_t rw, pgoff_t index, + struct page *src_page, struct page *dest_page, + gfp_t gfp_flags) +{ + u8 xts_tweak[FS_XTS_TWEAK_SIZE]; + struct skcipher_request *req = NULL; + DECLARE_FS_COMPLETION_RESULT(ecr); + struct scatterlist dst, src; + struct fscrypt_info *ci = inode->i_crypt_info; + struct crypto_skcipher *tfm = ci->ci_ctfm; + int res = 0; + + req = skcipher_request_alloc(tfm, gfp_flags); + if (!req) { + printk_ratelimited(KERN_ERR + "%s: crypto_request_alloc() failed\n", + __func__); + return -ENOMEM; + } + + skcipher_request_set_callback( + req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, + fscrypt_complete, &ecr); + + BUILD_BUG_ON(FS_XTS_TWEAK_SIZE < sizeof(index)); + memcpy(xts_tweak, &index, sizeof(index)); + memset(&xts_tweak[sizeof(index)], 0, + FS_XTS_TWEAK_SIZE - sizeof(index)); + + sg_init_table(&dst, 1); + sg_set_page(&dst, dest_page, PAGE_SIZE, 0); + sg_init_table(&src, 1); + sg_set_page(&src, src_page, PAGE_SIZE, 0); + skcipher_request_set_crypt(req, &src, &dst, PAGE_SIZE, + xts_tweak); + if (rw == FS_DECRYPT) + res = crypto_skcipher_decrypt(req); + else + res = crypto_skcipher_encrypt(req); + if (res == -EINPROGRESS || res == -EBUSY) { + BUG_ON(req->base.data != &ecr); + wait_for_completion(&ecr.completion); + res = ecr.res; + } + skcipher_request_free(req); + if (res) { + printk_ratelimited(KERN_ERR + "%s: crypto_skcipher_encrypt() returned %d\n", + __func__, res); + return res; + } + return 0; +} + +static struct page *alloc_bounce_page(struct fscrypt_ctx *ctx, gfp_t gfp_flags) +{ + ctx->w.bounce_page = mempool_alloc(fscrypt_bounce_page_pool, gfp_flags); + if (ctx->w.bounce_page == NULL) + return ERR_PTR(-ENOMEM); + ctx->flags |= FS_WRITE_PATH_FL; + return ctx->w.bounce_page; +} + +/** + * fscypt_encrypt_page() - Encrypts a page + * @inode: The inode for which the encryption should take place + * @plaintext_page: The page to encrypt. Must be locked. + * @gfp_flags: The gfp flag for memory allocation + * + * Allocates a ciphertext page and encrypts plaintext_page into it using the ctx + * encryption context. + * + * Called on the page write path. The caller must call + * fscrypt_restore_control_page() on the returned ciphertext page to + * release the bounce buffer and the encryption context. + * + * Return: An allocated page with the encrypted content on success. Else, an + * error value or NULL. + */ +struct page *fscrypt_encrypt_page(struct inode *inode, + struct page *plaintext_page, gfp_t gfp_flags) +{ + struct fscrypt_ctx *ctx; + struct page *ciphertext_page = NULL; + int err; + + BUG_ON(!PageLocked(plaintext_page)); + + ctx = fscrypt_get_ctx(inode, gfp_flags); + if (IS_ERR(ctx)) + return (struct page *)ctx; + + /* The encryption operation will require a bounce page. */ + ciphertext_page = alloc_bounce_page(ctx, gfp_flags); + if (IS_ERR(ciphertext_page)) + goto errout; + + ctx->w.control_page = plaintext_page; + err = do_page_crypto(inode, FS_ENCRYPT, plaintext_page->index, + plaintext_page, ciphertext_page, + gfp_flags); + if (err) { + ciphertext_page = ERR_PTR(err); + goto errout; + } + SetPagePrivate(ciphertext_page); + set_page_private(ciphertext_page, (unsigned long)ctx); + lock_page(ciphertext_page); + return ciphertext_page; + +errout: + fscrypt_release_ctx(ctx); + return ciphertext_page; +} +EXPORT_SYMBOL(fscrypt_encrypt_page); + +/** + * f2crypt_decrypt_page() - Decrypts a page in-place + * @page: The page to decrypt. Must be locked. + * + * Decrypts page in-place using the ctx encryption context. + * + * Called from the read completion callback. + * + * Return: Zero on success, non-zero otherwise. + */ +int fscrypt_decrypt_page(struct page *page) +{ + BUG_ON(!PageLocked(page)); + + return do_page_crypto(page->mapping->host, + FS_DECRYPT, page->index, page, page, GFP_NOFS); +} +EXPORT_SYMBOL(fscrypt_decrypt_page); + +int fscrypt_zeroout_range(struct inode *inode, pgoff_t lblk, + sector_t pblk, unsigned int len) +{ + struct fscrypt_ctx *ctx; + struct page *ciphertext_page = NULL; + struct bio *bio; + int ret, err = 0; + + BUG_ON(inode->i_sb->s_blocksize != PAGE_SIZE); + + ctx = fscrypt_get_ctx(inode, GFP_NOFS); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + ciphertext_page = alloc_bounce_page(ctx, GFP_NOWAIT); + if (IS_ERR(ciphertext_page)) { + err = PTR_ERR(ciphertext_page); + goto errout; + } + + while (len--) { + err = do_page_crypto(inode, FS_ENCRYPT, lblk, + ZERO_PAGE(0), ciphertext_page, + GFP_NOFS); + if (err) + goto errout; + + bio = bio_alloc(GFP_NOWAIT, 1); + if (!bio) { + err = -ENOMEM; + goto errout; + } + bio->bi_bdev = inode->i_sb->s_bdev; + bio->bi_iter.bi_sector = + pblk << (inode->i_sb->s_blocksize_bits - 9); + ret = bio_add_page(bio, ciphertext_page, + inode->i_sb->s_blocksize, 0); + if (ret != inode->i_sb->s_blocksize) { + /* should never happen! */ + WARN_ON(1); + bio_put(bio); + err = -EIO; + goto errout; + } + err = submit_bio_wait(WRITE, bio); + if ((err == 0) && bio->bi_error) + err = -EIO; + bio_put(bio); + if (err) + goto errout; + lblk++; + pblk++; + } + err = 0; +errout: + fscrypt_release_ctx(ctx); + return err; +} +EXPORT_SYMBOL(fscrypt_zeroout_range); + +/* + * Validate dentries for encrypted directories to make sure we aren't + * potentially caching stale data after a key has been added or + * removed. + */ +static int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags) +{ + struct dentry *dir; + struct fscrypt_info *ci; + int dir_has_key, cached_with_key; + + if (flags & LOOKUP_RCU) + return -ECHILD; + + dir = dget_parent(dentry); + if (!d_inode(dir)->i_sb->s_cop->is_encrypted(d_inode(dir))) { + dput(dir); + return 0; + } + + ci = d_inode(dir)->i_crypt_info; + if (ci && ci->ci_keyring_key && + (ci->ci_keyring_key->flags & ((1 << KEY_FLAG_INVALIDATED) | + (1 << KEY_FLAG_REVOKED) | + (1 << KEY_FLAG_DEAD)))) + ci = NULL; + + /* this should eventually be an flag in d_flags */ + spin_lock(&dentry->d_lock); + cached_with_key = dentry->d_flags & DCACHE_ENCRYPTED_WITH_KEY; + spin_unlock(&dentry->d_lock); + dir_has_key = (ci != NULL); + dput(dir); + + /* + * If the dentry was cached without the key, and it is a + * negative dentry, it might be a valid name. We can't check + * if the key has since been made available due to locking + * reasons, so we fail the validation so ext4_lookup() can do + * this check. + * + * We also fail the validation if the dentry was created with + * the key present, but we no longer have the key, or vice versa. + */ + if ((!cached_with_key && d_is_negative(dentry)) || + (!cached_with_key && dir_has_key) || + (cached_with_key && !dir_has_key)) + return 0; + return 1; +} + +const struct dentry_operations fscrypt_d_ops = { + .d_revalidate = fscrypt_d_revalidate, +}; +EXPORT_SYMBOL(fscrypt_d_ops); + +/* + * Call fscrypt_decrypt_page on every single page, reusing the encryption + * context. + */ +static void completion_pages(struct work_struct *work) +{ + struct fscrypt_ctx *ctx = + container_of(work, struct fscrypt_ctx, r.work); + struct bio *bio = ctx->r.bio; + struct bio_vec *bv; + int i; + + bio_for_each_segment_all(bv, bio, i) { + struct page *page = bv->bv_page; + int ret = fscrypt_decrypt_page(page); + + if (ret) { + WARN_ON_ONCE(1); + SetPageError(page); + } else { + SetPageUptodate(page); + } + unlock_page(page); + } + fscrypt_release_ctx(ctx); + bio_put(bio); +} + +void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *ctx, struct bio *bio) +{ + INIT_WORK(&ctx->r.work, completion_pages); + ctx->r.bio = bio; + queue_work(fscrypt_read_workqueue, &ctx->r.work); +} +EXPORT_SYMBOL(fscrypt_decrypt_bio_pages); + +void fscrypt_pullback_bio_page(struct page **page, bool restore) +{ + struct fscrypt_ctx *ctx; + struct page *bounce_page; + + /* The bounce data pages are unmapped. */ + if ((*page)->mapping) + return; + + /* The bounce data page is unmapped. */ + bounce_page = *page; + ctx = (struct fscrypt_ctx *)page_private(bounce_page); + + /* restore control page */ + *page = ctx->w.control_page; + + if (restore) + fscrypt_restore_control_page(bounce_page); +} +EXPORT_SYMBOL(fscrypt_pullback_bio_page); + +void fscrypt_restore_control_page(struct page *page) +{ + struct fscrypt_ctx *ctx; + + ctx = (struct fscrypt_ctx *)page_private(page); + set_page_private(page, (unsigned long)NULL); + ClearPagePrivate(page); + unlock_page(page); + fscrypt_release_ctx(ctx); +} +EXPORT_SYMBOL(fscrypt_restore_control_page); + +static void fscrypt_destroy(void) +{ + struct fscrypt_ctx *pos, *n; + + list_for_each_entry_safe(pos, n, &fscrypt_free_ctxs, free_list) + kmem_cache_free(fscrypt_ctx_cachep, pos); + INIT_LIST_HEAD(&fscrypt_free_ctxs); + mempool_destroy(fscrypt_bounce_page_pool); + fscrypt_bounce_page_pool = NULL; +} + +/** + * fscrypt_initialize() - allocate major buffers for fs encryption. + * + * We only call this when we start accessing encrypted files, since it + * results in memory getting allocated that wouldn't otherwise be used. + * + * Return: Zero on success, non-zero otherwise. + */ +int fscrypt_initialize(void) +{ + int i, res = -ENOMEM; + + if (fscrypt_bounce_page_pool) + return 0; + + mutex_lock(&fscrypt_init_mutex); + if (fscrypt_bounce_page_pool) + goto already_initialized; + + for (i = 0; i < num_prealloc_crypto_ctxs; i++) { + struct fscrypt_ctx *ctx; + + ctx = kmem_cache_zalloc(fscrypt_ctx_cachep, GFP_NOFS); + if (!ctx) + goto fail; + list_add(&ctx->free_list, &fscrypt_free_ctxs); + } + + fscrypt_bounce_page_pool = + mempool_create_page_pool(num_prealloc_crypto_pages, 0); + if (!fscrypt_bounce_page_pool) + goto fail; + +already_initialized: + mutex_unlock(&fscrypt_init_mutex); + return 0; +fail: + fscrypt_destroy(); + mutex_unlock(&fscrypt_init_mutex); + return res; +} +EXPORT_SYMBOL(fscrypt_initialize); + +/** + * fscrypt_init() - Set up for fs encryption. + */ +static int __init fscrypt_init(void) +{ + fscrypt_read_workqueue = alloc_workqueue("fscrypt_read_queue", + WQ_HIGHPRI, 0); + if (!fscrypt_read_workqueue) + goto fail; + + fscrypt_ctx_cachep = KMEM_CACHE(fscrypt_ctx, SLAB_RECLAIM_ACCOUNT); + if (!fscrypt_ctx_cachep) + goto fail_free_queue; + + fscrypt_info_cachep = KMEM_CACHE(fscrypt_info, SLAB_RECLAIM_ACCOUNT); + if (!fscrypt_info_cachep) + goto fail_free_ctx; + + return 0; + +fail_free_ctx: + kmem_cache_destroy(fscrypt_ctx_cachep); +fail_free_queue: + destroy_workqueue(fscrypt_read_workqueue); +fail: + return -ENOMEM; +} +module_init(fscrypt_init) + +/** + * fscrypt_exit() - Shutdown the fs encryption system + */ +static void __exit fscrypt_exit(void) +{ + fscrypt_destroy(); + + if (fscrypt_read_workqueue) + destroy_workqueue(fscrypt_read_workqueue); + kmem_cache_destroy(fscrypt_ctx_cachep); + kmem_cache_destroy(fscrypt_info_cachep); +} +module_exit(fscrypt_exit); + +MODULE_LICENSE("GPL"); diff --git a/fs/f2fs/crypto_fname.c b/fs/crypto/fname.c similarity index 54% rename from fs/f2fs/crypto_fname.c rename to fs/crypto/fname.c index ab377d496a39..5d6d49113efa 100644 --- a/fs/f2fs/crypto_fname.c +++ b/fs/crypto/fname.c @@ -1,46 +1,32 @@ /* - * linux/fs/f2fs/crypto_fname.c - * - * Copied from linux/fs/ext4/crypto.c + * This contains functions for filename crypto management * * Copyright (C) 2015, Google, Inc. * Copyright (C) 2015, Motorola Mobility * - * This contains functions for filename crypto management in f2fs - * * Written by Uday Savagaonkar, 2014. - * - * Adjust f2fs dentry structure - * Jaegeuk Kim, 2015. + * Modified by Jaegeuk Kim, 2015. * * This has not yet undergone a rigorous security audit. */ -#include -#include + #include #include -#include -#include -#include -#include -#include -#include -#include #include -#include -#include #include +#include -#include "f2fs.h" -#include "f2fs_crypto.h" -#include "xattr.h" +static u32 size_round_up(size_t size, size_t blksize) +{ + return ((size + blksize - 1) / blksize) * blksize; +} /** - * f2fs_dir_crypt_complete() - + * dir_crypt_complete() - */ -static void f2fs_dir_crypt_complete(struct crypto_async_request *req, int res) +static void dir_crypt_complete(struct crypto_async_request *req, int res) { - struct f2fs_completion_result *ecr = req->data; + struct fscrypt_completion_result *ecr = req->data; if (res == -EINPROGRESS) return; @@ -48,45 +34,35 @@ static void f2fs_dir_crypt_complete(struct crypto_async_request *req, int res) complete(&ecr->completion); } -bool f2fs_valid_filenames_enc_mode(uint32_t mode) -{ - return (mode == F2FS_ENCRYPTION_MODE_AES_256_CTS); -} - -static unsigned max_name_len(struct inode *inode) -{ - return S_ISLNK(inode->i_mode) ? inode->i_sb->s_blocksize : - F2FS_NAME_LEN; -} - /** - * f2fs_fname_encrypt() - + * fname_encrypt() - * * This function encrypts the input filename, and returns the length of the * ciphertext. Errors are returned as negative numbers. We trust the caller to * allocate sufficient memory to oname string. */ -static int f2fs_fname_encrypt(struct inode *inode, - const struct qstr *iname, struct f2fs_str *oname) +static int fname_encrypt(struct inode *inode, + const struct qstr *iname, struct fscrypt_str *oname) { u32 ciphertext_len; - struct ablkcipher_request *req = NULL; - DECLARE_F2FS_COMPLETION_RESULT(ecr); - struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info; - struct crypto_ablkcipher *tfm = ci->ci_ctfm; + struct skcipher_request *req = NULL; + DECLARE_FS_COMPLETION_RESULT(ecr); + struct fscrypt_info *ci = inode->i_crypt_info; + struct crypto_skcipher *tfm = ci->ci_ctfm; int res = 0; - char iv[F2FS_CRYPTO_BLOCK_SIZE]; + char iv[FS_CRYPTO_BLOCK_SIZE]; struct scatterlist src_sg, dst_sg; - int padding = 4 << (ci->ci_flags & F2FS_POLICY_FLAGS_PAD_MASK); + int padding = 4 << (ci->ci_flags & FS_POLICY_FLAGS_PAD_MASK); char *workbuf, buf[32], *alloc_buf = NULL; - unsigned lim = max_name_len(inode); + unsigned lim; + lim = inode->i_sb->s_cop->max_namelen(inode); if (iname->len <= 0 || iname->len > lim) return -EIO; - ciphertext_len = (iname->len < F2FS_CRYPTO_BLOCK_SIZE) ? - F2FS_CRYPTO_BLOCK_SIZE : iname->len; - ciphertext_len = f2fs_fname_crypto_round_up(ciphertext_len, padding); + ciphertext_len = (iname->len < FS_CRYPTO_BLOCK_SIZE) ? + FS_CRYPTO_BLOCK_SIZE : iname->len; + ciphertext_len = size_round_up(ciphertext_len, padding); ciphertext_len = (ciphertext_len > lim) ? lim : ciphertext_len; if (ciphertext_len <= sizeof(buf)) { @@ -99,16 +75,16 @@ static int f2fs_fname_encrypt(struct inode *inode, } /* Allocate request */ - req = ablkcipher_request_alloc(tfm, GFP_NOFS); + req = skcipher_request_alloc(tfm, GFP_NOFS); if (!req) { printk_ratelimited(KERN_ERR "%s: crypto_request_alloc() failed\n", __func__); kfree(alloc_buf); return -ENOMEM; } - ablkcipher_request_set_callback(req, + skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, - f2fs_dir_crypt_complete, &ecr); + dir_crypt_complete, &ecr); /* Copy the input */ memcpy(workbuf, iname->name, iname->len); @@ -116,79 +92,78 @@ static int f2fs_fname_encrypt(struct inode *inode, memset(workbuf + iname->len, 0, ciphertext_len - iname->len); /* Initialize IV */ - memset(iv, 0, F2FS_CRYPTO_BLOCK_SIZE); + memset(iv, 0, FS_CRYPTO_BLOCK_SIZE); /* Create encryption request */ sg_init_one(&src_sg, workbuf, ciphertext_len); sg_init_one(&dst_sg, oname->name, ciphertext_len); - ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, ciphertext_len, iv); - res = crypto_ablkcipher_encrypt(req); + skcipher_request_set_crypt(req, &src_sg, &dst_sg, ciphertext_len, iv); + res = crypto_skcipher_encrypt(req); if (res == -EINPROGRESS || res == -EBUSY) { - BUG_ON(req->base.data != &ecr); wait_for_completion(&ecr.completion); res = ecr.res; } kfree(alloc_buf); - ablkcipher_request_free(req); - if (res < 0) { + skcipher_request_free(req); + if (res < 0) printk_ratelimited(KERN_ERR "%s: Error (error code %d)\n", __func__, res); - } + oname->len = ciphertext_len; return res; } /* - * f2fs_fname_decrypt() + * fname_decrypt() * This function decrypts the input filename, and returns * the length of the plaintext. * Errors are returned as negative numbers. * We trust the caller to allocate sufficient memory to oname string. */ -static int f2fs_fname_decrypt(struct inode *inode, - const struct f2fs_str *iname, struct f2fs_str *oname) +static int fname_decrypt(struct inode *inode, + const struct fscrypt_str *iname, + struct fscrypt_str *oname) { - struct ablkcipher_request *req = NULL; - DECLARE_F2FS_COMPLETION_RESULT(ecr); + struct skcipher_request *req = NULL; + DECLARE_FS_COMPLETION_RESULT(ecr); struct scatterlist src_sg, dst_sg; - struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info; - struct crypto_ablkcipher *tfm = ci->ci_ctfm; + struct fscrypt_info *ci = inode->i_crypt_info; + struct crypto_skcipher *tfm = ci->ci_ctfm; int res = 0; - char iv[F2FS_CRYPTO_BLOCK_SIZE]; - unsigned lim = max_name_len(inode); + char iv[FS_CRYPTO_BLOCK_SIZE]; + unsigned lim; + lim = inode->i_sb->s_cop->max_namelen(inode); if (iname->len <= 0 || iname->len > lim) return -EIO; /* Allocate request */ - req = ablkcipher_request_alloc(tfm, GFP_NOFS); + req = skcipher_request_alloc(tfm, GFP_NOFS); if (!req) { printk_ratelimited(KERN_ERR "%s: crypto_request_alloc() failed\n", __func__); return -ENOMEM; } - ablkcipher_request_set_callback(req, + skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, - f2fs_dir_crypt_complete, &ecr); + dir_crypt_complete, &ecr); /* Initialize IV */ - memset(iv, 0, F2FS_CRYPTO_BLOCK_SIZE); + memset(iv, 0, FS_CRYPTO_BLOCK_SIZE); /* Create decryption request */ sg_init_one(&src_sg, iname->name, iname->len); sg_init_one(&dst_sg, oname->name, oname->len); - ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, iname->len, iv); - res = crypto_ablkcipher_decrypt(req); + skcipher_request_set_crypt(req, &src_sg, &dst_sg, iname->len, iv); + res = crypto_skcipher_decrypt(req); if (res == -EINPROGRESS || res == -EBUSY) { - BUG_ON(req->base.data != &ecr); wait_for_completion(&ecr.completion); res = ecr.res; } - ablkcipher_request_free(req); + skcipher_request_free(req); if (res < 0) { printk_ratelimited(KERN_ERR - "%s: Error in f2fs_fname_decrypt (error code %d)\n", - __func__, res); + "%s: Error (error code %d)\n", __func__, res); return res; } @@ -200,7 +175,7 @@ static const char *lookup_table = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,"; /** - * f2fs_fname_encode_digest() - + * digest_encode() - * * Encodes the input digest using characters from the set [a-zA-Z0-9_+]. * The encoded string is roughly 4/3 times the size of the input string. @@ -249,148 +224,152 @@ static int digest_decode(const char *src, int len, char *dst) return cp - dst; } -/** - * f2fs_fname_crypto_round_up() - - * - * Return: The next multiple of block size - */ -u32 f2fs_fname_crypto_round_up(u32 size, u32 blksize) +u32 fscrypt_fname_encrypted_size(struct inode *inode, u32 ilen) { - return ((size + blksize - 1) / blksize) * blksize; + int padding = 32; + struct fscrypt_info *ci = inode->i_crypt_info; + + if (ci) + padding = 4 << (ci->ci_flags & FS_POLICY_FLAGS_PAD_MASK); + if (ilen < FS_CRYPTO_BLOCK_SIZE) + ilen = FS_CRYPTO_BLOCK_SIZE; + return size_round_up(ilen, padding); } +EXPORT_SYMBOL(fscrypt_fname_encrypted_size); /** - * f2fs_fname_crypto_alloc_obuff() - + * fscrypt_fname_crypto_alloc_obuff() - * * Allocates an output buffer that is sufficient for the crypto operation * specified by the context and the direction. */ -int f2fs_fname_crypto_alloc_buffer(struct inode *inode, - u32 ilen, struct f2fs_str *crypto_str) +int fscrypt_fname_alloc_buffer(struct inode *inode, + u32 ilen, struct fscrypt_str *crypto_str) { - unsigned int olen; - int padding = 16; - struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info; + unsigned int olen = fscrypt_fname_encrypted_size(inode, ilen); - if (ci) - padding = 4 << (ci->ci_flags & F2FS_POLICY_FLAGS_PAD_MASK); - if (padding < F2FS_CRYPTO_BLOCK_SIZE) - padding = F2FS_CRYPTO_BLOCK_SIZE; - olen = f2fs_fname_crypto_round_up(ilen, padding); crypto_str->len = olen; - if (olen < F2FS_FNAME_CRYPTO_DIGEST_SIZE * 2) - olen = F2FS_FNAME_CRYPTO_DIGEST_SIZE * 2; - /* Allocated buffer can hold one more character to null-terminate the - * string */ + if (olen < FS_FNAME_CRYPTO_DIGEST_SIZE * 2) + olen = FS_FNAME_CRYPTO_DIGEST_SIZE * 2; + /* + * Allocated buffer can hold one more character to null-terminate the + * string + */ crypto_str->name = kmalloc(olen + 1, GFP_NOFS); if (!(crypto_str->name)) return -ENOMEM; return 0; } +EXPORT_SYMBOL(fscrypt_fname_alloc_buffer); /** - * f2fs_fname_crypto_free_buffer() - + * fscrypt_fname_crypto_free_buffer() - * * Frees the buffer allocated for crypto operation. */ -void f2fs_fname_crypto_free_buffer(struct f2fs_str *crypto_str) +void fscrypt_fname_free_buffer(struct fscrypt_str *crypto_str) { if (!crypto_str) return; kfree(crypto_str->name); crypto_str->name = NULL; } +EXPORT_SYMBOL(fscrypt_fname_free_buffer); /** - * f2fs_fname_disk_to_usr() - converts a filename from disk space to user space + * fscrypt_fname_disk_to_usr() - converts a filename from disk space to user + * space */ -int f2fs_fname_disk_to_usr(struct inode *inode, - f2fs_hash_t *hash, - const struct f2fs_str *iname, - struct f2fs_str *oname) +int fscrypt_fname_disk_to_usr(struct inode *inode, + u32 hash, u32 minor_hash, + const struct fscrypt_str *iname, + struct fscrypt_str *oname) { const struct qstr qname = FSTR_TO_QSTR(iname); char buf[24]; int ret; - if (is_dot_dotdot(&qname)) { + if (fscrypt_is_dot_dotdot(&qname)) { oname->name[0] = '.'; oname->name[iname->len - 1] = '.'; oname->len = iname->len; return oname->len; } - if (F2FS_I(inode)->i_crypt_info) - return f2fs_fname_decrypt(inode, iname, oname); + if (iname->len < FS_CRYPTO_BLOCK_SIZE) + return -EUCLEAN; - if (iname->len <= F2FS_FNAME_CRYPTO_DIGEST_SIZE) { + if (inode->i_crypt_info) + return fname_decrypt(inode, iname, oname); + + if (iname->len <= FS_FNAME_CRYPTO_DIGEST_SIZE) { ret = digest_encode(iname->name, iname->len, oname->name); oname->len = ret; return ret; } if (hash) { - memcpy(buf, hash, 4); - memset(buf + 4, 0, 4); - } else + memcpy(buf, &hash, 4); + memcpy(buf + 4, &minor_hash, 4); + } else { memset(buf, 0, 8); + } memcpy(buf + 8, iname->name + iname->len - 16, 16); oname->name[0] = '_'; ret = digest_encode(buf, 24, oname->name + 1); oname->len = ret + 1; return ret + 1; } +EXPORT_SYMBOL(fscrypt_fname_disk_to_usr); /** - * f2fs_fname_usr_to_disk() - converts a filename from user space to disk space + * fscrypt_fname_usr_to_disk() - converts a filename from user space to disk + * space */ -int f2fs_fname_usr_to_disk(struct inode *inode, +int fscrypt_fname_usr_to_disk(struct inode *inode, const struct qstr *iname, - struct f2fs_str *oname) + struct fscrypt_str *oname) { - int res; - struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info; - - if (is_dot_dotdot(iname)) { + if (fscrypt_is_dot_dotdot(iname)) { oname->name[0] = '.'; oname->name[iname->len - 1] = '.'; oname->len = iname->len; return oname->len; } - - if (ci) { - res = f2fs_fname_encrypt(inode, iname, oname); - return res; - } - /* Without a proper key, a user is not allowed to modify the filenames + if (inode->i_crypt_info) + return fname_encrypt(inode, iname, oname); + /* + * Without a proper key, a user is not allowed to modify the filenames * in a directory. Consequently, a user space name cannot be mapped to - * a disk-space name */ + * a disk-space name + */ return -EACCES; } +EXPORT_SYMBOL(fscrypt_fname_usr_to_disk); -int f2fs_fname_setup_filename(struct inode *dir, const struct qstr *iname, - int lookup, struct f2fs_filename *fname) +int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, + int lookup, struct fscrypt_name *fname) { - struct f2fs_crypt_info *ci; int ret = 0, bigname = 0; - memset(fname, 0, sizeof(struct f2fs_filename)); + memset(fname, 0, sizeof(struct fscrypt_name)); fname->usr_fname = iname; - if (!f2fs_encrypted_inode(dir) || is_dot_dotdot(iname)) { + if (!dir->i_sb->s_cop->is_encrypted(dir) || + fscrypt_is_dot_dotdot(iname)) { fname->disk_name.name = (unsigned char *)iname->name; fname->disk_name.len = iname->len; return 0; } - ret = f2fs_get_encryption_info(dir); - if (ret) + ret = get_crypt_info(dir); + if (ret && ret != -EOPNOTSUPP) return ret; - ci = F2FS_I(dir)->i_crypt_info; - if (ci) { - ret = f2fs_fname_crypto_alloc_buffer(dir, iname->len, - &fname->crypto_buf); + + if (dir->i_crypt_info) { + ret = fscrypt_fname_alloc_buffer(dir, iname->len, + &fname->crypto_buf); if (ret < 0) return ret; - ret = f2fs_fname_encrypt(dir, iname, &fname->crypto_buf); + ret = fname_encrypt(dir, iname, &fname->crypto_buf); if (ret < 0) goto errout; fname->disk_name.name = fname->crypto_buf.name; @@ -400,18 +379,19 @@ int f2fs_fname_setup_filename(struct inode *dir, const struct qstr *iname, if (!lookup) return -EACCES; - /* We don't have the key and we are doing a lookup; decode the + /* + * We don't have the key and we are doing a lookup; decode the * user-supplied name */ if (iname->name[0] == '_') bigname = 1; - if ((bigname && (iname->len != 33)) || - (!bigname && (iname->len > 43))) + if ((bigname && (iname->len != 33)) || (!bigname && (iname->len > 43))) return -ENOENT; fname->crypto_buf.name = kmalloc(32, GFP_KERNEL); if (fname->crypto_buf.name == NULL) return -ENOMEM; + ret = digest_decode(iname->name + bigname, iname->len - bigname, fname->crypto_buf.name); if (ret < 0) { @@ -421,20 +401,24 @@ int f2fs_fname_setup_filename(struct inode *dir, const struct qstr *iname, fname->crypto_buf.len = ret; if (bigname) { memcpy(&fname->hash, fname->crypto_buf.name, 4); + memcpy(&fname->minor_hash, fname->crypto_buf.name + 4, 4); } else { fname->disk_name.name = fname->crypto_buf.name; fname->disk_name.len = fname->crypto_buf.len; } return 0; + errout: - f2fs_fname_crypto_free_buffer(&fname->crypto_buf); + fscrypt_fname_free_buffer(&fname->crypto_buf); return ret; } +EXPORT_SYMBOL(fscrypt_setup_filename); -void f2fs_fname_free_filename(struct f2fs_filename *fname) +void fscrypt_free_filename(struct fscrypt_name *fname) { kfree(fname->crypto_buf.name); fname->crypto_buf.name = NULL; fname->usr_fname = NULL; fname->disk_name.name = NULL; } +EXPORT_SYMBOL(fscrypt_free_filename); diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c new file mode 100644 index 000000000000..1ac263eddc4e --- /dev/null +++ b/fs/crypto/keyinfo.c @@ -0,0 +1,304 @@ +/* + * key management facility for FS encryption support. + * + * Copyright (C) 2015, Google, Inc. + * + * This contains encryption key functions. + * + * Written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar, 2015. + */ + +#include +#include +#include +#include +#include +#include + +static void derive_crypt_complete(struct crypto_async_request *req, int rc) +{ + struct fscrypt_completion_result *ecr = req->data; + + if (rc == -EINPROGRESS) + return; + + ecr->res = rc; + complete(&ecr->completion); +} + +/** + * derive_key_aes() - Derive a key using AES-128-ECB + * @deriving_key: Encryption key used for derivation. + * @source_key: Source key to which to apply derivation. + * @derived_key: Derived key. + * + * Return: Zero on success; non-zero otherwise. + */ +static int derive_key_aes(u8 deriving_key[FS_AES_128_ECB_KEY_SIZE], + u8 source_key[FS_AES_256_XTS_KEY_SIZE], + u8 derived_key[FS_AES_256_XTS_KEY_SIZE]) +{ + int res = 0; + struct skcipher_request *req = NULL; + DECLARE_FS_COMPLETION_RESULT(ecr); + struct scatterlist src_sg, dst_sg; + struct crypto_skcipher *tfm = crypto_alloc_skcipher("ecb(aes)", 0, 0); + + if (IS_ERR(tfm)) { + res = PTR_ERR(tfm); + tfm = NULL; + goto out; + } + crypto_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_WEAK_KEY); + req = skcipher_request_alloc(tfm, GFP_NOFS); + if (!req) { + res = -ENOMEM; + goto out; + } + skcipher_request_set_callback(req, + CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, + derive_crypt_complete, &ecr); + res = crypto_skcipher_setkey(tfm, deriving_key, + FS_AES_128_ECB_KEY_SIZE); + if (res < 0) + goto out; + + sg_init_one(&src_sg, source_key, FS_AES_256_XTS_KEY_SIZE); + sg_init_one(&dst_sg, derived_key, FS_AES_256_XTS_KEY_SIZE); + skcipher_request_set_crypt(req, &src_sg, &dst_sg, + FS_AES_256_XTS_KEY_SIZE, NULL); + res = crypto_skcipher_encrypt(req); + if (res == -EINPROGRESS || res == -EBUSY) { + wait_for_completion(&ecr.completion); + res = ecr.res; + } +out: + skcipher_request_free(req); + crypto_free_skcipher(tfm); + return res; +} + +static int validate_user_key(struct fscrypt_info *crypt_info, + struct fscrypt_context *ctx, u8 *raw_key, + u8 *prefix, int prefix_size) +{ + u8 *full_key_descriptor; + struct key *keyring_key; + struct fscrypt_key *master_key; + const struct user_key_payload *ukp; + int full_key_len = prefix_size + (FS_KEY_DESCRIPTOR_SIZE * 2) + 1; + int res; + + full_key_descriptor = kmalloc(full_key_len, GFP_NOFS); + if (!full_key_descriptor) + return -ENOMEM; + + memcpy(full_key_descriptor, prefix, prefix_size); + sprintf(full_key_descriptor + prefix_size, + "%*phN", FS_KEY_DESCRIPTOR_SIZE, + ctx->master_key_descriptor); + full_key_descriptor[full_key_len - 1] = '\0'; + keyring_key = request_key(&key_type_logon, full_key_descriptor, NULL); + kfree(full_key_descriptor); + if (IS_ERR(keyring_key)) + return PTR_ERR(keyring_key); + + if (keyring_key->type != &key_type_logon) { + printk_once(KERN_WARNING + "%s: key type must be logon\n", __func__); + res = -ENOKEY; + goto out; + } + down_read(&keyring_key->sem); + ukp = user_key_payload(keyring_key); + if (ukp->datalen != sizeof(struct fscrypt_key)) { + res = -EINVAL; + up_read(&keyring_key->sem); + goto out; + } + master_key = (struct fscrypt_key *)ukp->data; + BUILD_BUG_ON(FS_AES_128_ECB_KEY_SIZE != FS_KEY_DERIVATION_NONCE_SIZE); + + if (master_key->size != FS_AES_256_XTS_KEY_SIZE) { + printk_once(KERN_WARNING + "%s: key size incorrect: %d\n", + __func__, master_key->size); + res = -ENOKEY; + up_read(&keyring_key->sem); + goto out; + } + res = derive_key_aes(ctx->nonce, master_key->raw, raw_key); + up_read(&keyring_key->sem); + if (res) + goto out; + + crypt_info->ci_keyring_key = keyring_key; + return 0; +out: + key_put(keyring_key); + return res; +} + +static void put_crypt_info(struct fscrypt_info *ci) +{ + if (!ci) + return; + + key_put(ci->ci_keyring_key); + crypto_free_skcipher(ci->ci_ctfm); + kmem_cache_free(fscrypt_info_cachep, ci); +} + +int get_crypt_info(struct inode *inode) +{ + struct fscrypt_info *crypt_info; + struct fscrypt_context ctx; + struct crypto_skcipher *ctfm; + const char *cipher_str; + u8 raw_key[FS_MAX_KEY_SIZE]; + u8 mode; + int res; + + res = fscrypt_initialize(); + if (res) + return res; + + if (!inode->i_sb->s_cop->get_context) + return -EOPNOTSUPP; +retry: + crypt_info = ACCESS_ONCE(inode->i_crypt_info); + if (crypt_info) { + if (!crypt_info->ci_keyring_key || + key_validate(crypt_info->ci_keyring_key) == 0) + return 0; + fscrypt_put_encryption_info(inode, crypt_info); + goto retry; + } + + res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); + if (res < 0) { + if (!fscrypt_dummy_context_enabled(inode)) + return res; + ctx.contents_encryption_mode = FS_ENCRYPTION_MODE_AES_256_XTS; + ctx.filenames_encryption_mode = FS_ENCRYPTION_MODE_AES_256_CTS; + ctx.flags = 0; + } else if (res != sizeof(ctx)) { + return -EINVAL; + } + res = 0; + + crypt_info = kmem_cache_alloc(fscrypt_info_cachep, GFP_NOFS); + if (!crypt_info) + return -ENOMEM; + + crypt_info->ci_flags = ctx.flags; + crypt_info->ci_data_mode = ctx.contents_encryption_mode; + crypt_info->ci_filename_mode = ctx.filenames_encryption_mode; + crypt_info->ci_ctfm = NULL; + crypt_info->ci_keyring_key = NULL; + memcpy(crypt_info->ci_master_key, ctx.master_key_descriptor, + sizeof(crypt_info->ci_master_key)); + if (S_ISREG(inode->i_mode)) + mode = crypt_info->ci_data_mode; + else if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) + mode = crypt_info->ci_filename_mode; + else + BUG(); + + switch (mode) { + case FS_ENCRYPTION_MODE_AES_256_XTS: + cipher_str = "xts(aes)"; + break; + case FS_ENCRYPTION_MODE_AES_256_CTS: + cipher_str = "cts(cbc(aes))"; + break; + default: + printk_once(KERN_WARNING + "%s: unsupported key mode %d (ino %u)\n", + __func__, mode, (unsigned) inode->i_ino); + res = -ENOKEY; + goto out; + } + if (fscrypt_dummy_context_enabled(inode)) { + memset(raw_key, 0x42, FS_AES_256_XTS_KEY_SIZE); + goto got_key; + } + + res = validate_user_key(crypt_info, &ctx, raw_key, + FS_KEY_DESC_PREFIX, FS_KEY_DESC_PREFIX_SIZE); + if (res && inode->i_sb->s_cop->key_prefix) { + u8 *prefix = NULL; + int prefix_size, res2; + + prefix_size = inode->i_sb->s_cop->key_prefix(inode, &prefix); + res2 = validate_user_key(crypt_info, &ctx, raw_key, + prefix, prefix_size); + if (res2) { + if (res2 == -ENOKEY) + res = -ENOKEY; + goto out; + } + } else if (res) { + goto out; + } +got_key: + ctfm = crypto_alloc_skcipher(cipher_str, 0, 0); + if (!ctfm || IS_ERR(ctfm)) { + res = ctfm ? PTR_ERR(ctfm) : -ENOMEM; + printk(KERN_DEBUG + "%s: error %d (inode %u) allocating crypto tfm\n", + __func__, res, (unsigned) inode->i_ino); + goto out; + } + crypt_info->ci_ctfm = ctfm; + crypto_skcipher_clear_flags(ctfm, ~0); + crypto_skcipher_set_flags(ctfm, CRYPTO_TFM_REQ_WEAK_KEY); + res = crypto_skcipher_setkey(ctfm, raw_key, fscrypt_key_size(mode)); + if (res) + goto out; + + memzero_explicit(raw_key, sizeof(raw_key)); + if (cmpxchg(&inode->i_crypt_info, NULL, crypt_info) != NULL) { + put_crypt_info(crypt_info); + goto retry; + } + return 0; + +out: + if (res == -ENOKEY) + res = 0; + put_crypt_info(crypt_info); + memzero_explicit(raw_key, sizeof(raw_key)); + return res; +} + +void fscrypt_put_encryption_info(struct inode *inode, struct fscrypt_info *ci) +{ + struct fscrypt_info *prev; + + if (ci == NULL) + ci = ACCESS_ONCE(inode->i_crypt_info); + if (ci == NULL) + return; + + prev = cmpxchg(&inode->i_crypt_info, ci, NULL); + if (prev != ci) + return; + + put_crypt_info(ci); +} +EXPORT_SYMBOL(fscrypt_put_encryption_info); + +int fscrypt_get_encryption_info(struct inode *inode) +{ + struct fscrypt_info *ci = inode->i_crypt_info; + + if (!ci || + (ci->ci_keyring_key && + (ci->ci_keyring_key->flags & ((1 << KEY_FLAG_INVALIDATED) | + (1 << KEY_FLAG_REVOKED) | + (1 << KEY_FLAG_DEAD))))) + return get_crypt_info(inode); + return 0; +} +EXPORT_SYMBOL(fscrypt_get_encryption_info); diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c new file mode 100644 index 000000000000..ed115acb5dee --- /dev/null +++ b/fs/crypto/policy.c @@ -0,0 +1,246 @@ +/* + * Encryption policy functions for per-file encryption support. + * + * Copyright (C) 2015, Google, Inc. + * Copyright (C) 2015, Motorola Mobility. + * + * Written by Michael Halcrow, 2015. + * Modified by Jaegeuk Kim, 2015. + */ + +#include +#include +#include +#include + +static int inode_has_encryption_context(struct inode *inode) +{ + if (!inode->i_sb->s_cop->get_context) + return 0; + return (inode->i_sb->s_cop->get_context(inode, NULL, 0L) > 0); +} + +/* + * check whether the policy is consistent with the encryption context + * for the inode + */ +static int is_encryption_context_consistent_with_policy(struct inode *inode, + const struct fscrypt_policy *policy) +{ + struct fscrypt_context ctx; + int res; + + if (!inode->i_sb->s_cop->get_context) + return 0; + + res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); + if (res != sizeof(ctx)) + return 0; + + return (memcmp(ctx.master_key_descriptor, policy->master_key_descriptor, + FS_KEY_DESCRIPTOR_SIZE) == 0 && + (ctx.flags == policy->flags) && + (ctx.contents_encryption_mode == + policy->contents_encryption_mode) && + (ctx.filenames_encryption_mode == + policy->filenames_encryption_mode)); +} + +static int create_encryption_context_from_policy(struct inode *inode, + const struct fscrypt_policy *policy) +{ + struct fscrypt_context ctx; + int res; + + if (!inode->i_sb->s_cop->set_context) + return -EOPNOTSUPP; + + if (inode->i_sb->s_cop->prepare_context) { + res = inode->i_sb->s_cop->prepare_context(inode); + if (res) + return res; + } + + ctx.format = FS_ENCRYPTION_CONTEXT_FORMAT_V1; + memcpy(ctx.master_key_descriptor, policy->master_key_descriptor, + FS_KEY_DESCRIPTOR_SIZE); + + if (!fscrypt_valid_contents_enc_mode( + policy->contents_encryption_mode)) { + printk(KERN_WARNING + "%s: Invalid contents encryption mode %d\n", __func__, + policy->contents_encryption_mode); + return -EINVAL; + } + + if (!fscrypt_valid_filenames_enc_mode( + policy->filenames_encryption_mode)) { + printk(KERN_WARNING + "%s: Invalid filenames encryption mode %d\n", __func__, + policy->filenames_encryption_mode); + return -EINVAL; + } + + if (policy->flags & ~FS_POLICY_FLAGS_VALID) + return -EINVAL; + + ctx.contents_encryption_mode = policy->contents_encryption_mode; + ctx.filenames_encryption_mode = policy->filenames_encryption_mode; + ctx.flags = policy->flags; + BUILD_BUG_ON(sizeof(ctx.nonce) != FS_KEY_DERIVATION_NONCE_SIZE); + get_random_bytes(ctx.nonce, FS_KEY_DERIVATION_NONCE_SIZE); + + return inode->i_sb->s_cop->set_context(inode, &ctx, sizeof(ctx), NULL); +} + +int fscrypt_process_policy(struct file *filp, + const struct fscrypt_policy *policy) +{ + struct inode *inode = file_inode(filp); + int ret; + + if (!inode_owner_or_capable(inode)) + return -EACCES; + + if (policy->version != 0) + return -EINVAL; + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + if (!inode_has_encryption_context(inode)) { + if (!S_ISDIR(inode->i_mode)) + ret = -EINVAL; + else if (!inode->i_sb->s_cop->empty_dir) + ret = -EOPNOTSUPP; + else if (!inode->i_sb->s_cop->empty_dir(inode)) + ret = -ENOTEMPTY; + else + ret = create_encryption_context_from_policy(inode, + policy); + } else if (!is_encryption_context_consistent_with_policy(inode, + policy)) { + printk(KERN_WARNING + "%s: Policy inconsistent with encryption context\n", + __func__); + ret = -EINVAL; + } + + mnt_drop_write_file(filp); + return ret; +} +EXPORT_SYMBOL(fscrypt_process_policy); + +int fscrypt_get_policy(struct inode *inode, struct fscrypt_policy *policy) +{ + struct fscrypt_context ctx; + int res; + + if (!inode->i_sb->s_cop->get_context || + !inode->i_sb->s_cop->is_encrypted(inode)) + return -ENODATA; + + res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); + if (res != sizeof(ctx)) + return -ENODATA; + if (ctx.format != FS_ENCRYPTION_CONTEXT_FORMAT_V1) + return -EINVAL; + + policy->version = 0; + policy->contents_encryption_mode = ctx.contents_encryption_mode; + policy->filenames_encryption_mode = ctx.filenames_encryption_mode; + policy->flags = ctx.flags; + memcpy(&policy->master_key_descriptor, ctx.master_key_descriptor, + FS_KEY_DESCRIPTOR_SIZE); + return 0; +} +EXPORT_SYMBOL(fscrypt_get_policy); + +int fscrypt_has_permitted_context(struct inode *parent, struct inode *child) +{ + struct fscrypt_info *parent_ci, *child_ci; + int res; + + if ((parent == NULL) || (child == NULL)) { + printk(KERN_ERR "parent %p child %p\n", parent, child); + BUG_ON(1); + } + + /* no restrictions if the parent directory is not encrypted */ + if (!parent->i_sb->s_cop->is_encrypted(parent)) + return 1; + /* if the child directory is not encrypted, this is always a problem */ + if (!parent->i_sb->s_cop->is_encrypted(child)) + return 0; + res = fscrypt_get_encryption_info(parent); + if (res) + return 0; + res = fscrypt_get_encryption_info(child); + if (res) + return 0; + parent_ci = parent->i_crypt_info; + child_ci = child->i_crypt_info; + if (!parent_ci && !child_ci) + return 1; + if (!parent_ci || !child_ci) + return 0; + + return (memcmp(parent_ci->ci_master_key, + child_ci->ci_master_key, + FS_KEY_DESCRIPTOR_SIZE) == 0 && + (parent_ci->ci_data_mode == child_ci->ci_data_mode) && + (parent_ci->ci_filename_mode == child_ci->ci_filename_mode) && + (parent_ci->ci_flags == child_ci->ci_flags)); +} +EXPORT_SYMBOL(fscrypt_has_permitted_context); + +/** + * fscrypt_inherit_context() - Sets a child context from its parent + * @parent: Parent inode from which the context is inherited. + * @child: Child inode that inherits the context from @parent. + * @fs_data: private data given by FS. + * @preload: preload child i_crypt_info + * + * Return: Zero on success, non-zero otherwise + */ +int fscrypt_inherit_context(struct inode *parent, struct inode *child, + void *fs_data, bool preload) +{ + struct fscrypt_context ctx; + struct fscrypt_info *ci; + int res; + + if (!parent->i_sb->s_cop->set_context) + return -EOPNOTSUPP; + + res = fscrypt_get_encryption_info(parent); + if (res < 0) + return res; + + ci = parent->i_crypt_info; + if (ci == NULL) + return -ENOKEY; + + ctx.format = FS_ENCRYPTION_CONTEXT_FORMAT_V1; + if (fscrypt_dummy_context_enabled(parent)) { + ctx.contents_encryption_mode = FS_ENCRYPTION_MODE_AES_256_XTS; + ctx.filenames_encryption_mode = FS_ENCRYPTION_MODE_AES_256_CTS; + ctx.flags = 0; + memset(ctx.master_key_descriptor, 0x42, FS_KEY_DESCRIPTOR_SIZE); + res = 0; + } else { + ctx.contents_encryption_mode = ci->ci_data_mode; + ctx.filenames_encryption_mode = ci->ci_filename_mode; + ctx.flags = ci->ci_flags; + memcpy(ctx.master_key_descriptor, ci->ci_master_key, + FS_KEY_DESCRIPTOR_SIZE); + } + get_random_bytes(ctx.nonce, FS_KEY_DERIVATION_NONCE_SIZE); + res = parent->i_sb->s_cop->set_context(child, &ctx, + sizeof(ctx), fs_data); + if (res) + return res; + return preload ? fscrypt_get_encryption_info(child): 0; +} +EXPORT_SYMBOL(fscrypt_inherit_context); diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig index b0a9dc929f88..1852d99df97b 100644 --- a/fs/f2fs/Kconfig +++ b/fs/f2fs/Kconfig @@ -1,6 +1,9 @@ config F2FS_FS tristate "F2FS filesystem support" depends on BLOCK + select CRYPTO + select KEYS + select CRYPTO_CRC32 help F2FS is based on Log-structured File System (LFS), which supports versatile "flash-friendly" features. The design has been focused on @@ -76,15 +79,7 @@ config F2FS_FS_ENCRYPTION bool "F2FS Encryption" depends on F2FS_FS depends on F2FS_FS_XATTR - select CRYPTO_AES - select CRYPTO_CBC - select CRYPTO_ECB - select CRYPTO_XTS - select CRYPTO_CTS - select CRYPTO_CTR - select CRYPTO_SHA256 - select KEYS - select ENCRYPTED_KEYS + select FS_ENCRYPTION help Enable encryption of f2fs files and directories. This feature is similar to ecryptfs, but it is more memory @@ -100,3 +95,11 @@ config F2FS_IO_TRACE information and block IO patterns in the filesystem level. If unsure, say N. + +config F2FS_FAULT_INJECTION + bool "F2FS fault injection facility" + depends on F2FS_FS + help + Test F2FS to inject faults such as ENOMEM, ENOSPC, and so on. + + If unsure, say N. diff --git a/fs/f2fs/Makefile b/fs/f2fs/Makefile index 08e101ed914c..ca949ea7c02f 100644 --- a/fs/f2fs/Makefile +++ b/fs/f2fs/Makefile @@ -7,5 +7,3 @@ f2fs-$(CONFIG_F2FS_STAT_FS) += debug.o f2fs-$(CONFIG_F2FS_FS_XATTR) += xattr.o f2fs-$(CONFIG_F2FS_FS_POSIX_ACL) += acl.o f2fs-$(CONFIG_F2FS_IO_TRACE) += trace.o -f2fs-$(CONFIG_F2FS_FS_ENCRYPTION) += crypto_policy.o crypto.o \ - crypto_key.o crypto_fname.o diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index c8f25f7241f0..fb0744b94c2f 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -109,14 +109,16 @@ fail: return ERR_PTR(-EINVAL); } -static void *f2fs_acl_to_disk(const struct posix_acl *acl, size_t *size) +static void *f2fs_acl_to_disk(struct f2fs_sb_info *sbi, + const struct posix_acl *acl, size_t *size) { struct f2fs_acl_header *f2fs_acl; struct f2fs_acl_entry *entry; int i; - f2fs_acl = kmalloc(sizeof(struct f2fs_acl_header) + acl->a_count * - sizeof(struct f2fs_acl_entry), GFP_NOFS); + f2fs_acl = f2fs_kmalloc(sbi, sizeof(struct f2fs_acl_header) + + acl->a_count * sizeof(struct f2fs_acl_entry), + GFP_NOFS); if (!f2fs_acl) return ERR_PTR(-ENOMEM); @@ -175,7 +177,7 @@ static struct posix_acl *__f2fs_get_acl(struct inode *inode, int type, retval = f2fs_getxattr(inode, name_index, "", NULL, 0, dpage); if (retval > 0) { - value = kmalloc(retval, GFP_F2FS_ZERO); + value = f2fs_kmalloc(F2FS_I_SB(inode), retval, GFP_F2FS_ZERO); if (!value) return ERR_PTR(-ENOMEM); retval = f2fs_getxattr(inode, name_index, "", value, @@ -204,7 +206,6 @@ struct posix_acl *f2fs_get_acl(struct inode *inode, int type) static int __f2fs_set_acl(struct inode *inode, int type, struct posix_acl *acl, struct page *ipage) { - struct f2fs_inode_info *fi = F2FS_I(inode); int name_index; void *value = NULL; size_t size = 0; @@ -217,7 +218,7 @@ static int __f2fs_set_acl(struct inode *inode, int type, error = posix_acl_equiv_mode(acl, &inode->i_mode); if (error < 0) return error; - set_acl_inode(fi, inode->i_mode); + set_acl_inode(inode, inode->i_mode); if (error == 0) acl = NULL; } @@ -234,9 +235,9 @@ static int __f2fs_set_acl(struct inode *inode, int type, } if (acl) { - value = f2fs_acl_to_disk(acl, &size); + value = f2fs_acl_to_disk(F2FS_I_SB(inode), acl, &size); if (IS_ERR(value)) { - clear_inode_flag(fi, FI_ACL_MODE); + clear_inode_flag(inode, FI_ACL_MODE); return (int)PTR_ERR(value); } } @@ -247,7 +248,7 @@ static int __f2fs_set_acl(struct inode *inode, int type, if (!error) set_cached_acl(inode, type, acl); - clear_inode_flag(fi, FI_ACL_MODE); + clear_inode_flag(inode, FI_ACL_MODE); return error; } @@ -388,6 +389,8 @@ int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage, if (error) return error; + f2fs_mark_inode_dirty_sync(inode); + if (default_acl) { error = __f2fs_set_acl(inode, ACL_TYPE_DEFAULT, default_acl, ipage); diff --git a/fs/f2fs/acl.h b/fs/f2fs/acl.h index 997ca8edb6cb..2c685185c24d 100644 --- a/fs/f2fs/acl.h +++ b/fs/f2fs/acl.h @@ -37,11 +37,10 @@ struct f2fs_acl_header { #ifdef CONFIG_F2FS_FS_POSIX_ACL extern struct posix_acl *f2fs_get_acl(struct inode *, int); -extern int f2fs_set_acl(struct inode *inode, struct posix_acl *acl, int type); +extern int f2fs_set_acl(struct inode *, struct posix_acl *, int); extern int f2fs_init_acl(struct inode *, struct inode *, struct page *, struct page *); #else -#define f2fs_check_acl NULL #define f2fs_get_acl NULL #define f2fs_set_acl NULL diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index f661d80474be..cb23d6cf676b 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -26,6 +26,14 @@ static struct kmem_cache *ino_entry_slab; struct kmem_cache *inode_entry_slab; +void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io) +{ + set_ckpt_flags(sbi, CP_ERROR_FLAG); + sbi->sb->s_flags |= MS_RDONLY; + if (!end_io) + f2fs_flush_merged_bios(sbi); +} + /* * We guarantee no failure on the returned page. */ @@ -34,13 +42,14 @@ struct page *grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index) struct address_space *mapping = META_MAPPING(sbi); struct page *page = NULL; repeat: - page = grab_cache_page(mapping, index); + page = f2fs_grab_cache_page(mapping, index, false); if (!page) { cond_resched(); goto repeat; } - f2fs_wait_on_page_writeback(page, META); - SetPageUptodate(page); + f2fs_wait_on_page_writeback(page, META, true); + if (!PageUptodate(page)) + SetPageUptodate(page); return page; } @@ -56,14 +65,15 @@ static struct page *__get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index, .sbi = sbi, .type = META, .rw = READ_SYNC | REQ_META | REQ_PRIO, - .blk_addr = index, + .old_blkaddr = index, + .new_blkaddr = index, .encrypted_page = NULL, }; if (unlikely(!is_meta)) fio.rw &= ~REQ_META; repeat: - page = grab_cache_page(mapping, index); + page = f2fs_grab_cache_page(mapping, index, false); if (!page) { cond_resched(); goto repeat; @@ -90,7 +100,7 @@ repeat: * meta page. */ if (unlikely(!PageUptodate(page))) - f2fs_stop_checkpoint(sbi); + f2fs_stop_checkpoint(sbi, false); out: return page; } @@ -143,7 +153,6 @@ bool is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type) int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type, bool sync) { - block_t prev_blk_addr = 0; struct page *page; block_t blkno = start; struct f2fs_io_info fio = { @@ -152,10 +161,12 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, .rw = sync ? (READ_SYNC | REQ_META | REQ_PRIO) : READA, .encrypted_page = NULL, }; + struct blk_plug plug; if (unlikely(type == META_POR)) fio.rw &= ~REQ_META; + blk_start_plug(&plug); for (; nrpages-- > 0; blkno++) { if (!is_valid_blkaddr(sbi, blkno, type)) @@ -167,27 +178,25 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, NAT_BLOCK_OFFSET(NM_I(sbi)->max_nid))) blkno = 0; /* get nat block addr */ - fio.blk_addr = current_nat_addr(sbi, + fio.new_blkaddr = current_nat_addr(sbi, blkno * NAT_ENTRY_PER_BLOCK); break; case META_SIT: /* get sit block addr */ - fio.blk_addr = current_sit_addr(sbi, + fio.new_blkaddr = current_sit_addr(sbi, blkno * SIT_ENTRY_PER_BLOCK); - if (blkno != start && prev_blk_addr + 1 != fio.blk_addr) - goto out; - prev_blk_addr = fio.blk_addr; break; case META_SSA: case META_CP: case META_POR: - fio.blk_addr = blkno; + fio.new_blkaddr = blkno; break; default: BUG(); } - page = grab_cache_page(META_MAPPING(sbi), fio.blk_addr); + page = f2fs_grab_cache_page(META_MAPPING(sbi), + fio.new_blkaddr, false); if (!page) continue; if (PageUptodate(page)) { @@ -196,11 +205,13 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, } fio.page = page; + fio.old_blkaddr = fio.new_blkaddr; f2fs_submit_page_mbio(&fio); f2fs_put_page(page, 0); } out: f2fs_submit_merged_bio(sbi, META, READ); + blk_finish_plug(&plug); return blkno - start; } @@ -210,7 +221,7 @@ void ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index) bool readahead = false; page = find_get_page(META_MAPPING(sbi), index); - if (!page || (page && !PageUptodate(page))) + if (!page || !PageUptodate(page)) readahead = true; f2fs_put_page(page, 0); @@ -232,13 +243,17 @@ static int f2fs_write_meta_page(struct page *page, if (unlikely(f2fs_cp_error(sbi))) goto redirty_out; - f2fs_wait_on_page_writeback(page, META); write_meta_page(sbi, page); dec_page_count(sbi, F2FS_DIRTY_META); - unlock_page(page); if (wbc->for_reclaim) + f2fs_submit_merged_bio_cond(sbi, NULL, page, 0, META, WRITE); + + unlock_page(page); + + if (unlikely(f2fs_cp_error(sbi))) f2fs_submit_merged_bio(sbi, META, WRITE); + return 0; redirty_out: @@ -252,13 +267,13 @@ static int f2fs_write_meta_pages(struct address_space *mapping, struct f2fs_sb_info *sbi = F2FS_M_SB(mapping); long diff, written; - trace_f2fs_writepages(mapping->host, wbc, META); - /* collect a number of dirty meta pages and write together */ if (wbc->for_kupdate || get_pages(sbi, F2FS_DIRTY_META) < nr_pages_to_skip(sbi, META)) goto skip_write; + trace_f2fs_writepages(mapping->host, wbc, META); + /* if mounting is failed, skip writing node pages */ mutex_lock(&sbi->cp_mutex); diff = nr_pages_to_write(sbi, META, wbc); @@ -269,6 +284,7 @@ static int f2fs_write_meta_pages(struct address_space *mapping, skip_write: wbc->pages_skipped += get_pages(sbi, F2FS_DIRTY_META); + trace_f2fs_writepages(mapping->host, wbc, META); return 0; } @@ -276,15 +292,18 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, long nr_to_write) { struct address_space *mapping = META_MAPPING(sbi); - pgoff_t index = 0, end = LONG_MAX, prev = LONG_MAX; + pgoff_t index = 0, end = ULONG_MAX, prev = ULONG_MAX; struct pagevec pvec; long nwritten = 0; struct writeback_control wbc = { .for_reclaim = 0, }; + struct blk_plug plug; pagevec_init(&pvec, 0); + blk_start_plug(&plug); + while (index <= end) { int i, nr_pages; nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, @@ -296,7 +315,7 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; - if (prev == LONG_MAX) + if (prev == ULONG_MAX) prev = page->index - 1; if (nr_to_write != LONG_MAX && page->index != prev + 1) { pagevec_release(&pvec); @@ -315,6 +334,9 @@ continue_unlock: goto continue_unlock; } + f2fs_wait_on_page_writeback(page, META, true); + + BUG_ON(PageWriteback(page)); if (!clear_page_dirty_for_io(page)) goto continue_unlock; @@ -334,6 +356,8 @@ stop: if (nwritten) f2fs_submit_merged_bio(sbi, type, WRITE); + blk_finish_plug(&plug); + return nwritten; } @@ -341,9 +365,10 @@ static int f2fs_set_meta_page_dirty(struct page *page) { trace_f2fs_set_page_dirty(page, META); - SetPageUptodate(page); + if (!PageUptodate(page)) + SetPageUptodate(page); if (!PageDirty(page)) { - __set_page_dirty_nobuffers(page); + f2fs_set_page_dirty_nobuffers(page); inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_META); SetPagePrivate(page); f2fs_trace_pid(page); @@ -358,6 +383,9 @@ const struct address_space_operations f2fs_meta_aops = { .set_page_dirty = f2fs_set_meta_page_dirty, .invalidatepage = f2fs_invalidate_page, .releasepage = f2fs_release_page, +#ifdef CONFIG_MIGRATION + .migratepage = f2fs_migrate_page, +#endif }; static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) @@ -410,13 +438,13 @@ static void __remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) spin_unlock(&im->ino_lock); } -void add_dirty_inode(struct f2fs_sb_info *sbi, nid_t ino, int type) +void add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) { /* add new dirty ino entry into list */ __add_ino_entry(sbi, ino, type); } -void remove_dirty_inode(struct f2fs_sb_info *sbi, nid_t ino, int type) +void remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) { /* remove dirty ino entry from list */ __remove_ino_entry(sbi, ino, type); @@ -434,12 +462,12 @@ bool exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode) return e ? true : false; } -void release_dirty_inode(struct f2fs_sb_info *sbi) +void release_ino_entry(struct f2fs_sb_info *sbi, bool all) { struct ino_entry *e, *tmp; int i; - for (i = APPEND_INO; i <= UPDATE_INO; i++) { + for (i = all ? ORPHAN_INO: APPEND_INO; i <= UPDATE_INO; i++) { struct inode_management *im = &sbi->im[i]; spin_lock(&im->ino_lock); @@ -459,6 +487,13 @@ int acquire_orphan_inode(struct f2fs_sb_info *sbi) int err = 0; spin_lock(&im->ino_lock); + +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(sbi, FAULT_ORPHAN)) { + spin_unlock(&im->ino_lock); + return -ENOSPC; + } +#endif if (unlikely(im->ino_num >= sbi->max_orphans)) err = -ENOSPC; else @@ -478,10 +513,11 @@ void release_orphan_inode(struct f2fs_sb_info *sbi) spin_unlock(&im->ino_lock); } -void add_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) +void add_orphan_inode(struct inode *inode) { /* add new orphan ino entry into list */ - __add_ino_entry(sbi, ino, ORPHAN_INO); + __add_ino_entry(F2FS_I_SB(inode), inode->i_ino, ORPHAN_INO); + update_inode_page(inode); } void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) @@ -493,8 +529,20 @@ void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) { struct inode *inode; + struct node_info ni; + int err = acquire_orphan_inode(sbi); - inode = f2fs_iget(sbi->sb, ino); + if (err) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_msg(sbi->sb, KERN_WARNING, + "%s: orphan failed (ino=%x), run fsck to fix.", + __func__, ino); + return err; + } + + __add_ino_entry(sbi, ino, ORPHAN_INO); + + inode = f2fs_iget_retry(sbi->sb, ino); if (IS_ERR(inode)) { /* * there should be a bug that we can't find the entry @@ -508,6 +556,18 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) /* truncate all the data during iput */ iput(inode); + + get_node_info(sbi, ino, &ni); + + /* ENOMEM was fully retried in f2fs_evict_inode. */ + if (ni.blk_addr != NULL_ADDR) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_msg(sbi->sb, KERN_WARNING, + "%s: orphan failed (ino=%x), run fsck to fix.", + __func__, ino); + return -EIO; + } + __remove_ino_entry(sbi, ino, ORPHAN_INO); return 0; } @@ -516,7 +576,7 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi) block_t start_blk, orphan_blocks, i, j; int err; - if (!is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG)) + if (!is_set_ckpt_flags(sbi, CP_ORPHAN_PRESENT_FLAG)) return 0; start_blk = __start_cp_addr(sbi) + 1 + __cp_payload(sbi); @@ -540,7 +600,7 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi) f2fs_put_page(page, 1); } /* clear Orphan Flag */ - clear_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG); + clear_ckpt_flags(sbi, CP_ORPHAN_PRESENT_FLAG); return 0; } @@ -601,45 +661,55 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk) } } +static int get_checkpoint_version(struct f2fs_sb_info *sbi, block_t cp_addr, + struct f2fs_checkpoint **cp_block, struct page **cp_page, + unsigned long long *version) +{ + unsigned long blk_size = sbi->blocksize; + size_t crc_offset = 0; + __u32 crc = 0; + + *cp_page = get_meta_page(sbi, cp_addr); + *cp_block = (struct f2fs_checkpoint *)page_address(*cp_page); + + crc_offset = le32_to_cpu((*cp_block)->checksum_offset); + if (crc_offset >= blk_size) { + f2fs_msg(sbi->sb, KERN_WARNING, + "invalid crc_offset: %zu", crc_offset); + return -EINVAL; + } + + crc = le32_to_cpu(*((__le32 *)((unsigned char *)*cp_block + + crc_offset))); + if (!f2fs_crc_valid(sbi, crc, *cp_block, crc_offset)) { + f2fs_msg(sbi->sb, KERN_WARNING, "invalid crc value"); + return -EINVAL; + } + + *version = cur_cp_version(*cp_block); + return 0; +} + static struct page *validate_checkpoint(struct f2fs_sb_info *sbi, block_t cp_addr, unsigned long long *version) { - struct page *cp_page_1, *cp_page_2 = NULL; - unsigned long blk_size = sbi->blocksize; - struct f2fs_checkpoint *cp_block; + struct page *cp_page_1 = NULL, *cp_page_2 = NULL; + struct f2fs_checkpoint *cp_block = NULL; unsigned long long cur_version = 0, pre_version = 0; - size_t crc_offset; - __u32 crc = 0; + int err; - /* Read the 1st cp block in this CP pack */ - cp_page_1 = get_meta_page(sbi, cp_addr); - - /* get the version number */ - cp_block = (struct f2fs_checkpoint *)page_address(cp_page_1); - crc_offset = le32_to_cpu(cp_block->checksum_offset); - if (crc_offset >= blk_size) + err = get_checkpoint_version(sbi, cp_addr, &cp_block, + &cp_page_1, version); + if (err) goto invalid_cp1; + pre_version = *version; - crc = le32_to_cpu(*((__le32 *)((unsigned char *)cp_block + crc_offset))); - if (!f2fs_crc_valid(crc, cp_block, crc_offset)) - goto invalid_cp1; - - pre_version = cur_cp_version(cp_block); - - /* Read the 2nd cp block in this CP pack */ cp_addr += le32_to_cpu(cp_block->cp_pack_total_block_count) - 1; - cp_page_2 = get_meta_page(sbi, cp_addr); - - cp_block = (struct f2fs_checkpoint *)page_address(cp_page_2); - crc_offset = le32_to_cpu(cp_block->checksum_offset); - if (crc_offset >= blk_size) + err = get_checkpoint_version(sbi, cp_addr, &cp_block, + &cp_page_2, version); + if (err) goto invalid_cp2; - - crc = le32_to_cpu(*((__le32 *)((unsigned char *)cp_block + crc_offset))); - if (!f2fs_crc_valid(crc, cp_block, crc_offset)) - goto invalid_cp2; - - cur_version = cur_cp_version(cp_block); + cur_version = *version; if (cur_version == pre_version) { *version = cur_version; @@ -696,6 +766,10 @@ int get_valid_checkpoint(struct f2fs_sb_info *sbi) cp_block = (struct f2fs_checkpoint *)page_address(cur_page); memcpy(sbi->ckpt, cp_block, blk_size); + /* Sanity checking of checkpoint */ + if (sanity_check_ckpt(sbi)) + goto fail_no_cp; + if (cp_blks <= 1) goto done; @@ -722,118 +796,94 @@ fail_no_cp: return -EINVAL; } -static int __add_dirty_inode(struct inode *inode, struct inode_entry *new) +static void __add_dirty_inode(struct inode *inode, enum inode_type type) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + int flag = (type == DIR_INODE) ? FI_DIRTY_DIR : FI_DIRTY_FILE; - if (is_inode_flag_set(F2FS_I(inode), FI_DIRTY_DIR)) - return -EEXIST; + if (is_inode_flag_set(inode, flag)) + return; - set_inode_flag(F2FS_I(inode), FI_DIRTY_DIR); - F2FS_I(inode)->dirty_dir = new; - list_add_tail(&new->list, &sbi->dir_inode_list); - stat_inc_dirty_dir(sbi); - return 0; + set_inode_flag(inode, flag); + list_add_tail(&F2FS_I(inode)->dirty_list, &sbi->inode_list[type]); + stat_inc_dirty_inode(sbi, type); +} + +static void __remove_dirty_inode(struct inode *inode, enum inode_type type) +{ + int flag = (type == DIR_INODE) ? FI_DIRTY_DIR : FI_DIRTY_FILE; + + if (get_dirty_pages(inode) || !is_inode_flag_set(inode, flag)) + return; + + list_del_init(&F2FS_I(inode)->dirty_list); + clear_inode_flag(inode, flag); + stat_dec_dirty_inode(F2FS_I_SB(inode), type); } void update_dirty_page(struct inode *inode, struct page *page) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct inode_entry *new; - int ret = 0; + enum inode_type type = S_ISDIR(inode->i_mode) ? DIR_INODE : FILE_INODE; if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode) && !S_ISLNK(inode->i_mode)) return; - if (!S_ISDIR(inode->i_mode)) { - inode_inc_dirty_pages(inode); - goto out; - } - - new = f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS); - new->inode = inode; - INIT_LIST_HEAD(&new->list); - - spin_lock(&sbi->dir_inode_lock); - ret = __add_dirty_inode(inode, new); + spin_lock(&sbi->inode_lock[type]); + if (type != FILE_INODE || test_opt(sbi, DATA_FLUSH)) + __add_dirty_inode(inode, type); inode_inc_dirty_pages(inode); - spin_unlock(&sbi->dir_inode_lock); + spin_unlock(&sbi->inode_lock[type]); - if (ret) - kmem_cache_free(inode_entry_slab, new); -out: SetPagePrivate(page); f2fs_trace_pid(page); } -void add_dirty_dir_inode(struct inode *inode) +void remove_dirty_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct inode_entry *new = - f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS); - int ret = 0; + enum inode_type type = S_ISDIR(inode->i_mode) ? DIR_INODE : FILE_INODE; - new->inode = inode; - INIT_LIST_HEAD(&new->list); - - spin_lock(&sbi->dir_inode_lock); - ret = __add_dirty_inode(inode, new); - spin_unlock(&sbi->dir_inode_lock); - - if (ret) - kmem_cache_free(inode_entry_slab, new); -} - -void remove_dirty_dir_inode(struct inode *inode) -{ - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct inode_entry *entry; - - if (!S_ISDIR(inode->i_mode)) + if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode) && + !S_ISLNK(inode->i_mode)) return; - spin_lock(&sbi->dir_inode_lock); - if (get_dirty_pages(inode) || - !is_inode_flag_set(F2FS_I(inode), FI_DIRTY_DIR)) { - spin_unlock(&sbi->dir_inode_lock); + if (type == FILE_INODE && !test_opt(sbi, DATA_FLUSH)) return; - } - entry = F2FS_I(inode)->dirty_dir; - list_del(&entry->list); - F2FS_I(inode)->dirty_dir = NULL; - clear_inode_flag(F2FS_I(inode), FI_DIRTY_DIR); - stat_dec_dirty_dir(sbi); - spin_unlock(&sbi->dir_inode_lock); - kmem_cache_free(inode_entry_slab, entry); - - /* Only from the recovery routine */ - if (is_inode_flag_set(F2FS_I(inode), FI_DELAY_IPUT)) { - clear_inode_flag(F2FS_I(inode), FI_DELAY_IPUT); - iput(inode); - } + spin_lock(&sbi->inode_lock[type]); + __remove_dirty_inode(inode, type); + spin_unlock(&sbi->inode_lock[type]); } -void sync_dirty_dir_inodes(struct f2fs_sb_info *sbi) +int sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type) { struct list_head *head; - struct inode_entry *entry; struct inode *inode; + struct f2fs_inode_info *fi; + bool is_dir = (type == DIR_INODE); + + trace_f2fs_sync_dirty_inodes_enter(sbi->sb, is_dir, + get_pages(sbi, is_dir ? + F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA)); retry: if (unlikely(f2fs_cp_error(sbi))) - return; + return -EIO; - spin_lock(&sbi->dir_inode_lock); + spin_lock(&sbi->inode_lock[type]); - head = &sbi->dir_inode_list; + head = &sbi->inode_list[type]; if (list_empty(head)) { - spin_unlock(&sbi->dir_inode_lock); - return; + spin_unlock(&sbi->inode_lock[type]); + trace_f2fs_sync_dirty_inodes_exit(sbi->sb, is_dir, + get_pages(sbi, is_dir ? + F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA)); + return 0; } - entry = list_entry(head->next, struct inode_entry, list); - inode = igrab(entry->inode); - spin_unlock(&sbi->dir_inode_lock); + fi = list_entry(head->next, struct f2fs_inode_info, dirty_list); + inode = igrab(&fi->vfs_inode); + spin_unlock(&sbi->inode_lock[type]); if (inode) { filemap_fdatawrite(inode->i_mapping); iput(inode); @@ -848,6 +898,34 @@ retry: goto retry; } +int f2fs_sync_inode_meta(struct f2fs_sb_info *sbi) +{ + struct list_head *head = &sbi->inode_list[DIRTY_META]; + struct inode *inode; + struct f2fs_inode_info *fi; + s64 total = get_pages(sbi, F2FS_DIRTY_IMETA); + + while (total--) { + if (unlikely(f2fs_cp_error(sbi))) + return -EIO; + + spin_lock(&sbi->inode_lock[DIRTY_META]); + if (list_empty(head)) { + spin_unlock(&sbi->inode_lock[DIRTY_META]); + return 0; + } + fi = list_entry(head->next, struct f2fs_inode_info, + gdirty_list); + inode = igrab(&fi->vfs_inode); + spin_unlock(&sbi->inode_lock[DIRTY_META]); + if (inode) { + update_inode_page(inode); + iput(inode); + } + }; + return 0; +} + /* * Freeze all the FS-operations for checkpoint. */ @@ -868,11 +946,17 @@ retry_flush_dents: /* write all the dirty dentry pages */ if (get_pages(sbi, F2FS_DIRTY_DENTS)) { f2fs_unlock_all(sbi); - sync_dirty_dir_inodes(sbi); - if (unlikely(f2fs_cp_error(sbi))) { - err = -EIO; + err = sync_dirty_inodes(sbi, DIR_INODE); + if (err) + goto out; + goto retry_flush_dents; + } + + if (get_pages(sbi, F2FS_DIRTY_IMETA)) { + f2fs_unlock_all(sbi); + err = f2fs_sync_inode_meta(sbi); + if (err) goto out; - } goto retry_flush_dents; } @@ -885,10 +969,9 @@ retry_flush_nodes: if (get_pages(sbi, F2FS_DIRTY_NODES)) { up_write(&sbi->node_write); - sync_node_pages(sbi, 0, &wbc); - if (unlikely(f2fs_cp_error(sbi))) { + err = sync_node_pages(sbi, &wbc); + if (err) { f2fs_unlock_all(sbi); - err = -EIO; goto out; } goto retry_flush_nodes; @@ -901,6 +984,8 @@ out: static void unblock_operations(struct f2fs_sb_info *sbi) { up_write(&sbi->node_write); + + build_free_nids(sbi); f2fs_unlock_all(sbi); } @@ -911,18 +996,48 @@ static void wait_on_all_pages_writeback(struct f2fs_sb_info *sbi) for (;;) { prepare_to_wait(&sbi->cp_wait, &wait, TASK_UNINTERRUPTIBLE); - if (!get_pages(sbi, F2FS_WRITEBACK)) + if (!atomic_read(&sbi->nr_wb_bios)) break; - io_schedule(); + io_schedule_timeout(5*HZ); } finish_wait(&sbi->cp_wait, &wait); } -static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) +static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc) +{ + unsigned long orphan_num = sbi->im[ORPHAN_INO].ino_num; + struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + + spin_lock(&sbi->cp_lock); + + if (cpc->reason == CP_UMOUNT) + __set_ckpt_flags(ckpt, CP_UMOUNT_FLAG); + else + __clear_ckpt_flags(ckpt, CP_UMOUNT_FLAG); + + if (cpc->reason == CP_FASTBOOT) + __set_ckpt_flags(ckpt, CP_FASTBOOT_FLAG); + else + __clear_ckpt_flags(ckpt, CP_FASTBOOT_FLAG); + + if (orphan_num) + __set_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG); + else + __clear_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG); + + if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) + __set_ckpt_flags(ckpt, CP_FSCK_FLAG); + + /* set this flag to activate crc|cp_ver for recovery */ + __set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG); + + spin_unlock(&sbi->cp_lock); +} + +static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); - struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); struct f2fs_nm_info *nm_i = NM_I(sbi); unsigned long orphan_num = sbi->im[ORPHAN_INO].ino_num; nid_t last_nid = nm_i->next_scan_nid; @@ -931,21 +1046,15 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) __u32 crc32 = 0; int i; int cp_payload_blks = __cp_payload(sbi); - block_t discard_blk = NEXT_FREE_BLKADDR(sbi, curseg); - bool invalidate = false; - - /* - * This avoids to conduct wrong roll-forward operations and uses - * metapages, so should be called prior to sync_meta_pages below. - */ - if (discard_next_dnode(sbi, discard_blk)) - invalidate = true; + struct super_block *sb = sbi->sb; + struct curseg_info *seg_i = CURSEG_I(sbi, CURSEG_HOT_NODE); + u64 kbytes_written; /* Flush all the NAT/SIT pages */ while (get_pages(sbi, F2FS_DIRTY_META)) { sync_meta_pages(sbi, META, LONG_MAX); if (unlikely(f2fs_cp_error(sbi))) - return; + return -EIO; } next_free_nid(sbi, &last_nid); @@ -980,10 +1089,12 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* 2 cp + n data seg summary + orphan inode blocks */ data_sum_blocks = npages_for_summary_flush(sbi, false); + spin_lock(&sbi->cp_lock); if (data_sum_blocks < NR_CURSEG_DATA_TYPE) - set_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG); + __set_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG); else - clear_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG); + __clear_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG); + spin_unlock(&sbi->cp_lock); orphan_blocks = GET_ORPHAN_BLOCKS(orphan_num); ckpt->cp_pack_start_sum = cpu_to_le32(1 + cp_payload_blks + @@ -998,29 +1109,14 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) cp_payload_blks + data_sum_blocks + orphan_blocks); - if (cpc->reason == CP_UMOUNT) - set_ckpt_flags(ckpt, CP_UMOUNT_FLAG); - else - clear_ckpt_flags(ckpt, CP_UMOUNT_FLAG); - - if (cpc->reason == CP_FASTBOOT) - set_ckpt_flags(ckpt, CP_FASTBOOT_FLAG); - else - clear_ckpt_flags(ckpt, CP_FASTBOOT_FLAG); - - if (orphan_num) - set_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG); - else - clear_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG); - - if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) - set_ckpt_flags(ckpt, CP_FSCK_FLAG); + /* update ckpt flag for checkpoint */ + update_ckpt_flags(sbi, cpc); /* update SIT/NAT bitmap */ get_sit_bitmap(sbi, __bitmap_ptr(sbi, SIT_BITMAP)); get_nat_bitmap(sbi, __bitmap_ptr(sbi, NAT_BITMAP)); - crc32 = f2fs_crc32(ckpt, le32_to_cpu(ckpt->checksum_offset)); + crc32 = f2fs_crc32(sbi, ckpt, le32_to_cpu(ckpt->checksum_offset)); *((__le32 *)((unsigned char *)ckpt + le32_to_cpu(ckpt->checksum_offset))) = cpu_to_le32(crc32); @@ -1030,7 +1126,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* need to wait for end_io results */ wait_on_all_pages_writeback(sbi); if (unlikely(f2fs_cp_error(sbi))) - return; + return -EIO; /* write out checkpoint buffer at block 0 */ update_meta_page(sbi, ckpt, start_blk++); @@ -1046,6 +1142,14 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) write_data_summaries(sbi, start_blk); start_blk += data_sum_blocks; + + /* Record write statistics in the hot node summary */ + kbytes_written = sbi->kbytes_written; + if (sb->s_bdev->bd_part) + kbytes_written += BD_PART_WRITTEN(sbi); + + seg_i->journal->info.kbytes_written = cpu_to_le64(kbytes_written); + if (__remain_node_summaries(cpc->reason)) { write_node_summaries(sbi, start_blk); start_blk += NR_CURSEG_NODE_TYPE; @@ -1058,14 +1162,14 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) wait_on_all_pages_writeback(sbi); if (unlikely(f2fs_cp_error(sbi))) - return; + return -EIO; - filemap_fdatawait_range(NODE_MAPPING(sbi), 0, LONG_MAX); - filemap_fdatawait_range(META_MAPPING(sbi), 0, LONG_MAX); + filemap_fdatawait_range(NODE_MAPPING(sbi), 0, LLONG_MAX); + filemap_fdatawait_range(META_MAPPING(sbi), 0, LLONG_MAX); /* update user_block_counts */ sbi->last_valid_block_count = sbi->total_valid_block_count; - sbi->alloc_valid_block_count = 0; + percpu_counter_set(&sbi->alloc_valid_block_count, 0); /* Here, we only have one bio having CP pack */ sync_meta_pages(sbi, META_FLUSH, LONG_MAX); @@ -1073,30 +1177,36 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* wait for previous submitted meta pages writeback */ wait_on_all_pages_writeback(sbi); - /* - * invalidate meta page which is used temporarily for zeroing out - * block at the end of warm node chain. - */ - if (invalidate) - invalidate_mapping_pages(META_MAPPING(sbi), discard_blk, - discard_blk); - - release_dirty_inode(sbi); + release_ino_entry(sbi, false); if (unlikely(f2fs_cp_error(sbi))) - return; + return -EIO; clear_prefree_segments(sbi, cpc); clear_sbi_flag(sbi, SBI_IS_DIRTY); + clear_sbi_flag(sbi, SBI_NEED_CP); + + /* + * redirty superblock if metadata like node page or inode cache is + * updated during writing checkpoint. + */ + if (get_pages(sbi, F2FS_DIRTY_NODES) || + get_pages(sbi, F2FS_DIRTY_IMETA)) + set_sbi_flag(sbi, SBI_IS_DIRTY); + + f2fs_bug_on(sbi, get_pages(sbi, F2FS_DIRTY_DENTS)); + + return 0; } /* * We guarantee that this checkpoint procedure will not fail. */ -void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) +int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); unsigned long long ckpt_ver; + int err = 0; mutex_lock(&sbi->cp_mutex); @@ -1104,21 +1214,35 @@ void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) (cpc->reason == CP_FASTBOOT || cpc->reason == CP_SYNC || (cpc->reason == CP_DISCARD && !sbi->discard_blks))) goto out; - if (unlikely(f2fs_cp_error(sbi))) + if (unlikely(f2fs_cp_error(sbi))) { + err = -EIO; goto out; - if (f2fs_readonly(sbi->sb)) + } + if (f2fs_readonly(sbi->sb)) { + err = -EROFS; goto out; + } trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "start block_ops"); - if (block_operations(sbi)) + err = block_operations(sbi); + if (err) goto out; trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish block_ops"); - f2fs_submit_merged_bio(sbi, DATA, WRITE); - f2fs_submit_merged_bio(sbi, NODE, WRITE); - f2fs_submit_merged_bio(sbi, META, WRITE); + f2fs_flush_merged_bios(sbi); + + /* this is the case of multiple fstrims without any changes */ + if (cpc->reason == CP_DISCARD && !is_sbi_flag_set(sbi, SBI_IS_DIRTY)) { + f2fs_bug_on(sbi, NM_I(sbi)->dirty_nat_cnt); + f2fs_bug_on(sbi, SIT_I(sbi)->dirty_sentries); + f2fs_bug_on(sbi, prefree_segments(sbi)); + flush_sit_entries(sbi, cpc); + clear_prefree_segments(sbi, cpc); + unblock_operations(sbi); + goto out; + } /* * update checkpoint pack index @@ -1133,7 +1257,7 @@ void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) flush_sit_entries(sbi, cpc); /* unlock all the fs_lock[] in do_checkpoint() */ - do_checkpoint(sbi, cpc); + err = do_checkpoint(sbi, cpc); unblock_operations(sbi); stat_inc_cp_count(sbi->stat_info); @@ -1143,10 +1267,11 @@ void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) "checkpoint: version = %llx", ckpt_ver); /* do checkpoint periodically */ - sbi->cp_expires = round_jiffies_up(jiffies + HZ * sbi->cp_interval); + f2fs_update_time(sbi, CP_TIME); + trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish checkpoint"); out: mutex_unlock(&sbi->cp_mutex); - trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish checkpoint"); + return err; } void init_ino_entry_info(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/crypto.c b/fs/f2fs/crypto.c deleted file mode 100644 index 4a62ef14e932..000000000000 --- a/fs/f2fs/crypto.c +++ /dev/null @@ -1,491 +0,0 @@ -/* - * linux/fs/f2fs/crypto.c - * - * Copied from linux/fs/ext4/crypto.c - * - * Copyright (C) 2015, Google, Inc. - * Copyright (C) 2015, Motorola Mobility - * - * This contains encryption functions for f2fs - * - * Written by Michael Halcrow, 2014. - * - * Filename encryption additions - * Uday Savagaonkar, 2014 - * Encryption policy handling additions - * Ildar Muslukhov, 2014 - * Remove ext4_encrypted_zeroout(), - * add f2fs_restore_and_release_control_page() - * Jaegeuk Kim, 2015. - * - * This has not yet undergone a rigorous security audit. - * - * The usage of AES-XTS should conform to recommendations in NIST - * Special Publication 800-38E and IEEE P1619/D16. - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "f2fs.h" -#include "xattr.h" - -/* Encryption added and removed here! (L: */ - -static unsigned int num_prealloc_crypto_pages = 32; -static unsigned int num_prealloc_crypto_ctxs = 128; - -module_param(num_prealloc_crypto_pages, uint, 0444); -MODULE_PARM_DESC(num_prealloc_crypto_pages, - "Number of crypto pages to preallocate"); -module_param(num_prealloc_crypto_ctxs, uint, 0444); -MODULE_PARM_DESC(num_prealloc_crypto_ctxs, - "Number of crypto contexts to preallocate"); - -static mempool_t *f2fs_bounce_page_pool; - -static LIST_HEAD(f2fs_free_crypto_ctxs); -static DEFINE_SPINLOCK(f2fs_crypto_ctx_lock); - -static struct workqueue_struct *f2fs_read_workqueue; -static DEFINE_MUTEX(crypto_init); - -static struct kmem_cache *f2fs_crypto_ctx_cachep; -struct kmem_cache *f2fs_crypt_info_cachep; - -/** - * f2fs_release_crypto_ctx() - Releases an encryption context - * @ctx: The encryption context to release. - * - * If the encryption context was allocated from the pre-allocated pool, returns - * it to that pool. Else, frees it. - * - * If there's a bounce page in the context, this frees that. - */ -void f2fs_release_crypto_ctx(struct f2fs_crypto_ctx *ctx) -{ - unsigned long flags; - - if (ctx->flags & F2FS_WRITE_PATH_FL && ctx->w.bounce_page) { - mempool_free(ctx->w.bounce_page, f2fs_bounce_page_pool); - ctx->w.bounce_page = NULL; - } - ctx->w.control_page = NULL; - if (ctx->flags & F2FS_CTX_REQUIRES_FREE_ENCRYPT_FL) { - kmem_cache_free(f2fs_crypto_ctx_cachep, ctx); - } else { - spin_lock_irqsave(&f2fs_crypto_ctx_lock, flags); - list_add(&ctx->free_list, &f2fs_free_crypto_ctxs); - spin_unlock_irqrestore(&f2fs_crypto_ctx_lock, flags); - } -} - -/** - * f2fs_get_crypto_ctx() - Gets an encryption context - * @inode: The inode for which we are doing the crypto - * - * Allocates and initializes an encryption context. - * - * Return: An allocated and initialized encryption context on success; error - * value or NULL otherwise. - */ -struct f2fs_crypto_ctx *f2fs_get_crypto_ctx(struct inode *inode) -{ - struct f2fs_crypto_ctx *ctx = NULL; - unsigned long flags; - struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info; - - if (ci == NULL) - return ERR_PTR(-ENOKEY); - - /* - * We first try getting the ctx from a free list because in - * the common case the ctx will have an allocated and - * initialized crypto tfm, so it's probably a worthwhile - * optimization. For the bounce page, we first try getting it - * from the kernel allocator because that's just about as fast - * as getting it from a list and because a cache of free pages - * should generally be a "last resort" option for a filesystem - * to be able to do its job. - */ - spin_lock_irqsave(&f2fs_crypto_ctx_lock, flags); - ctx = list_first_entry_or_null(&f2fs_free_crypto_ctxs, - struct f2fs_crypto_ctx, free_list); - if (ctx) - list_del(&ctx->free_list); - spin_unlock_irqrestore(&f2fs_crypto_ctx_lock, flags); - if (!ctx) { - ctx = kmem_cache_zalloc(f2fs_crypto_ctx_cachep, GFP_NOFS); - if (!ctx) - return ERR_PTR(-ENOMEM); - ctx->flags |= F2FS_CTX_REQUIRES_FREE_ENCRYPT_FL; - } else { - ctx->flags &= ~F2FS_CTX_REQUIRES_FREE_ENCRYPT_FL; - } - ctx->flags &= ~F2FS_WRITE_PATH_FL; - return ctx; -} - -/* - * Call f2fs_decrypt on every single page, reusing the encryption - * context. - */ -static void completion_pages(struct work_struct *work) -{ - struct f2fs_crypto_ctx *ctx = - container_of(work, struct f2fs_crypto_ctx, r.work); - struct bio *bio = ctx->r.bio; - struct bio_vec *bv; - int i; - - bio_for_each_segment_all(bv, bio, i) { - struct page *page = bv->bv_page; - int ret = f2fs_decrypt(ctx, page); - - if (ret) { - WARN_ON_ONCE(1); - SetPageError(page); - } else - SetPageUptodate(page); - unlock_page(page); - } - f2fs_release_crypto_ctx(ctx); - bio_put(bio); -} - -void f2fs_end_io_crypto_work(struct f2fs_crypto_ctx *ctx, struct bio *bio) -{ - INIT_WORK(&ctx->r.work, completion_pages); - ctx->r.bio = bio; - queue_work(f2fs_read_workqueue, &ctx->r.work); -} - -static void f2fs_crypto_destroy(void) -{ - struct f2fs_crypto_ctx *pos, *n; - - list_for_each_entry_safe(pos, n, &f2fs_free_crypto_ctxs, free_list) - kmem_cache_free(f2fs_crypto_ctx_cachep, pos); - INIT_LIST_HEAD(&f2fs_free_crypto_ctxs); - if (f2fs_bounce_page_pool) - mempool_destroy(f2fs_bounce_page_pool); - f2fs_bounce_page_pool = NULL; -} - -/** - * f2fs_crypto_initialize() - Set up for f2fs encryption. - * - * We only call this when we start accessing encrypted files, since it - * results in memory getting allocated that wouldn't otherwise be used. - * - * Return: Zero on success, non-zero otherwise. - */ -int f2fs_crypto_initialize(void) -{ - int i, res = -ENOMEM; - - if (f2fs_bounce_page_pool) - return 0; - - mutex_lock(&crypto_init); - if (f2fs_bounce_page_pool) - goto already_initialized; - - for (i = 0; i < num_prealloc_crypto_ctxs; i++) { - struct f2fs_crypto_ctx *ctx; - - ctx = kmem_cache_zalloc(f2fs_crypto_ctx_cachep, GFP_KERNEL); - if (!ctx) - goto fail; - list_add(&ctx->free_list, &f2fs_free_crypto_ctxs); - } - - /* must be allocated at the last step to avoid race condition above */ - f2fs_bounce_page_pool = - mempool_create_page_pool(num_prealloc_crypto_pages, 0); - if (!f2fs_bounce_page_pool) - goto fail; - -already_initialized: - mutex_unlock(&crypto_init); - return 0; -fail: - f2fs_crypto_destroy(); - mutex_unlock(&crypto_init); - return res; -} - -/** - * f2fs_exit_crypto() - Shutdown the f2fs encryption system - */ -void f2fs_exit_crypto(void) -{ - f2fs_crypto_destroy(); - - if (f2fs_read_workqueue) - destroy_workqueue(f2fs_read_workqueue); - if (f2fs_crypto_ctx_cachep) - kmem_cache_destroy(f2fs_crypto_ctx_cachep); - if (f2fs_crypt_info_cachep) - kmem_cache_destroy(f2fs_crypt_info_cachep); -} - -int __init f2fs_init_crypto(void) -{ - int res = -ENOMEM; - - f2fs_read_workqueue = alloc_workqueue("f2fs_crypto", WQ_HIGHPRI, 0); - if (!f2fs_read_workqueue) - goto fail; - - f2fs_crypto_ctx_cachep = KMEM_CACHE(f2fs_crypto_ctx, - SLAB_RECLAIM_ACCOUNT); - if (!f2fs_crypto_ctx_cachep) - goto fail; - - f2fs_crypt_info_cachep = KMEM_CACHE(f2fs_crypt_info, - SLAB_RECLAIM_ACCOUNT); - if (!f2fs_crypt_info_cachep) - goto fail; - - return 0; -fail: - f2fs_exit_crypto(); - return res; -} - -void f2fs_restore_and_release_control_page(struct page **page) -{ - struct f2fs_crypto_ctx *ctx; - struct page *bounce_page; - - /* The bounce data pages are unmapped. */ - if ((*page)->mapping) - return; - - /* The bounce data page is unmapped. */ - bounce_page = *page; - ctx = (struct f2fs_crypto_ctx *)page_private(bounce_page); - - /* restore control page */ - *page = ctx->w.control_page; - - f2fs_restore_control_page(bounce_page); -} - -void f2fs_restore_control_page(struct page *data_page) -{ - struct f2fs_crypto_ctx *ctx = - (struct f2fs_crypto_ctx *)page_private(data_page); - - set_page_private(data_page, (unsigned long)NULL); - ClearPagePrivate(data_page); - unlock_page(data_page); - f2fs_release_crypto_ctx(ctx); -} - -/** - * f2fs_crypt_complete() - The completion callback for page encryption - * @req: The asynchronous encryption request context - * @res: The result of the encryption operation - */ -static void f2fs_crypt_complete(struct crypto_async_request *req, int res) -{ - struct f2fs_completion_result *ecr = req->data; - - if (res == -EINPROGRESS) - return; - ecr->res = res; - complete(&ecr->completion); -} - -typedef enum { - F2FS_DECRYPT = 0, - F2FS_ENCRYPT, -} f2fs_direction_t; - -static int f2fs_page_crypto(struct f2fs_crypto_ctx *ctx, - struct inode *inode, - f2fs_direction_t rw, - pgoff_t index, - struct page *src_page, - struct page *dest_page) -{ - u8 xts_tweak[F2FS_XTS_TWEAK_SIZE]; - struct ablkcipher_request *req = NULL; - DECLARE_F2FS_COMPLETION_RESULT(ecr); - struct scatterlist dst, src; - struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info; - struct crypto_ablkcipher *tfm = ci->ci_ctfm; - int res = 0; - - req = ablkcipher_request_alloc(tfm, GFP_NOFS); - if (!req) { - printk_ratelimited(KERN_ERR - "%s: crypto_request_alloc() failed\n", - __func__); - return -ENOMEM; - } - ablkcipher_request_set_callback( - req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, - f2fs_crypt_complete, &ecr); - - BUILD_BUG_ON(F2FS_XTS_TWEAK_SIZE < sizeof(index)); - memcpy(xts_tweak, &index, sizeof(index)); - memset(&xts_tweak[sizeof(index)], 0, - F2FS_XTS_TWEAK_SIZE - sizeof(index)); - - sg_init_table(&dst, 1); - sg_set_page(&dst, dest_page, PAGE_CACHE_SIZE, 0); - sg_init_table(&src, 1); - sg_set_page(&src, src_page, PAGE_CACHE_SIZE, 0); - ablkcipher_request_set_crypt(req, &src, &dst, PAGE_CACHE_SIZE, - xts_tweak); - if (rw == F2FS_DECRYPT) - res = crypto_ablkcipher_decrypt(req); - else - res = crypto_ablkcipher_encrypt(req); - if (res == -EINPROGRESS || res == -EBUSY) { - BUG_ON(req->base.data != &ecr); - wait_for_completion(&ecr.completion); - res = ecr.res; - } - ablkcipher_request_free(req); - if (res) { - printk_ratelimited(KERN_ERR - "%s: crypto_ablkcipher_encrypt() returned %d\n", - __func__, res); - return res; - } - return 0; -} - -static struct page *alloc_bounce_page(struct f2fs_crypto_ctx *ctx) -{ - ctx->w.bounce_page = mempool_alloc(f2fs_bounce_page_pool, GFP_NOWAIT); - if (ctx->w.bounce_page == NULL) - return ERR_PTR(-ENOMEM); - ctx->flags |= F2FS_WRITE_PATH_FL; - return ctx->w.bounce_page; -} - -/** - * f2fs_encrypt() - Encrypts a page - * @inode: The inode for which the encryption should take place - * @plaintext_page: The page to encrypt. Must be locked. - * - * Allocates a ciphertext page and encrypts plaintext_page into it using the ctx - * encryption context. - * - * Called on the page write path. The caller must call - * f2fs_restore_control_page() on the returned ciphertext page to - * release the bounce buffer and the encryption context. - * - * Return: An allocated page with the encrypted content on success. Else, an - * error value or NULL. - */ -struct page *f2fs_encrypt(struct inode *inode, - struct page *plaintext_page) -{ - struct f2fs_crypto_ctx *ctx; - struct page *ciphertext_page = NULL; - int err; - - BUG_ON(!PageLocked(plaintext_page)); - - ctx = f2fs_get_crypto_ctx(inode); - if (IS_ERR(ctx)) - return (struct page *)ctx; - - /* The encryption operation will require a bounce page. */ - ciphertext_page = alloc_bounce_page(ctx); - if (IS_ERR(ciphertext_page)) - goto err_out; - - ctx->w.control_page = plaintext_page; - err = f2fs_page_crypto(ctx, inode, F2FS_ENCRYPT, plaintext_page->index, - plaintext_page, ciphertext_page); - if (err) { - ciphertext_page = ERR_PTR(err); - goto err_out; - } - - SetPagePrivate(ciphertext_page); - set_page_private(ciphertext_page, (unsigned long)ctx); - lock_page(ciphertext_page); - return ciphertext_page; - -err_out: - f2fs_release_crypto_ctx(ctx); - return ciphertext_page; -} - -/** - * f2fs_decrypt() - Decrypts a page in-place - * @ctx: The encryption context. - * @page: The page to decrypt. Must be locked. - * - * Decrypts page in-place using the ctx encryption context. - * - * Called from the read completion callback. - * - * Return: Zero on success, non-zero otherwise. - */ -int f2fs_decrypt(struct f2fs_crypto_ctx *ctx, struct page *page) -{ - BUG_ON(!PageLocked(page)); - - return f2fs_page_crypto(ctx, page->mapping->host, - F2FS_DECRYPT, page->index, page, page); -} - -/* - * Convenience function which takes care of allocating and - * deallocating the encryption context - */ -int f2fs_decrypt_one(struct inode *inode, struct page *page) -{ - struct f2fs_crypto_ctx *ctx = f2fs_get_crypto_ctx(inode); - int ret; - - if (IS_ERR(ctx)) - return PTR_ERR(ctx); - ret = f2fs_decrypt(ctx, page); - f2fs_release_crypto_ctx(ctx); - return ret; -} - -bool f2fs_valid_contents_enc_mode(uint32_t mode) -{ - return (mode == F2FS_ENCRYPTION_MODE_AES_256_XTS); -} - -/** - * f2fs_validate_encryption_key_size() - Validate the encryption key size - * @mode: The key mode. - * @size: The key size to validate. - * - * Return: The validated key size for @mode. Zero if invalid. - */ -uint32_t f2fs_validate_encryption_key_size(uint32_t mode, uint32_t size) -{ - if (size == f2fs_encryption_key_size(mode)) - return size; - return 0; -} diff --git a/fs/f2fs/crypto_key.c b/fs/f2fs/crypto_key.c deleted file mode 100644 index 5de2d866a25c..000000000000 --- a/fs/f2fs/crypto_key.c +++ /dev/null @@ -1,254 +0,0 @@ -/* - * linux/fs/f2fs/crypto_key.c - * - * Copied from linux/fs/f2fs/crypto_key.c - * - * Copyright (C) 2015, Google, Inc. - * - * This contains encryption key functions for f2fs - * - * Written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar, 2015. - */ -#include -#include -#include -#include -#include -#include -#include - -#include "f2fs.h" -#include "xattr.h" - -static void derive_crypt_complete(struct crypto_async_request *req, int rc) -{ - struct f2fs_completion_result *ecr = req->data; - - if (rc == -EINPROGRESS) - return; - - ecr->res = rc; - complete(&ecr->completion); -} - -/** - * f2fs_derive_key_aes() - Derive a key using AES-128-ECB - * @deriving_key: Encryption key used for derivatio. - * @source_key: Source key to which to apply derivation. - * @derived_key: Derived key. - * - * Return: Zero on success; non-zero otherwise. - */ -static int f2fs_derive_key_aes(char deriving_key[F2FS_AES_128_ECB_KEY_SIZE], - char source_key[F2FS_AES_256_XTS_KEY_SIZE], - char derived_key[F2FS_AES_256_XTS_KEY_SIZE]) -{ - int res = 0; - struct ablkcipher_request *req = NULL; - DECLARE_F2FS_COMPLETION_RESULT(ecr); - struct scatterlist src_sg, dst_sg; - struct crypto_ablkcipher *tfm = crypto_alloc_ablkcipher("ecb(aes)", 0, - 0); - - if (IS_ERR(tfm)) { - res = PTR_ERR(tfm); - tfm = NULL; - goto out; - } - crypto_ablkcipher_set_flags(tfm, CRYPTO_TFM_REQ_WEAK_KEY); - req = ablkcipher_request_alloc(tfm, GFP_NOFS); - if (!req) { - res = -ENOMEM; - goto out; - } - ablkcipher_request_set_callback(req, - CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, - derive_crypt_complete, &ecr); - res = crypto_ablkcipher_setkey(tfm, deriving_key, - F2FS_AES_128_ECB_KEY_SIZE); - if (res < 0) - goto out; - - sg_init_one(&src_sg, source_key, F2FS_AES_256_XTS_KEY_SIZE); - sg_init_one(&dst_sg, derived_key, F2FS_AES_256_XTS_KEY_SIZE); - ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, - F2FS_AES_256_XTS_KEY_SIZE, NULL); - res = crypto_ablkcipher_encrypt(req); - if (res == -EINPROGRESS || res == -EBUSY) { - BUG_ON(req->base.data != &ecr); - wait_for_completion(&ecr.completion); - res = ecr.res; - } -out: - if (req) - ablkcipher_request_free(req); - if (tfm) - crypto_free_ablkcipher(tfm); - return res; -} - -static void f2fs_free_crypt_info(struct f2fs_crypt_info *ci) -{ - if (!ci) - return; - - key_put(ci->ci_keyring_key); - crypto_free_ablkcipher(ci->ci_ctfm); - kmem_cache_free(f2fs_crypt_info_cachep, ci); -} - -void f2fs_free_encryption_info(struct inode *inode, struct f2fs_crypt_info *ci) -{ - struct f2fs_inode_info *fi = F2FS_I(inode); - struct f2fs_crypt_info *prev; - - if (ci == NULL) - ci = ACCESS_ONCE(fi->i_crypt_info); - if (ci == NULL) - return; - prev = cmpxchg(&fi->i_crypt_info, ci, NULL); - if (prev != ci) - return; - - f2fs_free_crypt_info(ci); -} - -int _f2fs_get_encryption_info(struct inode *inode) -{ - struct f2fs_inode_info *fi = F2FS_I(inode); - struct f2fs_crypt_info *crypt_info; - char full_key_descriptor[F2FS_KEY_DESC_PREFIX_SIZE + - (F2FS_KEY_DESCRIPTOR_SIZE * 2) + 1]; - struct key *keyring_key = NULL; - struct f2fs_encryption_key *master_key; - struct f2fs_encryption_context ctx; - const struct user_key_payload *ukp; - struct crypto_ablkcipher *ctfm; - const char *cipher_str; - char raw_key[F2FS_MAX_KEY_SIZE]; - char mode; - int res; - - res = f2fs_crypto_initialize(); - if (res) - return res; -retry: - crypt_info = ACCESS_ONCE(fi->i_crypt_info); - if (crypt_info) { - if (!crypt_info->ci_keyring_key || - key_validate(crypt_info->ci_keyring_key) == 0) - return 0; - f2fs_free_encryption_info(inode, crypt_info); - goto retry; - } - - res = f2fs_getxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION, - F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, - &ctx, sizeof(ctx), NULL); - if (res < 0) - return res; - else if (res != sizeof(ctx)) - return -EINVAL; - res = 0; - - crypt_info = kmem_cache_alloc(f2fs_crypt_info_cachep, GFP_NOFS); - if (!crypt_info) - return -ENOMEM; - - crypt_info->ci_flags = ctx.flags; - crypt_info->ci_data_mode = ctx.contents_encryption_mode; - crypt_info->ci_filename_mode = ctx.filenames_encryption_mode; - crypt_info->ci_ctfm = NULL; - crypt_info->ci_keyring_key = NULL; - memcpy(crypt_info->ci_master_key, ctx.master_key_descriptor, - sizeof(crypt_info->ci_master_key)); - if (S_ISREG(inode->i_mode)) - mode = crypt_info->ci_data_mode; - else if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) - mode = crypt_info->ci_filename_mode; - else - BUG(); - - switch (mode) { - case F2FS_ENCRYPTION_MODE_AES_256_XTS: - cipher_str = "xts(aes)"; - break; - case F2FS_ENCRYPTION_MODE_AES_256_CTS: - cipher_str = "cts(cbc(aes))"; - break; - default: - printk_once(KERN_WARNING - "f2fs: unsupported key mode %d (ino %u)\n", - mode, (unsigned) inode->i_ino); - res = -ENOKEY; - goto out; - } - - memcpy(full_key_descriptor, F2FS_KEY_DESC_PREFIX, - F2FS_KEY_DESC_PREFIX_SIZE); - sprintf(full_key_descriptor + F2FS_KEY_DESC_PREFIX_SIZE, - "%*phN", F2FS_KEY_DESCRIPTOR_SIZE, - ctx.master_key_descriptor); - full_key_descriptor[F2FS_KEY_DESC_PREFIX_SIZE + - (2 * F2FS_KEY_DESCRIPTOR_SIZE)] = '\0'; - keyring_key = request_key(&key_type_logon, full_key_descriptor, NULL); - if (IS_ERR(keyring_key)) { - res = PTR_ERR(keyring_key); - keyring_key = NULL; - goto out; - } - crypt_info->ci_keyring_key = keyring_key; - BUG_ON(keyring_key->type != &key_type_logon); - ukp = user_key_payload(keyring_key); - if (ukp->datalen != sizeof(struct f2fs_encryption_key)) { - res = -EINVAL; - goto out; - } - master_key = (struct f2fs_encryption_key *)ukp->data; - BUILD_BUG_ON(F2FS_AES_128_ECB_KEY_SIZE != - F2FS_KEY_DERIVATION_NONCE_SIZE); - BUG_ON(master_key->size != F2FS_AES_256_XTS_KEY_SIZE); - res = f2fs_derive_key_aes(ctx.nonce, master_key->raw, - raw_key); - if (res) - goto out; - - ctfm = crypto_alloc_ablkcipher(cipher_str, 0, 0); - if (!ctfm || IS_ERR(ctfm)) { - res = ctfm ? PTR_ERR(ctfm) : -ENOMEM; - printk(KERN_DEBUG - "%s: error %d (inode %u) allocating crypto tfm\n", - __func__, res, (unsigned) inode->i_ino); - goto out; - } - crypt_info->ci_ctfm = ctfm; - crypto_ablkcipher_clear_flags(ctfm, ~0); - crypto_tfm_set_flags(crypto_ablkcipher_tfm(ctfm), - CRYPTO_TFM_REQ_WEAK_KEY); - res = crypto_ablkcipher_setkey(ctfm, raw_key, - f2fs_encryption_key_size(mode)); - if (res) - goto out; - - memzero_explicit(raw_key, sizeof(raw_key)); - if (cmpxchg(&fi->i_crypt_info, NULL, crypt_info) != NULL) { - f2fs_free_crypt_info(crypt_info); - goto retry; - } - return 0; - -out: - if (res == -ENOKEY && !S_ISREG(inode->i_mode)) - res = 0; - - f2fs_free_crypt_info(crypt_info); - memzero_explicit(raw_key, sizeof(raw_key)); - return res; -} - -int f2fs_has_encryption_key(struct inode *inode) -{ - struct f2fs_inode_info *fi = F2FS_I(inode); - - return (fi->i_crypt_info != NULL); -} diff --git a/fs/f2fs/crypto_policy.c b/fs/f2fs/crypto_policy.c deleted file mode 100644 index e504f548b64e..000000000000 --- a/fs/f2fs/crypto_policy.c +++ /dev/null @@ -1,212 +0,0 @@ -/* - * copied from linux/fs/ext4/crypto_policy.c - * - * Copyright (C) 2015, Google, Inc. - * Copyright (C) 2015, Motorola Mobility. - * - * This contains encryption policy functions for f2fs with some modifications - * to support f2fs-specific xattr APIs. - * - * Written by Michael Halcrow, 2015. - * Modified by Jaegeuk Kim, 2015. - */ -#include -#include -#include -#include - -#include "f2fs.h" -#include "xattr.h" - -static int f2fs_inode_has_encryption_context(struct inode *inode) -{ - int res = f2fs_getxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION, - F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, NULL, 0, NULL); - return (res > 0); -} - -/* - * check whether the policy is consistent with the encryption context - * for the inode - */ -static int f2fs_is_encryption_context_consistent_with_policy( - struct inode *inode, const struct f2fs_encryption_policy *policy) -{ - struct f2fs_encryption_context ctx; - int res = f2fs_getxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION, - F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx, - sizeof(ctx), NULL); - - if (res != sizeof(ctx)) - return 0; - - return (memcmp(ctx.master_key_descriptor, policy->master_key_descriptor, - F2FS_KEY_DESCRIPTOR_SIZE) == 0 && - (ctx.flags == policy->flags) && - (ctx.contents_encryption_mode == - policy->contents_encryption_mode) && - (ctx.filenames_encryption_mode == - policy->filenames_encryption_mode)); -} - -static int f2fs_create_encryption_context_from_policy( - struct inode *inode, const struct f2fs_encryption_policy *policy) -{ - struct f2fs_encryption_context ctx; - - ctx.format = F2FS_ENCRYPTION_CONTEXT_FORMAT_V1; - memcpy(ctx.master_key_descriptor, policy->master_key_descriptor, - F2FS_KEY_DESCRIPTOR_SIZE); - - if (!f2fs_valid_contents_enc_mode(policy->contents_encryption_mode)) { - printk(KERN_WARNING - "%s: Invalid contents encryption mode %d\n", __func__, - policy->contents_encryption_mode); - return -EINVAL; - } - - if (!f2fs_valid_filenames_enc_mode(policy->filenames_encryption_mode)) { - printk(KERN_WARNING - "%s: Invalid filenames encryption mode %d\n", __func__, - policy->filenames_encryption_mode); - return -EINVAL; - } - - if (policy->flags & ~F2FS_POLICY_FLAGS_VALID) - return -EINVAL; - - ctx.contents_encryption_mode = policy->contents_encryption_mode; - ctx.filenames_encryption_mode = policy->filenames_encryption_mode; - ctx.flags = policy->flags; - BUILD_BUG_ON(sizeof(ctx.nonce) != F2FS_KEY_DERIVATION_NONCE_SIZE); - get_random_bytes(ctx.nonce, F2FS_KEY_DERIVATION_NONCE_SIZE); - - return f2fs_setxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION, - F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx, - sizeof(ctx), NULL, XATTR_CREATE); -} - -int f2fs_process_policy(const struct f2fs_encryption_policy *policy, - struct inode *inode) -{ - if (!inode_owner_or_capable(inode)) - return -EACCES; - - if (policy->version != 0) - return -EINVAL; - - if (!S_ISDIR(inode->i_mode)) - return -EINVAL; - - if (!f2fs_inode_has_encryption_context(inode)) { - if (!f2fs_empty_dir(inode)) - return -ENOTEMPTY; - return f2fs_create_encryption_context_from_policy(inode, - policy); - } - - if (f2fs_is_encryption_context_consistent_with_policy(inode, policy)) - return 0; - - printk(KERN_WARNING "%s: Policy inconsistent with encryption context\n", - __func__); - return -EINVAL; -} - -int f2fs_get_policy(struct inode *inode, struct f2fs_encryption_policy *policy) -{ - struct f2fs_encryption_context ctx; - int res; - - if (!f2fs_encrypted_inode(inode)) - return -ENODATA; - - res = f2fs_getxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION, - F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, - &ctx, sizeof(ctx), NULL); - if (res != sizeof(ctx)) - return -ENODATA; - if (ctx.format != F2FS_ENCRYPTION_CONTEXT_FORMAT_V1) - return -EINVAL; - - policy->version = 0; - policy->contents_encryption_mode = ctx.contents_encryption_mode; - policy->filenames_encryption_mode = ctx.filenames_encryption_mode; - policy->flags = ctx.flags; - memcpy(&policy->master_key_descriptor, ctx.master_key_descriptor, - F2FS_KEY_DESCRIPTOR_SIZE); - return 0; -} - -int f2fs_is_child_context_consistent_with_parent(struct inode *parent, - struct inode *child) -{ - struct f2fs_crypt_info *parent_ci, *child_ci; - int res; - - if ((parent == NULL) || (child == NULL)) { - pr_err("parent %p child %p\n", parent, child); - BUG_ON(1); - } - - /* no restrictions if the parent directory is not encrypted */ - if (!f2fs_encrypted_inode(parent)) - return 1; - /* if the child directory is not encrypted, this is always a problem */ - if (!f2fs_encrypted_inode(child)) - return 0; - res = f2fs_get_encryption_info(parent); - if (res) - return 0; - res = f2fs_get_encryption_info(child); - if (res) - return 0; - parent_ci = F2FS_I(parent)->i_crypt_info; - child_ci = F2FS_I(child)->i_crypt_info; - if (!parent_ci && !child_ci) - return 1; - if (!parent_ci || !child_ci) - return 0; - - return (memcmp(parent_ci->ci_master_key, - child_ci->ci_master_key, - F2FS_KEY_DESCRIPTOR_SIZE) == 0 && - (parent_ci->ci_data_mode == child_ci->ci_data_mode) && - (parent_ci->ci_filename_mode == child_ci->ci_filename_mode) && - (parent_ci->ci_flags == child_ci->ci_flags)); -} - -/** - * f2fs_inherit_context() - Sets a child context from its parent - * @parent: Parent inode from which the context is inherited. - * @child: Child inode that inherits the context from @parent. - * - * Return: Zero on success, non-zero otherwise - */ -int f2fs_inherit_context(struct inode *parent, struct inode *child, - struct page *ipage) -{ - struct f2fs_encryption_context ctx; - struct f2fs_crypt_info *ci; - int res; - - res = f2fs_get_encryption_info(parent); - if (res < 0) - return res; - - ci = F2FS_I(parent)->i_crypt_info; - BUG_ON(ci == NULL); - - ctx.format = F2FS_ENCRYPTION_CONTEXT_FORMAT_V1; - - ctx.contents_encryption_mode = ci->ci_data_mode; - ctx.filenames_encryption_mode = ci->ci_filename_mode; - ctx.flags = ci->ci_flags; - memcpy(ctx.master_key_descriptor, ci->ci_master_key, - F2FS_KEY_DESCRIPTOR_SIZE); - - get_random_bytes(ctx.nonce, F2FS_KEY_DERIVATION_NONCE_SIZE); - return f2fs_setxattr(child, F2FS_XATTR_INDEX_ENCRYPTION, - F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx, - sizeof(ctx), ipage, XATTR_CREATE); -} diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 972eab7ac071..7a3ac306a57c 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -19,6 +19,8 @@ #include #include #include +#include +#include #include #include "f2fs.h" @@ -32,11 +34,16 @@ static void f2fs_read_end_io(struct bio *bio) struct bio_vec *bvec; int i; +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(F2FS_P_SB(bio->bi_io_vec->bv_page), FAULT_IO)) + bio->bi_error = -EIO; +#endif + if (f2fs_bio_encrypted(bio)) { if (bio->bi_error) { - f2fs_release_crypto_ctx(bio->bi_private); + fscrypt_release_ctx(bio->bi_private); } else { - f2fs_end_io_crypto_work(bio->bi_private, bio); + fscrypt_decrypt_bio_pages(bio->bi_private, bio); return; } } @@ -45,7 +52,8 @@ static void f2fs_read_end_io(struct bio *bio) struct page *page = bvec->bv_page; if (!bio->bi_error) { - SetPageUptodate(page); + if (!PageUptodate(page)) + SetPageUptodate(page); } else { ClearPageUptodate(page); SetPageError(page); @@ -64,19 +72,16 @@ static void f2fs_write_end_io(struct bio *bio) bio_for_each_segment_all(bvec, bio, i) { struct page *page = bvec->bv_page; - f2fs_restore_and_release_control_page(&page); + fscrypt_pullback_bio_page(&page, true); if (unlikely(bio->bi_error)) { - set_page_dirty(page); set_bit(AS_EIO, &page->mapping->flags); - f2fs_stop_checkpoint(sbi); + f2fs_stop_checkpoint(sbi, true); } end_page_writeback(page); - dec_page_count(sbi, F2FS_WRITEBACK); } - - if (!get_pages(sbi, F2FS_WRITEBACK) && - !list_empty(&sbi->cp_wait.task_list)) + if (atomic_dec_and_test(&sbi->nr_wb_bios) && + wq_has_sleeper(&sbi->cp_wait)) wake_up(&sbi->cp_wait); bio_put(bio); @@ -100,6 +105,18 @@ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr, return bio; } +static inline void __submit_bio(struct f2fs_sb_info *sbi, int rw, + struct bio *bio, enum page_type type) +{ + if (!is_read_io(rw)) { + atomic_inc(&sbi->nr_wb_bios); + if (f2fs_sb_mounted_hmsmr(sbi->sb) && + current->plug && (type == DATA || type == NODE)) + blk_finish_plug(current->plug); + } + submit_bio(rw, bio); +} + static void __submit_merged_bio(struct f2fs_bio_info *io) { struct f2fs_io_info *fio = &io->fio; @@ -112,12 +129,58 @@ static void __submit_merged_bio(struct f2fs_bio_info *io) else trace_f2fs_submit_write_bio(io->sbi->sb, fio, io->bio); - submit_bio(fio->rw, io->bio); + __submit_bio(io->sbi, fio->rw, io->bio, fio->type); io->bio = NULL; } -void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, - enum page_type type, int rw) +static bool __has_merged_page(struct f2fs_bio_info *io, struct inode *inode, + struct page *page, nid_t ino) +{ + struct bio_vec *bvec; + struct page *target; + int i; + + if (!io->bio) + return false; + + if (!inode && !page && !ino) + return true; + + bio_for_each_segment_all(bvec, io->bio, i) { + + if (bvec->bv_page->mapping) + target = bvec->bv_page; + else + target = fscrypt_control_page(bvec->bv_page); + + if (inode && inode == target->mapping->host) + return true; + if (page && page == target) + return true; + if (ino && ino == ino_of_node(target)) + return true; + } + + return false; +} + +static bool has_merged_page(struct f2fs_sb_info *sbi, struct inode *inode, + struct page *page, nid_t ino, + enum page_type type) +{ + enum page_type btype = PAGE_TYPE_OF_BIO(type); + struct f2fs_bio_info *io = &sbi->write_io[btype]; + bool ret; + + down_read(&io->io_rwsem); + ret = __has_merged_page(io, inode, page, ino); + up_read(&io->io_rwsem); + return ret; +} + +static void __f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, + struct inode *inode, struct page *page, + nid_t ino, enum page_type type, int rw) { enum page_type btype = PAGE_TYPE_OF_BIO(type); struct f2fs_bio_info *io; @@ -126,6 +189,9 @@ void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, down_write(&io->io_rwsem); + if (!__has_merged_page(io, inode, page, ino)) + goto out; + /* change META to META_FLUSH in the checkpoint procedure */ if (type >= META_FLUSH) { io->fio.type = META_FLUSH; @@ -135,9 +201,31 @@ void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, io->fio.rw = WRITE_FLUSH_FUA | REQ_META | REQ_PRIO; } __submit_merged_bio(io); +out: up_write(&io->io_rwsem); } +void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, enum page_type type, + int rw) +{ + __f2fs_submit_merged_bio(sbi, NULL, NULL, 0, type, rw); +} + +void f2fs_submit_merged_bio_cond(struct f2fs_sb_info *sbi, + struct inode *inode, struct page *page, + nid_t ino, enum page_type type, int rw) +{ + if (has_merged_page(sbi, inode, page, ino, type)) + __f2fs_submit_merged_bio(sbi, inode, page, ino, type, rw); +} + +void f2fs_flush_merged_bios(struct f2fs_sb_info *sbi) +{ + f2fs_submit_merged_bio(sbi, DATA, WRITE); + f2fs_submit_merged_bio(sbi, NODE, WRITE); + f2fs_submit_merged_bio(sbi, META, WRITE); +} + /* * Fill the locked page with data located in the block address. * Return unlocked page. @@ -145,20 +233,21 @@ void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, int f2fs_submit_page_bio(struct f2fs_io_info *fio) { struct bio *bio; - struct page *page = fio->encrypted_page ? fio->encrypted_page : fio->page; + struct page *page = fio->encrypted_page ? + fio->encrypted_page : fio->page; trace_f2fs_submit_page_bio(page, fio); f2fs_trace_ios(fio, 0); /* Allocate a new bio */ - bio = __bio_alloc(fio->sbi, fio->blk_addr, 1, is_read_io(fio->rw)); + bio = __bio_alloc(fio->sbi, fio->new_blkaddr, 1, is_read_io(fio->rw)); - if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) { + if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { bio_put(bio); return -EFAULT; } - submit_bio(fio->rw, bio); + __submit_bio(fio->sbi, fio->rw, bio, fio->type); return 0; } @@ -172,39 +261,49 @@ void f2fs_submit_page_mbio(struct f2fs_io_info *fio) io = is_read ? &sbi->read_io : &sbi->write_io[btype]; - verify_block_addr(sbi, fio->blk_addr); + if (fio->old_blkaddr != NEW_ADDR) + verify_block_addr(sbi, fio->old_blkaddr); + verify_block_addr(sbi, fio->new_blkaddr); down_write(&io->io_rwsem); - if (!is_read) - inc_page_count(sbi, F2FS_WRITEBACK); - - if (io->bio && (io->last_block_in_bio != fio->blk_addr - 1 || + if (io->bio && (io->last_block_in_bio != fio->new_blkaddr - 1 || io->fio.rw != fio->rw)) __submit_merged_bio(io); alloc_new: if (io->bio == NULL) { int bio_blocks = MAX_BIO_BLOCKS(sbi); - io->bio = __bio_alloc(sbi, fio->blk_addr, bio_blocks, is_read); + io->bio = __bio_alloc(sbi, fio->new_blkaddr, + bio_blocks, is_read); io->fio = *fio; } bio_page = fio->encrypted_page ? fio->encrypted_page : fio->page; - if (bio_add_page(io->bio, bio_page, PAGE_CACHE_SIZE, 0) < - PAGE_CACHE_SIZE) { + if (bio_add_page(io->bio, bio_page, PAGE_SIZE, 0) < + PAGE_SIZE) { __submit_merged_bio(io); goto alloc_new; } - io->last_block_in_bio = fio->blk_addr; + io->last_block_in_bio = fio->new_blkaddr; f2fs_trace_ios(fio, 0); up_write(&io->io_rwsem); trace_f2fs_submit_page_mbio(fio->page, fio); } +static void __set_data_blkaddr(struct dnode_of_data *dn) +{ + struct f2fs_node *rn = F2FS_NODE(dn->node_page); + __le32 *addr_array; + + /* Get physical address of data block */ + addr_array = blkaddr_in_node(rn); + addr_array[dn->ofs_in_node] = cpu_to_le32(dn->data_blkaddr); +} + /* * Lock ordering for the change of data block address: * ->data_page @@ -213,39 +312,63 @@ alloc_new: */ void set_data_blkaddr(struct dnode_of_data *dn) { - struct f2fs_node *rn; - __le32 *addr_array; - struct page *node_page = dn->node_page; - unsigned int ofs_in_node = dn->ofs_in_node; - - f2fs_wait_on_page_writeback(node_page, NODE); - - rn = F2FS_NODE(node_page); - - /* Get physical address of data block */ - addr_array = blkaddr_in_node(rn); - addr_array[ofs_in_node] = cpu_to_le32(dn->data_blkaddr); - set_page_dirty(node_page); + f2fs_wait_on_page_writeback(dn->node_page, NODE, true); + __set_data_blkaddr(dn); + if (set_page_dirty(dn->node_page)) + dn->node_changed = true; } -int reserve_new_block(struct dnode_of_data *dn) +void f2fs_update_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr) +{ + dn->data_blkaddr = blkaddr; + set_data_blkaddr(dn); + f2fs_update_extent_cache(dn); +} + +/* dn->ofs_in_node will be returned with up-to-date last block pointer */ +int reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count) { struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); - if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC))) + if (!count) + return 0; + + if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC))) return -EPERM; - if (unlikely(!inc_valid_block_count(sbi, dn->inode, 1))) + if (unlikely(!inc_valid_block_count(sbi, dn->inode, &count))) return -ENOSPC; - trace_f2fs_reserve_new_block(dn->inode, dn->nid, dn->ofs_in_node); + trace_f2fs_reserve_new_blocks(dn->inode, dn->nid, + dn->ofs_in_node, count); - dn->data_blkaddr = NEW_ADDR; - set_data_blkaddr(dn); - mark_inode_dirty(dn->inode); - sync_inode_page(dn); + f2fs_wait_on_page_writeback(dn->node_page, NODE, true); + + for (; count > 0; dn->ofs_in_node++) { + block_t blkaddr = + datablock_addr(dn->node_page, dn->ofs_in_node); + if (blkaddr == NULL_ADDR) { + dn->data_blkaddr = NEW_ADDR; + __set_data_blkaddr(dn); + count--; + } + } + + if (set_page_dirty(dn->node_page)) + dn->node_changed = true; return 0; } +/* Should keep dn->ofs_in_node unchanged */ +int reserve_new_block(struct dnode_of_data *dn) +{ + unsigned int ofs_in_node = dn->ofs_in_node; + int ret; + + ret = reserve_new_blocks(dn, 1); + dn->ofs_in_node = ofs_in_node; + return ret; +} + int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index) { bool need_put = dn->inode_page ? false : true; @@ -325,13 +448,14 @@ got_it: * see, f2fs_add_link -> get_new_data_page -> init_inode_metadata. */ if (dn.data_blkaddr == NEW_ADDR) { - zero_user_segment(page, 0, PAGE_CACHE_SIZE); - SetPageUptodate(page); + zero_user_segment(page, 0, PAGE_SIZE); + if (!PageUptodate(page)) + SetPageUptodate(page); unlock_page(page); return page; } - fio.blk_addr = dn.data_blkaddr; + fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr; fio.page = page; err = f2fs_submit_page_bio(&fio); if (err) @@ -385,14 +509,14 @@ repeat: /* wait for read completion */ lock_page(page); - if (unlikely(!PageUptodate(page))) { - f2fs_put_page(page, 1); - return ERR_PTR(-EIO); - } if (unlikely(page->mapping != mapping)) { f2fs_put_page(page, 1); goto repeat; } + if (unlikely(!PageUptodate(page))) { + f2fs_put_page(page, 1); + return ERR_PTR(-EIO); + } return page; } @@ -412,7 +536,7 @@ struct page *get_new_data_page(struct inode *inode, struct page *page; struct dnode_of_data dn; int err; -repeat: + page = f2fs_grab_cache_page(mapping, index, true); if (!page) { /* @@ -436,45 +560,42 @@ repeat: goto got_it; if (dn.data_blkaddr == NEW_ADDR) { - zero_user_segment(page, 0, PAGE_CACHE_SIZE); - SetPageUptodate(page); + zero_user_segment(page, 0, PAGE_SIZE); + if (!PageUptodate(page)) + SetPageUptodate(page); } else { f2fs_put_page(page, 1); - page = get_read_data_page(inode, index, READ_SYNC, true); + /* if ipage exists, blkaddr should be NEW_ADDR */ + f2fs_bug_on(F2FS_I_SB(inode), ipage); + page = get_lock_data_page(inode, index, true); if (IS_ERR(page)) - goto repeat; - - /* wait for read completion */ - lock_page(page); + return page; } got_it: if (new_i_size && i_size_read(inode) < - ((loff_t)(index + 1) << PAGE_CACHE_SHIFT)) { - i_size_write(inode, ((loff_t)(index + 1) << PAGE_CACHE_SHIFT)); - /* Only the directory inode sets new_i_size */ - set_inode_flag(F2FS_I(inode), FI_UPDATE_DIR); - } + ((loff_t)(index + 1) << PAGE_SHIFT)) + f2fs_i_size_write(inode, ((loff_t)(index + 1) << PAGE_SHIFT)); return page; } static int __allocate_data_block(struct dnode_of_data *dn) { struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); - struct f2fs_inode_info *fi = F2FS_I(dn->inode); struct f2fs_summary sum; struct node_info ni; int seg = CURSEG_WARM_DATA; pgoff_t fofs; + blkcnt_t count = 1; - if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC))) + if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC))) return -EPERM; dn->data_blkaddr = datablock_addr(dn->node_page, dn->ofs_in_node); if (dn->data_blkaddr == NEW_ADDR) goto alloc; - if (unlikely(!inc_valid_block_count(sbi, dn->inode, 1))) + if (unlikely(!inc_valid_block_count(sbi, dn->inode, &count))) return -ENOSPC; alloc: @@ -489,72 +610,43 @@ alloc: set_data_blkaddr(dn); /* update i_size */ - fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) + + fofs = start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) + dn->ofs_in_node; - if (i_size_read(dn->inode) < ((loff_t)(fofs + 1) << PAGE_CACHE_SHIFT)) - i_size_write(dn->inode, - ((loff_t)(fofs + 1) << PAGE_CACHE_SHIFT)); - - /* direct IO doesn't use extent cache to maximize the performance */ - f2fs_drop_largest_extent(dn->inode, fofs); - + if (i_size_read(dn->inode) < ((loff_t)(fofs + 1) << PAGE_SHIFT)) + f2fs_i_size_write(dn->inode, + ((loff_t)(fofs + 1) << PAGE_SHIFT)); return 0; } -static void __allocate_data_blocks(struct inode *inode, loff_t offset, - size_t count) +ssize_t f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) { - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct dnode_of_data dn; - u64 start = F2FS_BYTES_TO_BLK(offset); - u64 len = F2FS_BYTES_TO_BLK(count); - bool allocated; - u64 end_offset; + struct inode *inode = file_inode(iocb->ki_filp); + struct f2fs_map_blocks map; + ssize_t ret = 0; - while (len) { - f2fs_balance_fs(sbi); - f2fs_lock_op(sbi); + map.m_lblk = F2FS_BLK_ALIGN(iocb->ki_pos); + map.m_len = F2FS_BYTES_TO_BLK(iocb->ki_pos + iov_iter_count(from)); + if (map.m_len > map.m_lblk) + map.m_len -= map.m_lblk; + else + map.m_len = 0; - /* When reading holes, we need its node page */ - set_new_dnode(&dn, inode, NULL, NULL, 0); - if (get_dnode_of_data(&dn, start, ALLOC_NODE)) - goto out; + map.m_next_pgofs = NULL; - allocated = false; - end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode)); - - while (dn.ofs_in_node < end_offset && len) { - block_t blkaddr; - - if (unlikely(f2fs_cp_error(sbi))) - goto sync_out; - - blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node); - if (blkaddr == NULL_ADDR || blkaddr == NEW_ADDR) { - if (__allocate_data_block(&dn)) - goto sync_out; - allocated = true; - } - len--; - start++; - dn.ofs_in_node++; - } - - if (allocated) - sync_inode_page(&dn); - - f2fs_put_dnode(&dn); - f2fs_unlock_op(sbi); + if (iocb->ki_flags & IOCB_DIRECT) { + ret = f2fs_convert_inline_inode(inode); + if (ret) + return ret; + return f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_DIO); } - return; - -sync_out: - if (allocated) - sync_inode_page(&dn); - f2fs_put_dnode(&dn); -out: - f2fs_unlock_op(sbi); - return; + if (iocb->ki_pos + iov_iter_count(from) > MAX_INLINE_DATA) { + ret = f2fs_convert_inline_inode(inode); + if (ret) + return ret; + } + if (!f2fs_has_inline_data(inode)) + return f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO); + return ret; } /* @@ -566,156 +658,181 @@ out: * b. do not use extent cache for better performance * c. give the block addresses to blockdev */ -static int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, +int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, int create, int flag) { unsigned int maxblocks = map->m_len; struct dnode_of_data dn; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - int mode = create ? ALLOC_NODE : LOOKUP_NODE_RA; - pgoff_t pgofs, end_offset; + int mode = create ? ALLOC_NODE : LOOKUP_NODE; + pgoff_t pgofs, end_offset, end; int err = 0, ofs = 1; + unsigned int ofs_in_node, last_ofs_in_node; + blkcnt_t prealloc; struct extent_info ei; bool allocated = false; + block_t blkaddr; + + if (!maxblocks) + return 0; map->m_len = 0; map->m_flags = 0; /* it only supports block size == page size */ pgofs = (pgoff_t)map->m_lblk; + end = pgofs + maxblocks; - if (f2fs_lookup_extent_cache(inode, pgofs, &ei)) { + if (!create && f2fs_lookup_extent_cache(inode, pgofs, &ei)) { map->m_pblk = ei.blk + pgofs - ei.fofs; map->m_len = min((pgoff_t)maxblocks, ei.fofs + ei.len - pgofs); map->m_flags = F2FS_MAP_MAPPED; goto out; } +next_dnode: if (create) - f2fs_lock_op(F2FS_I_SB(inode)); + f2fs_lock_op(sbi); /* When reading holes, we need its node page */ set_new_dnode(&dn, inode, NULL, NULL, 0); err = get_dnode_of_data(&dn, pgofs, mode); if (err) { - if (err == -ENOENT) + if (flag == F2FS_GET_BLOCK_BMAP) + map->m_pblk = 0; + if (err == -ENOENT) { err = 0; + if (map->m_next_pgofs) + *map->m_next_pgofs = + get_next_page_offset(&dn, pgofs); + } goto unlock_out; } - if (dn.data_blkaddr == NEW_ADDR || dn.data_blkaddr == NULL_ADDR) { + prealloc = 0; + ofs_in_node = dn.ofs_in_node; + end_offset = ADDRS_PER_PAGE(dn.node_page, inode); + +next_block: + blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node); + + if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR) { if (create) { if (unlikely(f2fs_cp_error(sbi))) { err = -EIO; - goto put_out; + goto sync_out; + } + if (flag == F2FS_GET_BLOCK_PRE_AIO) { + if (blkaddr == NULL_ADDR) { + prealloc++; + last_ofs_in_node = dn.ofs_in_node; + } + } else { + err = __allocate_data_block(&dn); + if (!err) { + set_inode_flag(inode, FI_APPEND_WRITE); + allocated = true; + } } - err = __allocate_data_block(&dn); if (err) - goto put_out; - allocated = true; + goto sync_out; map->m_flags = F2FS_MAP_NEW; + blkaddr = dn.data_blkaddr; } else { - if (flag != F2FS_GET_BLOCK_FIEMAP || - dn.data_blkaddr != NEW_ADDR) { - if (flag == F2FS_GET_BLOCK_BMAP) - err = -ENOENT; - goto put_out; + if (flag == F2FS_GET_BLOCK_BMAP) { + map->m_pblk = 0; + goto sync_out; } - - /* - * preallocated unwritten block should be mapped - * for fiemap. - */ - if (dn.data_blkaddr == NEW_ADDR) - map->m_flags = F2FS_MAP_UNWRITTEN; + if (flag == F2FS_GET_BLOCK_FIEMAP && + blkaddr == NULL_ADDR) { + if (map->m_next_pgofs) + *map->m_next_pgofs = pgofs + 1; + } + if (flag != F2FS_GET_BLOCK_FIEMAP || + blkaddr != NEW_ADDR) + goto sync_out; } } - map->m_flags |= F2FS_MAP_MAPPED; - map->m_pblk = dn.data_blkaddr; - map->m_len = 1; + if (flag == F2FS_GET_BLOCK_PRE_AIO) + goto skip; - end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode)); + if (map->m_len == 0) { + /* preallocated unwritten block should be mapped for fiemap. */ + if (blkaddr == NEW_ADDR) + map->m_flags |= F2FS_MAP_UNWRITTEN; + map->m_flags |= F2FS_MAP_MAPPED; + + map->m_pblk = blkaddr; + map->m_len = 1; + } else if ((map->m_pblk != NEW_ADDR && + blkaddr == (map->m_pblk + ofs)) || + (map->m_pblk == NEW_ADDR && blkaddr == NEW_ADDR) || + flag == F2FS_GET_BLOCK_PRE_DIO) { + ofs++; + map->m_len++; + } else { + goto sync_out; + } + +skip: dn.ofs_in_node++; pgofs++; -get_next: - if (dn.ofs_in_node >= end_offset) { - if (allocated) - sync_inode_page(&dn); - allocated = false; - f2fs_put_dnode(&dn); + /* preallocate blocks in batch for one dnode page */ + if (flag == F2FS_GET_BLOCK_PRE_AIO && + (pgofs == end || dn.ofs_in_node == end_offset)) { - set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, pgofs, mode); - if (err) { - if (err == -ENOENT) - err = 0; - goto unlock_out; + dn.ofs_in_node = ofs_in_node; + err = reserve_new_blocks(&dn, prealloc); + if (err) + goto sync_out; + allocated = dn.node_changed; + + map->m_len += dn.ofs_in_node - ofs_in_node; + if (prealloc && dn.ofs_in_node != last_ofs_in_node + 1) { + err = -ENOSPC; + goto sync_out; } - - end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode)); + dn.ofs_in_node = end_offset; } - if (maxblocks > map->m_len) { - block_t blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node); + if (pgofs >= end) + goto sync_out; + else if (dn.ofs_in_node < end_offset) + goto next_block; - if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR) { - if (create) { - if (unlikely(f2fs_cp_error(sbi))) { - err = -EIO; - goto sync_out; - } - err = __allocate_data_block(&dn); - if (err) - goto sync_out; - allocated = true; - map->m_flags |= F2FS_MAP_NEW; - blkaddr = dn.data_blkaddr; - } else { - /* - * we only merge preallocated unwritten blocks - * for fiemap. - */ - if (flag != F2FS_GET_BLOCK_FIEMAP || - blkaddr != NEW_ADDR) - goto sync_out; - } - } + f2fs_put_dnode(&dn); - /* Give more consecutive addresses for the readahead */ - if ((map->m_pblk != NEW_ADDR && - blkaddr == (map->m_pblk + ofs)) || - (map->m_pblk == NEW_ADDR && - blkaddr == NEW_ADDR)) { - ofs++; - dn.ofs_in_node++; - pgofs++; - map->m_len++; - goto get_next; - } + if (create) { + f2fs_unlock_op(sbi); + f2fs_balance_fs(sbi, allocated); } + allocated = false; + goto next_dnode; + sync_out: - if (allocated) - sync_inode_page(&dn); -put_out: f2fs_put_dnode(&dn); unlock_out: - if (create) - f2fs_unlock_op(F2FS_I_SB(inode)); + if (create) { + f2fs_unlock_op(sbi); + f2fs_balance_fs(sbi, allocated); + } out: trace_f2fs_map_blocks(inode, map, err); return err; } static int __get_data_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh, int create, int flag) + struct buffer_head *bh, int create, int flag, + pgoff_t *next_pgofs) { struct f2fs_map_blocks map; int ret; map.m_lblk = iblock; map.m_len = bh->b_size >> inode->i_blkbits; + map.m_next_pgofs = next_pgofs; ret = f2fs_map_blocks(inode, &map, create, flag); if (!ret) { @@ -727,23 +844,29 @@ static int __get_data_block(struct inode *inode, sector_t iblock, } static int get_data_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create, int flag) + struct buffer_head *bh_result, int create, int flag, + pgoff_t *next_pgofs) { - return __get_data_block(inode, iblock, bh_result, create, flag); + return __get_data_block(inode, iblock, bh_result, create, + flag, next_pgofs); } static int get_data_block_dio(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { return __get_data_block(inode, iblock, bh_result, create, - F2FS_GET_BLOCK_DIO); + F2FS_GET_BLOCK_DIO, NULL); } static int get_data_block_bmap(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { + /* Block number less than F2FS MAX BLOCKS */ + if (unlikely(iblock >= F2FS_I_SB(inode)->max_file_blocks)) + return -EFBIG; + return __get_data_block(inode, iblock, bh_result, create, - F2FS_GET_BLOCK_BMAP); + F2FS_GET_BLOCK_BMAP, NULL); } static inline sector_t logical_to_blk(struct inode *inode, loff_t offset) @@ -761,10 +884,10 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, { struct buffer_head map_bh; sector_t start_blk, last_blk; - loff_t isize = i_size_read(inode); + pgoff_t next_pgofs; + loff_t isize; u64 logical = 0, phys = 0, size = 0; u32 flags = 0; - bool past_eof = false, whole_file = false; int ret = 0; ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC); @@ -777,82 +900,64 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, return ret; } - mutex_lock(&inode->i_mutex); + inode_lock(inode); - if (len >= isize) { - whole_file = true; - len = isize; - } + isize = i_size_read(inode); + if (start >= isize) + goto out; + + if (start + len > isize) + len = isize - start; if (logical_to_blk(inode, len) == 0) len = blk_to_logical(inode, 1); start_blk = logical_to_blk(inode, start); last_blk = logical_to_blk(inode, start + len - 1); + next: memset(&map_bh, 0, sizeof(struct buffer_head)); map_bh.b_size = len; ret = get_data_block(inode, start_blk, &map_bh, 0, - F2FS_GET_BLOCK_FIEMAP); + F2FS_GET_BLOCK_FIEMAP, &next_pgofs); if (ret) goto out; /* HOLE */ if (!buffer_mapped(&map_bh)) { - start_blk++; - - if (!past_eof && blk_to_logical(inode, start_blk) >= isize) - past_eof = 1; - - if (past_eof && size) { - flags |= FIEMAP_EXTENT_LAST; - ret = fiemap_fill_next_extent(fieinfo, logical, - phys, size, flags); - } else if (size) { - ret = fiemap_fill_next_extent(fieinfo, logical, - phys, size, flags); - size = 0; - } - - /* if we have holes up to/past EOF then we're done */ - if (start_blk > last_blk || past_eof || ret) - goto out; - } else { - if (start_blk > last_blk && !whole_file) { - ret = fiemap_fill_next_extent(fieinfo, logical, - phys, size, flags); - goto out; - } - - /* - * if size != 0 then we know we already have an extent - * to add, so add it. + start_blk = next_pgofs; + /* Go through holes util pass the EOF */ + if (blk_to_logical(inode, start_blk) < isize) + goto prep_next; + /* Found a hole beyond isize means no more extents. + * Note that the premise is that filesystems don't + * punch holes beyond isize and keep size unchanged. */ - if (size) { - ret = fiemap_fill_next_extent(fieinfo, logical, - phys, size, flags); - if (ret) - goto out; - } - - logical = blk_to_logical(inode, start_blk); - phys = blk_to_logical(inode, map_bh.b_blocknr); - size = map_bh.b_size; - flags = 0; - if (buffer_unwritten(&map_bh)) - flags = FIEMAP_EXTENT_UNWRITTEN; - - start_blk += logical_to_blk(inode, size); - - /* - * If we are past the EOF, then we need to make sure as - * soon as we find a hole that the last extent we found - * is marked with FIEMAP_EXTENT_LAST - */ - if (!past_eof && logical + size >= isize) - past_eof = true; + flags |= FIEMAP_EXTENT_LAST; } + + if (size) { + if (f2fs_encrypted_inode(inode)) + flags |= FIEMAP_EXTENT_DATA_ENCRYPTED; + + ret = fiemap_fill_next_extent(fieinfo, logical, + phys, size, flags); + } + + if (start_blk > last_blk || ret) + goto out; + + logical = blk_to_logical(inode, start_blk); + phys = blk_to_logical(inode, map_bh.b_blocknr); + size = map_bh.b_size; + flags = 0; + if (buffer_unwritten(&map_bh)) + flags = FIEMAP_EXTENT_UNWRITTEN; + + start_blk += logical_to_blk(inode, size); + +prep_next: cond_resched(); if (fatal_signal_pending(current)) ret = -EINTR; @@ -862,10 +967,41 @@ out: if (ret == 1) ret = 0; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } +static struct bio *f2fs_grab_bio(struct inode *inode, block_t blkaddr, + unsigned nr_pages) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct fscrypt_ctx *ctx = NULL; + struct block_device *bdev = sbi->sb->s_bdev; + struct bio *bio; + + if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) { + ctx = fscrypt_get_ctx(inode, GFP_NOFS); + if (IS_ERR(ctx)) + return ERR_CAST(ctx); + + /* wait the page to be moved by cleaning */ + f2fs_wait_on_encrypted_page_writeback(sbi, blkaddr); + } + + bio = bio_alloc(GFP_KERNEL, min_t(int, nr_pages, BIO_MAX_PAGES)); + if (!bio) { + if (ctx) + fscrypt_release_ctx(ctx); + return ERR_PTR(-ENOMEM); + } + bio->bi_bdev = bdev; + bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blkaddr); + bio->bi_end_io = f2fs_read_end_io; + bio->bi_private = ctx; + + return bio; +} + /* * This function was originally taken from fs/mpage.c, and customized for f2fs. * Major change was from block_size == page_size in f2fs by default. @@ -884,13 +1020,13 @@ static int f2fs_mpage_readpages(struct address_space *mapping, sector_t last_block; sector_t last_block_in_file; sector_t block_nr; - struct block_device *bdev = inode->i_sb->s_bdev; struct f2fs_map_blocks map; map.m_pblk = 0; map.m_lblk = 0; map.m_len = 0; map.m_flags = 0; + map.m_next_pgofs = NULL; for (page_idx = 0; nr_pages; page_idx++, nr_pages--) { @@ -929,7 +1065,7 @@ static int f2fs_mpage_readpages(struct address_space *mapping, map.m_len = last_block - block_in_file; if (f2fs_map_blocks(inode, &map, 0, - F2FS_GET_BLOCK_READ)) + F2FS_GET_BLOCK_READ)) goto set_error_page; } got_it: @@ -942,8 +1078,9 @@ got_it: goto confused; } } else { - zero_user_segment(page, 0, PAGE_CACHE_SIZE); - SetPageUptodate(page); + zero_user_segment(page, 0, PAGE_SIZE); + if (!PageUptodate(page)) + SetPageUptodate(page); unlock_page(page); goto next_page; } @@ -954,35 +1091,15 @@ got_it: */ if (bio && (last_block_in_bio != block_nr - 1)) { submit_and_realloc: - submit_bio(READ, bio); + __submit_bio(F2FS_I_SB(inode), READ, bio, DATA); bio = NULL; } if (bio == NULL) { - struct f2fs_crypto_ctx *ctx = NULL; - - if (f2fs_encrypted_inode(inode) && - S_ISREG(inode->i_mode)) { - - ctx = f2fs_get_crypto_ctx(inode); - if (IS_ERR(ctx)) - goto set_error_page; - - /* wait the page to be moved by cleaning */ - f2fs_wait_on_encrypted_page_writeback( - F2FS_I_SB(inode), block_nr); - } - - bio = bio_alloc(GFP_KERNEL, - min_t(int, nr_pages, BIO_MAX_PAGES)); - if (!bio) { - if (ctx) - f2fs_release_crypto_ctx(ctx); + bio = f2fs_grab_bio(inode, block_nr, nr_pages); + if (IS_ERR(bio)) { + bio = NULL; goto set_error_page; } - bio->bi_bdev = bdev; - bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(block_nr); - bio->bi_end_io = f2fs_read_end_io; - bio->bi_private = ctx; } if (bio_add_page(bio, page, blocksize, 0) < blocksize) @@ -992,22 +1109,22 @@ submit_and_realloc: goto next_page; set_error_page: SetPageError(page); - zero_user_segment(page, 0, PAGE_CACHE_SIZE); + zero_user_segment(page, 0, PAGE_SIZE); unlock_page(page); goto next_page; confused: if (bio) { - submit_bio(READ, bio); + __submit_bio(F2FS_I_SB(inode), READ, bio, DATA); bio = NULL; } unlock_page(page); next_page: if (pages) - page_cache_release(page); + put_page(page); } BUG_ON(pages && !list_empty(pages)); if (bio) - submit_bio(READ, bio); + __submit_bio(F2FS_I_SB(inode), READ, bio, DATA); return 0; } @@ -1054,23 +1171,33 @@ int do_write_data_page(struct f2fs_io_info *fio) if (err) return err; - fio->blk_addr = dn.data_blkaddr; + fio->old_blkaddr = dn.data_blkaddr; /* This page is already truncated */ - if (fio->blk_addr == NULL_ADDR) { + if (fio->old_blkaddr == NULL_ADDR) { ClearPageUptodate(page); goto out_writepage; } if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) { + gfp_t gfp_flags = GFP_NOFS; /* wait for GCed encrypted page writeback */ f2fs_wait_on_encrypted_page_writeback(F2FS_I_SB(inode), - fio->blk_addr); - - fio->encrypted_page = f2fs_encrypt(inode, fio->page); + fio->old_blkaddr); +retry_encrypt: + fio->encrypted_page = fscrypt_encrypt_page(inode, fio->page, + gfp_flags); if (IS_ERR(fio->encrypted_page)) { err = PTR_ERR(fio->encrypted_page); + if (err == -ENOMEM) { + /* flush pending ios and wait for a while */ + f2fs_flush_merged_bios(F2FS_I_SB(inode)); + congestion_wait(BLK_RW_ASYNC, HZ/50); + gfp_flags |= __GFP_NOFAIL; + err = 0; + goto retry_encrypt; + } goto out_writepage; } } @@ -1081,20 +1208,19 @@ int do_write_data_page(struct f2fs_io_info *fio) * If current allocation needs SSR, * it had better in-place writes for updated data. */ - if (unlikely(fio->blk_addr != NEW_ADDR && + if (unlikely(fio->old_blkaddr != NEW_ADDR && !is_cold_data(page) && + !IS_ATOMIC_WRITTEN_PAGE(page) && need_inplace_update(inode))) { rewrite_data_page(fio); - set_inode_flag(F2FS_I(inode), FI_UPDATE_WRITE); + set_inode_flag(inode, FI_UPDATE_WRITE); trace_f2fs_do_write_data_page(page, IPU); } else { write_data_page(&dn, fio); - set_data_blkaddr(&dn); - f2fs_update_extent_cache(&dn); trace_f2fs_do_write_data_page(page, OPU); - set_inode_flag(F2FS_I(inode), FI_APPEND_WRITE); + set_inode_flag(inode, FI_APPEND_WRITE); if (page->index == 0) - set_inode_flag(F2FS_I(inode), FI_FIRST_BLOCK_WRITTEN); + set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN); } out_writepage: f2fs_put_dnode(&dn); @@ -1108,7 +1234,8 @@ static int f2fs_write_data_page(struct page *page, struct f2fs_sb_info *sbi = F2FS_I_SB(inode); loff_t i_size = i_size_read(inode); const pgoff_t end_index = ((unsigned long long) i_size) - >> PAGE_CACHE_SHIFT; + >> PAGE_SHIFT; + loff_t psize = (page->index + 1) << PAGE_SHIFT; unsigned offset = 0; bool need_balance_fs = false; int err = 0; @@ -1129,37 +1256,37 @@ static int f2fs_write_data_page(struct page *page, * If the offset is out-of-range of file size, * this page does not have to be written to disk. */ - offset = i_size & (PAGE_CACHE_SIZE - 1); + offset = i_size & (PAGE_SIZE - 1); if ((page->index >= end_index + 1) || !offset) goto out; - zero_user_segment(page, offset, PAGE_CACHE_SIZE); + zero_user_segment(page, offset, PAGE_SIZE); write: if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) goto redirty_out; if (f2fs_is_drop_cache(inode)) goto out; - if (f2fs_is_volatile_file(inode) && !wbc->for_reclaim && - available_free_memory(sbi, BASE_CHECK)) + /* we should not write 0'th page having journal header */ + if (f2fs_is_volatile_file(inode) && (!page->index || + (!wbc->for_reclaim && + available_free_memory(sbi, BASE_CHECK)))) goto redirty_out; + /* we should bypass data pages to proceed the kworkder jobs */ + if (unlikely(f2fs_cp_error(sbi))) { + mapping_set_error(page->mapping, -EIO); + goto out; + } + /* Dentry blocks are controlled by checkpoint */ if (S_ISDIR(inode->i_mode)) { - if (unlikely(f2fs_cp_error(sbi))) - goto redirty_out; err = do_write_data_page(&fio); goto done; } - /* we should bypass data pages to proceed the kworkder jobs */ - if (unlikely(f2fs_cp_error(sbi))) { - SetPageError(page); - goto out; - } - if (!wbc->for_reclaim) need_balance_fs = true; - else if (has_not_enough_free_secs(sbi, 0)) + else if (has_not_enough_free_secs(sbi, 0, 0)) goto redirty_out; err = -EAGAIN; @@ -1168,6 +1295,8 @@ write: err = f2fs_write_inline_data(inode, page); if (err == -EAGAIN) err = do_write_data_page(&fio); + if (F2FS_I(inode)->last_disk_size < psize) + F2FS_I(inode)->last_disk_size = psize; f2fs_unlock_op(sbi); done: if (err && err != -ENOENT) @@ -1178,25 +1307,24 @@ out: inode_dec_dirty_pages(inode); if (err) ClearPageUptodate(page); + + if (wbc->for_reclaim) { + f2fs_submit_merged_bio_cond(sbi, NULL, page, 0, DATA, WRITE); + remove_dirty_inode(inode); + } + unlock_page(page); - if (need_balance_fs) - f2fs_balance_fs(sbi); - if (wbc->for_reclaim) + f2fs_balance_fs(sbi, need_balance_fs); + + if (unlikely(f2fs_cp_error(sbi))) f2fs_submit_merged_bio(sbi, DATA, WRITE); + return 0; redirty_out: redirty_page_for_writepage(wbc, page); - return AOP_WRITEPAGE_ACTIVATE; -} - -static int __f2fs_writepage(struct page *page, struct writeback_control *wbc, - void *data) -{ - struct address_space *mapping = data; - int ret = mapping->a_ops->writepage(page, wbc); - mapping_set_error(mapping, ret); - return ret; + unlock_page(page); + return err; } /* @@ -1205,8 +1333,7 @@ static int __f2fs_writepage(struct page *page, struct writeback_control *wbc, * warm/hot data page. */ static int f2fs_write_cache_pages(struct address_space *mapping, - struct writeback_control *wbc, writepage_t writepage, - void *data) + struct writeback_control *wbc) { int ret = 0; int done = 0; @@ -1219,10 +1346,10 @@ static int f2fs_write_cache_pages(struct address_space *mapping, int cycled; int range_whole = 0; int tag; - int step = 0; + int nwritten = 0; pagevec_init(&pvec, 0); -next: + if (wbc->range_cyclic) { writeback_index = mapping->writeback_index; /* prev offset */ index = writeback_index; @@ -1232,8 +1359,8 @@ next: cycled = 0; end = -1; } else { - index = wbc->range_start >> PAGE_CACHE_SHIFT; - end = wbc->range_end >> PAGE_CACHE_SHIFT; + index = wbc->range_start >> PAGE_SHIFT; + end = wbc->range_end >> PAGE_SHIFT; if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) range_whole = 1; cycled = 1; /* ignore range_cyclic tests */ @@ -1277,12 +1404,10 @@ continue_unlock: goto continue_unlock; } - if (step == is_cold_data(page)) - goto continue_unlock; - if (PageWriteback(page)) { if (wbc->sync_mode != WB_SYNC_NONE) - f2fs_wait_on_page_writeback(page, DATA); + f2fs_wait_on_page_writeback(page, + DATA, true); else goto continue_unlock; } @@ -1291,16 +1416,13 @@ continue_unlock: if (!clear_page_dirty_for_io(page)) goto continue_unlock; - ret = (*writepage)(page, wbc, data); + ret = mapping->a_ops->writepage(page, wbc); if (unlikely(ret)) { - if (ret == AOP_WRITEPAGE_ACTIVATE) { - unlock_page(page); - ret = 0; - } else { - done_index = page->index + 1; - done = 1; - break; - } + done_index = page->index + 1; + done = 1; + break; + } else { + nwritten++; } if (--wbc->nr_to_write <= 0 && @@ -1313,11 +1435,6 @@ continue_unlock: cond_resched(); } - if (step < 1) { - step++; - goto next; - } - if (!cycled && !done) { cycled = 1; index = 0; @@ -1327,6 +1444,10 @@ continue_unlock: if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) mapping->writeback_index = done_index; + if (nwritten) + f2fs_submit_merged_bio_cond(F2FS_M_SB(mapping), mapping->host, + NULL, 0, DATA, WRITE); + return ret; } @@ -1335,11 +1456,8 @@ static int f2fs_write_data_pages(struct address_space *mapping, { struct inode *inode = mapping->host; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - bool locked = false; + struct blk_plug plug; int ret; - long diff; - - trace_f2fs_writepages(mapping->host, wbc, DATA); /* deal with chardevs and other special file */ if (!mapping->a_ops->writepage) @@ -1354,41 +1472,119 @@ static int f2fs_write_data_pages(struct address_space *mapping, available_free_memory(sbi, DIRTY_DENTS)) goto skip_write; + /* skip writing during file defragment */ + if (is_inode_flag_set(inode, FI_DO_DEFRAG)) + goto skip_write; + /* during POR, we don't need to trigger writepage at all. */ if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) goto skip_write; - diff = nr_pages_to_write(sbi, DATA, wbc); + trace_f2fs_writepages(mapping->host, wbc, DATA); - if (!S_ISDIR(inode->i_mode)) { - mutex_lock(&sbi->writepages); - locked = true; - } - ret = f2fs_write_cache_pages(mapping, wbc, __f2fs_writepage, mapping); - f2fs_submit_merged_bio(sbi, DATA, WRITE); - if (locked) - mutex_unlock(&sbi->writepages); + blk_start_plug(&plug); + ret = f2fs_write_cache_pages(mapping, wbc); + blk_finish_plug(&plug); + /* + * if some pages were truncated, we cannot guarantee its mapping->host + * to detect pending bios. + */ - remove_dirty_dir_inode(inode); - - wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff); + remove_dirty_inode(inode); return ret; skip_write: wbc->pages_skipped += get_dirty_pages(inode); + trace_f2fs_writepages(mapping->host, wbc, DATA); return 0; } static void f2fs_write_failed(struct address_space *mapping, loff_t to) { struct inode *inode = mapping->host; + loff_t i_size = i_size_read(inode); - if (to > inode->i_size) { - truncate_pagecache(inode, inode->i_size); - truncate_blocks(inode, inode->i_size, true); + if (to > i_size) { + truncate_pagecache(inode, i_size); + truncate_blocks(inode, i_size, true); } } +static int prepare_write_begin(struct f2fs_sb_info *sbi, + struct page *page, loff_t pos, unsigned len, + block_t *blk_addr, bool *node_changed) +{ + struct inode *inode = page->mapping->host; + pgoff_t index = page->index; + struct dnode_of_data dn; + struct page *ipage; + bool locked = false; + struct extent_info ei; + int err = 0; + + /* + * we already allocated all the blocks, so we don't need to get + * the block addresses when there is no need to fill the page. + */ + if (!f2fs_has_inline_data(inode) && len == PAGE_SIZE) + return 0; + + if (f2fs_has_inline_data(inode) || + (pos & PAGE_MASK) >= i_size_read(inode)) { + f2fs_lock_op(sbi); + locked = true; + } +restart: + /* check inline_data */ + ipage = get_node_page(sbi, inode->i_ino); + if (IS_ERR(ipage)) { + err = PTR_ERR(ipage); + goto unlock_out; + } + + set_new_dnode(&dn, inode, ipage, ipage, 0); + + if (f2fs_has_inline_data(inode)) { + if (pos + len <= MAX_INLINE_DATA) { + read_inline_data(page, ipage); + set_inode_flag(inode, FI_DATA_EXIST); + if (inode->i_nlink) + set_inline_node(ipage); + } else { + err = f2fs_convert_inline_page(&dn, page); + if (err) + goto out; + if (dn.data_blkaddr == NULL_ADDR) + err = f2fs_get_block(&dn, index); + } + } else if (locked) { + err = f2fs_get_block(&dn, index); + } else { + if (f2fs_lookup_extent_cache(inode, index, &ei)) { + dn.data_blkaddr = ei.blk + index - ei.fofs; + } else { + /* hole case */ + err = get_dnode_of_data(&dn, index, LOOKUP_NODE); + if (err || dn.data_blkaddr == NULL_ADDR) { + f2fs_put_dnode(&dn); + f2fs_lock_op(sbi); + locked = true; + goto restart; + } + } + } + + /* convert_inline_page can make node_changed */ + *blk_addr = dn.data_blkaddr; + *node_changed = dn.node_changed; +out: + f2fs_put_dnode(&dn); +unlock_out: + if (locked) + f2fs_unlock_op(sbi); + return err; +} + static int f2fs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) @@ -1396,15 +1592,13 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, struct inode *inode = mapping->host; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct page *page = NULL; - struct page *ipage; - pgoff_t index = ((unsigned long long) pos) >> PAGE_CACHE_SHIFT; - struct dnode_of_data dn; + pgoff_t index = ((unsigned long long) pos) >> PAGE_SHIFT; + bool need_balance = false; + block_t blkaddr = NULL_ADDR; int err = 0; trace_f2fs_write_begin(inode, pos, len, flags); - f2fs_balance_fs(sbi); - /* * We should check this at this moment to avoid deadlock on inode page * and #0 page. The locking rule for inline_data conversion should be: @@ -1424,98 +1618,63 @@ repeat: *pagep = page; - f2fs_lock_op(sbi); - - /* check inline_data */ - ipage = get_node_page(sbi, inode->i_ino); - if (IS_ERR(ipage)) { - err = PTR_ERR(ipage); - goto unlock_fail; - } - - set_new_dnode(&dn, inode, ipage, ipage, 0); - - if (f2fs_has_inline_data(inode)) { - if (pos + len <= MAX_INLINE_DATA) { - read_inline_data(page, ipage); - set_inode_flag(F2FS_I(inode), FI_DATA_EXIST); - sync_inode_page(&dn); - goto put_next; - } - err = f2fs_convert_inline_page(&dn, page); - if (err) - goto put_fail; - } - - err = f2fs_get_block(&dn, index); + err = prepare_write_begin(sbi, page, pos, len, + &blkaddr, &need_balance); if (err) - goto put_fail; -put_next: - f2fs_put_dnode(&dn); - f2fs_unlock_op(sbi); + goto fail; - f2fs_wait_on_page_writeback(page, DATA); + if (need_balance && has_not_enough_free_secs(sbi, 0, 0)) { + unlock_page(page); + f2fs_balance_fs(sbi, true); + lock_page(page); + if (page->mapping != mapping) { + /* The page got truncated from under us */ + f2fs_put_page(page, 1); + goto repeat; + } + } + + f2fs_wait_on_page_writeback(page, DATA, false); /* wait for GCed encrypted page writeback */ if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) - f2fs_wait_on_encrypted_page_writeback(sbi, dn.data_blkaddr); + f2fs_wait_on_encrypted_page_writeback(sbi, blkaddr); - if (len == PAGE_CACHE_SIZE) - goto out_update; - if (PageUptodate(page)) - goto out_clear; + if (len == PAGE_SIZE || PageUptodate(page)) + return 0; - if ((pos & PAGE_CACHE_MASK) >= i_size_read(inode)) { - unsigned start = pos & (PAGE_CACHE_SIZE - 1); - unsigned end = start + len; - - /* Reading beyond i_size is simple: memset to zero */ - zero_user_segments(page, 0, start, end, PAGE_CACHE_SIZE); - goto out_update; - } - - if (dn.data_blkaddr == NEW_ADDR) { - zero_user_segment(page, 0, PAGE_CACHE_SIZE); + if (blkaddr == NEW_ADDR) { + zero_user_segment(page, 0, PAGE_SIZE); + SetPageUptodate(page); } else { - struct f2fs_io_info fio = { - .sbi = sbi, - .type = DATA, - .rw = READ_SYNC, - .blk_addr = dn.data_blkaddr, - .page = page, - .encrypted_page = NULL, - }; - err = f2fs_submit_page_bio(&fio); - if (err) - goto fail; + struct bio *bio; - lock_page(page); - if (unlikely(!PageUptodate(page))) { - err = -EIO; + bio = f2fs_grab_bio(inode, blkaddr, 1); + if (IS_ERR(bio)) { + err = PTR_ERR(bio); goto fail; } + + if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { + bio_put(bio); + err = -EFAULT; + goto fail; + } + + __submit_bio(sbi, READ_SYNC, bio, DATA); + + lock_page(page); if (unlikely(page->mapping != mapping)) { f2fs_put_page(page, 1); goto repeat; } - - /* avoid symlink page */ - if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) { - err = f2fs_decrypt_one(inode, page); - if (err) - goto fail; + if (unlikely(!PageUptodate(page))) { + err = -EIO; + goto fail; } } -out_update: - SetPageUptodate(page); -out_clear: - clear_cold_data(page); return 0; -put_fail: - f2fs_put_dnode(&dn); -unlock_fail: - f2fs_unlock_op(sbi); fail: f2fs_put_page(page, 1); f2fs_write_failed(mapping, pos + len); @@ -1531,15 +1690,28 @@ static int f2fs_write_end(struct file *file, trace_f2fs_write_end(inode, pos, len, copied); - set_page_dirty(page); - - if (pos + copied > i_size_read(inode)) { - i_size_write(inode, pos + copied); - mark_inode_dirty(inode); - update_inode_page(inode); + /* + * This should be come from len == PAGE_SIZE, and we expect copied + * should be PAGE_SIZE. Otherwise, we treat it with zero copied and + * let generic_perform_write() try to copy data again through copied=0. + */ + if (!PageUptodate(page)) { + if (unlikely(copied != PAGE_SIZE)) + copied = 0; + else + SetPageUptodate(page); } + if (!copied) + goto unlock_out; + set_page_dirty(page); + clear_cold_data(page); + + if (pos + copied > i_size_read(inode)) + f2fs_i_size_write(inode, pos + copied); +unlock_out: f2fs_put_page(page, 1); + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); return copied; } @@ -1558,44 +1730,37 @@ static int check_direct_IO(struct inode *inode, struct iov_iter *iter, } static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, - loff_t offset) + loff_t offset) { - struct file *file = iocb->ki_filp; - struct address_space *mapping = file->f_mapping; + struct address_space *mapping = iocb->ki_filp->f_mapping; struct inode *inode = mapping->host; size_t count = iov_iter_count(iter); + int rw = iov_iter_rw(iter); int err; - /* we don't need to use inline_data strictly */ - if (f2fs_has_inline_data(inode)) { - err = f2fs_convert_inline_inode(inode); - if (err) - return err; - } - - if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) - return 0; - err = check_direct_IO(inode, iter, offset); if (err) return err; - trace_f2fs_direct_IO_enter(inode, offset, count, iov_iter_rw(iter)); + if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) + return 0; + if (test_opt(F2FS_I_SB(inode), LFS)) + return 0; - if (iov_iter_rw(iter) == WRITE) { - __allocate_data_blocks(inode, offset, count); - if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) { - err = -EIO; - goto out; - } + trace_f2fs_direct_IO_enter(inode, offset, count, rw); + + down_read(&F2FS_I(inode)->dio_rwsem[rw]); + err = blockdev_direct_IO(iocb, inode, iter, offset, get_data_block_dio); + up_read(&F2FS_I(inode)->dio_rwsem[rw]); + + if (rw == WRITE) { + if (err > 0) + set_inode_flag(inode, FI_UPDATE_WRITE); + else if (err < 0) + f2fs_write_failed(mapping, offset + count); } - err = blockdev_direct_IO(iocb, inode, iter, offset, get_data_block_dio); -out: - if (err < 0 && iov_iter_rw(iter) == WRITE) - f2fs_write_failed(mapping, offset + count); - - trace_f2fs_direct_IO_exit(inode, offset, count, iov_iter_rw(iter), err); + trace_f2fs_direct_IO_exit(inode, offset, count, rw, err); return err; } @@ -1607,7 +1772,7 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset, struct f2fs_sb_info *sbi = F2FS_I_SB(inode); if (inode->i_ino >= F2FS_ROOT_INO(sbi) && - (offset % PAGE_CACHE_SIZE || length != PAGE_CACHE_SIZE)) + (offset % PAGE_SIZE || length != PAGE_SIZE)) return; if (PageDirty(page)) { @@ -1623,6 +1788,7 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset, if (IS_ATOMIC_WRITTEN_PAGE(page)) return; + set_page_private(page, 0); ClearPagePrivate(page); } @@ -1636,10 +1802,42 @@ int f2fs_release_page(struct page *page, gfp_t wait) if (IS_ATOMIC_WRITTEN_PAGE(page)) return 0; + set_page_private(page, 0); ClearPagePrivate(page); return 1; } +/* + * This was copied from __set_page_dirty_buffers which gives higher performance + * in very high speed storages. (e.g., pmem) + */ +void f2fs_set_page_dirty_nobuffers(struct page *page) +{ + struct address_space *mapping = page->mapping; + struct mem_cgroup *memcg; + unsigned long flags; + + if (unlikely(!mapping)) + return; + + spin_lock(&mapping->private_lock); + memcg = mem_cgroup_begin_page_stat(page); + SetPageDirty(page); + spin_unlock(&mapping->private_lock); + + spin_lock_irqsave(&mapping->tree_lock, flags); + WARN_ON_ONCE(!PageUptodate(page)); + account_page_dirtied(page, mapping, memcg); + radix_tree_tag_set(&mapping->page_tree, + page_index(page), PAGECACHE_TAG_DIRTY); + spin_unlock_irqrestore(&mapping->tree_lock, flags); + + mem_cgroup_end_page_stat(memcg); + + __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); + return; +} + static int f2fs_set_data_page_dirty(struct page *page) { struct address_space *mapping = page->mapping; @@ -1647,7 +1845,8 @@ static int f2fs_set_data_page_dirty(struct page *page) trace_f2fs_set_page_dirty(page, DATA); - SetPageUptodate(page); + if (!PageUptodate(page)) + SetPageUptodate(page); if (f2fs_is_atomic_file(inode)) { if (!IS_ATOMIC_WRITTEN_PAGE(page)) { @@ -1662,7 +1861,7 @@ static int f2fs_set_data_page_dirty(struct page *page) } if (!PageDirty(page)) { - __set_page_dirty_nobuffers(page); + f2fs_set_page_dirty_nobuffers(page); update_dirty_page(inode, page); return 1; } @@ -1683,6 +1882,58 @@ static sector_t f2fs_bmap(struct address_space *mapping, sector_t block) return generic_block_bmap(mapping, block, get_data_block_bmap); } +#ifdef CONFIG_MIGRATION +#include + +int f2fs_migrate_page(struct address_space *mapping, + struct page *newpage, struct page *page, enum migrate_mode mode) +{ + int rc, extra_count; + struct f2fs_inode_info *fi = F2FS_I(mapping->host); + bool atomic_written = IS_ATOMIC_WRITTEN_PAGE(page); + + BUG_ON(PageWriteback(page)); + + /* migrating an atomic written page is safe with the inmem_lock hold */ + if (atomic_written && !mutex_trylock(&fi->inmem_lock)) + return -EAGAIN; + + /* + * A reference is expected if PagePrivate set when move mapping, + * however F2FS breaks this for maintaining dirty page counts when + * truncating pages. So here adjusting the 'extra_count' make it work. + */ + extra_count = (atomic_written ? 1 : 0) - page_has_private(page); + rc = migrate_page_move_mapping(mapping, newpage, + page, NULL, mode, extra_count); + if (rc != MIGRATEPAGE_SUCCESS) { + if (atomic_written) + mutex_unlock(&fi->inmem_lock); + return rc; + } + + if (atomic_written) { + struct inmem_pages *cur; + list_for_each_entry(cur, &fi->inmem_pages, list) + if (cur->page == page) { + cur->page = newpage; + break; + } + mutex_unlock(&fi->inmem_lock); + put_page(page); + get_page(newpage); + } + + if (PagePrivate(page)) + SetPagePrivate(newpage); + set_page_private(newpage, page_private(page)); + + migrate_page_copy(newpage, page); + + return MIGRATEPAGE_SUCCESS; +} +#endif + const struct address_space_operations f2fs_dblock_aops = { .readpage = f2fs_read_data_page, .readpages = f2fs_read_data_pages, @@ -1695,4 +1946,7 @@ const struct address_space_operations f2fs_dblock_aops = { .releasepage = f2fs_release_page, .direct_IO = f2fs_direct_IO, .bmap = f2fs_bmap, +#ifdef CONFIG_MIGRATION + .migratepage = f2fs_migrate_page, +#endif }; diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 478e5d54154f..fb245bd302e4 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -38,23 +38,30 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->hit_rbtree = atomic64_read(&sbi->read_hit_rbtree); si->hit_total = si->hit_largest + si->hit_cached + si->hit_rbtree; si->total_ext = atomic64_read(&sbi->total_hit_ext); - si->ext_tree = sbi->total_ext_tree; + si->ext_tree = atomic_read(&sbi->total_ext_tree); + si->zombie_tree = atomic_read(&sbi->total_zombie_tree); si->ext_node = atomic_read(&sbi->total_ext_node); si->ndirty_node = get_pages(sbi, F2FS_DIRTY_NODES); si->ndirty_dent = get_pages(sbi, F2FS_DIRTY_DENTS); - si->ndirty_dirs = sbi->n_dirty_dirs; si->ndirty_meta = get_pages(sbi, F2FS_DIRTY_META); + si->ndirty_data = get_pages(sbi, F2FS_DIRTY_DATA); + si->ndirty_imeta = get_pages(sbi, F2FS_DIRTY_IMETA); + si->ndirty_dirs = sbi->ndirty_inode[DIR_INODE]; + si->ndirty_files = sbi->ndirty_inode[FILE_INODE]; + si->ndirty_all = sbi->ndirty_inode[DIRTY_META]; si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES); - si->wb_pages = get_pages(sbi, F2FS_WRITEBACK); + si->wb_bios = atomic_read(&sbi->nr_wb_bios); si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg; si->rsvd_segs = reserved_segments(sbi); si->overp_segs = overprovision_segments(sbi); si->valid_count = valid_user_blocks(sbi); + si->discard_blks = discard_blocks(sbi); si->valid_node_count = valid_node_count(sbi); si->valid_inode_count = valid_inode_count(sbi); si->inline_xattr = atomic_read(&sbi->inline_xattr); si->inline_inode = atomic_read(&sbi->inline_inode); si->inline_dir = atomic_read(&sbi->inline_dir); + si->orphans = sbi->im[ORPHAN_INO].ino_num; si->utilization = utilization(sbi); si->free_segs = free_segments(sbi); @@ -105,7 +112,7 @@ static void update_sit_info(struct f2fs_sb_info *sbi) bimodal = 0; total_vblocks = 0; - blks_per_sec = sbi->segs_per_sec * (1 << sbi->log_blocks_per_seg); + blks_per_sec = sbi->segs_per_sec * sbi->blocks_per_seg; hblks_per_sec = blks_per_sec / 2; for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) { vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec); @@ -140,6 +147,7 @@ static void update_mem_info(struct f2fs_sb_info *sbi) si->base_mem = sizeof(struct f2fs_sb_info) + sbi->sb->s_blocksize; si->base_mem += 2 * sizeof(struct f2fs_inode_info); si->base_mem += sizeof(*sbi->ckpt); + si->base_mem += sizeof(struct percpu_counter) * NR_COUNT_TYPE; /* build sm */ si->base_mem += sizeof(struct f2fs_sm_info); @@ -148,7 +156,9 @@ static void update_mem_info(struct f2fs_sb_info *sbi) si->base_mem += sizeof(struct sit_info); si->base_mem += MAIN_SEGS(sbi) * sizeof(struct seg_entry); si->base_mem += f2fs_bitmap_size(MAIN_SEGS(sbi)); - si->base_mem += 3 * SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi); + si->base_mem += 2 * SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi); + if (f2fs_discard_en(sbi)) + si->base_mem += SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi); si->base_mem += SIT_VBLOCK_MAP_SIZE; if (sbi->segs_per_sec > 1) si->base_mem += MAIN_SECS(sbi) * sizeof(struct sec_entry); @@ -161,7 +171,7 @@ static void update_mem_info(struct f2fs_sb_info *sbi) /* build curseg */ si->base_mem += sizeof(struct curseg_info) * NR_CURSEG_TYPE; - si->base_mem += PAGE_CACHE_SIZE * NR_CURSEG_TYPE; + si->base_mem += PAGE_SIZE * NR_CURSEG_TYPE; /* build dirty segmap */ si->base_mem += sizeof(struct dirty_seglist_info); @@ -189,18 +199,18 @@ get_cache: si->cache_mem += NM_I(sbi)->dirty_nat_cnt * sizeof(struct nat_entry_set); si->cache_mem += si->inmem_pages * sizeof(struct inmem_pages); - si->cache_mem += sbi->n_dirty_dirs * sizeof(struct inode_entry); - for (i = 0; i <= UPDATE_INO; i++) + for (i = 0; i <= ORPHAN_INO; i++) si->cache_mem += sbi->im[i].ino_num * sizeof(struct ino_entry); - si->cache_mem += sbi->total_ext_tree * sizeof(struct extent_tree); + si->cache_mem += atomic_read(&sbi->total_ext_tree) * + sizeof(struct extent_tree); si->cache_mem += atomic_read(&sbi->total_ext_node) * sizeof(struct extent_node); si->page_mem = 0; npages = NODE_MAPPING(sbi)->nrpages; - si->page_mem += (unsigned long long)npages << PAGE_CACHE_SHIFT; + si->page_mem += (unsigned long long)npages << PAGE_SHIFT; npages = META_MAPPING(sbi)->nrpages; - si->page_mem += (unsigned long long)npages << PAGE_CACHE_SHIFT; + si->page_mem += (unsigned long long)npages << PAGE_SHIFT; } static int stat_show(struct seq_file *s, void *v) @@ -211,20 +221,24 @@ static int stat_show(struct seq_file *s, void *v) mutex_lock(&f2fs_stat_mutex); list_for_each_entry(si, &f2fs_stat_list, stat_list) { - char devname[BDEVNAME_SIZE]; - update_general_status(si->sbi); - seq_printf(s, "\n=====[ partition info(%s). #%d ]=====\n", - bdevname(si->sbi->sb->s_bdev, devname), i++); + seq_printf(s, "\n=====[ partition info(%pg). #%d, %s]=====\n", + si->sbi->sb->s_bdev, i++, + f2fs_readonly(si->sbi->sb) ? "RO": "RW"); seq_printf(s, "[SB: 1] [CP: 2] [SIT: %d] [NAT: %d] ", si->sit_area_segs, si->nat_area_segs); seq_printf(s, "[SSA: %d] [MAIN: %d", si->ssa_area_segs, si->main_area_segs); seq_printf(s, "(OverProv:%d Resv:%d)]\n\n", si->overp_segs, si->rsvd_segs); - seq_printf(s, "Utilization: %d%% (%d valid blocks)\n", - si->utilization, si->valid_count); + if (test_opt(si->sbi, DISCARD)) + seq_printf(s, "Utilization: %u%% (%u valid blocks, %u discard blocks)\n", + si->utilization, si->valid_count, si->discard_blks); + else + seq_printf(s, "Utilization: %u%% (%u valid blocks)\n", + si->utilization, si->valid_count); + seq_printf(s, " - Node: %u (Inode: %u, ", si->valid_node_count, si->valid_inode_count); seq_printf(s, "Other: %u)\n - Data: %u\n", @@ -236,6 +250,8 @@ static int stat_show(struct seq_file *s, void *v) si->inline_inode); seq_printf(s, " - Inline_dentry Inode: %u\n", si->inline_dir); + seq_printf(s, " - Orphan Inode: %u\n", + si->orphans); seq_printf(s, "\nMain area: %d segs, %d secs %d zones\n", si->main_area_segs, si->main_area_sections, si->main_area_zones); @@ -269,7 +285,8 @@ static int stat_show(struct seq_file *s, void *v) si->dirty_count); seq_printf(s, " - Prefree: %d\n - Free: %d (%d)\n\n", si->prefree_count, si->free_segs, si->free_secs); - seq_printf(s, "CP calls: %d\n", si->cp_count); + seq_printf(s, "CP calls: %d (BG: %d)\n", + si->cp_count, si->bg_cp_count); seq_printf(s, "GC calls: %d (BG: %d)\n", si->call_count, si->bg_gc); seq_printf(s, " - data segments : %d (%d)\n", @@ -290,17 +307,21 @@ static int stat_show(struct seq_file *s, void *v) !si->total_ext ? 0 : div64_u64(si->hit_total * 100, si->total_ext), si->hit_total, si->total_ext); - seq_printf(s, " - Inner Struct Count: tree: %d, node: %d\n", - si->ext_tree, si->ext_node); + seq_printf(s, " - Inner Struct Count: tree: %d(%d), node: %d\n", + si->ext_tree, si->zombie_tree, si->ext_node); seq_puts(s, "\nBalancing F2FS Async:\n"); - seq_printf(s, " - inmem: %4d, wb: %4d\n", - si->inmem_pages, si->wb_pages); - seq_printf(s, " - nodes: %4d in %4d\n", + seq_printf(s, " - inmem: %4lld, wb_bios: %4d\n", + si->inmem_pages, si->wb_bios); + seq_printf(s, " - nodes: %4lld in %4d\n", si->ndirty_node, si->node_pages); - seq_printf(s, " - dents: %4d in dirs:%4d\n", - si->ndirty_dent, si->ndirty_dirs); - seq_printf(s, " - meta: %4d in %4d\n", + seq_printf(s, " - dents: %4lld in dirs:%4d (%4d)\n", + si->ndirty_dent, si->ndirty_dirs, si->ndirty_all); + seq_printf(s, " - datas: %4lld in files:%4d\n", + si->ndirty_data, si->ndirty_files); + seq_printf(s, " - meta: %4lld in %4d\n", si->ndirty_meta, si->meta_pages); + seq_printf(s, " - imeta: %4lld\n", + si->ndirty_imeta); seq_printf(s, " - NATs: %9d/%9d\n - SITs: %9d/%9d\n", si->dirty_nats, si->nats, si->dirty_sits, si->sits); seq_printf(s, " - free_nids: %9d\n", @@ -406,20 +427,23 @@ void f2fs_destroy_stats(struct f2fs_sb_info *sbi) kfree(si); } -void __init f2fs_create_root_stats(void) +int __init f2fs_create_root_stats(void) { struct dentry *file; f2fs_debugfs_root = debugfs_create_dir("f2fs", NULL); if (!f2fs_debugfs_root) - return; + return -ENOMEM; file = debugfs_create_file("status", S_IRUGO, f2fs_debugfs_root, NULL, &stat_fops); if (!file) { debugfs_remove(f2fs_debugfs_root); f2fs_debugfs_root = NULL; + return -ENOMEM; } + + return 0; } void f2fs_destroy_root_stats(void) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 7c1678ba8f92..e634a637c443 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -17,8 +17,8 @@ static unsigned long dir_blocks(struct inode *inode) { - return ((unsigned long long) (i_size_read(inode) + PAGE_CACHE_SIZE - 1)) - >> PAGE_CACHE_SHIFT; + return ((unsigned long long) (i_size_read(inode) + PAGE_SIZE - 1)) + >> PAGE_SHIFT; } static unsigned int dir_buckets(unsigned int level, int dir_level) @@ -37,7 +37,7 @@ static unsigned int bucket_blocks(unsigned int level) return 4; } -unsigned char f2fs_filetype_table[F2FS_FT_MAX] = { +static unsigned char f2fs_filetype_table[F2FS_FT_MAX] = { [F2FS_FT_UNKNOWN] = DT_UNKNOWN, [F2FS_FT_REG_FILE] = DT_REG, [F2FS_FT_DIR] = DT_DIR, @@ -48,7 +48,6 @@ unsigned char f2fs_filetype_table[F2FS_FT_MAX] = { [F2FS_FT_SYMLINK] = DT_LNK, }; -#define S_SHIFT 12 static unsigned char f2fs_type_by_mode[S_IFMT >> S_SHIFT] = { [S_IFREG >> S_SHIFT] = F2FS_FT_REG_FILE, [S_IFDIR >> S_SHIFT] = F2FS_FT_DIR, @@ -64,6 +63,13 @@ void set_de_type(struct f2fs_dir_entry *de, umode_t mode) de->file_type = f2fs_type_by_mode[(mode & S_IFMT) >> S_SHIFT]; } +unsigned char get_de_type(struct f2fs_dir_entry *de) +{ + if (de->file_type < F2FS_FT_MAX) + return f2fs_filetype_table[de->file_type]; + return DT_UNKNOWN; +} + static unsigned long dir_block_index(unsigned int level, int dir_level, unsigned int idx) { @@ -77,7 +83,7 @@ static unsigned long dir_block_index(unsigned int level, } static struct f2fs_dir_entry *find_in_block(struct page *dentry_page, - struct f2fs_filename *fname, + struct fscrypt_name *fname, f2fs_hash_t namehash, int *max_slots, struct page **res_page) @@ -95,23 +101,18 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page, else kunmap(dentry_page); - /* - * For the most part, it should be a bug when name_len is zero. - * We stop here for figuring out where the bugs has occurred. - */ - f2fs_bug_on(F2FS_P_SB(dentry_page), d.max < 0); return de; } -struct f2fs_dir_entry *find_target_dentry(struct f2fs_filename *fname, +struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *fname, f2fs_hash_t namehash, int *max_slots, struct f2fs_dentry_ptr *d) { struct f2fs_dir_entry *de; unsigned long bit_pos = 0; int max_len = 0; - struct f2fs_str de_name = FSTR_INIT(NULL, 0); - struct f2fs_str *name = &fname->disk_name; + struct fscrypt_str de_name = FSTR_INIT(NULL, 0); + struct fscrypt_str *name = &fname->disk_name; if (max_slots) *max_slots = 0; @@ -124,6 +125,11 @@ struct f2fs_dir_entry *find_target_dentry(struct f2fs_filename *fname, de = &d->dentry[bit_pos]; + if (unlikely(!de->name_len)) { + bit_pos++; + continue; + } + /* encrypted case */ de_name.name = d->filename[bit_pos]; de_name.len = le16_to_cpu(de->name_len); @@ -141,10 +147,6 @@ struct f2fs_dir_entry *find_target_dentry(struct f2fs_filename *fname, *max_slots = max_len; max_len = 0; - /* remain bug on condition */ - if (unlikely(!de->name_len)) - d->max = -1; - bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); } @@ -157,7 +159,7 @@ found: static struct f2fs_dir_entry *find_in_level(struct inode *dir, unsigned int level, - struct f2fs_filename *fname, + struct fscrypt_name *fname, struct page **res_page) { struct qstr name = FSTR_TO_QSTR(&fname->disk_name); @@ -170,9 +172,10 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, int max_slots; f2fs_hash_t namehash; - namehash = f2fs_dentry_hash(&name); - - f2fs_bug_on(F2FS_I_SB(dir), level > MAX_DIR_HASH_DEPTH); + if(fname->hash) + namehash = cpu_to_le32(fname->hash); + else + namehash = f2fs_dentry_hash(&name); nbucket = dir_buckets(level, F2FS_I(dir)->i_dir_level); nblock = bucket_blocks(level); @@ -185,8 +188,13 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, /* no need to allocate new dentry pages to all the indices */ dentry_page = find_data_page(dir, bidx); if (IS_ERR(dentry_page)) { - room = true; - continue; + if (PTR_ERR(dentry_page) == -ENOENT) { + room = true; + continue; + } else { + *res_page = dentry_page; + break; + } } de = find_in_block(dentry_page, fname, namehash, &max_slots, @@ -207,6 +215,44 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, return de; } +struct f2fs_dir_entry *__f2fs_find_entry(struct inode *dir, + struct fscrypt_name *fname, struct page **res_page) +{ + unsigned long npages = dir_blocks(dir); + struct f2fs_dir_entry *de = NULL; + unsigned int max_depth; + unsigned int level; + + if (f2fs_has_inline_dentry(dir)) { + *res_page = NULL; + de = find_in_inline_dir(dir, fname, res_page); + goto out; + } + + if (npages == 0) { + *res_page = NULL; + goto out; + } + + max_depth = F2FS_I(dir)->i_current_depth; + if (unlikely(max_depth > MAX_DIR_HASH_DEPTH)) { + f2fs_msg(F2FS_I_SB(dir)->sb, KERN_WARNING, + "Corrupted max_depth of %lu: %u", + dir->i_ino, max_depth); + max_depth = MAX_DIR_HASH_DEPTH; + f2fs_i_depth_write(dir, max_depth); + } + + for (level = 0; level < max_depth; level++) { + *res_page = NULL; + de = find_in_level(dir, level, fname, res_page); + if (de || IS_ERR(*res_page)) + break; + } +out: + return de; +} + /* * Find an entry in the specified directory with the wanted name. * It returns the page where the entry was found (as a parameter - res_page), @@ -214,72 +260,42 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, * Entry is guaranteed to be valid. */ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir, - struct qstr *child, struct page **res_page) + const struct qstr *child, struct page **res_page) { - unsigned long npages = dir_blocks(dir); struct f2fs_dir_entry *de = NULL; - unsigned int max_depth; - unsigned int level; - struct f2fs_filename fname; + struct fscrypt_name fname; int err; - *res_page = NULL; - - err = f2fs_fname_setup_filename(dir, child, 1, &fname); - if (err) + err = fscrypt_setup_filename(dir, child, 1, &fname); + if (err) { + *res_page = ERR_PTR(err); return NULL; - - if (f2fs_has_inline_dentry(dir)) { - de = find_in_inline_dir(dir, &fname, res_page); - goto out; } - if (npages == 0) - goto out; + de = __f2fs_find_entry(dir, &fname, res_page); - max_depth = F2FS_I(dir)->i_current_depth; - - for (level = 0; level < max_depth; level++) { - de = find_in_level(dir, level, &fname, res_page); - if (de) - break; - } -out: - f2fs_fname_free_filename(&fname); + fscrypt_free_filename(&fname); return de; } struct f2fs_dir_entry *f2fs_parent_dir(struct inode *dir, struct page **p) { - struct page *page; - struct f2fs_dir_entry *de; - struct f2fs_dentry_block *dentry_blk; + struct qstr dotdot = QSTR_INIT("..", 2); - if (f2fs_has_inline_dentry(dir)) - return f2fs_parent_inline_dir(dir, p); - - page = get_lock_data_page(dir, 0, false); - if (IS_ERR(page)) - return NULL; - - dentry_blk = kmap(page); - de = &dentry_blk->dentry[1]; - *p = page; - unlock_page(page); - return de; + return f2fs_find_entry(dir, &dotdot, p); } -ino_t f2fs_inode_by_name(struct inode *dir, struct qstr *qstr) +ino_t f2fs_inode_by_name(struct inode *dir, const struct qstr *qstr, + struct page **page) { ino_t res = 0; struct f2fs_dir_entry *de; - struct page *page; - de = f2fs_find_entry(dir, qstr, &page); + de = f2fs_find_entry(dir, qstr, page); if (de) { res = le32_to_cpu(de->ino); - f2fs_dentry_kunmap(dir, page); - f2fs_put_page(page, 0); + f2fs_dentry_kunmap(dir, *page); + f2fs_put_page(*page, 0); } return res; @@ -290,14 +306,14 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, { enum page_type type = f2fs_has_inline_dentry(dir) ? NODE : DATA; lock_page(page); - f2fs_wait_on_page_writeback(page, type); + f2fs_wait_on_page_writeback(page, type, true); de->ino = cpu_to_le32(inode->i_ino); set_de_type(de, inode->i_mode); f2fs_dentry_kunmap(dir, page); set_page_dirty(page); - dir->i_mtime = dir->i_ctime = CURRENT_TIME; - mark_inode_dirty(dir); + dir->i_mtime = dir->i_ctime = CURRENT_TIME; + f2fs_mark_inode_dirty_sync(dir); f2fs_put_page(page, 1); } @@ -305,7 +321,7 @@ static void init_dent_inode(const struct qstr *name, struct page *ipage) { struct f2fs_inode *ri; - f2fs_wait_on_page_writeback(ipage, NODE); + f2fs_wait_on_page_writeback(ipage, NODE, true); /* copy name info. to this inode page */ ri = F2FS_INODE(ipage); @@ -335,24 +351,14 @@ int update_dent_inode(struct inode *inode, struct inode *to, void do_make_empty_dir(struct inode *inode, struct inode *parent, struct f2fs_dentry_ptr *d) { - struct f2fs_dir_entry *de; + struct qstr dot = QSTR_INIT(".", 1); + struct qstr dotdot = QSTR_INIT("..", 2); - de = &d->dentry[0]; - de->name_len = cpu_to_le16(1); - de->hash_code = 0; - de->ino = cpu_to_le32(inode->i_ino); - memcpy(d->filename[0], ".", 1); - set_de_type(de, inode->i_mode); + /* update dirent of "." */ + f2fs_update_dentry(inode->i_ino, inode->i_mode, d, &dot, 0, 0); - de = &d->dentry[1]; - de->hash_code = 0; - de->name_len = cpu_to_le16(2); - de->ino = cpu_to_le32(parent->i_ino); - memcpy(d->filename[1], "..", 2); - set_de_type(de, parent->i_mode); - - test_and_set_bit_le(0, (void *)d->bitmap); - test_and_set_bit_le(1, (void *)d->bitmap); + /* update dirent of ".." */ + f2fs_update_dentry(parent->i_ino, parent->i_mode, d, &dotdot, 0, 1); } static int make_empty_dir(struct inode *inode, @@ -382,32 +388,38 @@ static int make_empty_dir(struct inode *inode, } struct page *init_inode_metadata(struct inode *inode, struct inode *dir, - const struct qstr *name, struct page *dpage) + const struct qstr *new_name, const struct qstr *orig_name, + struct page *dpage) { struct page *page; int err; - if (is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) { + if (is_inode_flag_set(inode, FI_NEW_INODE)) { page = new_inode_page(inode); if (IS_ERR(page)) return page; if (S_ISDIR(inode->i_mode)) { + /* in order to handle error case */ + get_page(page); err = make_empty_dir(inode, dir, page); - if (err) - goto error; + if (err) { + lock_page(page); + goto put_error; + } + put_page(page); } err = f2fs_init_acl(inode, dir, page, dpage); if (err) goto put_error; - err = f2fs_init_security(inode, dir, name, page); + err = f2fs_init_security(inode, dir, orig_name, page); if (err) goto put_error; if (f2fs_encrypted_inode(dir) && f2fs_may_encrypt(inode)) { - err = f2fs_inherit_context(dir, inode, page); + err = fscrypt_inherit_context(dir, inode, page, false); if (err) goto put_error; } @@ -419,14 +431,14 @@ struct page *init_inode_metadata(struct inode *inode, struct inode *dir, set_cold_node(inode, page); } - if (name) - init_dent_inode(name, page); + if (new_name) + init_dent_inode(new_name, page); /* * This file should be checkpointed during fsync. * We lost i_pino from now on. */ - if (is_inode_flag_set(F2FS_I(inode), FI_INC_LINK)) { + if (is_inode_flag_set(inode, FI_INC_LINK)) { file_lost_pino(inode); /* * If link the tmpfile to alias through linkat path, @@ -434,41 +446,33 @@ struct page *init_inode_metadata(struct inode *inode, struct inode *dir, */ if (inode->i_nlink == 0) remove_orphan_inode(F2FS_I_SB(dir), inode->i_ino); - inc_nlink(inode); + f2fs_i_links_write(inode, true); } return page; put_error: + clear_nlink(inode); + update_inode(inode, page); f2fs_put_page(page, 1); -error: - /* once the failed inode becomes a bad inode, i_mode is S_IFREG */ - truncate_inode_pages(&inode->i_data, 0); - truncate_blocks(inode, 0, false); - remove_dirty_dir_inode(inode); - remove_inode_page(inode); return ERR_PTR(err); } void update_parent_metadata(struct inode *dir, struct inode *inode, unsigned int current_depth) { - if (inode && is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) { - if (S_ISDIR(inode->i_mode)) { - inc_nlink(dir); - set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR); - } - clear_inode_flag(F2FS_I(inode), FI_NEW_INODE); + if (inode && is_inode_flag_set(inode, FI_NEW_INODE)) { + if (S_ISDIR(inode->i_mode)) + f2fs_i_links_write(dir, true); + clear_inode_flag(inode, FI_NEW_INODE); } dir->i_mtime = dir->i_ctime = CURRENT_TIME; - mark_inode_dirty(dir); + f2fs_mark_inode_dirty_sync(dir); - if (F2FS_I(dir)->i_current_depth != current_depth) { - F2FS_I(dir)->i_current_depth = current_depth; - set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR); - } + if (F2FS_I(dir)->i_current_depth != current_depth) + f2fs_i_depth_write(dir, current_depth); - if (inode && is_inode_flag_set(F2FS_I(inode), FI_INC_LINK)) - clear_inode_flag(F2FS_I(inode), FI_INC_LINK); + if (inode && is_inode_flag_set(inode, FI_INC_LINK)) + clear_inode_flag(inode, FI_INC_LINK); } int room_for_filename(const void *bitmap, int slots, int max_slots) @@ -505,15 +509,16 @@ void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *d, memcpy(d->filename[bit_pos], name->name, name->len); de->ino = cpu_to_le32(ino); set_de_type(de, mode); - for (i = 0; i < slots; i++) - test_and_set_bit_le(bit_pos + i, (void *)d->bitmap); + for (i = 0; i < slots; i++) { + __set_bit_le(bit_pos + i, (void *)d->bitmap); + /* avoid wrong garbage data for readdir */ + if (i) + (de + i)->name_len = 0; + } } -/* - * Caller should grab and release a rwsem by calling f2fs_lock_op() and - * f2fs_unlock_op(). - */ -int __f2fs_add_link(struct inode *dir, const struct qstr *name, +int f2fs_add_regular_entry(struct inode *dir, const struct qstr *new_name, + const struct qstr *orig_name, struct inode *inode, nid_t ino, umode_t mode) { unsigned int bit_pos; @@ -526,28 +531,11 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name, struct f2fs_dentry_block *dentry_blk = NULL; struct f2fs_dentry_ptr d; struct page *page = NULL; - struct f2fs_filename fname; - struct qstr new_name; - int slots, err; - - err = f2fs_fname_setup_filename(dir, name, 0, &fname); - if (err) - return err; - - new_name.name = fname_name(&fname); - new_name.len = fname_len(&fname); - - if (f2fs_has_inline_dentry(dir)) { - err = f2fs_add_inline_entry(dir, &new_name, inode, ino, mode); - if (!err || err != -EAGAIN) - goto out; - else - err = 0; - } + int slots, err = 0; level = 0; - slots = GET_DENTRY_SLOTS(new_name.len); - dentry_hash = f2fs_dentry_hash(&new_name); + slots = GET_DENTRY_SLOTS(new_name->len); + dentry_hash = f2fs_dentry_hash(new_name); current_depth = F2FS_I(dir)->i_current_depth; if (F2FS_I(dir)->chash == dentry_hash) { @@ -556,10 +544,12 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name, } start: - if (unlikely(current_depth == MAX_DIR_HASH_DEPTH)) { - err = -ENOSPC; - goto out; - } +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(F2FS_I_SB(dir), FAULT_DIR_DEPTH)) + return -ENOSPC; +#endif + if (unlikely(current_depth == MAX_DIR_HASH_DEPTH)) + return -ENOSPC; /* Increase the depth, if required */ if (level == current_depth) @@ -573,10 +563,8 @@ start: for (block = bidx; block <= (bidx + nblock - 1); block++) { dentry_page = get_new_data_page(dir, NULL, block, true); - if (IS_ERR(dentry_page)) { - err = PTR_ERR(dentry_page); - goto out; - } + if (IS_ERR(dentry_page)) + return PTR_ERR(dentry_page); dentry_blk = kmap(dentry_page); bit_pos = room_for_filename(&dentry_blk->dentry_bitmap, @@ -592,11 +580,12 @@ start: ++level; goto start; add_dentry: - f2fs_wait_on_page_writeback(dentry_page, DATA); + f2fs_wait_on_page_writeback(dentry_page, DATA, true); if (inode) { down_write(&F2FS_I(inode)->i_sem); - page = init_inode_metadata(inode, dir, &new_name, NULL); + page = init_inode_metadata(inode, dir, new_name, + orig_name, NULL); if (IS_ERR(page)) { err = PTR_ERR(page); goto fail; @@ -606,14 +595,12 @@ add_dentry: } make_dentry_ptr(NULL, &d, (void *)dentry_blk, 1); - f2fs_update_dentry(ino, mode, &d, &new_name, dentry_hash, bit_pos); + f2fs_update_dentry(ino, mode, &d, new_name, dentry_hash, bit_pos); set_page_dirty(dentry_page); if (inode) { - /* we don't need to mark_inode_dirty now */ - F2FS_I(inode)->i_pino = dir->i_ino; - update_inode(inode, page); + f2fs_i_pino_write(inode, dir->i_ino); f2fs_put_page(page, 1); } @@ -622,14 +609,49 @@ fail: if (inode) up_write(&F2FS_I(inode)->i_sem); - if (is_inode_flag_set(F2FS_I(dir), FI_UPDATE_DIR)) { - update_inode_page(dir); - clear_inode_flag(F2FS_I(dir), FI_UPDATE_DIR); - } kunmap(dentry_page); f2fs_put_page(dentry_page, 1); -out: - f2fs_fname_free_filename(&fname); + + return err; +} + +int __f2fs_do_add_link(struct inode *dir, struct fscrypt_name *fname, + struct inode *inode, nid_t ino, umode_t mode) +{ + struct qstr new_name; + int err = -EAGAIN; + + new_name.name = fname_name(fname); + new_name.len = fname_len(fname); + + if (f2fs_has_inline_dentry(dir)) + err = f2fs_add_inline_entry(dir, &new_name, fname->usr_fname, + inode, ino, mode); + if (err == -EAGAIN) + err = f2fs_add_regular_entry(dir, &new_name, fname->usr_fname, + inode, ino, mode); + + f2fs_update_time(F2FS_I_SB(dir), REQ_TIME); + return err; +} + +/* + * Caller should grab and release a rwsem by calling f2fs_lock_op() and + * f2fs_unlock_op(). + */ +int __f2fs_add_link(struct inode *dir, const struct qstr *name, + struct inode *inode, nid_t ino, umode_t mode) +{ + struct fscrypt_name fname; + int err; + + err = fscrypt_setup_filename(dir, name, 0, &fname); + if (err) + return err; + + err = __f2fs_do_add_link(dir, &fname, inode, ino, mode); + + fscrypt_free_filename(&fname); return err; } @@ -639,46 +661,39 @@ int f2fs_do_tmpfile(struct inode *inode, struct inode *dir) int err = 0; down_write(&F2FS_I(inode)->i_sem); - page = init_inode_metadata(inode, dir, NULL, NULL); + page = init_inode_metadata(inode, dir, NULL, NULL, NULL); if (IS_ERR(page)) { err = PTR_ERR(page); goto fail; } - /* we don't need to mark_inode_dirty now */ - update_inode(inode, page); f2fs_put_page(page, 1); - clear_inode_flag(F2FS_I(inode), FI_NEW_INODE); + clear_inode_flag(inode, FI_NEW_INODE); fail: up_write(&F2FS_I(inode)->i_sem); + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); return err; } -void f2fs_drop_nlink(struct inode *dir, struct inode *inode, struct page *page) +void f2fs_drop_nlink(struct inode *dir, struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); down_write(&F2FS_I(inode)->i_sem); - if (S_ISDIR(inode->i_mode)) { - drop_nlink(dir); - if (page) - update_inode(dir, page); - else - update_inode_page(dir); - } + if (S_ISDIR(inode->i_mode)) + f2fs_i_links_write(dir, false); inode->i_ctime = CURRENT_TIME; - drop_nlink(inode); + f2fs_i_links_write(inode, false); if (S_ISDIR(inode->i_mode)) { - drop_nlink(inode); - i_size_write(inode, 0); + f2fs_i_links_write(inode, false); + f2fs_i_size_write(inode, 0); } up_write(&F2FS_I(inode)->i_sem); - update_inode_page(inode); if (inode->i_nlink == 0) - add_orphan_inode(sbi, inode->i_ino); + add_orphan_inode(inode); else release_orphan_inode(sbi); } @@ -695,11 +710,13 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len)); int i; + f2fs_update_time(F2FS_I_SB(dir), REQ_TIME); + if (f2fs_has_inline_dentry(dir)) return f2fs_delete_inline_entry(dentry, page, dir, inode); lock_page(page); - f2fs_wait_on_page_writeback(page, DATA); + f2fs_wait_on_page_writeback(page, DATA, true); dentry_blk = page_address(page); bit_pos = dentry - dentry_blk->dentry; @@ -714,9 +731,10 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, set_page_dirty(page); dir->i_ctime = dir->i_mtime = CURRENT_TIME; + f2fs_mark_inode_dirty_sync(dir); if (inode) - f2fs_drop_nlink(dir, inode, NULL); + f2fs_drop_nlink(dir, inode); if (bit_pos == NR_DENTRY_IN_BLOCK && !truncate_hole(dir, page->index, page->index + 1)) { @@ -767,12 +785,12 @@ bool f2fs_empty_dir(struct inode *dir) } bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, - unsigned int start_pos, struct f2fs_str *fstr) + unsigned int start_pos, struct fscrypt_str *fstr) { unsigned char d_type = DT_UNKNOWN; unsigned int bit_pos; struct f2fs_dir_entry *de = NULL; - struct f2fs_str de_name = FSTR_INIT(NULL, 0); + struct fscrypt_str de_name = FSTR_INIT(NULL, 0); bit_pos = ((unsigned long)ctx->pos % d->max); @@ -782,10 +800,13 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, break; de = &d->dentry[bit_pos]; - if (de->file_type < F2FS_FT_MAX) - d_type = f2fs_filetype_table[de->file_type]; - else - d_type = DT_UNKNOWN; + if (de->name_len == 0) { + bit_pos++; + ctx->pos = start_pos + bit_pos; + continue; + } + + d_type = get_de_type(de); de_name.name = d->filename[bit_pos]; de_name.len = le16_to_cpu(de->name_len); @@ -794,15 +815,9 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, int save_len = fstr->len; int ret; - de_name.name = kmalloc(de_name.len, GFP_NOFS); - if (!de_name.name) - return false; - - memcpy(de_name.name, d->filename[bit_pos], de_name.len); - - ret = f2fs_fname_disk_to_usr(d->inode, &de->hash_code, - &de_name, fstr); - kfree(de_name.name); + ret = fscrypt_fname_disk_to_usr(d->inode, + (u32)de->hash_code, 0, + &de_name, fstr); if (ret < 0) return true; @@ -829,16 +844,15 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) struct file_ra_state *ra = &file->f_ra; unsigned int n = ((unsigned long)ctx->pos / NR_DENTRY_IN_BLOCK); struct f2fs_dentry_ptr d; - struct f2fs_str fstr = FSTR_INIT(NULL, 0); + struct fscrypt_str fstr = FSTR_INIT(NULL, 0); int err = 0; if (f2fs_encrypted_inode(inode)) { - err = f2fs_get_encryption_info(inode); - if (err) + err = fscrypt_get_encryption_info(inode); + if (err && err != -ENOKEY) return err; - err = f2fs_fname_crypto_alloc_buffer(inode, F2FS_NAME_LEN, - &fstr); + err = fscrypt_fname_alloc_buffer(inode, F2FS_NAME_LEN, &fstr); if (err < 0) return err; } @@ -855,36 +869,47 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) for (; n < npages; n++) { dentry_page = get_lock_data_page(inode, n, false); - if (IS_ERR(dentry_page)) - continue; + if (IS_ERR(dentry_page)) { + err = PTR_ERR(dentry_page); + if (err == -ENOENT) + continue; + else + goto out; + } dentry_blk = kmap(dentry_page); make_dentry_ptr(inode, &d, (void *)dentry_blk, 1); - if (f2fs_fill_dentries(ctx, &d, n * NR_DENTRY_IN_BLOCK, &fstr)) - goto stop; + if (f2fs_fill_dentries(ctx, &d, n * NR_DENTRY_IN_BLOCK, &fstr)) { + kunmap(dentry_page); + f2fs_put_page(dentry_page, 1); + break; + } ctx->pos = (n + 1) * NR_DENTRY_IN_BLOCK; kunmap(dentry_page); f2fs_put_page(dentry_page, 1); - dentry_page = NULL; - } -stop: - if (dentry_page && !IS_ERR(dentry_page)) { - kunmap(dentry_page); - f2fs_put_page(dentry_page, 1); } + err = 0; out: - f2fs_fname_crypto_free_buffer(&fstr); + fscrypt_fname_free_buffer(&fstr); return err; } +static int f2fs_dir_open(struct inode *inode, struct file *filp) +{ + if (f2fs_encrypted_inode(inode)) + return fscrypt_get_encryption_info(inode) ? -EACCES : 0; + return 0; +} + const struct file_operations f2fs_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, .iterate = f2fs_readdir, .fsync = f2fs_sync_file, + .open = f2fs_dir_open, .unlocked_ioctl = f2fs_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = f2fs_compat_ioctl, diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 7ddba812e11b..2b06d4fcd954 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -33,10 +33,11 @@ static struct extent_node *__attach_extent_node(struct f2fs_sb_info *sbi, en->ei = *ei; INIT_LIST_HEAD(&en->list); + en->et = et; rb_link_node(&en->rb_node, parent, p); rb_insert_color(&en->rb_node, &et->root); - et->count++; + atomic_inc(&et->node_cnt); atomic_inc(&sbi->total_ext_node); return en; } @@ -45,11 +46,29 @@ static void __detach_extent_node(struct f2fs_sb_info *sbi, struct extent_tree *et, struct extent_node *en) { rb_erase(&en->rb_node, &et->root); - et->count--; + atomic_dec(&et->node_cnt); atomic_dec(&sbi->total_ext_node); if (et->cached_en == en) et->cached_en = NULL; + kmem_cache_free(extent_node_slab, en); +} + +/* + * Flow to release an extent_node: + * 1. list_del_init + * 2. __detach_extent_node + * 3. kmem_cache_free. + */ +static void __release_extent_node(struct f2fs_sb_info *sbi, + struct extent_tree *et, struct extent_node *en) +{ + spin_lock(&sbi->extent_lock); + f2fs_bug_on(sbi, list_empty(&en->list)); + list_del_init(&en->list); + spin_unlock(&sbi->extent_lock); + + __detach_extent_node(sbi, et, en); } static struct extent_tree *__grab_extent_tree(struct inode *inode) @@ -68,11 +87,13 @@ static struct extent_tree *__grab_extent_tree(struct inode *inode) et->root = RB_ROOT; et->cached_en = NULL; rwlock_init(&et->lock); - atomic_set(&et->refcount, 0); - et->count = 0; - sbi->total_ext_tree++; + INIT_LIST_HEAD(&et->list); + atomic_set(&et->node_cnt, 0); + atomic_inc(&sbi->total_ext_tree); + } else { + atomic_dec(&sbi->total_zombie_tree); + list_del_init(&et->list); } - atomic_inc(&et->refcount); up_write(&sbi->extent_tree_lock); /* never died until evict_inode */ @@ -127,32 +148,21 @@ static struct extent_node *__init_extent_tree(struct f2fs_sb_info *sbi, } static unsigned int __free_extent_tree(struct f2fs_sb_info *sbi, - struct extent_tree *et, bool free_all) + struct extent_tree *et) { struct rb_node *node, *next; struct extent_node *en; - unsigned int count = et->count; + unsigned int count = atomic_read(&et->node_cnt); node = rb_first(&et->root); while (node) { next = rb_next(node); en = rb_entry(node, struct extent_node, rb_node); - - if (free_all) { - spin_lock(&sbi->extent_lock); - if (!list_empty(&en->list)) - list_del_init(&en->list); - spin_unlock(&sbi->extent_lock); - } - - if (free_all || list_empty(&en->list)) { - __detach_extent_node(sbi, et, en); - kmem_cache_free(extent_node_slab, en); - } + __release_extent_node(sbi, et, en); node = next; } - return count - et->count; + return count - atomic_read(&et->node_cnt); } static void __drop_largest_extent(struct inode *inode, @@ -160,38 +170,38 @@ static void __drop_largest_extent(struct inode *inode, { struct extent_info *largest = &F2FS_I(inode)->extent_tree->largest; - if (fofs < largest->fofs + largest->len && fofs + len > largest->fofs) + if (fofs < largest->fofs + largest->len && fofs + len > largest->fofs) { largest->len = 0; + f2fs_mark_inode_dirty_sync(inode); + } } -void f2fs_drop_largest_extent(struct inode *inode, pgoff_t fofs) -{ - if (!f2fs_may_extent_tree(inode)) - return; - - __drop_largest_extent(inode, fofs, 1); -} - -void f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext) +/* return true, if inode page is changed */ +bool f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct extent_tree *et; struct extent_node *en; struct extent_info ei; - if (!f2fs_may_extent_tree(inode)) - return; + if (!f2fs_may_extent_tree(inode)) { + /* drop largest extent */ + if (i_ext && i_ext->len) { + i_ext->len = 0; + return true; + } + return false; + } et = __grab_extent_tree(inode); - if (!i_ext || le32_to_cpu(i_ext->len) < F2FS_MIN_EXTENT_LEN) - return; + if (!i_ext || !i_ext->len) + return false; - set_extent_info(&ei, le32_to_cpu(i_ext->fofs), - le32_to_cpu(i_ext->blk), le32_to_cpu(i_ext->len)); + get_extent_info(&ei, i_ext); write_lock(&et->lock); - if (et->count) + if (atomic_read(&et->node_cnt)) goto out; en = __init_extent_tree(sbi, et, &ei); @@ -202,6 +212,7 @@ void f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext) } out: write_unlock(&et->lock); + return false; } static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs, @@ -230,9 +241,10 @@ static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs, if (en) { *ei = en->ei; spin_lock(&sbi->extent_lock); - if (!list_empty(&en->list)) + if (!list_empty(&en->list)) { list_move_tail(&en->list, &sbi->extent_list); - et->cached_en = en; + et->cached_en = en; + } spin_unlock(&sbi->extent_lock); ret = true; } @@ -325,12 +337,12 @@ lookup_neighbors: return en; } -static struct extent_node *__try_merge_extent_node(struct f2fs_sb_info *sbi, +static struct extent_node *__try_merge_extent_node(struct inode *inode, struct extent_tree *et, struct extent_info *ei, - struct extent_node **den, struct extent_node *prev_ex, struct extent_node *next_ex) { + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct extent_node *en = NULL; if (prev_ex && __is_back_mergeable(ei, &prev_ex->ei)) { @@ -340,28 +352,34 @@ static struct extent_node *__try_merge_extent_node(struct f2fs_sb_info *sbi, } if (next_ex && __is_front_mergeable(ei, &next_ex->ei)) { - if (en) { - __detach_extent_node(sbi, et, prev_ex); - *den = prev_ex; - } + if (en) + __release_extent_node(sbi, et, prev_ex); next_ex->ei.fofs = ei->fofs; next_ex->ei.blk = ei->blk; next_ex->ei.len += ei->len; en = next_ex; } - if (en) { - __try_update_largest_extent(et, en); + if (!en) + return NULL; + + __try_update_largest_extent(inode, et, en); + + spin_lock(&sbi->extent_lock); + if (!list_empty(&en->list)) { + list_move_tail(&en->list, &sbi->extent_list); et->cached_en = en; } + spin_unlock(&sbi->extent_lock); return en; } -static struct extent_node *__insert_extent_tree(struct f2fs_sb_info *sbi, +static struct extent_node *__insert_extent_tree(struct inode *inode, struct extent_tree *et, struct extent_info *ei, struct rb_node **insert_p, struct rb_node *insert_parent) { + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct rb_node **p = &et->root.rb_node; struct rb_node *parent = NULL; struct extent_node *en = NULL; @@ -388,8 +406,13 @@ do_insert: if (!en) return NULL; - __try_update_largest_extent(et, en); + __try_update_largest_extent(inode, et, en); + + /* update in global extent list */ + spin_lock(&sbi->extent_lock); + list_add_tail(&en->list, &sbi->extent_list); et->cached_en = en; + spin_unlock(&sbi->extent_lock); return en; } @@ -412,7 +435,7 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode, write_lock(&et->lock); - if (is_inode_flag_set(F2FS_I(inode), FI_NO_EXTENT)) { + if (is_inode_flag_set(inode, FI_NO_EXTENT)) { write_unlock(&et->lock); return false; } @@ -454,7 +477,7 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode, set_extent_info(&ei, end, end - dei.fofs + dei.blk, org_end - end); - en1 = __insert_extent_tree(sbi, et, &ei, + en1 = __insert_extent_tree(inode, et, &ei, NULL, NULL); next_en = en1; } else { @@ -475,9 +498,9 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode, } if (parts) - __try_update_largest_extent(et, en); + __try_update_largest_extent(inode, et, en); else - __detach_extent_node(sbi, et, en); + __release_extent_node(sbi, et, en); /* * if original extent is split into zero or two parts, extent @@ -488,58 +511,28 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode, insert_p = NULL; insert_parent = NULL; } - - /* update in global extent list */ - spin_lock(&sbi->extent_lock); - if (!parts && !list_empty(&en->list)) - list_del(&en->list); - if (en1) - list_add_tail(&en1->list, &sbi->extent_list); - spin_unlock(&sbi->extent_lock); - - /* release extent node */ - if (!parts) - kmem_cache_free(extent_node_slab, en); - en = next_en; } /* 3. update extent in extent cache */ if (blkaddr) { - struct extent_node *den = NULL; set_extent_info(&ei, fofs, blkaddr, len); - en1 = __try_merge_extent_node(sbi, et, &ei, &den, - prev_en, next_en); - if (!en1) - en1 = __insert_extent_tree(sbi, et, &ei, + if (!__try_merge_extent_node(inode, et, &ei, prev_en, next_en)) + __insert_extent_tree(inode, et, &ei, insert_p, insert_parent); /* give up extent_cache, if split and small updates happen */ if (dei.len >= 1 && prev.len < F2FS_MIN_EXTENT_LEN && et->largest.len < F2FS_MIN_EXTENT_LEN) { - et->largest.len = 0; - set_inode_flag(F2FS_I(inode), FI_NO_EXTENT); + __drop_largest_extent(inode, 0, UINT_MAX); + set_inode_flag(inode, FI_NO_EXTENT); } - - spin_lock(&sbi->extent_lock); - if (en1) { - if (list_empty(&en1->list)) - list_add_tail(&en1->list, &sbi->extent_list); - else - list_move_tail(&en1->list, &sbi->extent_list); - } - if (den && !list_empty(&den->list)) - list_del(&den->list); - spin_unlock(&sbi->extent_lock); - - if (den) - kmem_cache_free(extent_node_slab, den); } - if (is_inode_flag_set(F2FS_I(inode), FI_NO_EXTENT)) - __free_extent_tree(sbi, et, true); + if (is_inode_flag_set(inode, FI_NO_EXTENT)) + __free_extent_tree(sbi, et); write_unlock(&et->lock); @@ -548,46 +541,42 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode, unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) { - struct extent_tree *treevec[EXT_TREE_VEC_SIZE]; - struct extent_node *en, *tmp; - unsigned long ino = F2FS_ROOT_INO(sbi); - struct radix_tree_root *root = &sbi->extent_tree_root; - unsigned int found; + struct extent_tree *et, *next; + struct extent_node *en; unsigned int node_cnt = 0, tree_cnt = 0; int remained; if (!test_opt(sbi, EXTENT_CACHE)) return 0; + if (!atomic_read(&sbi->total_zombie_tree)) + goto free_node; + if (!down_write_trylock(&sbi->extent_tree_lock)) goto out; /* 1. remove unreferenced extent tree */ - while ((found = radix_tree_gang_lookup(root, - (void **)treevec, ino, EXT_TREE_VEC_SIZE))) { - unsigned i; - - ino = treevec[found - 1]->ino + 1; - for (i = 0; i < found; i++) { - struct extent_tree *et = treevec[i]; - - if (!atomic_read(&et->refcount)) { - write_lock(&et->lock); - node_cnt += __free_extent_tree(sbi, et, true); - write_unlock(&et->lock); - - radix_tree_delete(root, et->ino); - kmem_cache_free(extent_tree_slab, et); - sbi->total_ext_tree--; - tree_cnt++; - - if (node_cnt + tree_cnt >= nr_shrink) - goto unlock_out; - } + list_for_each_entry_safe(et, next, &sbi->zombie_list, list) { + if (atomic_read(&et->node_cnt)) { + write_lock(&et->lock); + node_cnt += __free_extent_tree(sbi, et); + write_unlock(&et->lock); } + f2fs_bug_on(sbi, atomic_read(&et->node_cnt)); + list_del_init(&et->list); + radix_tree_delete(&sbi->extent_tree_root, et->ino); + kmem_cache_free(extent_tree_slab, et); + atomic_dec(&sbi->total_ext_tree); + atomic_dec(&sbi->total_zombie_tree); + tree_cnt++; + + if (node_cnt + tree_cnt >= nr_shrink) + goto unlock_out; + cond_resched(); } up_write(&sbi->extent_tree_lock); +free_node: /* 2. remove LRU extent entries */ if (!down_write_trylock(&sbi->extent_tree_lock)) goto out; @@ -595,34 +584,29 @@ unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) remained = nr_shrink - (node_cnt + tree_cnt); spin_lock(&sbi->extent_lock); - list_for_each_entry_safe(en, tmp, &sbi->extent_list, list) { - if (!remained--) + for (; remained > 0; remained--) { + if (list_empty(&sbi->extent_list)) break; + en = list_first_entry(&sbi->extent_list, + struct extent_node, list); + et = en->et; + if (!write_trylock(&et->lock)) { + /* refresh this extent node's position in extent list */ + list_move_tail(&en->list, &sbi->extent_list); + continue; + } + list_del_init(&en->list); + spin_unlock(&sbi->extent_lock); + + __detach_extent_node(sbi, et, en); + + write_unlock(&et->lock); + node_cnt++; + spin_lock(&sbi->extent_lock); } spin_unlock(&sbi->extent_lock); - /* - * reset ino for searching victims from beginning of global extent tree. - */ - ino = F2FS_ROOT_INO(sbi); - - while ((found = radix_tree_gang_lookup(root, - (void **)treevec, ino, EXT_TREE_VEC_SIZE))) { - unsigned i; - - ino = treevec[found - 1]->ino + 1; - for (i = 0; i < found; i++) { - struct extent_tree *et = treevec[i]; - - write_lock(&et->lock); - node_cnt += __free_extent_tree(sbi, et, false); - write_unlock(&et->lock); - - if (node_cnt + tree_cnt >= nr_shrink) - goto unlock_out; - } - } unlock_out: up_write(&sbi->extent_tree_lock); out: @@ -637,16 +621,29 @@ unsigned int f2fs_destroy_extent_node(struct inode *inode) struct extent_tree *et = F2FS_I(inode)->extent_tree; unsigned int node_cnt = 0; - if (!et) + if (!et || !atomic_read(&et->node_cnt)) return 0; write_lock(&et->lock); - node_cnt = __free_extent_tree(sbi, et, true); + node_cnt = __free_extent_tree(sbi, et); write_unlock(&et->lock); return node_cnt; } +void f2fs_drop_extent_tree(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct extent_tree *et = F2FS_I(inode)->extent_tree; + + set_inode_flag(inode, FI_NO_EXTENT); + + write_lock(&et->lock); + __free_extent_tree(sbi, et); + __drop_largest_extent(inode, 0, UINT_MAX); + write_unlock(&et->lock); +} + void f2fs_destroy_extent_tree(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -656,8 +653,12 @@ void f2fs_destroy_extent_tree(struct inode *inode) if (!et) return; - if (inode->i_nlink && !is_bad_inode(inode) && et->count) { - atomic_dec(&et->refcount); + if (inode->i_nlink && !is_bad_inode(inode) && + atomic_read(&et->node_cnt)) { + down_write(&sbi->extent_tree_lock); + list_add_tail(&et->list, &sbi->zombie_list); + atomic_inc(&sbi->total_zombie_tree); + up_write(&sbi->extent_tree_lock); return; } @@ -666,11 +667,10 @@ void f2fs_destroy_extent_tree(struct inode *inode) /* delete extent tree entry in radix tree */ down_write(&sbi->extent_tree_lock); - atomic_dec(&et->refcount); - f2fs_bug_on(sbi, atomic_read(&et->refcount) || et->count); + f2fs_bug_on(sbi, atomic_read(&et->node_cnt)); radix_tree_delete(&sbi->extent_tree_root, inode->i_ino); kmem_cache_free(extent_tree_slab, et); - sbi->total_ext_tree--; + atomic_dec(&sbi->total_ext_tree); up_write(&sbi->extent_tree_lock); F2FS_I(inode)->extent_tree = NULL; @@ -689,20 +689,20 @@ bool f2fs_lookup_extent_cache(struct inode *inode, pgoff_t pgofs, void f2fs_update_extent_cache(struct dnode_of_data *dn) { - struct f2fs_inode_info *fi = F2FS_I(dn->inode); pgoff_t fofs; + block_t blkaddr; if (!f2fs_may_extent_tree(dn->inode)) return; - f2fs_bug_on(F2FS_I_SB(dn->inode), dn->data_blkaddr == NEW_ADDR); + if (dn->data_blkaddr == NEW_ADDR) + blkaddr = NULL_ADDR; + else + blkaddr = dn->data_blkaddr; - - fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) + - dn->ofs_in_node; - - if (f2fs_update_extent_tree_range(dn->inode, fofs, dn->data_blkaddr, 1)) - sync_inode_page(dn); + fofs = start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) + + dn->ofs_in_node; + f2fs_update_extent_tree_range(dn->inode, fofs, blkaddr, 1); } void f2fs_update_extent_cache_range(struct dnode_of_data *dn, @@ -712,8 +712,7 @@ void f2fs_update_extent_cache_range(struct dnode_of_data *dn, if (!f2fs_may_extent_tree(dn->inode)) return; - if (f2fs_update_extent_tree_range(dn->inode, fofs, blkaddr, len)) - sync_inode_page(dn); + f2fs_update_extent_tree_range(dn->inode, fofs, blkaddr, len); } void init_extent_cache_info(struct f2fs_sb_info *sbi) @@ -722,7 +721,9 @@ void init_extent_cache_info(struct f2fs_sb_info *sbi) init_rwsem(&sbi->extent_tree_lock); INIT_LIST_HEAD(&sbi->extent_list); spin_lock_init(&sbi->extent_lock); - sbi->total_ext_tree = 0; + atomic_set(&sbi->total_ext_tree, 0); + INIT_LIST_HEAD(&sbi->zombie_list); + atomic_set(&sbi->total_zombie_tree, 0); atomic_set(&sbi->total_ext_node, 0); } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 9db5500d63d9..af293e84e5cd 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -21,10 +21,12 @@ #include #include #include +#include +#include +#include #ifdef CONFIG_F2FS_CHECK_FS #define f2fs_bug_on(sbi, condition) BUG_ON(condition) -#define f2fs_down_write(x, y) down_write_nest_lock(x, y) #else #define f2fs_bug_on(sbi, condition) \ do { \ @@ -33,7 +35,30 @@ set_sbi_flag(sbi, SBI_NEED_FSCK); \ } \ } while (0) -#define f2fs_down_write(x, y) down_write(x) +#endif + +#ifdef CONFIG_F2FS_FAULT_INJECTION +enum { + FAULT_KMALLOC, + FAULT_PAGE_ALLOC, + FAULT_ALLOC_NID, + FAULT_ORPHAN, + FAULT_BLOCK, + FAULT_DIR_DEPTH, + FAULT_EVICT_INODE, + FAULT_IO, + FAULT_CHECKPOINT, + FAULT_MAX, +}; + +struct f2fs_fault_info { + atomic_t inject_ops; + unsigned int inject_rate; + unsigned int inject_type; +}; + +extern char *fault_name[FAULT_MAX]; +#define IS_FAULT_SET(fi, type) (fi->inject_type & (1 << (type))) #endif /* @@ -54,6 +79,10 @@ #define F2FS_MOUNT_FASTBOOT 0x00001000 #define F2FS_MOUNT_EXTENT_CACHE 0x00002000 #define F2FS_MOUNT_FORCE_FG_GC 0x00004000 +#define F2FS_MOUNT_DATA_FLUSH 0x00008000 +#define F2FS_MOUNT_FAULT_INJECTION 0x00010000 +#define F2FS_MOUNT_ADAPTIVE 0x00020000 +#define F2FS_MOUNT_LFS 0x00040000 #define clear_opt(sbi, option) (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option) #define set_opt(sbi, option) (sbi->mount_opt.opt |= F2FS_MOUNT_##option) @@ -74,6 +103,7 @@ struct f2fs_mount_info { }; #define F2FS_FEATURE_ENCRYPT 0x0001 +#define F2FS_FEATURE_HMSMR 0x0002 #define F2FS_HAS_FEATURE(sb, mask) \ ((F2FS_SB(sb)->raw_super->feature & cpu_to_le32(mask)) != 0) @@ -82,25 +112,30 @@ struct f2fs_mount_info { #define F2FS_CLEAR_FEATURE(sb, mask) \ F2FS_SB(sb)->raw_super->feature &= ~cpu_to_le32(mask) -#define CRCPOLY_LE 0xedb88320 - -static inline __u32 f2fs_crc32(void *buf, size_t len) +/** + * wq_has_sleeper - check if there are any waiting processes + * @wq: wait queue head + * + * Returns true if wq has waiting processes + * + * Please refer to the comment for waitqueue_active. + */ +static inline bool wq_has_sleeper(wait_queue_head_t *wq) { - unsigned char *p = (unsigned char *)buf; - __u32 crc = F2FS_SUPER_MAGIC; - int i; - - while (len--) { - crc ^= *p++; - for (i = 0; i < 8; i++) - crc = (crc >> 1) ^ ((crc & 1) ? CRCPOLY_LE : 0); - } - return crc; + /* + * We need to be sure we are in sync with the + * add_wait_queue modifications to the wait queue. + * + * This memory barrier should be paired with one on the + * waiting side. + */ + smp_mb(); + return waitqueue_active(wq); } -static inline bool f2fs_crc_valid(__u32 blk_crc, void *buf, size_t buf_size) +static inline void inode_nohighmem(struct inode *inode) { - return f2fs_crc32(buf, buf_size) == blk_crc; + mapping_set_gfp_mask(inode->i_mapping, GFP_USER); } /* @@ -119,12 +154,13 @@ enum { CP_DISCARD, }; -#define DEF_BATCHED_TRIM_SECTIONS 32 +#define DEF_BATCHED_TRIM_SECTIONS 2 #define BATCHED_TRIM_SEGMENTS(sbi) \ (SM_I(sbi)->trim_sections * (sbi)->segs_per_sec) #define BATCHED_TRIM_BLOCKS(sbi) \ (BATCHED_TRIM_SEGMENTS(sbi) << (sbi)->log_blocks_per_seg) #define DEF_CP_INTERVAL 60 /* 60 secs */ +#define DEF_IDLE_INTERVAL 5 /* 5 secs */ struct cp_control { int reason; @@ -158,13 +194,7 @@ struct ino_entry { nid_t ino; /* inode number */ }; -/* - * for the list of directory inodes or gc inodes. - * NOTE: there are two slab users for this structure, if we add/modify/delete - * fields in structure for one of slab users, it may affect fields or size of - * other one, in this condition, it's better to split both of slab and related - * data structure. - */ +/* for the list of inodes to be GCed */ struct inode_entry { struct list_head list; /* list head */ struct inode *inode; /* vfs inode pointer */ @@ -177,46 +207,52 @@ struct discard_entry { int len; /* # of consecutive blocks of the discard */ }; +struct bio_entry { + struct list_head list; + struct bio *bio; + struct completion event; + int error; +}; + /* for the list of fsync inodes, used only during recovery */ struct fsync_inode_entry { struct list_head list; /* list head */ struct inode *inode; /* vfs inode pointer */ block_t blkaddr; /* block address locating the last fsync */ block_t last_dentry; /* block address locating the last dentry */ - block_t last_inode; /* block address locating the last inode */ }; -#define nats_in_cursum(sum) (le16_to_cpu(sum->n_nats)) -#define sits_in_cursum(sum) (le16_to_cpu(sum->n_sits)) +#define nats_in_cursum(jnl) (le16_to_cpu(jnl->n_nats)) +#define sits_in_cursum(jnl) (le16_to_cpu(jnl->n_sits)) -#define nat_in_journal(sum, i) (sum->nat_j.entries[i].ne) -#define nid_in_journal(sum, i) (sum->nat_j.entries[i].nid) -#define sit_in_journal(sum, i) (sum->sit_j.entries[i].se) -#define segno_in_journal(sum, i) (sum->sit_j.entries[i].segno) +#define nat_in_journal(jnl, i) (jnl->nat_j.entries[i].ne) +#define nid_in_journal(jnl, i) (jnl->nat_j.entries[i].nid) +#define sit_in_journal(jnl, i) (jnl->sit_j.entries[i].se) +#define segno_in_journal(jnl, i) (jnl->sit_j.entries[i].segno) -#define MAX_NAT_JENTRIES(sum) (NAT_JOURNAL_ENTRIES - nats_in_cursum(sum)) -#define MAX_SIT_JENTRIES(sum) (SIT_JOURNAL_ENTRIES - sits_in_cursum(sum)) +#define MAX_NAT_JENTRIES(jnl) (NAT_JOURNAL_ENTRIES - nats_in_cursum(jnl)) +#define MAX_SIT_JENTRIES(jnl) (SIT_JOURNAL_ENTRIES - sits_in_cursum(jnl)) -static inline int update_nats_in_cursum(struct f2fs_summary_block *rs, int i) +static inline int update_nats_in_cursum(struct f2fs_journal *journal, int i) { - int before = nats_in_cursum(rs); - rs->n_nats = cpu_to_le16(before + i); + int before = nats_in_cursum(journal); + journal->n_nats = cpu_to_le16(before + i); return before; } -static inline int update_sits_in_cursum(struct f2fs_summary_block *rs, int i) +static inline int update_sits_in_cursum(struct f2fs_journal *journal, int i) { - int before = sits_in_cursum(rs); - rs->n_sits = cpu_to_le16(before + i); + int before = sits_in_cursum(journal); + journal->n_sits = cpu_to_le16(before + i); return before; } -static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size, - int type) +static inline bool __has_cursum_space(struct f2fs_journal *journal, + int size, int type) { if (type == NAT_JOURNAL) - return size <= MAX_NAT_JENTRIES(sum); - return size <= MAX_SIT_JENTRIES(sum); + return size <= MAX_NAT_JENTRIES(journal); + return size <= MAX_SIT_JENTRIES(journal); } /* @@ -234,13 +270,13 @@ static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size, #define F2FS_IOC_ABORT_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 5) #define F2FS_IOC_GARBAGE_COLLECT _IO(F2FS_IOCTL_MAGIC, 6) #define F2FS_IOC_WRITE_CHECKPOINT _IO(F2FS_IOCTL_MAGIC, 7) +#define F2FS_IOC_DEFRAGMENT _IO(F2FS_IOCTL_MAGIC, 8) +#define F2FS_IOC_MOVE_RANGE _IOWR(F2FS_IOCTL_MAGIC, 9, \ + struct f2fs_move_range) -#define F2FS_IOC_SET_ENCRYPTION_POLICY \ - _IOR('f', 19, struct f2fs_encryption_policy) -#define F2FS_IOC_GET_ENCRYPTION_PWSALT \ - _IOW('f', 20, __u8[16]) -#define F2FS_IOC_GET_ENCRYPTION_POLICY \ - _IOW('f', 21, struct f2fs_encryption_policy) +#define F2FS_IOC_SET_ENCRYPTION_POLICY FS_IOC_SET_ENCRYPTION_POLICY +#define F2FS_IOC_GET_ENCRYPTION_POLICY FS_IOC_GET_ENCRYPTION_POLICY +#define F2FS_IOC_GET_ENCRYPTION_PWSALT FS_IOC_GET_ENCRYPTION_PWSALT /* * should be same as XFS_IOC_GOINGDOWN. @@ -256,33 +292,27 @@ static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size, /* * ioctl commands in 32 bit emulation */ -#define F2FS_IOC32_GETFLAGS FS_IOC32_GETFLAGS -#define F2FS_IOC32_SETFLAGS FS_IOC32_SETFLAGS +#define F2FS_IOC32_GETFLAGS FS_IOC32_GETFLAGS +#define F2FS_IOC32_SETFLAGS FS_IOC32_SETFLAGS +#define F2FS_IOC32_GETVERSION FS_IOC32_GETVERSION #endif +struct f2fs_defragment { + u64 start; + u64 len; +}; + +struct f2fs_move_range { + u32 dst_fd; /* destination fd */ + u64 pos_in; /* start position in src_fd */ + u64 pos_out; /* start position in dst_fd */ + u64 len; /* size to move */ +}; + /* * For INODE and NODE manager */ /* for directory operations */ -struct f2fs_str { - unsigned char *name; - u32 len; -}; - -struct f2fs_filename { - const struct qstr *usr_fname; - struct f2fs_str disk_name; - f2fs_hash_t hash; -#ifdef CONFIG_F2FS_FS_ENCRYPTION - struct f2fs_str crypto_buf; -#endif -}; - -#define FSTR_INIT(n, l) { .name = n, .len = l } -#define FSTR_TO_QSTR(f) QSTR_INIT((f)->name, (f)->len) -#define fname_name(p) ((p)->disk_name.name) -#define fname_len(p) ((p)->disk_name.len) - struct f2fs_dentry_ptr { struct inode *inode; const void *bitmap; @@ -350,6 +380,7 @@ struct extent_node { struct rb_node rb_node; /* rb node located in rb-tree */ struct list_head list; /* node in global extent list of sbi */ struct extent_info ei; /* extent info */ + struct extent_tree *et; /* extent tree pointer */ }; struct extent_tree { @@ -357,9 +388,9 @@ struct extent_tree { struct rb_root root; /* root of extent info rb-tree */ struct extent_node *cached_en; /* recently accessed extent node */ struct extent_info largest; /* largested extent info */ + struct list_head list; /* to be used by sbi->zombie_list */ rwlock_t lock; /* protect extent info rb-tree */ - atomic_t refcount; /* reference count of rb-tree */ - unsigned int count; /* # of extent node in rb-tree*/ + atomic_t node_cnt; /* # of extent node in rb-tree*/ }; /* @@ -378,6 +409,7 @@ struct f2fs_map_blocks { block_t m_lblk; unsigned int m_len; unsigned int m_flags; + pgoff_t *m_next_pgofs; /* point next possible non-hole pgofs */ }; /* for flag in get_data_block */ @@ -385,6 +417,8 @@ struct f2fs_map_blocks { #define F2FS_GET_BLOCK_DIO 1 #define F2FS_GET_BLOCK_FIEMAP 2 #define F2FS_GET_BLOCK_BMAP 3 +#define F2FS_GET_BLOCK_PRE_DIO 4 +#define F2FS_GET_BLOCK_PRE_AIO 5 /* * i_advise uses FADVISE_XXX_BIT. We can add additional hints later. @@ -406,15 +440,6 @@ struct f2fs_map_blocks { #define file_enc_name(inode) is_file(inode, FADVISE_ENC_NAME_BIT) #define file_set_enc_name(inode) set_file(inode, FADVISE_ENC_NAME_BIT) -/* Encryption algorithms */ -#define F2FS_ENCRYPTION_MODE_INVALID 0 -#define F2FS_ENCRYPTION_MODE_AES_256_XTS 1 -#define F2FS_ENCRYPTION_MODE_AES_256_GCM 2 -#define F2FS_ENCRYPTION_MODE_AES_256_CBC 3 -#define F2FS_ENCRYPTION_MODE_AES_256_CTS 4 - -#include "f2fs_crypto.h" - #define DEF_DIR_LEVEL 0 struct f2fs_inode_info { @@ -429,30 +454,27 @@ struct f2fs_inode_info { /* Use below internally in f2fs*/ unsigned long flags; /* use to pass per-file flags */ struct rw_semaphore i_sem; /* protect fi info */ - atomic_t dirty_pages; /* # of dirty pages */ + struct percpu_counter dirty_pages; /* # of dirty pages */ f2fs_hash_t chash; /* hash value of given file name */ unsigned int clevel; /* maximum level of given file name */ nid_t i_xattr_nid; /* node id that contains xattrs */ unsigned long long xattr_ver; /* cp version of xattr modification */ - struct inode_entry *dirty_dir; /* the pointer of dirty dir */ + loff_t last_disk_size; /* lastly written file size */ + struct list_head dirty_list; /* dirty list for dirs and files */ + struct list_head gdirty_list; /* linked in global dirty list */ struct list_head inmem_pages; /* inmemory pages managed by f2fs */ struct mutex inmem_lock; /* lock for inmemory pages */ - struct extent_tree *extent_tree; /* cached extent_tree entry */ - -#ifdef CONFIG_F2FS_FS_ENCRYPTION - /* Encryption params */ - struct f2fs_crypt_info *i_crypt_info; -#endif + struct rw_semaphore dio_rwsem[2];/* avoid racing between dio and gc */ }; static inline void get_extent_info(struct extent_info *ext, - struct f2fs_extent i_ext) + struct f2fs_extent *i_ext) { - ext->fofs = le32_to_cpu(i_ext.fofs); - ext->blk = le32_to_cpu(i_ext.blk); - ext->len = le32_to_cpu(i_ext.len); + ext->fofs = le32_to_cpu(i_ext->fofs); + ext->blk = le32_to_cpu(i_ext->blk); + ext->len = le32_to_cpu(i_ext->len); } static inline void set_raw_extent(struct extent_info *ext, @@ -497,11 +519,14 @@ static inline bool __is_front_mergeable(struct extent_info *cur, return __is_extent_mergeable(cur, front); } -static inline void __try_update_largest_extent(struct extent_tree *et, - struct extent_node *en) +extern void f2fs_mark_inode_dirty_sync(struct inode *); +static inline void __try_update_largest_extent(struct inode *inode, + struct extent_tree *et, struct extent_node *en) { - if (en->ei.len > et->largest.len) + if (en->ei.len > et->largest.len) { et->largest = en->ei; + f2fs_mark_inode_dirty_sync(inode); + } } struct f2fs_nm_info { @@ -511,6 +536,7 @@ struct f2fs_nm_info { nid_t next_scan_nid; /* the next nid to be scanned */ unsigned int ram_thresh; /* control the memory footprint */ unsigned int ra_nid_pages; /* # of nid pages to be readaheaded */ + unsigned int dirty_nats_ratio; /* control dirty nats ratio threshold */ /* NAT cache management */ struct radix_tree_root nat_root;/* root of the nat entry cache */ @@ -544,6 +570,9 @@ struct dnode_of_data { nid_t nid; /* node id of the direct node block */ unsigned int ofs_in_node; /* data offset in the node page */ bool inode_page_locked; /* inode page is locked or not */ + bool node_changed; /* is node block changed */ + char cur_level; /* level of hole node page */ + char max_level; /* level of current page located */ block_t data_blkaddr; /* block address of the node block */ }; @@ -594,6 +623,7 @@ struct flush_cmd { struct flush_cmd_control { struct task_struct *f2fs_issue_flush; /* flush thread */ wait_queue_head_t flush_wait_queue; /* waiting queue for wake-up */ + atomic_t submit_flush; /* # of issued flushes */ struct llist_head issue_list; /* list for command issue */ struct llist_node *dispatch_list; /* list for command dispatch */ }; @@ -618,6 +648,7 @@ struct f2fs_sm_info { /* for small discard management */ struct list_head discard_list; /* 4KB discard list */ + struct list_head wait_list; /* linked with issued discard bio */ int nr_discards; /* # of discards in the list */ int max_discards; /* max. discards to be issued */ @@ -645,11 +676,12 @@ struct f2fs_sm_info { * dirty dentry blocks, dirty node blocks, and dirty meta blocks. */ enum count_type { - F2FS_WRITEBACK, F2FS_DIRTY_DENTS, + F2FS_DIRTY_DATA, F2FS_DIRTY_NODES, F2FS_DIRTY_META, F2FS_INMEM_PAGES, + F2FS_DIRTY_IMETA, NR_COUNT_TYPE, }; @@ -673,6 +705,7 @@ enum page_type { META_FLUSH, INMEM, /* the below types are used by tracepoints only. */ INMEM_DROP, + INMEM_REVOKE, IPU, OPU, }; @@ -681,7 +714,8 @@ struct f2fs_io_info { struct f2fs_sb_info *sbi; /* f2fs_sb_info pointer */ enum page_type type; /* contains DATA/NODE/META/META_FLUSH */ int rw; /* contains R/RS/W/WS with REQ_META/REQ_PRIO */ - block_t blk_addr; /* block address to be written */ + block_t new_blkaddr; /* new block address to be written */ + block_t old_blkaddr; /* old block address before Cow */ struct page *page; /* page to be written */ struct page *encrypted_page; /* encrypted page */ }; @@ -695,6 +729,13 @@ struct f2fs_bio_info { struct rw_semaphore io_rwsem; /* blocking op for bio */ }; +enum inode_type { + DIR_INODE, /* for dirty dir inode */ + FILE_INODE, /* for dirty regular/symlink inode */ + DIRTY_META, /* for all dirtied inode metadata */ + NR_INODE_TYPE, +}; + /* for inner inode cache management */ struct inode_management { struct radix_tree_root ino_root; /* ino entry array */ @@ -709,15 +750,31 @@ enum { SBI_IS_CLOSE, /* specify unmounting */ SBI_NEED_FSCK, /* need fsck.f2fs to fix */ SBI_POR_DOING, /* recovery is doing or not */ + SBI_NEED_SB_WRITE, /* need to recover superblock */ + SBI_NEED_CP, /* need to checkpoint */ }; +enum { + CP_TIME, + REQ_TIME, + MAX_TIME, +}; + +#ifdef CONFIG_F2FS_FS_ENCRYPTION +#define F2FS_KEY_DESC_PREFIX "f2fs:" +#define F2FS_KEY_DESC_PREFIX_SIZE 5 +#endif struct f2fs_sb_info { struct super_block *sb; /* pointer to VFS super block */ struct proc_dir_entry *s_proc; /* proc entry */ - struct buffer_head *raw_super_buf; /* buffer head of raw sb */ struct f2fs_super_block *raw_super; /* raw super block pointer */ - int s_flag; /* flags for sbi */ + int valid_super_block; /* valid super block no */ + unsigned long s_flag; /* flags for sbi */ +#ifdef CONFIG_F2FS_FS_ENCRYPTION + u8 key_prefix[F2FS_KEY_DESC_PREFIX_SIZE]; + u8 key_prefix_size; +#endif /* for node-related operations */ struct f2fs_nm_info *nm_info; /* node manager */ struct inode *node_inode; /* cache node blocks */ @@ -728,32 +785,36 @@ struct f2fs_sb_info { /* for bio operations */ struct f2fs_bio_info read_io; /* for read bios */ struct f2fs_bio_info write_io[NR_PAGE_TYPE]; /* for write bios */ + struct mutex wio_mutex[NODE + 1]; /* bio ordering for NODE/DATA */ /* for checkpoint */ struct f2fs_checkpoint *ckpt; /* raw checkpoint pointer */ + spinlock_t cp_lock; /* for flag in ckpt */ struct inode *meta_inode; /* cache meta blocks */ struct mutex cp_mutex; /* checkpoint procedure lock */ struct rw_semaphore cp_rwsem; /* blocking FS operations */ struct rw_semaphore node_write; /* locking node writes */ - struct mutex writepages; /* mutex for writepages() */ wait_queue_head_t cp_wait; - long cp_expires, cp_interval; /* next expected periodic cp */ + unsigned long last_time[MAX_TIME]; /* to store time in jiffies */ + long interval_time[MAX_TIME]; /* to store thresholds */ struct inode_management im[MAX_INO_ENTRY]; /* manage inode cache */ /* for orphan inode, use 0'th array */ unsigned int max_orphans; /* max orphan inodes */ - /* for directory inode management */ - struct list_head dir_inode_list; /* dir inode list */ - spinlock_t dir_inode_lock; /* for dir inode list lock */ + /* for inode management */ + struct list_head inode_list[NR_INODE_TYPE]; /* dirty inode list */ + spinlock_t inode_lock[NR_INODE_TYPE]; /* for dirty inode list lock */ /* for extent tree cache */ struct radix_tree_root extent_tree_root;/* cache extent cache entries */ struct rw_semaphore extent_tree_lock; /* locking extent radix tree */ struct list_head extent_list; /* lru list for shrinker */ spinlock_t extent_lock; /* locking extent lru list */ - int total_ext_tree; /* extent tree count */ + atomic_t total_ext_tree; /* extent tree count */ + struct list_head zombie_list; /* extent zombie tree list */ + atomic_t total_zombie_tree; /* extent zombie tree count */ atomic_t total_ext_node; /* extent info count */ /* basic filesystem units */ @@ -770,17 +831,24 @@ struct f2fs_sb_info { unsigned int total_sections; /* total section count */ unsigned int total_node_count; /* total node block count */ unsigned int total_valid_node_count; /* valid node block count */ - unsigned int total_valid_inode_count; /* valid inode count */ + loff_t max_file_blocks; /* max block index of file */ int active_logs; /* # of active logs */ int dir_level; /* directory level */ block_t user_block_count; /* # of user blocks */ block_t total_valid_block_count; /* # of valid blocks */ - block_t alloc_valid_block_count; /* # of allocated blocks */ block_t discard_blks; /* discard command candidats */ block_t last_valid_block_count; /* for recovery */ u32 s_next_generation; /* for NFS support */ - atomic_t nr_pages[NR_COUNT_TYPE]; /* # of pages, see count_type */ + atomic_t nr_wb_bios; /* # of writeback bios */ + + /* # of pages, see count_type */ + struct percpu_counter nr_pages[NR_COUNT_TYPE]; + /* # of allocated blocks */ + struct percpu_counter alloc_valid_block_count; + + /* valid inode count */ + struct percpu_counter total_valid_inode_count; struct f2fs_mount_info mount_opt; /* mount options */ @@ -809,7 +877,7 @@ struct f2fs_sb_info { atomic_t inline_inode; /* # of inline_data inodes */ atomic_t inline_dir; /* # of inline_dentry inodes */ int bg_gc; /* background gc calls */ - unsigned int n_dirty_dirs; /* # of dir inodes */ + unsigned int ndirty_inode[NR_INODE_TYPE]; /* # of dirty inodes */ #endif unsigned int last_victim[2]; /* last victim segment # */ spinlock_t stat_lock; /* lock for stat operations */ @@ -822,11 +890,102 @@ struct f2fs_sb_info { struct list_head s_list; struct mutex umount_mutex; unsigned int shrinker_run_no; + + /* For write statistics */ + u64 sectors_written_start; + u64 kbytes_written; + + /* Reference to checksum algorithm driver via cryptoapi */ + struct crypto_shash *s_chksum_driver; + + /* For fault injection */ +#ifdef CONFIG_F2FS_FAULT_INJECTION + struct f2fs_fault_info fault_info; +#endif }; +#ifdef CONFIG_F2FS_FAULT_INJECTION +static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type) +{ + struct f2fs_fault_info *ffi = &sbi->fault_info; + + if (!ffi->inject_rate) + return false; + + if (!IS_FAULT_SET(ffi, type)) + return false; + + atomic_inc(&ffi->inject_ops); + if (atomic_read(&ffi->inject_ops) >= ffi->inject_rate) { + atomic_set(&ffi->inject_ops, 0); + printk("%sF2FS-fs : inject %s in %pF\n", + KERN_INFO, + fault_name[type], + __builtin_return_address(0)); + return true; + } + return false; +} +#endif + +/* For write statistics. Suppose sector size is 512 bytes, + * and the return value is in kbytes. s is of struct f2fs_sb_info. + */ +#define BD_PART_WRITTEN(s) \ +(((u64)part_stat_read(s->sb->s_bdev->bd_part, sectors[1]) - \ + s->sectors_written_start) >> 1) + +static inline void f2fs_update_time(struct f2fs_sb_info *sbi, int type) +{ + sbi->last_time[type] = jiffies; +} + +static inline bool f2fs_time_over(struct f2fs_sb_info *sbi, int type) +{ + struct timespec ts = {sbi->interval_time[type], 0}; + unsigned long interval = timespec_to_jiffies(&ts); + + return time_after(jiffies, sbi->last_time[type] + interval); +} + +static inline bool is_idle(struct f2fs_sb_info *sbi) +{ + struct block_device *bdev = sbi->sb->s_bdev; + struct request_queue *q = bdev_get_queue(bdev); + struct request_list *rl = &q->root_rl; + + if (rl->count[BLK_RW_SYNC] || rl->count[BLK_RW_ASYNC]) + return 0; + + return f2fs_time_over(sbi, REQ_TIME); +} + /* * Inline functions */ +static inline u32 f2fs_crc32(struct f2fs_sb_info *sbi, const void *address, + unsigned int length) +{ + SHASH_DESC_ON_STACK(shash, sbi->s_chksum_driver); + u32 *ctx = (u32 *)shash_desc_ctx(shash); + int err; + + shash->tfm = sbi->s_chksum_driver; + shash->flags = 0; + *ctx = F2FS_SUPER_MAGIC; + + err = crypto_shash_update(shash, address, length); + BUG_ON(err); + + return *ctx; +} + +static inline bool f2fs_crc_valid(struct f2fs_sb_info *sbi, __u32 blk_crc, + void *buf, size_t buf_size) +{ + return f2fs_crc32(sbi, buf, buf_size) == blk_crc; +} + static inline struct f2fs_inode_info *F2FS_I(struct inode *inode) { return container_of(inode, struct f2fs_inode_info, vfs_inode); @@ -909,17 +1068,17 @@ static inline struct address_space *NODE_MAPPING(struct f2fs_sb_info *sbi) static inline bool is_sbi_flag_set(struct f2fs_sb_info *sbi, unsigned int type) { - return sbi->s_flag & (0x01 << type); + return test_bit(type, &sbi->s_flag); } static inline void set_sbi_flag(struct f2fs_sb_info *sbi, unsigned int type) { - sbi->s_flag |= (0x01 << type); + set_bit(type, &sbi->s_flag); } static inline void clear_sbi_flag(struct f2fs_sb_info *sbi, unsigned int type) { - sbi->s_flag &= ~(0x01 << type); + clear_bit(type, &sbi->s_flag); } static inline unsigned long long cur_cp_version(struct f2fs_checkpoint *cp) @@ -927,26 +1086,57 @@ static inline unsigned long long cur_cp_version(struct f2fs_checkpoint *cp) return le64_to_cpu(cp->checkpoint_ver); } -static inline bool is_set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) +static inline bool __is_set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) { unsigned int ckpt_flags = le32_to_cpu(cp->ckpt_flags); + return ckpt_flags & f; } -static inline void set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) +static inline bool is_set_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f) { - unsigned int ckpt_flags = le32_to_cpu(cp->ckpt_flags); + return __is_set_ckpt_flags(F2FS_CKPT(sbi), f); +} + +static inline void __set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) +{ + unsigned int ckpt_flags; + + ckpt_flags = le32_to_cpu(cp->ckpt_flags); ckpt_flags |= f; cp->ckpt_flags = cpu_to_le32(ckpt_flags); } -static inline void clear_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) +static inline void set_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f) { - unsigned int ckpt_flags = le32_to_cpu(cp->ckpt_flags); + spin_lock(&sbi->cp_lock); + __set_ckpt_flags(F2FS_CKPT(sbi), f); + spin_unlock(&sbi->cp_lock); +} + +static inline void __clear_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) +{ + unsigned int ckpt_flags; + + ckpt_flags = le32_to_cpu(cp->ckpt_flags); ckpt_flags &= (~f); cp->ckpt_flags = cpu_to_le32(ckpt_flags); } +static inline void clear_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f) +{ + spin_lock(&sbi->cp_lock); + __clear_ckpt_flags(F2FS_CKPT(sbi), f); + spin_unlock(&sbi->cp_lock); +} + +static inline bool f2fs_discard_en(struct f2fs_sb_info *sbi) +{ + struct request_queue *q = bdev_get_queue(sbi->sb->s_bdev); + + return blk_queue_discard(q); +} + static inline void f2fs_lock_op(struct f2fs_sb_info *sbi) { down_read(&sbi->cp_rwsem); @@ -959,7 +1149,7 @@ static inline void f2fs_unlock_op(struct f2fs_sb_info *sbi) static inline void f2fs_lock_all(struct f2fs_sb_info *sbi) { - f2fs_down_write(&sbi->cp_rwsem, &sbi->cp_mutex); + down_write(&sbi->cp_rwsem); } static inline void f2fs_unlock_all(struct f2fs_sb_info *sbi) @@ -985,8 +1175,8 @@ static inline bool __remain_node_summaries(int reason) static inline bool __exist_node_summaries(struct f2fs_sb_info *sbi) { - return (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_UMOUNT_FLAG) || - is_set_ckpt_flags(F2FS_CKPT(sbi), CP_FASTBOOT_FLAG)); + return (is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG) || + is_set_ckpt_flags(sbi, CP_FASTBOOT_FLAG)); } /* @@ -1019,22 +1209,37 @@ static inline bool f2fs_has_xattr_block(unsigned int ofs) return ofs == XATTR_NODE_OFFSET; } +static inline void f2fs_i_blocks_write(struct inode *, blkcnt_t, bool); static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi, - struct inode *inode, blkcnt_t count) + struct inode *inode, blkcnt_t *count) { - block_t valid_block_count; + blkcnt_t diff; + +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(sbi, FAULT_BLOCK)) + return false; +#endif + /* + * let's increase this in prior to actual block count change in order + * for f2fs_sync_file to avoid data races when deciding checkpoint. + */ + percpu_counter_add(&sbi->alloc_valid_block_count, (*count)); spin_lock(&sbi->stat_lock); - valid_block_count = - sbi->total_valid_block_count + (block_t)count; - if (unlikely(valid_block_count > sbi->user_block_count)) { - spin_unlock(&sbi->stat_lock); - return false; + sbi->total_valid_block_count += (block_t)(*count); + if (unlikely(sbi->total_valid_block_count > sbi->user_block_count)) { + diff = sbi->total_valid_block_count - sbi->user_block_count; + *count -= diff; + sbi->total_valid_block_count = sbi->user_block_count; + if (!*count) { + spin_unlock(&sbi->stat_lock); + percpu_counter_sub(&sbi->alloc_valid_block_count, diff); + return false; + } } - inode->i_blocks += count; - sbi->total_valid_block_count = valid_block_count; - sbi->alloc_valid_block_count += (block_t)count; spin_unlock(&sbi->stat_lock); + + f2fs_i_blocks_write(inode, *count, true); return true; } @@ -1045,27 +1250,31 @@ static inline void dec_valid_block_count(struct f2fs_sb_info *sbi, spin_lock(&sbi->stat_lock); f2fs_bug_on(sbi, sbi->total_valid_block_count < (block_t) count); f2fs_bug_on(sbi, inode->i_blocks < count); - inode->i_blocks -= count; sbi->total_valid_block_count -= (block_t)count; spin_unlock(&sbi->stat_lock); + f2fs_i_blocks_write(inode, count, false); } static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type) { - atomic_inc(&sbi->nr_pages[count_type]); + percpu_counter_inc(&sbi->nr_pages[count_type]); + + if (count_type == F2FS_DIRTY_DATA || count_type == F2FS_INMEM_PAGES) + return; + set_sbi_flag(sbi, SBI_IS_DIRTY); } static inline void inode_inc_dirty_pages(struct inode *inode) { - atomic_inc(&F2FS_I(inode)->dirty_pages); - if (S_ISDIR(inode->i_mode)) - inc_page_count(F2FS_I_SB(inode), F2FS_DIRTY_DENTS); + percpu_counter_inc(&F2FS_I(inode)->dirty_pages); + inc_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ? + F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA); } static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type) { - atomic_dec(&sbi->nr_pages[count_type]); + percpu_counter_dec(&sbi->nr_pages[count_type]); } static inline void inode_dec_dirty_pages(struct inode *inode) @@ -1074,28 +1283,28 @@ static inline void inode_dec_dirty_pages(struct inode *inode) !S_ISLNK(inode->i_mode)) return; - atomic_dec(&F2FS_I(inode)->dirty_pages); - - if (S_ISDIR(inode->i_mode)) - dec_page_count(F2FS_I_SB(inode), F2FS_DIRTY_DENTS); + percpu_counter_dec(&F2FS_I(inode)->dirty_pages); + dec_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ? + F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA); } -static inline int get_pages(struct f2fs_sb_info *sbi, int count_type) +static inline s64 get_pages(struct f2fs_sb_info *sbi, int count_type) { - return atomic_read(&sbi->nr_pages[count_type]); + return percpu_counter_sum_positive(&sbi->nr_pages[count_type]); } -static inline int get_dirty_pages(struct inode *inode) +static inline s64 get_dirty_pages(struct inode *inode) { - return atomic_read(&F2FS_I(inode)->dirty_pages); + return percpu_counter_sum_positive(&F2FS_I(inode)->dirty_pages); } static inline int get_blocktype_secs(struct f2fs_sb_info *sbi, int block_type) { - unsigned int pages_per_sec = sbi->segs_per_sec * - (1 << sbi->log_blocks_per_seg); - return ((get_pages(sbi, block_type) + pages_per_sec - 1) - >> sbi->log_blocks_per_seg) / sbi->segs_per_sec; + unsigned int pages_per_sec = sbi->segs_per_sec * sbi->blocks_per_seg; + unsigned int segs = (get_pages(sbi, block_type) + pages_per_sec - 1) >> + sbi->log_blocks_per_seg; + + return segs / sbi->segs_per_sec; } static inline block_t valid_user_blocks(struct f2fs_sb_info *sbi) @@ -1103,6 +1312,11 @@ static inline block_t valid_user_blocks(struct f2fs_sb_info *sbi) return sbi->total_valid_block_count; } +static inline block_t discard_blocks(struct f2fs_sb_info *sbi) +{ + return sbi->discard_blks; +} + static inline unsigned long __bitmap_size(struct f2fs_sb_info *sbi, int flag) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); @@ -1182,13 +1396,13 @@ static inline bool inc_valid_node_count(struct f2fs_sb_info *sbi, } if (inode) - inode->i_blocks++; + f2fs_i_blocks_write(inode, 1, true); - sbi->alloc_valid_block_count++; sbi->total_valid_node_count++; sbi->total_valid_block_count++; spin_unlock(&sbi->stat_lock); + percpu_counter_inc(&sbi->alloc_valid_block_count); return true; } @@ -1201,7 +1415,7 @@ static inline void dec_valid_node_count(struct f2fs_sb_info *sbi, f2fs_bug_on(sbi, !sbi->total_valid_node_count); f2fs_bug_on(sbi, !inode->i_blocks); - inode->i_blocks--; + f2fs_i_blocks_write(inode, 1, false); sbi->total_valid_node_count--; sbi->total_valid_block_count--; @@ -1215,28 +1429,30 @@ static inline unsigned int valid_node_count(struct f2fs_sb_info *sbi) static inline void inc_valid_inode_count(struct f2fs_sb_info *sbi) { - spin_lock(&sbi->stat_lock); - f2fs_bug_on(sbi, sbi->total_valid_inode_count == sbi->total_node_count); - sbi->total_valid_inode_count++; - spin_unlock(&sbi->stat_lock); + percpu_counter_inc(&sbi->total_valid_inode_count); } static inline void dec_valid_inode_count(struct f2fs_sb_info *sbi) { - spin_lock(&sbi->stat_lock); - f2fs_bug_on(sbi, !sbi->total_valid_inode_count); - sbi->total_valid_inode_count--; - spin_unlock(&sbi->stat_lock); + percpu_counter_dec(&sbi->total_valid_inode_count); } -static inline unsigned int valid_inode_count(struct f2fs_sb_info *sbi) +static inline s64 valid_inode_count(struct f2fs_sb_info *sbi) { - return sbi->total_valid_inode_count; + return percpu_counter_sum_positive(&sbi->total_valid_inode_count); } static inline struct page *f2fs_grab_cache_page(struct address_space *mapping, pgoff_t index, bool for_write) { +#ifdef CONFIG_F2FS_FAULT_INJECTION + struct page *page = find_lock_page(mapping, index); + if (page) + return page; + + if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_ALLOC)) + return NULL; +#endif if (!for_write) return grab_cache_page(mapping, index); return grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS); @@ -1261,7 +1477,7 @@ static inline void f2fs_put_page(struct page *page, int unlock) f2fs_bug_on(F2FS_P_SB(page), !PageLocked(page)); unlock_page(page); } - page_cache_release(page); + put_page(page); } static inline void f2fs_put_dnode(struct dnode_of_data *dn) @@ -1396,13 +1612,12 @@ static inline void f2fs_change_bit(unsigned int nr, char *addr) enum { FI_NEW_INODE, /* indicate newly allocated inode */ FI_DIRTY_INODE, /* indicate inode is dirty or not */ + FI_AUTO_RECOVER, /* indicate inode is recoverable */ FI_DIRTY_DIR, /* indicate directory has dirty pages */ FI_INC_LINK, /* need to increment i_nlink */ FI_ACL_MODE, /* indicate acl mode */ FI_NO_ALLOC, /* should not allocate any blocks */ FI_FREE_NID, /* free allocated nide */ - FI_UPDATE_DIR, /* should update inode block for consistency */ - FI_DELAY_IPUT, /* used for the recovery */ FI_NO_EXTENT, /* not to use the extent cache */ FI_INLINE_XATTR, /* used for inline xattr */ FI_INLINE_DATA, /* used for inline data*/ @@ -1416,71 +1631,152 @@ enum { FI_DROP_CACHE, /* drop dirty page cache */ FI_DATA_EXIST, /* indicate data exists */ FI_INLINE_DOTS, /* indicate inline dot dentries */ + FI_DO_DEFRAG, /* indicate defragment is running */ + FI_DIRTY_FILE, /* indicate regular/symlink has dirty pages */ }; -static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag) +static inline void __mark_inode_dirty_flag(struct inode *inode, + int flag, bool set) { - if (!test_bit(flag, &fi->flags)) - set_bit(flag, &fi->flags); + switch (flag) { + case FI_INLINE_XATTR: + case FI_INLINE_DATA: + case FI_INLINE_DENTRY: + if (set) + return; + case FI_DATA_EXIST: + case FI_INLINE_DOTS: + f2fs_mark_inode_dirty_sync(inode); + } } -static inline int is_inode_flag_set(struct f2fs_inode_info *fi, int flag) +static inline void set_inode_flag(struct inode *inode, int flag) { - return test_bit(flag, &fi->flags); + if (!test_bit(flag, &F2FS_I(inode)->flags)) + set_bit(flag, &F2FS_I(inode)->flags); + __mark_inode_dirty_flag(inode, flag, true); } -static inline void clear_inode_flag(struct f2fs_inode_info *fi, int flag) +static inline int is_inode_flag_set(struct inode *inode, int flag) { - if (test_bit(flag, &fi->flags)) - clear_bit(flag, &fi->flags); + return test_bit(flag, &F2FS_I(inode)->flags); } -static inline void set_acl_inode(struct f2fs_inode_info *fi, umode_t mode) +static inline void clear_inode_flag(struct inode *inode, int flag) { - fi->i_acl_mode = mode; - set_inode_flag(fi, FI_ACL_MODE); + if (test_bit(flag, &F2FS_I(inode)->flags)) + clear_bit(flag, &F2FS_I(inode)->flags); + __mark_inode_dirty_flag(inode, flag, false); } -static inline void get_inline_info(struct f2fs_inode_info *fi, - struct f2fs_inode *ri) +static inline void set_acl_inode(struct inode *inode, umode_t mode) { + F2FS_I(inode)->i_acl_mode = mode; + set_inode_flag(inode, FI_ACL_MODE); + f2fs_mark_inode_dirty_sync(inode); +} + +static inline void f2fs_i_links_write(struct inode *inode, bool inc) +{ + if (inc) + inc_nlink(inode); + else + drop_nlink(inode); + f2fs_mark_inode_dirty_sync(inode); +} + +static inline void f2fs_i_blocks_write(struct inode *inode, + blkcnt_t diff, bool add) +{ + bool clean = !is_inode_flag_set(inode, FI_DIRTY_INODE); + bool recover = is_inode_flag_set(inode, FI_AUTO_RECOVER); + + inode->i_blocks = add ? inode->i_blocks + diff : + inode->i_blocks - diff; + f2fs_mark_inode_dirty_sync(inode); + if (clean || recover) + set_inode_flag(inode, FI_AUTO_RECOVER); +} + +static inline void f2fs_i_size_write(struct inode *inode, loff_t i_size) +{ + bool clean = !is_inode_flag_set(inode, FI_DIRTY_INODE); + bool recover = is_inode_flag_set(inode, FI_AUTO_RECOVER); + + if (i_size_read(inode) == i_size) + return; + + i_size_write(inode, i_size); + f2fs_mark_inode_dirty_sync(inode); + if (clean || recover) + set_inode_flag(inode, FI_AUTO_RECOVER); +} + +static inline bool f2fs_skip_inode_update(struct inode *inode) +{ + if (!is_inode_flag_set(inode, FI_AUTO_RECOVER)) + return false; + return F2FS_I(inode)->last_disk_size == i_size_read(inode); +} + +static inline void f2fs_i_depth_write(struct inode *inode, unsigned int depth) +{ + F2FS_I(inode)->i_current_depth = depth; + f2fs_mark_inode_dirty_sync(inode); +} + +static inline void f2fs_i_xnid_write(struct inode *inode, nid_t xnid) +{ + F2FS_I(inode)->i_xattr_nid = xnid; + f2fs_mark_inode_dirty_sync(inode); +} + +static inline void f2fs_i_pino_write(struct inode *inode, nid_t pino) +{ + F2FS_I(inode)->i_pino = pino; + f2fs_mark_inode_dirty_sync(inode); +} + +static inline void get_inline_info(struct inode *inode, struct f2fs_inode *ri) +{ + struct f2fs_inode_info *fi = F2FS_I(inode); + if (ri->i_inline & F2FS_INLINE_XATTR) - set_inode_flag(fi, FI_INLINE_XATTR); + set_bit(FI_INLINE_XATTR, &fi->flags); if (ri->i_inline & F2FS_INLINE_DATA) - set_inode_flag(fi, FI_INLINE_DATA); + set_bit(FI_INLINE_DATA, &fi->flags); if (ri->i_inline & F2FS_INLINE_DENTRY) - set_inode_flag(fi, FI_INLINE_DENTRY); + set_bit(FI_INLINE_DENTRY, &fi->flags); if (ri->i_inline & F2FS_DATA_EXIST) - set_inode_flag(fi, FI_DATA_EXIST); + set_bit(FI_DATA_EXIST, &fi->flags); if (ri->i_inline & F2FS_INLINE_DOTS) - set_inode_flag(fi, FI_INLINE_DOTS); + set_bit(FI_INLINE_DOTS, &fi->flags); } -static inline void set_raw_inline(struct f2fs_inode_info *fi, - struct f2fs_inode *ri) +static inline void set_raw_inline(struct inode *inode, struct f2fs_inode *ri) { ri->i_inline = 0; - if (is_inode_flag_set(fi, FI_INLINE_XATTR)) + if (is_inode_flag_set(inode, FI_INLINE_XATTR)) ri->i_inline |= F2FS_INLINE_XATTR; - if (is_inode_flag_set(fi, FI_INLINE_DATA)) + if (is_inode_flag_set(inode, FI_INLINE_DATA)) ri->i_inline |= F2FS_INLINE_DATA; - if (is_inode_flag_set(fi, FI_INLINE_DENTRY)) + if (is_inode_flag_set(inode, FI_INLINE_DENTRY)) ri->i_inline |= F2FS_INLINE_DENTRY; - if (is_inode_flag_set(fi, FI_DATA_EXIST)) + if (is_inode_flag_set(inode, FI_DATA_EXIST)) ri->i_inline |= F2FS_DATA_EXIST; - if (is_inode_flag_set(fi, FI_INLINE_DOTS)) + if (is_inode_flag_set(inode, FI_INLINE_DOTS)) ri->i_inline |= F2FS_INLINE_DOTS; } static inline int f2fs_has_inline_xattr(struct inode *inode) { - return is_inode_flag_set(F2FS_I(inode), FI_INLINE_XATTR); + return is_inode_flag_set(inode, FI_INLINE_XATTR); } -static inline unsigned int addrs_per_inode(struct f2fs_inode_info *fi) +static inline unsigned int addrs_per_inode(struct inode *inode) { - if (f2fs_has_inline_xattr(&fi->vfs_inode)) + if (f2fs_has_inline_xattr(inode)) return DEF_ADDRS_PER_INODE - F2FS_INLINE_XATTR_ADDRS; return DEF_ADDRS_PER_INODE; } @@ -1502,43 +1798,43 @@ static inline int inline_xattr_size(struct inode *inode) static inline int f2fs_has_inline_data(struct inode *inode) { - return is_inode_flag_set(F2FS_I(inode), FI_INLINE_DATA); + return is_inode_flag_set(inode, FI_INLINE_DATA); } static inline void f2fs_clear_inline_inode(struct inode *inode) { - clear_inode_flag(F2FS_I(inode), FI_INLINE_DATA); - clear_inode_flag(F2FS_I(inode), FI_DATA_EXIST); + clear_inode_flag(inode, FI_INLINE_DATA); + clear_inode_flag(inode, FI_DATA_EXIST); } static inline int f2fs_exist_data(struct inode *inode) { - return is_inode_flag_set(F2FS_I(inode), FI_DATA_EXIST); + return is_inode_flag_set(inode, FI_DATA_EXIST); } static inline int f2fs_has_inline_dots(struct inode *inode) { - return is_inode_flag_set(F2FS_I(inode), FI_INLINE_DOTS); + return is_inode_flag_set(inode, FI_INLINE_DOTS); } static inline bool f2fs_is_atomic_file(struct inode *inode) { - return is_inode_flag_set(F2FS_I(inode), FI_ATOMIC_FILE); + return is_inode_flag_set(inode, FI_ATOMIC_FILE); } static inline bool f2fs_is_volatile_file(struct inode *inode) { - return is_inode_flag_set(F2FS_I(inode), FI_VOLATILE_FILE); + return is_inode_flag_set(inode, FI_VOLATILE_FILE); } static inline bool f2fs_is_first_block_written(struct inode *inode) { - return is_inode_flag_set(F2FS_I(inode), FI_FIRST_BLOCK_WRITTEN); + return is_inode_flag_set(inode, FI_FIRST_BLOCK_WRITTEN); } static inline bool f2fs_is_drop_cache(struct inode *inode) { - return is_inode_flag_set(F2FS_I(inode), FI_DROP_CACHE); + return is_inode_flag_set(inode, FI_DROP_CACHE); } static inline void *inline_data_addr(struct page *page) @@ -1549,7 +1845,7 @@ static inline void *inline_data_addr(struct page *page) static inline int f2fs_has_inline_dentry(struct inode *inode) { - return is_inode_flag_set(F2FS_I(inode), FI_INLINE_DENTRY); + return is_inode_flag_set(inode, FI_INLINE_DENTRY); } static inline void f2fs_dentry_kunmap(struct inode *dir, struct page *page) @@ -1566,11 +1862,13 @@ static inline int is_file(struct inode *inode, int type) static inline void set_file(struct inode *inode, int type) { F2FS_I(inode)->i_advise |= type; + f2fs_mark_inode_dirty_sync(inode); } static inline void clear_file(struct inode *inode, int type) { F2FS_I(inode)->i_advise &= ~type; + f2fs_mark_inode_dirty_sync(inode); } static inline int f2fs_readonly(struct super_block *sb) @@ -1580,13 +1878,7 @@ static inline int f2fs_readonly(struct super_block *sb) static inline bool f2fs_cp_error(struct f2fs_sb_info *sbi) { - return is_set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG); -} - -static inline void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi) -{ - set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG); - sbi->sb->s_flags |= MS_RDONLY; + return is_set_ckpt_flags(sbi, CP_ERROR_FLAG); } static inline bool is_dot_dotdot(const struct qstr *str) @@ -1602,13 +1894,21 @@ static inline bool is_dot_dotdot(const struct qstr *str) static inline bool f2fs_may_extent_tree(struct inode *inode) { - mode_t mode = inode->i_mode; - if (!test_opt(F2FS_I_SB(inode), EXTENT_CACHE) || - is_inode_flag_set(F2FS_I(inode), FI_NO_EXTENT)) + is_inode_flag_set(inode, FI_NO_EXTENT)) return false; - return S_ISREG(mode); + return S_ISREG(inode->i_mode); +} + +static inline void *f2fs_kmalloc(struct f2fs_sb_info *sbi, + size_t size, gfp_t flags) +{ +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(sbi, FAULT_KMALLOC)) + return NULL; +#endif + return kmalloc(size, flags); } static inline void *f2fs_kvmalloc(size_t size, gfp_t flags) @@ -1632,14 +1932,14 @@ static inline void *f2fs_kvzalloc(size_t size, gfp_t flags) } #define get_inode_mode(i) \ - ((is_inode_flag_set(F2FS_I(i), FI_ACL_MODE)) ? \ + ((is_inode_flag_set(i, FI_ACL_MODE)) ? \ (F2FS_I(i)->i_acl_mode) : ((i)->i_mode)) /* get offset of first page in next direct node */ -#define PGOFS_OF_NEXT_DNODE(pgofs, fi) \ - ((pgofs < ADDRS_PER_INODE(fi)) ? ADDRS_PER_INODE(fi) : \ - (pgofs - ADDRS_PER_INODE(fi) + ADDRS_PER_BLOCK) / \ - ADDRS_PER_BLOCK * ADDRS_PER_BLOCK + ADDRS_PER_INODE(fi)) +#define PGOFS_OF_NEXT_DNODE(pgofs, inode) \ + ((pgofs < ADDRS_PER_INODE(inode)) ? ADDRS_PER_INODE(inode) : \ + (pgofs - ADDRS_PER_INODE(inode) + ADDRS_PER_BLOCK) / \ + ADDRS_PER_BLOCK * ADDRS_PER_BLOCK + ADDRS_PER_INODE(inode)) /* * file.c @@ -1647,7 +1947,7 @@ static inline void *f2fs_kvzalloc(size_t size, gfp_t flags) int f2fs_sync_file(struct file *, loff_t, loff_t, int); void truncate_data_blocks(struct dnode_of_data *); int truncate_blocks(struct inode *, u64, bool); -int f2fs_truncate(struct inode *, bool); +int f2fs_truncate(struct inode *); int f2fs_getattr(struct vfsmount *, struct dentry *, struct kstat *); int f2fs_setattr(struct dentry *, struct iattr *); int truncate_hole(struct inode *, pgoff_t, pgoff_t); @@ -1660,9 +1960,10 @@ long f2fs_compat_ioctl(struct file *, unsigned int, unsigned long); */ void f2fs_set_inode_flags(struct inode *); struct inode *f2fs_iget(struct super_block *, unsigned long); +struct inode *f2fs_iget_retry(struct super_block *, unsigned long); int try_to_free_nats(struct f2fs_sb_info *, int); -void update_inode(struct inode *, struct page *); -void update_inode_page(struct inode *); +int update_inode(struct inode *, struct page *); +int update_inode_page(struct inode *); int f2fs_write_inode(struct inode *, struct writeback_control *); void f2fs_evict_inode(struct inode *); void handle_failed_inode(struct inode *); @@ -1675,29 +1976,34 @@ struct dentry *f2fs_get_parent(struct dentry *child); /* * dir.c */ -extern unsigned char f2fs_filetype_table[F2FS_FT_MAX]; void set_de_type(struct f2fs_dir_entry *, umode_t); - -struct f2fs_dir_entry *find_target_dentry(struct f2fs_filename *, +unsigned char get_de_type(struct f2fs_dir_entry *); +struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *, f2fs_hash_t, int *, struct f2fs_dentry_ptr *); bool f2fs_fill_dentries(struct dir_context *, struct f2fs_dentry_ptr *, - unsigned int, struct f2fs_str *); + unsigned int, struct fscrypt_str *); void do_make_empty_dir(struct inode *, struct inode *, struct f2fs_dentry_ptr *); struct page *init_inode_metadata(struct inode *, struct inode *, - const struct qstr *, struct page *); + const struct qstr *, const struct qstr *, struct page *); void update_parent_metadata(struct inode *, struct inode *, unsigned int); int room_for_filename(const void *, int, int); -void f2fs_drop_nlink(struct inode *, struct inode *, struct page *); -struct f2fs_dir_entry *f2fs_find_entry(struct inode *, struct qstr *, +void f2fs_drop_nlink(struct inode *, struct inode *); +struct f2fs_dir_entry *__f2fs_find_entry(struct inode *, struct fscrypt_name *, + struct page **); +struct f2fs_dir_entry *f2fs_find_entry(struct inode *, const struct qstr *, struct page **); struct f2fs_dir_entry *f2fs_parent_dir(struct inode *, struct page **); -ino_t f2fs_inode_by_name(struct inode *, struct qstr *); +ino_t f2fs_inode_by_name(struct inode *, const struct qstr *, struct page **); void f2fs_set_link(struct inode *, struct f2fs_dir_entry *, struct page *, struct inode *); int update_dent_inode(struct inode *, struct inode *, const struct qstr *); void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *, const struct qstr *, f2fs_hash_t , unsigned int); +int f2fs_add_regular_entry(struct inode *, const struct qstr *, + const struct qstr *, struct inode *, nid_t, umode_t); +int __f2fs_do_add_link(struct inode *, struct fscrypt_name*, struct inode *, + nid_t, umode_t); int __f2fs_add_link(struct inode *, const struct qstr *, struct inode *, nid_t, umode_t); void f2fs_delete_entry(struct f2fs_dir_entry *, struct page *, struct inode *, @@ -1714,10 +2020,13 @@ static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode) /* * super.c */ +int f2fs_inode_dirtied(struct inode *); +void f2fs_inode_synced(struct inode *); int f2fs_commit_super(struct f2fs_sb_info *, bool); int f2fs_sync_fs(struct super_block *, int); extern __printf(3, 4) void f2fs_msg(struct super_block *, const char *, const char *, ...); +int sanity_check_ckpt(struct f2fs_sb_info *sbi); /* * hash.c @@ -1735,6 +2044,7 @@ int need_dentry_mark(struct f2fs_sb_info *, nid_t); bool is_checkpointed_node(struct f2fs_sb_info *, nid_t); bool need_inode_block_update(struct f2fs_sb_info *, nid_t); void get_node_info(struct f2fs_sb_info *, nid_t, struct node_info *); +pgoff_t get_next_page_offset(struct dnode_of_data *, pgoff_t); int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int); int truncate_inode_blocks(struct inode *, pgoff_t); int truncate_xattr_node(struct inode *, struct page *); @@ -1745,8 +2055,11 @@ struct page *new_node_page(struct dnode_of_data *, unsigned int, struct page *); void ra_node_page(struct f2fs_sb_info *, nid_t); struct page *get_node_page(struct f2fs_sb_info *, pgoff_t); struct page *get_node_page_ra(struct page *, int); -void sync_inode_page(struct dnode_of_data *); -int sync_node_pages(struct f2fs_sb_info *, nid_t, struct writeback_control *); +void move_node_page(struct page *, int); +int fsync_node_pages(struct f2fs_sb_info *, struct inode *, + struct writeback_control *, bool); +int sync_node_pages(struct f2fs_sb_info *, struct writeback_control *); +void build_free_nids(struct f2fs_sb_info *); bool alloc_nid(struct f2fs_sb_info *, nid_t *); void alloc_nid_done(struct f2fs_sb_info *, nid_t); void alloc_nid_failed(struct f2fs_sb_info *, nid_t); @@ -1766,8 +2079,9 @@ void destroy_node_manager_caches(void); * segment.c */ void register_inmem_page(struct inode *, struct page *); -int commit_inmem_pages(struct inode *, bool); -void f2fs_balance_fs(struct f2fs_sb_info *); +void drop_inmem_pages(struct inode *); +int commit_inmem_pages(struct inode *); +void f2fs_balance_fs(struct f2fs_sb_info *, bool); void f2fs_balance_fs_bg(struct f2fs_sb_info *); int f2fs_issue_flush(struct f2fs_sb_info *); int create_flush_cmd_control(struct f2fs_sb_info *); @@ -1777,7 +2091,6 @@ bool is_checkpointed_data(struct f2fs_sb_info *, block_t); void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t); void clear_prefree_segments(struct f2fs_sb_info *, struct cp_control *); void release_discard_addrs(struct f2fs_sb_info *); -bool discard_next_dnode(struct f2fs_sb_info *, block_t); int npages_for_summary_flush(struct f2fs_sb_info *, bool); void allocate_new_segments(struct f2fs_sb_info *); int f2fs_trim_fs(struct f2fs_sb_info *, struct fstrim_range *); @@ -1787,16 +2100,17 @@ void write_meta_page(struct f2fs_sb_info *, struct page *); void write_node_page(unsigned int, struct f2fs_io_info *); void write_data_page(struct dnode_of_data *, struct f2fs_io_info *); void rewrite_data_page(struct f2fs_io_info *); +void __f2fs_replace_block(struct f2fs_sb_info *, struct f2fs_summary *, + block_t, block_t, bool, bool); void f2fs_replace_block(struct f2fs_sb_info *, struct dnode_of_data *, - block_t, block_t, unsigned char, bool); + block_t, block_t, unsigned char, bool, bool); void allocate_data_block(struct f2fs_sb_info *, struct page *, block_t, block_t *, struct f2fs_summary *, int); -void f2fs_wait_on_page_writeback(struct page *, enum page_type); +void f2fs_wait_on_page_writeback(struct page *, enum page_type, bool); void f2fs_wait_on_encrypted_page_writeback(struct f2fs_sb_info *, block_t); void write_data_summaries(struct f2fs_sb_info *, block_t); void write_node_summaries(struct f2fs_sb_info *, block_t); -int lookup_journal_in_cursum(struct f2fs_summary_block *, - int, unsigned int, int); +int lookup_journal_in_cursum(struct f2fs_journal *, int, unsigned int, int); void flush_sit_entries(struct f2fs_sb_info *, struct cp_control *); int build_segment_manager(struct f2fs_sb_info *); void destroy_segment_manager(struct f2fs_sb_info *); @@ -1806,6 +2120,7 @@ void destroy_segment_manager_caches(void); /* * checkpoint.c */ +void f2fs_stop_checkpoint(struct f2fs_sb_info *, bool); struct page *grab_meta_page(struct f2fs_sb_info *, pgoff_t); struct page *get_meta_page(struct f2fs_sb_info *, pgoff_t); struct page *get_tmp_page(struct f2fs_sb_info *, pgoff_t); @@ -1813,21 +2128,21 @@ bool is_valid_blkaddr(struct f2fs_sb_info *, block_t, int); int ra_meta_pages(struct f2fs_sb_info *, block_t, int, int, bool); void ra_meta_pages_cond(struct f2fs_sb_info *, pgoff_t); long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long); -void add_dirty_inode(struct f2fs_sb_info *, nid_t, int type); -void remove_dirty_inode(struct f2fs_sb_info *, nid_t, int type); -void release_dirty_inode(struct f2fs_sb_info *); +void add_ino_entry(struct f2fs_sb_info *, nid_t, int type); +void remove_ino_entry(struct f2fs_sb_info *, nid_t, int type); +void release_ino_entry(struct f2fs_sb_info *, bool); bool exist_written_data(struct f2fs_sb_info *, nid_t, int); +int f2fs_sync_inode_meta(struct f2fs_sb_info *); int acquire_orphan_inode(struct f2fs_sb_info *); void release_orphan_inode(struct f2fs_sb_info *); -void add_orphan_inode(struct f2fs_sb_info *, nid_t); +void add_orphan_inode(struct inode *); void remove_orphan_inode(struct f2fs_sb_info *, nid_t); int recover_orphan_inodes(struct f2fs_sb_info *); int get_valid_checkpoint(struct f2fs_sb_info *); void update_dirty_page(struct inode *, struct page *); -void add_dirty_dir_inode(struct inode *); -void remove_dirty_dir_inode(struct inode *); -void sync_dirty_dir_inodes(struct f2fs_sb_info *); -void write_checkpoint(struct f2fs_sb_info *, struct cp_control *); +void remove_dirty_inode(struct inode *); +int sync_dirty_inodes(struct f2fs_sb_info *, enum inode_type); +int write_checkpoint(struct f2fs_sb_info *, struct cp_control *); void init_ino_entry_info(struct f2fs_sb_info *); int __init create_checkpoint_caches(void); void destroy_checkpoint_caches(void); @@ -1836,34 +2151,46 @@ void destroy_checkpoint_caches(void); * data.c */ void f2fs_submit_merged_bio(struct f2fs_sb_info *, enum page_type, int); +void f2fs_submit_merged_bio_cond(struct f2fs_sb_info *, struct inode *, + struct page *, nid_t, enum page_type, int); +void f2fs_flush_merged_bios(struct f2fs_sb_info *); int f2fs_submit_page_bio(struct f2fs_io_info *); void f2fs_submit_page_mbio(struct f2fs_io_info *); void set_data_blkaddr(struct dnode_of_data *); +void f2fs_update_data_blkaddr(struct dnode_of_data *, block_t); +int reserve_new_blocks(struct dnode_of_data *, blkcnt_t); int reserve_new_block(struct dnode_of_data *); int f2fs_get_block(struct dnode_of_data *, pgoff_t); +ssize_t f2fs_preallocate_blocks(struct kiocb *, struct iov_iter *); int f2fs_reserve_block(struct dnode_of_data *, pgoff_t); struct page *get_read_data_page(struct inode *, pgoff_t, int, bool); struct page *find_data_page(struct inode *, pgoff_t); struct page *get_lock_data_page(struct inode *, pgoff_t, bool); struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool); int do_write_data_page(struct f2fs_io_info *); +int f2fs_map_blocks(struct inode *, struct f2fs_map_blocks *, int, int); int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *, u64, u64); +void f2fs_set_page_dirty_nobuffers(struct page *); void f2fs_invalidate_page(struct page *, unsigned int, unsigned int); int f2fs_release_page(struct page *, gfp_t); +#ifdef CONFIG_MIGRATION +int f2fs_migrate_page(struct address_space *, struct page *, struct page *, + enum migrate_mode); +#endif /* * gc.c */ int start_gc_thread(struct f2fs_sb_info *); void stop_gc_thread(struct f2fs_sb_info *); -block_t start_bidx_of_node(unsigned int, struct f2fs_inode_info *); +block_t start_bidx_of_node(unsigned int, struct inode *); int f2fs_gc(struct f2fs_sb_info *, bool); void build_gc_manager(struct f2fs_sb_info *); /* * recovery.c */ -int recover_fsync_data(struct f2fs_sb_info *); +int recover_fsync_data(struct f2fs_sb_info *, bool); bool space_for_roll_forward(struct f2fs_sb_info *); /* @@ -1877,18 +2204,20 @@ struct f2fs_stat_info { int main_area_segs, main_area_sections, main_area_zones; unsigned long long hit_largest, hit_cached, hit_rbtree; unsigned long long hit_total, total_ext; - int ext_tree, ext_node; - int ndirty_node, ndirty_dent, ndirty_dirs, ndirty_meta; + int ext_tree, zombie_tree, ext_node; + s64 ndirty_node, ndirty_dent, ndirty_meta, ndirty_data, ndirty_imeta; + s64 inmem_pages; + unsigned int ndirty_dirs, ndirty_files, ndirty_all; int nats, dirty_nats, sits, dirty_sits, fnids; int total_count, utilization; - int bg_gc, inmem_pages, wb_pages; - int inline_xattr, inline_inode, inline_dir; - unsigned int valid_count, valid_node_count, valid_inode_count; + int bg_gc, wb_bios; + int inline_xattr, inline_inode, inline_dir, orphans; + unsigned int valid_count, valid_node_count, valid_inode_count, discard_blks; unsigned int bimodal, avg_vblocks; int util_free, util_valid, util_invalid; int rsvd_segs, overp_segs; int dirty_count, node_pages, meta_pages; - int prefree_count, call_count, cp_count; + int prefree_count, call_count, cp_count, bg_cp_count; int tot_segs, node_segs, data_segs, free_segs, free_secs; int bg_node_segs, bg_data_segs; int tot_blks, data_blks, node_blks; @@ -1909,10 +2238,11 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) } #define stat_inc_cp_count(si) ((si)->cp_count++) +#define stat_inc_bg_cp_count(si) ((si)->bg_cp_count++) #define stat_inc_call_count(si) ((si)->call_count++) #define stat_inc_bggc_count(sbi) ((sbi)->bg_gc++) -#define stat_inc_dirty_dir(sbi) ((sbi)->n_dirty_dirs++) -#define stat_dec_dirty_dir(sbi) ((sbi)->n_dirty_dirs--) +#define stat_inc_dirty_inode(sbi, type) ((sbi)->ndirty_inode[type]++) +#define stat_dec_dirty_inode(sbi, type) ((sbi)->ndirty_inode[type]--) #define stat_inc_total_hit(sbi) (atomic64_inc(&(sbi)->total_hit_ext)) #define stat_inc_rbtree_node_hit(sbi) (atomic64_inc(&(sbi)->read_hit_rbtree)) #define stat_inc_largest_node_hit(sbi) (atomic64_inc(&(sbi)->read_hit_largest)) @@ -1987,14 +2317,15 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) int f2fs_build_stats(struct f2fs_sb_info *); void f2fs_destroy_stats(struct f2fs_sb_info *); -void __init f2fs_create_root_stats(void); +int __init f2fs_create_root_stats(void); void f2fs_destroy_root_stats(void); #else #define stat_inc_cp_count(si) +#define stat_inc_bg_cp_count(si) #define stat_inc_call_count(si) #define stat_inc_bggc_count(si) -#define stat_inc_dirty_dir(sbi) -#define stat_dec_dirty_dir(sbi) +#define stat_inc_dirty_inode(sbi, type) +#define stat_dec_dirty_inode(sbi, type) #define stat_inc_total_hit(sb) #define stat_inc_rbtree_node_hit(sb) #define stat_inc_largest_node_hit(sbi) @@ -2015,7 +2346,7 @@ void f2fs_destroy_root_stats(void); static inline int f2fs_build_stats(struct f2fs_sb_info *sbi) { return 0; } static inline void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { } -static inline void __init f2fs_create_root_stats(void) { } +static inline int __init f2fs_create_root_stats(void) { return 0; } static inline void f2fs_destroy_root_stats(void) { } #endif @@ -2044,16 +2375,15 @@ int f2fs_convert_inline_inode(struct inode *); int f2fs_write_inline_data(struct inode *, struct page *); bool recover_inline_data(struct inode *, struct page *); struct f2fs_dir_entry *find_in_inline_dir(struct inode *, - struct f2fs_filename *, struct page **); -struct f2fs_dir_entry *f2fs_parent_inline_dir(struct inode *, struct page **); + struct fscrypt_name *, struct page **); int make_empty_inline_dir(struct inode *inode, struct inode *, struct page *); -int f2fs_add_inline_entry(struct inode *, const struct qstr *, struct inode *, - nid_t, umode_t); +int f2fs_add_inline_entry(struct inode *, const struct qstr *, + const struct qstr *, struct inode *, nid_t, umode_t); void f2fs_delete_inline_entry(struct f2fs_dir_entry *, struct page *, struct inode *, struct inode *); bool f2fs_empty_inline_dir(struct inode *); int f2fs_read_inline_dir(struct file *, struct dir_context *, - struct f2fs_str *); + struct fscrypt_str *); int f2fs_inline_data_fiemap(struct inode *, struct fiemap_extent_info *, __u64, __u64); @@ -2069,8 +2399,8 @@ void f2fs_leave_shrinker(struct f2fs_sb_info *); * extent_cache.c */ unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *, int); -void f2fs_drop_largest_extent(struct inode *, pgoff_t); -void f2fs_init_extent_tree(struct inode *, struct f2fs_extent *); +bool f2fs_init_extent_tree(struct inode *, struct f2fs_extent *); +void f2fs_drop_extent_tree(struct inode *); unsigned int f2fs_destroy_extent_node(struct inode *); void f2fs_destroy_extent_tree(struct inode *); bool f2fs_lookup_extent_cache(struct inode *, pgoff_t, struct extent_info *); @@ -2084,13 +2414,9 @@ void destroy_extent_cache(void); /* * crypto support */ -static inline int f2fs_encrypted_inode(struct inode *inode) +static inline bool f2fs_encrypted_inode(struct inode *inode) { -#ifdef CONFIG_F2FS_FS_ENCRYPTION return file_is_encrypt(inode); -#else - return 0; -#endif } static inline void f2fs_set_encrypted_inode(struct inode *inode) @@ -2102,26 +2428,38 @@ static inline void f2fs_set_encrypted_inode(struct inode *inode) static inline bool f2fs_bio_encrypted(struct bio *bio) { -#ifdef CONFIG_F2FS_FS_ENCRYPTION - return unlikely(bio->bi_private != NULL); -#else - return false; -#endif + return bio->bi_private != NULL; } static inline int f2fs_sb_has_crypto(struct super_block *sb) { -#ifdef CONFIG_F2FS_FS_ENCRYPTION return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_ENCRYPT); -#else - return 0; -#endif +} + +static inline int f2fs_sb_mounted_hmsmr(struct super_block *sb) +{ + return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_HMSMR); +} + +static inline void set_opt_mode(struct f2fs_sb_info *sbi, unsigned int mt) +{ + clear_opt(sbi, ADAPTIVE); + clear_opt(sbi, LFS); + + switch (mt) { + case F2FS_MOUNT_ADAPTIVE: + set_opt(sbi, ADAPTIVE); + break; + case F2FS_MOUNT_LFS: + set_opt(sbi, LFS); + break; + } } static inline bool f2fs_may_encrypt(struct inode *inode) { #ifdef CONFIG_F2FS_FS_ENCRYPTION - mode_t mode = inode->i_mode; + umode_t mode = inode->i_mode; return (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)); #else @@ -2129,86 +2467,28 @@ static inline bool f2fs_may_encrypt(struct inode *inode) #endif } -/* crypto_policy.c */ -int f2fs_is_child_context_consistent_with_parent(struct inode *, - struct inode *); -int f2fs_inherit_context(struct inode *, struct inode *, struct page *); -int f2fs_process_policy(const struct f2fs_encryption_policy *, struct inode *); -int f2fs_get_policy(struct inode *, struct f2fs_encryption_policy *); - -/* crypt.c */ -extern struct kmem_cache *f2fs_crypt_info_cachep; -bool f2fs_valid_contents_enc_mode(uint32_t); -uint32_t f2fs_validate_encryption_key_size(uint32_t, uint32_t); -struct f2fs_crypto_ctx *f2fs_get_crypto_ctx(struct inode *); -void f2fs_release_crypto_ctx(struct f2fs_crypto_ctx *); -struct page *f2fs_encrypt(struct inode *, struct page *); -int f2fs_decrypt(struct f2fs_crypto_ctx *, struct page *); -int f2fs_decrypt_one(struct inode *, struct page *); -void f2fs_end_io_crypto_work(struct f2fs_crypto_ctx *, struct bio *); - -/* crypto_key.c */ -void f2fs_free_encryption_info(struct inode *, struct f2fs_crypt_info *); -int _f2fs_get_encryption_info(struct inode *inode); - -/* crypto_fname.c */ -bool f2fs_valid_filenames_enc_mode(uint32_t); -u32 f2fs_fname_crypto_round_up(u32, u32); -int f2fs_fname_crypto_alloc_buffer(struct inode *, u32, struct f2fs_str *); -int f2fs_fname_disk_to_usr(struct inode *, f2fs_hash_t *, - const struct f2fs_str *, struct f2fs_str *); -int f2fs_fname_usr_to_disk(struct inode *, const struct qstr *, - struct f2fs_str *); - -#ifdef CONFIG_F2FS_FS_ENCRYPTION -void f2fs_restore_and_release_control_page(struct page **); -void f2fs_restore_control_page(struct page *); - -int __init f2fs_init_crypto(void); -int f2fs_crypto_initialize(void); -void f2fs_exit_crypto(void); - -int f2fs_has_encryption_key(struct inode *); - -static inline int f2fs_get_encryption_info(struct inode *inode) -{ - struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info; - - if (!ci || - (ci->ci_keyring_key && - (ci->ci_keyring_key->flags & ((1 << KEY_FLAG_INVALIDATED) | - (1 << KEY_FLAG_REVOKED) | - (1 << KEY_FLAG_DEAD))))) - return _f2fs_get_encryption_info(inode); - return 0; -} - -void f2fs_fname_crypto_free_buffer(struct f2fs_str *); -int f2fs_fname_setup_filename(struct inode *, const struct qstr *, - int lookup, struct f2fs_filename *); -void f2fs_fname_free_filename(struct f2fs_filename *); -#else -static inline void f2fs_restore_and_release_control_page(struct page **p) { } -static inline void f2fs_restore_control_page(struct page *p) { } - -static inline int __init f2fs_init_crypto(void) { return 0; } -static inline void f2fs_exit_crypto(void) { } - -static inline int f2fs_has_encryption_key(struct inode *i) { return 0; } -static inline int f2fs_get_encryption_info(struct inode *i) { return 0; } -static inline void f2fs_fname_crypto_free_buffer(struct f2fs_str *p) { } - -static inline int f2fs_fname_setup_filename(struct inode *dir, - const struct qstr *iname, - int lookup, struct f2fs_filename *fname) -{ - memset(fname, 0, sizeof(struct f2fs_filename)); - fname->usr_fname = iname; - fname->disk_name.name = (unsigned char *)iname->name; - fname->disk_name.len = iname->len; - return 0; -} - -static inline void f2fs_fname_free_filename(struct f2fs_filename *fname) { } +#ifndef CONFIG_F2FS_FS_ENCRYPTION +#define fscrypt_set_d_op(i) +#define fscrypt_get_ctx fscrypt_notsupp_get_ctx +#define fscrypt_release_ctx fscrypt_notsupp_release_ctx +#define fscrypt_encrypt_page fscrypt_notsupp_encrypt_page +#define fscrypt_decrypt_page fscrypt_notsupp_decrypt_page +#define fscrypt_decrypt_bio_pages fscrypt_notsupp_decrypt_bio_pages +#define fscrypt_pullback_bio_page fscrypt_notsupp_pullback_bio_page +#define fscrypt_restore_control_page fscrypt_notsupp_restore_control_page +#define fscrypt_zeroout_range fscrypt_notsupp_zeroout_range +#define fscrypt_process_policy fscrypt_notsupp_process_policy +#define fscrypt_get_policy fscrypt_notsupp_get_policy +#define fscrypt_has_permitted_context fscrypt_notsupp_has_permitted_context +#define fscrypt_inherit_context fscrypt_notsupp_inherit_context +#define fscrypt_get_encryption_info fscrypt_notsupp_get_encryption_info +#define fscrypt_put_encryption_info fscrypt_notsupp_put_encryption_info +#define fscrypt_setup_filename fscrypt_notsupp_setup_filename +#define fscrypt_free_filename fscrypt_notsupp_free_filename +#define fscrypt_fname_encrypted_size fscrypt_notsupp_fname_encrypted_size +#define fscrypt_fname_alloc_buffer fscrypt_notsupp_fname_alloc_buffer +#define fscrypt_fname_free_buffer fscrypt_notsupp_fname_free_buffer +#define fscrypt_fname_disk_to_usr fscrypt_notsupp_fname_disk_to_usr +#define fscrypt_fname_usr_to_disk fscrypt_notsupp_fname_usr_to_disk #endif #endif diff --git a/fs/f2fs/f2fs_crypto.h b/fs/f2fs/f2fs_crypto.h deleted file mode 100644 index c2c1c2b63b25..000000000000 --- a/fs/f2fs/f2fs_crypto.h +++ /dev/null @@ -1,151 +0,0 @@ -/* - * linux/fs/f2fs/f2fs_crypto.h - * - * Copied from linux/fs/ext4/ext4_crypto.h - * - * Copyright (C) 2015, Google, Inc. - * - * This contains encryption header content for f2fs - * - * Written by Michael Halcrow, 2015. - * Modified by Jaegeuk Kim, 2015. - */ -#ifndef _F2FS_CRYPTO_H -#define _F2FS_CRYPTO_H - -#include - -#define F2FS_KEY_DESCRIPTOR_SIZE 8 - -/* Policy provided via an ioctl on the topmost directory */ -struct f2fs_encryption_policy { - char version; - char contents_encryption_mode; - char filenames_encryption_mode; - char flags; - char master_key_descriptor[F2FS_KEY_DESCRIPTOR_SIZE]; -} __attribute__((__packed__)); - -#define F2FS_ENCRYPTION_CONTEXT_FORMAT_V1 1 -#define F2FS_KEY_DERIVATION_NONCE_SIZE 16 - -#define F2FS_POLICY_FLAGS_PAD_4 0x00 -#define F2FS_POLICY_FLAGS_PAD_8 0x01 -#define F2FS_POLICY_FLAGS_PAD_16 0x02 -#define F2FS_POLICY_FLAGS_PAD_32 0x03 -#define F2FS_POLICY_FLAGS_PAD_MASK 0x03 -#define F2FS_POLICY_FLAGS_VALID 0x03 - -/** - * Encryption context for inode - * - * Protector format: - * 1 byte: Protector format (1 = this version) - * 1 byte: File contents encryption mode - * 1 byte: File names encryption mode - * 1 byte: Flags - * 8 bytes: Master Key descriptor - * 16 bytes: Encryption Key derivation nonce - */ -struct f2fs_encryption_context { - char format; - char contents_encryption_mode; - char filenames_encryption_mode; - char flags; - char master_key_descriptor[F2FS_KEY_DESCRIPTOR_SIZE]; - char nonce[F2FS_KEY_DERIVATION_NONCE_SIZE]; -} __attribute__((__packed__)); - -/* Encryption parameters */ -#define F2FS_XTS_TWEAK_SIZE 16 -#define F2FS_AES_128_ECB_KEY_SIZE 16 -#define F2FS_AES_256_GCM_KEY_SIZE 32 -#define F2FS_AES_256_CBC_KEY_SIZE 32 -#define F2FS_AES_256_CTS_KEY_SIZE 32 -#define F2FS_AES_256_XTS_KEY_SIZE 64 -#define F2FS_MAX_KEY_SIZE 64 - -#define F2FS_KEY_DESC_PREFIX "f2fs:" -#define F2FS_KEY_DESC_PREFIX_SIZE 5 - -struct f2fs_encryption_key { - __u32 mode; - char raw[F2FS_MAX_KEY_SIZE]; - __u32 size; -} __attribute__((__packed__)); - -struct f2fs_crypt_info { - char ci_data_mode; - char ci_filename_mode; - char ci_flags; - struct crypto_ablkcipher *ci_ctfm; - struct key *ci_keyring_key; - char ci_master_key[F2FS_KEY_DESCRIPTOR_SIZE]; -}; - -#define F2FS_CTX_REQUIRES_FREE_ENCRYPT_FL 0x00000001 -#define F2FS_WRITE_PATH_FL 0x00000002 - -struct f2fs_crypto_ctx { - union { - struct { - struct page *bounce_page; /* Ciphertext page */ - struct page *control_page; /* Original page */ - } w; - struct { - struct bio *bio; - struct work_struct work; - } r; - struct list_head free_list; /* Free list */ - }; - char flags; /* Flags */ -}; - -struct f2fs_completion_result { - struct completion completion; - int res; -}; - -#define DECLARE_F2FS_COMPLETION_RESULT(ecr) \ - struct f2fs_completion_result ecr = { \ - COMPLETION_INITIALIZER((ecr).completion), 0 } - -static inline int f2fs_encryption_key_size(int mode) -{ - switch (mode) { - case F2FS_ENCRYPTION_MODE_AES_256_XTS: - return F2FS_AES_256_XTS_KEY_SIZE; - case F2FS_ENCRYPTION_MODE_AES_256_GCM: - return F2FS_AES_256_GCM_KEY_SIZE; - case F2FS_ENCRYPTION_MODE_AES_256_CBC: - return F2FS_AES_256_CBC_KEY_SIZE; - case F2FS_ENCRYPTION_MODE_AES_256_CTS: - return F2FS_AES_256_CTS_KEY_SIZE; - default: - BUG(); - } - return 0; -} - -#define F2FS_FNAME_NUM_SCATTER_ENTRIES 4 -#define F2FS_CRYPTO_BLOCK_SIZE 16 -#define F2FS_FNAME_CRYPTO_DIGEST_SIZE 32 - -/** - * For encrypted symlinks, the ciphertext length is stored at the beginning - * of the string in little-endian format. - */ -struct f2fs_encrypted_symlink_data { - __le16 len; - char encrypted_path[1]; -} __attribute__((__packed__)); - -/** - * This function is used to calculate the disk space required to - * store a filename of length l in encrypted symlink format. - */ -static inline u32 encrypted_symlink_data_len(u32 l) -{ - return (l + sizeof(struct f2fs_encrypted_symlink_data) - 1); -} -#endif /* _F2FS_CRYPTO_H */ diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index a197215ad52b..c6e33258fabf 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -21,6 +21,8 @@ #include #include #include +#include +#include #include "f2fs.h" #include "node.h" @@ -40,8 +42,6 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, struct dnode_of_data dn; int err; - f2fs_balance_fs(sbi); - sb_start_pagefault(inode->i_sb); f2fs_bug_on(sbi, f2fs_has_inline_data(inode)); @@ -57,6 +57,8 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, f2fs_put_dnode(&dn); f2fs_unlock_op(sbi); + f2fs_balance_fs(sbi, dn.node_changed); + file_update_time(vma->vm_file); lock_page(page); if (unlikely(page->mapping != inode->i_mapping || @@ -74,19 +76,20 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, goto mapped; /* page is wholly or partially inside EOF */ - if (((loff_t)(page->index + 1) << PAGE_CACHE_SHIFT) > + if (((loff_t)(page->index + 1) << PAGE_SHIFT) > i_size_read(inode)) { unsigned offset; - offset = i_size_read(inode) & ~PAGE_CACHE_MASK; - zero_user_segment(page, offset, PAGE_CACHE_SIZE); + offset = i_size_read(inode) & ~PAGE_MASK; + zero_user_segment(page, offset, PAGE_SIZE); } set_page_dirty(page); - SetPageUptodate(page); + if (!PageUptodate(page)) + SetPageUptodate(page); trace_f2fs_vm_page_mkwrite(page, DATA); mapped: /* fill the page */ - f2fs_wait_on_page_writeback(page, DATA); + f2fs_wait_on_page_writeback(page, DATA, false); /* wait for GCed encrypted page writeback */ if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) @@ -96,6 +99,7 @@ mapped: clear_cold_data(page); out: sb_end_pagefault(inode->i_sb); + f2fs_update_time(sbi, REQ_TIME); return block_page_mkwrite_return(err); } @@ -132,7 +136,7 @@ static inline bool need_do_checkpoint(struct inode *inode) if (!S_ISREG(inode->i_mode) || inode->i_nlink != 1) need_cp = true; - else if (file_enc_name(inode) && need_dentry_mark(sbi, inode->i_ino)) + else if (is_sbi_flag_set(sbi, SBI_NEED_CP)) need_cp = true; else if (file_wrong_pino(inode)) need_cp = true; @@ -170,21 +174,16 @@ static void try_to_fix_pino(struct inode *inode) fi->xattr_ver = 0; if (file_wrong_pino(inode) && inode->i_nlink == 1 && get_parent_ino(inode, &pino)) { - fi->i_pino = pino; + f2fs_i_pino_write(inode, pino); file_got_pino(inode); - up_write(&fi->i_sem); - - mark_inode_dirty_sync(inode); - f2fs_write_inode(inode, NULL); - } else { - up_write(&fi->i_sem); } + up_write(&fi->i_sem); } -int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) +static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end, + int datasync, bool atomic) { struct inode *inode = file->f_mapping->host; - struct f2fs_inode_info *fi = F2FS_I(inode); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); nid_t ino = inode->i_ino; int ret = 0; @@ -201,10 +200,10 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) trace_f2fs_sync_file_enter(inode); /* if fdatasync is triggered, let's do in-place-update */ - if (get_dirty_pages(inode) <= SM_I(sbi)->min_fsync_blocks) - set_inode_flag(fi, FI_NEED_IPU); + if (datasync || get_dirty_pages(inode) <= SM_I(sbi)->min_fsync_blocks) + set_inode_flag(inode, FI_NEED_IPU); ret = filemap_write_and_wait_range(inode->i_mapping, start, end); - clear_inode_flag(fi, FI_NEED_IPU); + clear_inode_flag(inode, FI_NEED_IPU); if (ret) { trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret); @@ -212,7 +211,7 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) } /* if the inode is dirty, let's recover all the time */ - if (!datasync) { + if (!datasync && !f2fs_skip_inode_update(inode)) { f2fs_write_inode(inode, NULL); goto go_write; } @@ -220,29 +219,26 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) /* * if there is no written data, don't waste time to write recovery info. */ - if (!is_inode_flag_set(fi, FI_APPEND_WRITE) && + if (!is_inode_flag_set(inode, FI_APPEND_WRITE) && !exist_written_data(sbi, ino, APPEND_INO)) { /* it may call write_inode just prior to fsync */ if (need_inode_page_update(sbi, ino)) goto go_write; - if (is_inode_flag_set(fi, FI_UPDATE_WRITE) || + if (is_inode_flag_set(inode, FI_UPDATE_WRITE) || exist_written_data(sbi, ino, UPDATE_INO)) goto flush_out; goto out; } go_write: - /* guarantee free sections for fsync */ - f2fs_balance_fs(sbi); - /* * Both of fdatasync() and fsync() are able to be recovered from * sudden-power-off. */ - down_read(&fi->i_sem); + down_read(&F2FS_I(inode)->i_sem); need_cp = need_do_checkpoint(inode); - up_read(&fi->i_sem); + up_read(&F2FS_I(inode)->i_sem); if (need_cp) { /* all the dirty node pages should be flushed for POR */ @@ -253,19 +249,23 @@ go_write: * will be used only for fsynced inodes after checkpoint. */ try_to_fix_pino(inode); - clear_inode_flag(fi, FI_APPEND_WRITE); - clear_inode_flag(fi, FI_UPDATE_WRITE); + clear_inode_flag(inode, FI_APPEND_WRITE); + clear_inode_flag(inode, FI_UPDATE_WRITE); goto out; } sync_nodes: - sync_node_pages(sbi, ino, &wbc); - - /* if cp_error was enabled, we should avoid infinite loop */ - if (unlikely(f2fs_cp_error(sbi))) + ret = fsync_node_pages(sbi, inode, &wbc, atomic); + if (ret) goto out; + /* if cp_error was enabled, we should avoid infinite loop */ + if (unlikely(f2fs_cp_error(sbi))) { + ret = -EIO; + goto out; + } + if (need_inode_block_update(sbi, ino)) { - mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode); f2fs_write_inode(inode, NULL); goto sync_nodes; } @@ -275,18 +275,24 @@ sync_nodes: goto out; /* once recovery info is written, don't need to tack this */ - remove_dirty_inode(sbi, ino, APPEND_INO); - clear_inode_flag(fi, FI_APPEND_WRITE); + remove_ino_entry(sbi, ino, APPEND_INO); + clear_inode_flag(inode, FI_APPEND_WRITE); flush_out: - remove_dirty_inode(sbi, ino, UPDATE_INO); - clear_inode_flag(fi, FI_UPDATE_WRITE); + remove_ino_entry(sbi, ino, UPDATE_INO); + clear_inode_flag(inode, FI_UPDATE_WRITE); ret = f2fs_issue_flush(sbi); + f2fs_update_time(sbi, REQ_TIME); out: trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret); f2fs_trace_ios(NULL, 1); return ret; } +int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) +{ + return f2fs_do_sync_file(file, start, end, datasync, false); +} + static pgoff_t __get_first_dirty_index(struct address_space *mapping, pgoff_t pgofs, int whence) { @@ -300,7 +306,7 @@ static pgoff_t __get_first_dirty_index(struct address_space *mapping, pagevec_init(&pvec, 0); nr_pages = pagevec_lookup_tag(&pvec, mapping, &pgofs, PAGECACHE_TAG_DIRTY, 1); - pgofs = nr_pages ? pvec.pages[0]->index : LONG_MAX; + pgofs = nr_pages ? pvec.pages[0]->index : ULONG_MAX; pagevec_release(&pvec); return pgofs; } @@ -332,7 +338,7 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) loff_t isize; int err = 0; - mutex_lock(&inode->i_mutex); + inode_lock(inode); isize = i_size_read(inode); if (offset >= isize) @@ -345,32 +351,31 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) goto found; } - pgofs = (pgoff_t)(offset >> PAGE_CACHE_SHIFT); + pgofs = (pgoff_t)(offset >> PAGE_SHIFT); dirty = __get_first_dirty_index(inode->i_mapping, pgofs, whence); - for (; data_ofs < isize; data_ofs = (loff_t)pgofs << PAGE_CACHE_SHIFT) { + for (; data_ofs < isize; data_ofs = (loff_t)pgofs << PAGE_SHIFT) { set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, pgofs, LOOKUP_NODE_RA); + err = get_dnode_of_data(&dn, pgofs, LOOKUP_NODE); if (err && err != -ENOENT) { goto fail; } else if (err == -ENOENT) { /* direct node does not exists */ if (whence == SEEK_DATA) { - pgofs = PGOFS_OF_NEXT_DNODE(pgofs, - F2FS_I(inode)); + pgofs = get_next_page_offset(&dn, pgofs); continue; } else { goto found; } } - end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode)); + end_offset = ADDRS_PER_PAGE(dn.node_page, inode); /* find data/hole in dnode block */ for (; dn.ofs_in_node < end_offset; dn.ofs_in_node++, pgofs++, - data_ofs = (loff_t)pgofs << PAGE_CACHE_SHIFT) { + data_ofs = (loff_t)pgofs << PAGE_SHIFT) { block_t blkaddr; blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node); @@ -387,10 +392,10 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) found: if (whence == SEEK_HOLE && data_ofs > isize) data_ofs = isize; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return vfs_setpos(file, data_ofs, maxbytes); fail: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return -ENXIO; } @@ -418,19 +423,20 @@ static loff_t f2fs_llseek(struct file *file, loff_t offset, int whence) static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma) { struct inode *inode = file_inode(file); + int err; if (f2fs_encrypted_inode(inode)) { - int err = f2fs_get_encryption_info(inode); + err = fscrypt_get_encryption_info(inode); if (err) return 0; + if (!f2fs_encrypted_inode(inode)) + return -ENOKEY; } /* we don't need to use inline_data strictly */ - if (f2fs_has_inline_data(inode)) { - int err = f2fs_convert_inline_inode(inode); - if (err) - return err; - } + err = f2fs_convert_inline_inode(inode); + if (err) + return err; file_accessed(file); vma->vm_ops = &f2fs_file_vm_ops; @@ -440,12 +446,22 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma) static int f2fs_file_open(struct inode *inode, struct file *filp) { int ret = generic_file_open(inode, filp); + struct dentry *dir; if (!ret && f2fs_encrypted_inode(inode)) { - ret = f2fs_get_encryption_info(inode); + ret = fscrypt_get_encryption_info(inode); if (ret) - ret = -EACCES; + return -EACCES; + if (!fscrypt_has_encryption_key(inode)) + return -ENOKEY; } + dir = dget_parent(file_dentry(filp)); + if (f2fs_encrypted_inode(d_inode(dir)) && + !fscrypt_has_permitted_context(d_inode(dir), inode)) { + dput(dir); + return -EPERM; + } + dput(dir); return ret; } @@ -468,8 +484,7 @@ int truncate_data_blocks_range(struct dnode_of_data *dn, int count) set_data_blkaddr(dn); invalidate_blocks(sbi, blkaddr); if (dn->ofs_in_node == 0 && IS_INODE(dn->node_page)) - clear_inode_flag(F2FS_I(dn->inode), - FI_FIRST_BLOCK_WRITTEN); + clear_inode_flag(dn->inode, FI_FIRST_BLOCK_WRITTEN); nr_free++; } @@ -480,14 +495,13 @@ int truncate_data_blocks_range(struct dnode_of_data *dn, int count) * we will invalidate all blkaddr in the whole range. */ fofs = start_bidx_of_node(ofs_of_node(dn->node_page), - F2FS_I(dn->inode)) + ofs; + dn->inode) + ofs; f2fs_update_extent_cache_range(dn, fofs, 0, len); dec_valid_block_count(sbi, dn->inode, nr_free); - set_page_dirty(dn->node_page); - sync_inode_page(dn); } dn->ofs_in_node = ofs; + f2fs_update_time(sbi, REQ_TIME); trace_f2fs_truncate_data_blocks_range(dn->inode, dn->nid, dn->ofs_in_node, nr_free); return nr_free; @@ -501,8 +515,8 @@ void truncate_data_blocks(struct dnode_of_data *dn) static int truncate_partial_data_page(struct inode *inode, u64 from, bool cache_only) { - unsigned offset = from & (PAGE_CACHE_SIZE - 1); - pgoff_t index = from >> PAGE_CACHE_SHIFT; + unsigned offset = from & (PAGE_SIZE - 1); + pgoff_t index = from >> PAGE_SHIFT; struct address_space *mapping = inode->i_mapping; struct page *page; @@ -510,7 +524,7 @@ static int truncate_partial_data_page(struct inode *inode, u64 from, return 0; if (cache_only) { - page = f2fs_grab_cache_page(mapping, index, false); + page = find_lock_page(mapping, index); if (page && PageUptodate(page)) goto truncate_out; f2fs_put_page(page, 1); @@ -521,9 +535,10 @@ static int truncate_partial_data_page(struct inode *inode, u64 from, if (IS_ERR(page)) return 0; truncate_out: - f2fs_wait_on_page_writeback(page, DATA); - zero_user(page, offset, PAGE_CACHE_SIZE - offset); - if (!cache_only || !f2fs_encrypted_inode(inode) || !S_ISREG(inode->i_mode)) + f2fs_wait_on_page_writeback(page, DATA, true); + zero_user(page, offset, PAGE_SIZE - offset); + if (!cache_only || !f2fs_encrypted_inode(inode) || + !S_ISREG(inode->i_mode)) set_page_dirty(page); f2fs_put_page(page, 1); return 0; @@ -543,6 +558,9 @@ int truncate_blocks(struct inode *inode, u64 from, bool lock) free_from = (pgoff_t)F2FS_BYTES_TO_BLK(from + blocksize - 1); + if (free_from >= sbi->max_file_blocks) + goto free_partial; + if (lock) f2fs_lock_op(sbi); @@ -561,14 +579,14 @@ int truncate_blocks(struct inode *inode, u64 from, bool lock) } set_new_dnode(&dn, inode, ipage, NULL, 0); - err = get_dnode_of_data(&dn, free_from, LOOKUP_NODE); + err = get_dnode_of_data(&dn, free_from, LOOKUP_NODE_RA); if (err) { if (err == -ENOENT) goto free_next; goto out; } - count = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode)); + count = ADDRS_PER_PAGE(dn.node_page, inode); count -= dn.ofs_in_node; f2fs_bug_on(sbi, count < 0); @@ -584,7 +602,7 @@ free_next: out: if (lock) f2fs_unlock_op(sbi); - +free_partial: /* lastly zero out the first data page */ if (!err) err = truncate_partial_data_page(inode, from, truncate_page); @@ -593,7 +611,7 @@ out: return err; } -int f2fs_truncate(struct inode *inode, bool lock) +int f2fs_truncate(struct inode *inode) { int err; @@ -604,18 +622,18 @@ int f2fs_truncate(struct inode *inode, bool lock) trace_f2fs_truncate(inode); /* we should check inline_data size */ - if (f2fs_has_inline_data(inode) && !f2fs_may_inline_data(inode)) { + if (!f2fs_may_inline_data(inode)) { err = f2fs_convert_inline_inode(inode); if (err) return err; } - err = truncate_blocks(inode, i_size_read(inode), lock); + err = truncate_blocks(inode, i_size_read(inode), true); if (err) return err; inode->i_mtime = inode->i_ctime = CURRENT_TIME; - mark_inode_dirty(inode); + f2fs_mark_inode_dirty_sync(inode); return 0; } @@ -631,7 +649,6 @@ int f2fs_getattr(struct vfsmount *mnt, #ifdef CONFIG_F2FS_FS_POSIX_ACL static void __setattr_copy(struct inode *inode, const struct iattr *attr) { - struct f2fs_inode_info *fi = F2FS_I(inode); unsigned int ia_valid = attr->ia_valid; if (ia_valid & ATTR_UID) @@ -652,7 +669,7 @@ static void __setattr_copy(struct inode *inode, const struct iattr *attr) if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) mode &= ~S_ISGID; - set_acl_inode(fi, mode); + set_acl_inode(inode, mode); } } #else @@ -662,7 +679,6 @@ static void __setattr_copy(struct inode *inode, const struct iattr *attr) int f2fs_setattr(struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); - struct f2fs_inode_info *fi = F2FS_I(inode); int err; err = inode_change_ok(inode, attr); @@ -671,21 +687,28 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) if (attr->ia_valid & ATTR_SIZE) { if (f2fs_encrypted_inode(inode) && - f2fs_get_encryption_info(inode)) + fscrypt_get_encryption_info(inode)) return -EACCES; if (attr->ia_size <= i_size_read(inode)) { truncate_setsize(inode, attr->ia_size); - err = f2fs_truncate(inode, true); + err = f2fs_truncate(inode); if (err) return err; - f2fs_balance_fs(F2FS_I_SB(inode)); + f2fs_balance_fs(F2FS_I_SB(inode), true); } else { /* * do not trim all blocks after i_size if target size is * larger than i_size. */ truncate_setsize(inode, attr->ia_size); + + /* should convert inline inode here */ + if (!f2fs_may_inline_data(inode)) { + err = f2fs_convert_inline_inode(inode); + if (err) + return err; + } inode->i_mtime = inode->i_ctime = CURRENT_TIME; } } @@ -694,13 +717,13 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) if (attr->ia_valid & ATTR_MODE) { err = posix_acl_chmod(inode, get_inode_mode(inode)); - if (err || is_inode_flag_set(fi, FI_ACL_MODE)) { - inode->i_mode = fi->i_acl_mode; - clear_inode_flag(fi, FI_ACL_MODE); + if (err || is_inode_flag_set(inode, FI_ACL_MODE)) { + inode->i_mode = F2FS_I(inode)->i_acl_mode; + clear_inode_flag(inode, FI_ACL_MODE); } } - mark_inode_dirty(inode); + f2fs_mark_inode_dirty_sync(inode); return err; } @@ -727,7 +750,7 @@ static int fill_zero(struct inode *inode, pgoff_t index, if (!len) return 0; - f2fs_balance_fs(sbi); + f2fs_balance_fs(sbi, true); f2fs_lock_op(sbi); page = get_new_data_page(inode, NULL, index, false); @@ -736,7 +759,7 @@ static int fill_zero(struct inode *inode, pgoff_t index, if (IS_ERR(page)) return PTR_ERR(page); - f2fs_wait_on_page_writeback(page, DATA); + f2fs_wait_on_page_writeback(page, DATA, true); zero_user(page, start, len); set_page_dirty(page); f2fs_put_page(page, 1); @@ -761,7 +784,7 @@ int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end) return err; } - end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode)); + end_offset = ADDRS_PER_PAGE(dn.node_page, inode); count = min(end_offset - dn.ofs_in_node, pg_end - pg_start); f2fs_bug_on(F2FS_I_SB(inode), count == 0 || count > end_offset); @@ -778,19 +801,17 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) { pgoff_t pg_start, pg_end; loff_t off_start, off_end; - int ret = 0; + int ret; - if (f2fs_has_inline_data(inode)) { - ret = f2fs_convert_inline_inode(inode); - if (ret) - return ret; - } + ret = f2fs_convert_inline_inode(inode); + if (ret) + return ret; - pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT; - pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT; + pg_start = ((unsigned long long) offset) >> PAGE_SHIFT; + pg_end = ((unsigned long long) offset + len) >> PAGE_SHIFT; - off_start = offset & (PAGE_CACHE_SIZE - 1); - off_end = (offset + len) & (PAGE_CACHE_SIZE - 1); + off_start = offset & (PAGE_SIZE - 1); + off_end = (offset + len) & (PAGE_SIZE - 1); if (pg_start == pg_end) { ret = fill_zero(inode, pg_start, off_start, @@ -800,7 +821,7 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) } else { if (off_start) { ret = fill_zero(inode, pg_start++, off_start, - PAGE_CACHE_SIZE - off_start); + PAGE_SIZE - off_start); if (ret) return ret; } @@ -815,10 +836,10 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) loff_t blk_start, blk_end; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - f2fs_balance_fs(sbi); + f2fs_balance_fs(sbi, true); - blk_start = (loff_t)pg_start << PAGE_CACHE_SHIFT; - blk_end = (loff_t)pg_end << PAGE_CACHE_SHIFT; + blk_start = (loff_t)pg_start << PAGE_SHIFT; + blk_end = (loff_t)pg_end << PAGE_SHIFT; truncate_inode_pages_range(mapping, blk_start, blk_end - 1); @@ -831,83 +852,199 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) return ret; } -static int __exchange_data_block(struct inode *inode, pgoff_t src, - pgoff_t dst, bool full) +static int __read_out_blkaddrs(struct inode *inode, block_t *blkaddr, + int *do_replace, pgoff_t off, pgoff_t len) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct dnode_of_data dn; - block_t new_addr; - bool do_replace = false; - int ret; + int ret, done, i; +next_dnode: set_new_dnode(&dn, inode, NULL, NULL, 0); - ret = get_dnode_of_data(&dn, src, LOOKUP_NODE_RA); + ret = get_dnode_of_data(&dn, off, LOOKUP_NODE_RA); if (ret && ret != -ENOENT) { return ret; } else if (ret == -ENOENT) { - new_addr = NULL_ADDR; - } else { - new_addr = dn.data_blkaddr; - if (!is_checkpointed_data(sbi, new_addr)) { - dn.data_blkaddr = NULL_ADDR; + if (dn.max_level == 0) + return -ENOENT; + done = min((pgoff_t)ADDRS_PER_BLOCK - dn.ofs_in_node, len); + blkaddr += done; + do_replace += done; + goto next; + } + + done = min((pgoff_t)ADDRS_PER_PAGE(dn.node_page, inode) - + dn.ofs_in_node, len); + for (i = 0; i < done; i++, blkaddr++, do_replace++, dn.ofs_in_node++) { + *blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node); + if (!is_checkpointed_data(sbi, *blkaddr)) { + + if (test_opt(sbi, LFS)) { + f2fs_put_dnode(&dn); + return -ENOTSUPP; + } + /* do not invalidate this block address */ - set_data_blkaddr(&dn); - f2fs_update_extent_cache(&dn); - do_replace = true; + f2fs_update_data_blkaddr(&dn, NULL_ADDR); + *do_replace = 1; + } + } + f2fs_put_dnode(&dn); +next: + len -= done; + off += done; + if (len) + goto next_dnode; + return 0; +} + +static int __roll_back_blkaddrs(struct inode *inode, block_t *blkaddr, + int *do_replace, pgoff_t off, int len) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct dnode_of_data dn; + int ret, i; + + for (i = 0; i < len; i++, do_replace++, blkaddr++) { + if (*do_replace == 0) + continue; + + set_new_dnode(&dn, inode, NULL, NULL, 0); + ret = get_dnode_of_data(&dn, off + i, LOOKUP_NODE_RA); + if (ret) { + dec_valid_block_count(sbi, inode, 1); + invalidate_blocks(sbi, *blkaddr); + } else { + f2fs_update_data_blkaddr(&dn, *blkaddr); } f2fs_put_dnode(&dn); } + return 0; +} - if (new_addr == NULL_ADDR) - return full ? truncate_hole(inode, dst, dst + 1) : 0; +static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode, + block_t *blkaddr, int *do_replace, + pgoff_t src, pgoff_t dst, pgoff_t len, bool full) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(src_inode); + pgoff_t i = 0; + int ret; - if (do_replace) { - struct page *ipage = get_node_page(sbi, inode->i_ino); - struct node_info ni; - - if (IS_ERR(ipage)) { - ret = PTR_ERR(ipage); - goto err_out; + while (i < len) { + if (blkaddr[i] == NULL_ADDR && !full) { + i++; + continue; } - set_new_dnode(&dn, inode, ipage, NULL, 0); - ret = f2fs_reserve_block(&dn, dst); - if (ret) - goto err_out; + if (do_replace[i] || blkaddr[i] == NULL_ADDR) { + struct dnode_of_data dn; + struct node_info ni; + size_t new_size; + pgoff_t ilen; - truncate_data_blocks_range(&dn, 1); + set_new_dnode(&dn, dst_inode, NULL, NULL, 0); + ret = get_dnode_of_data(&dn, dst + i, ALLOC_NODE); + if (ret) + return ret; - get_node_info(sbi, dn.nid, &ni); - f2fs_replace_block(sbi, &dn, dn.data_blkaddr, new_addr, - ni.version, true); - f2fs_put_dnode(&dn); - } else { - struct page *psrc, *pdst; + get_node_info(sbi, dn.nid, &ni); + ilen = min((pgoff_t) + ADDRS_PER_PAGE(dn.node_page, dst_inode) - + dn.ofs_in_node, len - i); + do { + dn.data_blkaddr = datablock_addr(dn.node_page, + dn.ofs_in_node); + truncate_data_blocks_range(&dn, 1); - psrc = get_lock_data_page(inode, src, true); - if (IS_ERR(psrc)) - return PTR_ERR(psrc); - pdst = get_new_data_page(inode, NULL, dst, false); - if (IS_ERR(pdst)) { + if (do_replace[i]) { + f2fs_i_blocks_write(src_inode, + 1, false); + f2fs_i_blocks_write(dst_inode, + 1, true); + f2fs_replace_block(sbi, &dn, dn.data_blkaddr, + blkaddr[i], ni.version, true, false); + + do_replace[i] = 0; + } + dn.ofs_in_node++; + i++; + new_size = (dst + i) << PAGE_SHIFT; + if (dst_inode->i_size < new_size) + f2fs_i_size_write(dst_inode, new_size); + } while ((do_replace[i] || blkaddr[i] == NULL_ADDR) && --ilen); + + f2fs_put_dnode(&dn); + } else { + struct page *psrc, *pdst; + + psrc = get_lock_data_page(src_inode, src + i, true); + if (IS_ERR(psrc)) + return PTR_ERR(psrc); + pdst = get_new_data_page(dst_inode, NULL, dst + i, + true); + if (IS_ERR(pdst)) { + f2fs_put_page(psrc, 1); + return PTR_ERR(pdst); + } + f2fs_copy_page(psrc, pdst); + set_page_dirty(pdst); + f2fs_put_page(pdst, 1); f2fs_put_page(psrc, 1); - return PTR_ERR(pdst); - } - f2fs_copy_page(psrc, pdst); - set_page_dirty(pdst); - f2fs_put_page(pdst, 1); - f2fs_put_page(psrc, 1); - return truncate_hole(inode, src, src + 1); + ret = truncate_hole(src_inode, src + i, src + i + 1); + if (ret) + return ret; + i++; + } + } + return 0; +} + +static int __exchange_data_block(struct inode *src_inode, + struct inode *dst_inode, pgoff_t src, pgoff_t dst, + pgoff_t len, bool full) +{ + block_t *src_blkaddr; + int *do_replace; + pgoff_t olen; + int ret; + + while (len) { + olen = min((pgoff_t)4 * ADDRS_PER_BLOCK, len); + + src_blkaddr = f2fs_kvzalloc(sizeof(block_t) * olen, GFP_KERNEL); + if (!src_blkaddr) + return -ENOMEM; + + do_replace = f2fs_kvzalloc(sizeof(int) * olen, GFP_KERNEL); + if (!do_replace) { + kvfree(src_blkaddr); + return -ENOMEM; + } + + ret = __read_out_blkaddrs(src_inode, src_blkaddr, + do_replace, src, olen); + if (ret) + goto roll_back; + + ret = __clone_blkaddrs(src_inode, dst_inode, src_blkaddr, + do_replace, src, dst, olen, full); + if (ret) + goto roll_back; + + src += olen; + dst += olen; + len -= olen; + + kvfree(src_blkaddr); + kvfree(do_replace); } return 0; -err_out: - if (!get_dnode_of_data(&dn, src, LOOKUP_NODE)) { - dn.data_blkaddr = new_addr; - set_data_blkaddr(&dn); - f2fs_update_extent_cache(&dn); - f2fs_put_dnode(&dn); - } +roll_back: + __roll_back_blkaddrs(src_inode, src_blkaddr, do_replace, src, len); + kvfree(src_blkaddr); + kvfree(do_replace); return ret; } @@ -915,16 +1052,15 @@ static int f2fs_do_collapse(struct inode *inode, pgoff_t start, pgoff_t end) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); pgoff_t nrpages = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE; - int ret = 0; + int ret; - for (; end < nrpages; start++, end++) { - f2fs_balance_fs(sbi); - f2fs_lock_op(sbi); - ret = __exchange_data_block(inode, end, start, true); - f2fs_unlock_op(sbi); - if (ret) - break; - } + f2fs_balance_fs(sbi, true); + f2fs_lock_op(sbi); + + f2fs_drop_extent_tree(inode); + + ret = __exchange_data_block(inode, inode, end, start, nrpages - end, true); + f2fs_unlock_op(sbi); return ret; } @@ -941,16 +1077,12 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) if (offset & (F2FS_BLKSIZE - 1) || len & (F2FS_BLKSIZE - 1)) return -EINVAL; - f2fs_balance_fs(F2FS_I_SB(inode)); + ret = f2fs_convert_inline_inode(inode); + if (ret) + return ret; - if (f2fs_has_inline_data(inode)) { - ret = f2fs_convert_inline_inode(inode); - if (ret) - return ret; - } - - pg_start = offset >> PAGE_CACHE_SHIFT; - pg_end = (offset + len) >> PAGE_CACHE_SHIFT; + pg_start = offset >> PAGE_SHIFT; + pg_end = (offset + len) >> PAGE_SHIFT; /* write out all dirty pages from offset */ ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); @@ -972,7 +1104,50 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) ret = truncate_blocks(inode, new_size, true); if (!ret) - i_size_write(inode, new_size); + f2fs_i_size_write(inode, new_size); + + return ret; +} + +static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start, + pgoff_t end) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); + pgoff_t index = start; + unsigned int ofs_in_node = dn->ofs_in_node; + blkcnt_t count = 0; + int ret; + + for (; index < end; index++, dn->ofs_in_node++) { + if (datablock_addr(dn->node_page, dn->ofs_in_node) == NULL_ADDR) + count++; + } + + dn->ofs_in_node = ofs_in_node; + ret = reserve_new_blocks(dn, count); + if (ret) + return ret; + + dn->ofs_in_node = ofs_in_node; + for (index = start; index < end; index++, dn->ofs_in_node++) { + dn->data_blkaddr = + datablock_addr(dn->node_page, dn->ofs_in_node); + /* + * reserve_new_blocks will not guarantee entire block + * allocation. + */ + if (dn->data_blkaddr == NULL_ADDR) { + ret = -ENOSPC; + break; + } + if (dn->data_blkaddr != NEW_ADDR) { + invalidate_blocks(sbi, dn->data_blkaddr); + dn->data_blkaddr = NEW_ADDR; + set_data_blkaddr(dn); + } + } + + f2fs_update_extent_cache_range(dn, start, 0, index - start); return ret; } @@ -991,13 +1166,9 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, if (ret) return ret; - f2fs_balance_fs(sbi); - - if (f2fs_has_inline_data(inode)) { - ret = f2fs_convert_inline_inode(inode); - if (ret) - return ret; - } + ret = f2fs_convert_inline_inode(inode); + if (ret) + return ret; ret = filemap_write_and_wait_range(mapping, offset, offset + len - 1); if (ret) @@ -1005,11 +1176,11 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, truncate_pagecache_range(inode, offset, offset + len - 1); - pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT; - pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT; + pg_start = ((unsigned long long) offset) >> PAGE_SHIFT; + pg_end = ((unsigned long long) offset + len) >> PAGE_SHIFT; - off_start = offset & (PAGE_CACHE_SIZE - 1); - off_end = (offset + len) & (PAGE_CACHE_SIZE - 1); + off_start = offset & (PAGE_SIZE - 1); + off_end = (offset + len) & (PAGE_SIZE - 1); if (pg_start == pg_end) { ret = fill_zero(inode, pg_start, off_start, @@ -1023,48 +1194,40 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, } else { if (off_start) { ret = fill_zero(inode, pg_start++, off_start, - PAGE_CACHE_SIZE - off_start); + PAGE_SIZE - off_start); if (ret) return ret; new_size = max_t(loff_t, new_size, - (loff_t)pg_start << PAGE_CACHE_SHIFT); + (loff_t)pg_start << PAGE_SHIFT); } - for (index = pg_start; index < pg_end; index++) { + for (index = pg_start; index < pg_end;) { struct dnode_of_data dn; - struct page *ipage; + unsigned int end_offset; + pgoff_t end; f2fs_lock_op(sbi); - ipage = get_node_page(sbi, inode->i_ino); - if (IS_ERR(ipage)) { - ret = PTR_ERR(ipage); - f2fs_unlock_op(sbi); - goto out; - } - - set_new_dnode(&dn, inode, ipage, NULL, 0); - ret = f2fs_reserve_block(&dn, index); + set_new_dnode(&dn, inode, NULL, NULL, 0); + ret = get_dnode_of_data(&dn, index, ALLOC_NODE); if (ret) { f2fs_unlock_op(sbi); goto out; } - if (dn.data_blkaddr != NEW_ADDR) { - invalidate_blocks(sbi, dn.data_blkaddr); + end_offset = ADDRS_PER_PAGE(dn.node_page, inode); + end = min(pg_end, end_offset - dn.ofs_in_node + index); - dn.data_blkaddr = NEW_ADDR; - set_data_blkaddr(&dn); - - dn.data_blkaddr = NULL_ADDR; - f2fs_update_extent_cache(&dn); - } + ret = f2fs_do_zero_range(&dn, index, end); f2fs_put_dnode(&dn); f2fs_unlock_op(sbi); + if (ret) + goto out; + index = end; new_size = max_t(loff_t, new_size, - (loff_t)(index + 1) << PAGE_CACHE_SHIFT); + (loff_t)index << PAGE_SHIFT); } if (off_end) { @@ -1077,11 +1240,8 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, } out: - if (!(mode & FALLOC_FL_KEEP_SIZE) && i_size_read(inode) < new_size) { - i_size_write(inode, new_size); - mark_inode_dirty(inode); - update_inode_page(inode); - } + if (!(mode & FALLOC_FL_KEEP_SIZE) && i_size_read(inode) < new_size) + f2fs_i_size_write(inode, new_size); return ret; } @@ -1089,7 +1249,7 @@ out: static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - pgoff_t pg_start, pg_end, delta, nrpages, idx; + pgoff_t nr, pg_start, pg_end, delta, idx; loff_t new_size; int ret = 0; @@ -1104,13 +1264,11 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) if (offset & (F2FS_BLKSIZE - 1) || len & (F2FS_BLKSIZE - 1)) return -EINVAL; - f2fs_balance_fs(sbi); + ret = f2fs_convert_inline_inode(inode); + if (ret) + return ret; - if (f2fs_has_inline_data(inode)) { - ret = f2fs_convert_inline_inode(inode); - if (ret) - return ret; - } + f2fs_balance_fs(sbi, true); ret = truncate_blocks(inode, i_size_read(inode), true); if (ret) @@ -1123,17 +1281,23 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) truncate_pagecache(inode, offset); - pg_start = offset >> PAGE_CACHE_SHIFT; - pg_end = (offset + len) >> PAGE_CACHE_SHIFT; + pg_start = offset >> PAGE_SHIFT; + pg_end = (offset + len) >> PAGE_SHIFT; delta = pg_end - pg_start; - nrpages = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE; + idx = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE; + + while (!ret && idx > pg_start) { + nr = idx - pg_start; + if (nr > delta) + nr = delta; + idx -= nr; - for (idx = nrpages - 1; idx >= pg_start && idx != -1; idx--) { f2fs_lock_op(sbi); - ret = __exchange_data_block(inode, idx, idx + delta, false); + f2fs_drop_extent_tree(inode); + + ret = __exchange_data_block(inode, inode, idx, + idx + delta, nr, false); f2fs_unlock_op(sbi); - if (ret) - break; } /* write out all moved pages, if possible */ @@ -1141,7 +1305,7 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) truncate_pagecache(inode, offset); if (!ret) - i_size_write(inode, new_size); + f2fs_i_size_write(inode, new_size); return ret; } @@ -1149,60 +1313,48 @@ static int expand_inode_data(struct inode *inode, loff_t offset, loff_t len, int mode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - pgoff_t index, pg_start, pg_end; + struct f2fs_map_blocks map = { .m_next_pgofs = NULL }; + pgoff_t pg_end; loff_t new_size = i_size_read(inode); - loff_t off_start, off_end; - int ret = 0; - - f2fs_balance_fs(sbi); + loff_t off_end; + int ret; ret = inode_newsize_ok(inode, (len + offset)); if (ret) return ret; - if (f2fs_has_inline_data(inode)) { - ret = f2fs_convert_inline_inode(inode); - if (ret) + ret = f2fs_convert_inline_inode(inode); + if (ret) + return ret; + + f2fs_balance_fs(sbi, true); + + pg_end = ((unsigned long long)offset + len) >> PAGE_SHIFT; + off_end = (offset + len) & (PAGE_SIZE - 1); + + map.m_lblk = ((unsigned long long)offset) >> PAGE_SHIFT; + map.m_len = pg_end - map.m_lblk; + if (off_end) + map.m_len++; + + ret = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO); + if (ret) { + pgoff_t last_off; + + if (!map.m_len) return ret; + + last_off = map.m_lblk + map.m_len - 1; + + /* update new size to the failed position */ + new_size = (last_off == pg_end) ? offset + len: + (loff_t)(last_off + 1) << PAGE_SHIFT; + } else { + new_size = ((loff_t)pg_end << PAGE_SHIFT) + off_end; } - pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT; - pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT; - - off_start = offset & (PAGE_CACHE_SIZE - 1); - off_end = (offset + len) & (PAGE_CACHE_SIZE - 1); - - f2fs_lock_op(sbi); - - for (index = pg_start; index <= pg_end; index++) { - struct dnode_of_data dn; - - if (index == pg_end && !off_end) - goto noalloc; - - set_new_dnode(&dn, inode, NULL, NULL, 0); - ret = f2fs_reserve_block(&dn, index); - if (ret) - break; -noalloc: - if (pg_start == pg_end) - new_size = offset + len; - else if (index == pg_start && off_start) - new_size = (loff_t)(index + 1) << PAGE_CACHE_SHIFT; - else if (index == pg_end) - new_size = ((loff_t)index << PAGE_CACHE_SHIFT) + - off_end; - else - new_size += PAGE_CACHE_SIZE; - } - - if (!(mode & FALLOC_FL_KEEP_SIZE) && - i_size_read(inode) < new_size) { - i_size_write(inode, new_size); - mark_inode_dirty(inode); - update_inode_page(inode); - } - f2fs_unlock_op(sbi); + if (!(mode & FALLOC_FL_KEEP_SIZE) && i_size_read(inode) < new_size) + f2fs_i_size_write(inode, new_size); return ret; } @@ -1226,7 +1378,7 @@ static long f2fs_fallocate(struct file *file, int mode, FALLOC_FL_INSERT_RANGE)) return -EOPNOTSUPP; - mutex_lock(&inode->i_mutex); + inode_lock(inode); if (mode & FALLOC_FL_PUNCH_HOLE) { if (offset >= inode->i_size) @@ -1245,11 +1397,12 @@ static long f2fs_fallocate(struct file *file, int mode, if (!ret) { inode->i_mtime = inode->i_ctime = CURRENT_TIME; - mark_inode_dirty(inode); + f2fs_mark_inode_dirty_sync(inode); + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); } out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); trace_f2fs_fallocate(inode, mode, offset, len, ret); return ret; @@ -1257,13 +1410,22 @@ out: static int f2fs_release_file(struct inode *inode, struct file *filp) { + /* + * f2fs_relase_file is called at every close calls. So we should + * not drop any inmemory pages by close called by other process. + */ + if (!(filp->f_mode & FMODE_WRITE) || + atomic_read(&inode->i_writecount) != 1) + return 0; + /* some remained atomic pages should discarded */ if (f2fs_is_atomic_file(inode)) - commit_inmem_pages(inode, true); + drop_inmem_pages(inode); if (f2fs_is_volatile_file(inode)) { - set_inode_flag(F2FS_I(inode), FI_DROP_CACHE); + clear_inode_flag(inode, FI_VOLATILE_FILE); + set_inode_flag(inode, FI_DROP_CACHE); filemap_fdatawrite(inode->i_mapping); - clear_inode_flag(F2FS_I(inode), FI_DROP_CACHE); + clear_inode_flag(inode, FI_DROP_CACHE); } return 0; } @@ -1293,33 +1455,29 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); struct f2fs_inode_info *fi = F2FS_I(inode); - unsigned int flags = fi->i_flags & FS_FL_USER_VISIBLE; + unsigned int flags; unsigned int oldflags; int ret; + if (!inode_owner_or_capable(inode)) + return -EACCES; + + if (get_user(flags, (int __user *)arg)) + return -EFAULT; + ret = mnt_want_write_file(filp); if (ret) return ret; - if (!inode_owner_or_capable(inode)) { - ret = -EACCES; - goto out; - } - - if (get_user(flags, (int __user *)arg)) { - ret = -EFAULT; - goto out; - } - flags = f2fs_mask_flags(inode->i_mode, flags); - mutex_lock(&inode->i_mutex); + inode_lock(inode); oldflags = fi->i_flags; if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) { if (!capable(CAP_LINUX_IMMUTABLE)) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); ret = -EPERM; goto out; } @@ -1328,11 +1486,10 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg) flags = flags & FS_FL_USER_MODIFIABLE; flags |= oldflags & ~FS_FL_USER_MODIFIABLE; fi->i_flags = flags; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); - f2fs_set_inode_flags(inode); inode->i_ctime = CURRENT_TIME; - mark_inode_dirty(inode); + f2fs_set_inode_flags(inode); out: mnt_drop_write_file(filp); return ret; @@ -1353,17 +1510,35 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) if (!inode_owner_or_capable(inode)) return -EACCES; - f2fs_balance_fs(F2FS_I_SB(inode)); - - if (f2fs_is_atomic_file(inode)) - return 0; - - ret = f2fs_convert_inline_inode(inode); + ret = mnt_want_write_file(filp); if (ret) return ret; - set_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); - return 0; + inode_lock(inode); + + if (f2fs_is_atomic_file(inode)) + goto out; + + ret = f2fs_convert_inline_inode(inode); + if (ret) + goto out; + + set_inode_flag(inode, FI_ATOMIC_FILE); + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); + + if (!get_dirty_pages(inode)) + goto out; + + f2fs_msg(F2FS_I_SB(inode)->sb, KERN_WARNING, + "Unexpected flush for atomic writes: ino=%lu, npages=%lld", + inode->i_ino, get_dirty_pages(inode)); + ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); + if (ret) + clear_inode_flag(inode, FI_ATOMIC_FILE); +out: + inode_unlock(inode); + mnt_drop_write_file(filp); + return ret; } static int f2fs_ioc_commit_atomic_write(struct file *filp) @@ -1374,22 +1549,27 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) if (!inode_owner_or_capable(inode)) return -EACCES; - if (f2fs_is_volatile_file(inode)) - return 0; - ret = mnt_want_write_file(filp); if (ret) return ret; + inode_lock(inode); + + if (f2fs_is_volatile_file(inode)) + goto err_out; + if (f2fs_is_atomic_file(inode)) { - clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); - ret = commit_inmem_pages(inode, false); - if (ret) + clear_inode_flag(inode, FI_ATOMIC_FILE); + ret = commit_inmem_pages(inode); + if (ret) { + set_inode_flag(inode, FI_ATOMIC_FILE); goto err_out; + } } - ret = f2fs_sync_file(filp, 0, LLONG_MAX, 0); + ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); err_out: + inode_unlock(inode); mnt_drop_write_file(filp); return ret; } @@ -1402,31 +1582,54 @@ static int f2fs_ioc_start_volatile_write(struct file *filp) if (!inode_owner_or_capable(inode)) return -EACCES; - if (f2fs_is_volatile_file(inode)) - return 0; - - ret = f2fs_convert_inline_inode(inode); + ret = mnt_want_write_file(filp); if (ret) return ret; - set_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE); - return 0; + inode_lock(inode); + + if (f2fs_is_volatile_file(inode)) + goto out; + + ret = f2fs_convert_inline_inode(inode); + if (ret) + goto out; + + set_inode_flag(inode, FI_VOLATILE_FILE); + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); +out: + inode_unlock(inode); + mnt_drop_write_file(filp); + return ret; } static int f2fs_ioc_release_volatile_write(struct file *filp) { struct inode *inode = file_inode(filp); + int ret; if (!inode_owner_or_capable(inode)) return -EACCES; + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + inode_lock(inode); + if (!f2fs_is_volatile_file(inode)) - return 0; + goto out; - if (!f2fs_is_first_block_written(inode)) - return truncate_partial_data_page(inode, 0, true); + if (!f2fs_is_first_block_written(inode)) { + ret = truncate_partial_data_page(inode, 0, true); + goto out; + } - return punch_hole(inode, 0, F2FS_BLKSIZE); + ret = punch_hole(inode, 0, F2FS_BLKSIZE); +out: + inode_unlock(inode); + mnt_drop_write_file(filp); + return ret; } static int f2fs_ioc_abort_volatile_write(struct file *filp) @@ -1441,13 +1644,19 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp) if (ret) return ret; - f2fs_balance_fs(F2FS_I_SB(inode)); + inode_lock(inode); - clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); - clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE); - commit_inmem_pages(inode, true); + if (f2fs_is_atomic_file(inode)) + drop_inmem_pages(inode); + if (f2fs_is_volatile_file(inode)) { + clear_inode_flag(inode, FI_VOLATILE_FILE); + ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); + } + + inode_unlock(inode); mnt_drop_write_file(filp); + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); return ret; } @@ -1457,6 +1666,7 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct super_block *sb = sbi->sb; __u32 in; + int ret; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -1464,30 +1674,38 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) if (get_user(in, (__u32 __user *)arg)) return -EFAULT; + ret = mnt_want_write_file(filp); + if (ret) + return ret; + switch (in) { case F2FS_GOING_DOWN_FULLSYNC: sb = freeze_bdev(sb->s_bdev); if (sb && !IS_ERR(sb)) { - f2fs_stop_checkpoint(sbi); + f2fs_stop_checkpoint(sbi, false); thaw_bdev(sb->s_bdev, sb); } break; case F2FS_GOING_DOWN_METASYNC: /* do checkpoint only */ f2fs_sync_fs(sb, 1); - f2fs_stop_checkpoint(sbi); + f2fs_stop_checkpoint(sbi, false); break; case F2FS_GOING_DOWN_NOSYNC: - f2fs_stop_checkpoint(sbi); + f2fs_stop_checkpoint(sbi, false); break; case F2FS_GOING_DOWN_METAFLUSH: sync_meta_pages(sbi, META, LONG_MAX); - f2fs_stop_checkpoint(sbi); + f2fs_stop_checkpoint(sbi, false); break; default: - return -EINVAL; + ret = -EINVAL; + goto out; } - return 0; + f2fs_update_time(sbi, REQ_TIME); +out: + mnt_drop_write_file(filp); + return ret; } static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg) @@ -1508,15 +1726,21 @@ static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg) sizeof(range))) return -EFAULT; + ret = mnt_want_write_file(filp); + if (ret) + return ret; + range.minlen = max((unsigned int)range.minlen, q->limits.discard_granularity); ret = f2fs_trim_fs(F2FS_SB(sb), &range); + mnt_drop_write_file(filp); if (ret < 0) return ret; if (copy_to_user((struct fstrim_range __user *)arg, &range, sizeof(range))) return -EFAULT; + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); return 0; } @@ -1532,38 +1756,31 @@ static bool uuid_is_nonzero(__u8 u[16]) static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg) { -#ifdef CONFIG_F2FS_FS_ENCRYPTION - struct f2fs_encryption_policy policy; + struct fscrypt_policy policy; struct inode *inode = file_inode(filp); - if (copy_from_user(&policy, (struct f2fs_encryption_policy __user *)arg, - sizeof(policy))) + if (copy_from_user(&policy, (struct fscrypt_policy __user *)arg, + sizeof(policy))) return -EFAULT; - return f2fs_process_policy(&policy, inode); -#else - return -EOPNOTSUPP; -#endif + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); + + return fscrypt_process_policy(filp, &policy); } static int f2fs_ioc_get_encryption_policy(struct file *filp, unsigned long arg) { -#ifdef CONFIG_F2FS_FS_ENCRYPTION - struct f2fs_encryption_policy policy; + struct fscrypt_policy policy; struct inode *inode = file_inode(filp); int err; - err = f2fs_get_policy(inode, &policy); + err = fscrypt_get_policy(inode, &policy); if (err) return err; - if (copy_to_user((struct f2fs_encryption_policy __user *)arg, &policy, - sizeof(policy))) + if (copy_to_user((struct fscrypt_policy __user *)arg, &policy, sizeof(policy))) return -EFAULT; return 0; -#else - return -EOPNOTSUPP; -#endif } static int f2fs_ioc_get_encryption_pwsalt(struct file *filp, unsigned long arg) @@ -1586,13 +1803,13 @@ static int f2fs_ioc_get_encryption_pwsalt(struct file *filp, unsigned long arg) generate_random_uuid(sbi->raw_super->encrypt_pw_salt); err = f2fs_commit_super(sbi, false); - - mnt_drop_write_file(filp); if (err) { /* undo new data */ memset(sbi->raw_super->encrypt_pw_salt, 0, 16); + mnt_drop_write_file(filp); return err; } + mnt_drop_write_file(filp); got_it: if (copy_to_user((__u8 __user *)arg, sbi->raw_super->encrypt_pw_salt, 16)) @@ -1605,6 +1822,7 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg) struct inode *inode = file_inode(filp); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); __u32 sync; + int ret; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -1615,21 +1833,30 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg) if (f2fs_readonly(sbi->sb)) return -EROFS; + ret = mnt_want_write_file(filp); + if (ret) + return ret; + if (!sync) { - if (!mutex_trylock(&sbi->gc_mutex)) - return -EBUSY; + if (!mutex_trylock(&sbi->gc_mutex)) { + ret = -EBUSY; + goto out; + } } else { mutex_lock(&sbi->gc_mutex); } - return f2fs_gc(sbi, sync); + ret = f2fs_gc(sbi, sync); +out: + mnt_drop_write_file(filp); + return ret; } static int f2fs_ioc_write_checkpoint(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct cp_control cpc; + int ret; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -1637,13 +1864,343 @@ static int f2fs_ioc_write_checkpoint(struct file *filp, unsigned long arg) if (f2fs_readonly(sbi->sb)) return -EROFS; - cpc.reason = __get_cp_reason(sbi); + ret = mnt_want_write_file(filp); + if (ret) + return ret; - mutex_lock(&sbi->gc_mutex); - write_checkpoint(sbi, &cpc); - mutex_unlock(&sbi->gc_mutex); + ret = f2fs_sync_fs(sbi->sb, 1); - return 0; + mnt_drop_write_file(filp); + return ret; +} + +static int f2fs_defragment_range(struct f2fs_sb_info *sbi, + struct file *filp, + struct f2fs_defragment *range) +{ + struct inode *inode = file_inode(filp); + struct f2fs_map_blocks map = { .m_next_pgofs = NULL }; + struct extent_info ei; + pgoff_t pg_start, pg_end; + unsigned int blk_per_seg = sbi->blocks_per_seg; + unsigned int total = 0, sec_num; + unsigned int pages_per_sec = sbi->segs_per_sec * blk_per_seg; + block_t blk_end = 0; + bool fragmented = false; + int err; + + /* if in-place-update policy is enabled, don't waste time here */ + if (need_inplace_update(inode)) + return -EINVAL; + + pg_start = range->start >> PAGE_SHIFT; + pg_end = (range->start + range->len) >> PAGE_SHIFT; + + f2fs_balance_fs(sbi, true); + + inode_lock(inode); + + /* writeback all dirty pages in the range */ + err = filemap_write_and_wait_range(inode->i_mapping, range->start, + range->start + range->len - 1); + if (err) + goto out; + + /* + * lookup mapping info in extent cache, skip defragmenting if physical + * block addresses are continuous. + */ + if (f2fs_lookup_extent_cache(inode, pg_start, &ei)) { + if (ei.fofs + ei.len >= pg_end) + goto out; + } + + map.m_lblk = pg_start; + + /* + * lookup mapping info in dnode page cache, skip defragmenting if all + * physical block addresses are continuous even if there are hole(s) + * in logical blocks. + */ + while (map.m_lblk < pg_end) { + map.m_len = pg_end - map.m_lblk; + err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_READ); + if (err) + goto out; + + if (!(map.m_flags & F2FS_MAP_FLAGS)) { + map.m_lblk++; + continue; + } + + if (blk_end && blk_end != map.m_pblk) { + fragmented = true; + break; + } + blk_end = map.m_pblk + map.m_len; + + map.m_lblk += map.m_len; + } + + if (!fragmented) + goto out; + + map.m_lblk = pg_start; + map.m_len = pg_end - pg_start; + + sec_num = (map.m_len + pages_per_sec - 1) / pages_per_sec; + + /* + * make sure there are enough free section for LFS allocation, this can + * avoid defragment running in SSR mode when free section are allocated + * intensively + */ + if (has_not_enough_free_secs(sbi, 0, sec_num)) { + err = -EAGAIN; + goto out; + } + + while (map.m_lblk < pg_end) { + pgoff_t idx; + int cnt = 0; + +do_map: + map.m_len = pg_end - map.m_lblk; + err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_READ); + if (err) + goto clear_out; + + if (!(map.m_flags & F2FS_MAP_FLAGS)) { + map.m_lblk++; + continue; + } + + set_inode_flag(inode, FI_DO_DEFRAG); + + idx = map.m_lblk; + while (idx < map.m_lblk + map.m_len && cnt < blk_per_seg) { + struct page *page; + + page = get_lock_data_page(inode, idx, true); + if (IS_ERR(page)) { + err = PTR_ERR(page); + goto clear_out; + } + + set_page_dirty(page); + f2fs_put_page(page, 1); + + idx++; + cnt++; + total++; + } + + map.m_lblk = idx; + + if (idx < pg_end && cnt < blk_per_seg) + goto do_map; + + clear_inode_flag(inode, FI_DO_DEFRAG); + + err = filemap_fdatawrite(inode->i_mapping); + if (err) + goto out; + } +clear_out: + clear_inode_flag(inode, FI_DO_DEFRAG); +out: + inode_unlock(inode); + if (!err) + range->len = (u64)total << PAGE_SHIFT; + return err; +} + +static int f2fs_ioc_defragment(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_defragment range; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + + err = mnt_want_write_file(filp); + if (err) + return err; + + if (f2fs_readonly(sbi->sb)) { + err = -EROFS; + goto out; + } + + if (copy_from_user(&range, (struct f2fs_defragment __user *)arg, + sizeof(range))) { + err = -EFAULT; + goto out; + } + + /* verify alignment of offset & size */ + if (range.start & (F2FS_BLKSIZE - 1) || + range.len & (F2FS_BLKSIZE - 1)) { + err = -EINVAL; + goto out; + } + + err = f2fs_defragment_range(sbi, filp, &range); + f2fs_update_time(sbi, REQ_TIME); + if (err < 0) + goto out; + + if (copy_to_user((struct f2fs_defragment __user *)arg, &range, + sizeof(range))) + err = -EFAULT; +out: + mnt_drop_write_file(filp); + return err; +} + +static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, size_t len) +{ + struct inode *src = file_inode(file_in); + struct inode *dst = file_inode(file_out); + struct f2fs_sb_info *sbi = F2FS_I_SB(src); + size_t olen = len, dst_max_i_size = 0; + size_t dst_osize; + int ret; + + if (file_in->f_path.mnt != file_out->f_path.mnt || + src->i_sb != dst->i_sb) + return -EXDEV; + + if (unlikely(f2fs_readonly(src->i_sb))) + return -EROFS; + + if (!S_ISREG(src->i_mode) || !S_ISREG(dst->i_mode)) + return -EINVAL; + + if (f2fs_encrypted_inode(src) || f2fs_encrypted_inode(dst)) + return -EOPNOTSUPP; + + if (src == dst) { + if (pos_in == pos_out) + return 0; + if (pos_out > pos_in && pos_out < pos_in + len) + return -EINVAL; + } + + inode_lock(src); + if (src != dst) { + if (!inode_trylock(dst)) { + ret = -EBUSY; + goto out; + } + } + + ret = -EINVAL; + if (pos_in + len > src->i_size || pos_in + len < pos_in) + goto out_unlock; + if (len == 0) + olen = len = src->i_size - pos_in; + if (pos_in + len == src->i_size) + len = ALIGN(src->i_size, F2FS_BLKSIZE) - pos_in; + if (len == 0) { + ret = 0; + goto out_unlock; + } + + dst_osize = dst->i_size; + if (pos_out + olen > dst->i_size) + dst_max_i_size = pos_out + olen; + + /* verify the end result is block aligned */ + if (!IS_ALIGNED(pos_in, F2FS_BLKSIZE) || + !IS_ALIGNED(pos_in + len, F2FS_BLKSIZE) || + !IS_ALIGNED(pos_out, F2FS_BLKSIZE)) + goto out_unlock; + + ret = f2fs_convert_inline_inode(src); + if (ret) + goto out_unlock; + + ret = f2fs_convert_inline_inode(dst); + if (ret) + goto out_unlock; + + /* write out all dirty pages from offset */ + ret = filemap_write_and_wait_range(src->i_mapping, + pos_in, pos_in + len); + if (ret) + goto out_unlock; + + ret = filemap_write_and_wait_range(dst->i_mapping, + pos_out, pos_out + len); + if (ret) + goto out_unlock; + + f2fs_balance_fs(sbi, true); + f2fs_lock_op(sbi); + ret = __exchange_data_block(src, dst, pos_in >> F2FS_BLKSIZE_BITS, + pos_out >> F2FS_BLKSIZE_BITS, + len >> F2FS_BLKSIZE_BITS, false); + + if (!ret) { + if (dst_max_i_size) + f2fs_i_size_write(dst, dst_max_i_size); + else if (dst_osize != dst->i_size) + f2fs_i_size_write(dst, dst_osize); + } + f2fs_unlock_op(sbi); +out_unlock: + if (src != dst) + inode_unlock(dst); +out: + inode_unlock(src); + return ret; +} + +static int f2fs_ioc_move_range(struct file *filp, unsigned long arg) +{ + struct f2fs_move_range range; + struct fd dst; + int err; + + if (!(filp->f_mode & FMODE_READ) || + !(filp->f_mode & FMODE_WRITE)) + return -EBADF; + + if (copy_from_user(&range, (struct f2fs_move_range __user *)arg, + sizeof(range))) + return -EFAULT; + + dst = fdget(range.dst_fd); + if (!dst.file) + return -EBADF; + + if (!(dst.file->f_mode & FMODE_WRITE)) { + err = -EBADF; + goto err_out; + } + + err = mnt_want_write_file(filp); + if (err) + goto err_out; + + err = f2fs_move_file_range(filp, range.pos_in, dst.file, + range.pos_out, range.len); + + mnt_drop_write_file(filp); + + if (copy_to_user((struct f2fs_move_range __user *)arg, + &range, sizeof(range))) + err = -EFAULT; +err_out: + fdput(dst); + return err; } long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) @@ -1679,6 +2236,10 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return f2fs_ioc_gc(filp, arg); case F2FS_IOC_WRITE_CHECKPOINT: return f2fs_ioc_write_checkpoint(filp, arg); + case F2FS_IOC_DEFRAGMENT: + return f2fs_ioc_defragment(filp, arg); + case F2FS_IOC_MOVE_RANGE: + return f2fs_ioc_move_range(filp, arg); default: return -ENOTTY; } @@ -1686,14 +2247,36 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { - struct inode *inode = file_inode(iocb->ki_filp); + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + struct blk_plug plug; + ssize_t ret; if (f2fs_encrypted_inode(inode) && - !f2fs_has_encryption_key(inode) && - f2fs_get_encryption_info(inode)) + !fscrypt_has_encryption_key(inode) && + fscrypt_get_encryption_info(inode)) return -EACCES; - return generic_file_write_iter(iocb, from); + inode_lock(inode); + ret = generic_write_checks(iocb, from); + if (ret > 0) { + ret = f2fs_preallocate_blocks(iocb, from); + if (!ret) { + blk_start_plug(&plug); + ret = __generic_file_write_iter(iocb, from); + blk_finish_plug(&plug); + } + } + inode_unlock(inode); + + if (ret > 0) { + ssize_t err; + + err = generic_write_sync(file, iocb->ki_pos - ret, ret); + if (err < 0) + ret = err; + } + return ret; } #ifdef CONFIG_COMPAT @@ -1706,6 +2289,24 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case F2FS_IOC32_SETFLAGS: cmd = F2FS_IOC_SETFLAGS; break; + case F2FS_IOC32_GETVERSION: + cmd = F2FS_IOC_GETVERSION; + break; + case F2FS_IOC_START_ATOMIC_WRITE: + case F2FS_IOC_COMMIT_ATOMIC_WRITE: + case F2FS_IOC_START_VOLATILE_WRITE: + case F2FS_IOC_RELEASE_VOLATILE_WRITE: + case F2FS_IOC_ABORT_VOLATILE_WRITE: + case F2FS_IOC_SHUTDOWN: + case F2FS_IOC_SET_ENCRYPTION_POLICY: + case F2FS_IOC_GET_ENCRYPTION_PWSALT: + case F2FS_IOC_GET_ENCRYPTION_POLICY: + case F2FS_IOC_GARBAGE_COLLECT: + case F2FS_IOC_WRITE_CHECKPOINT: + case F2FS_IOC_DEFRAGMENT: + break; + case F2FS_IOC_MOVE_RANGE: + break; default: return -ENOIOCTLCMD; } diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index fedbf67a0842..0a0a1ad1fe1f 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -16,7 +16,6 @@ #include #include #include -#include #include "f2fs.h" #include "node.h" @@ -48,6 +47,11 @@ static int gc_thread_func(void *data) continue; } +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(sbi, FAULT_CHECKPOINT)) + f2fs_stop_checkpoint(sbi, false); +#endif + /* * [GC triggering condition] * 0. GC is not conducted currently. @@ -97,7 +101,7 @@ int start_gc_thread(struct f2fs_sb_info *sbi) dev_t dev = sbi->sb->s_bdev->bd_dev; int err = 0; - gc_th = kmalloc(sizeof(struct f2fs_gc_kthread), GFP_KERNEL); + gc_th = f2fs_kmalloc(sbi, sizeof(struct f2fs_gc_kthread), GFP_KERNEL); if (!gc_th) { err = -ENOMEM; goto out; @@ -173,9 +177,9 @@ static unsigned int get_max_cost(struct f2fs_sb_info *sbi, { /* SSR allocates in a segment unit */ if (p->alloc_mode == SSR) - return 1 << sbi->log_blocks_per_seg; + return sbi->blocks_per_seg; if (p->gc_mode == GC_GREEDY) - return (1 << sbi->log_blocks_per_seg) * p->ofs_unit; + return sbi->blocks_per_seg * p->ofs_unit; else if (p->gc_mode == GC_CB) return UINT_MAX; else /* No other gc_mode */ @@ -246,6 +250,18 @@ static inline unsigned int get_gc_cost(struct f2fs_sb_info *sbi, return get_cb_cost(sbi, segno); } +static unsigned int count_bits(const unsigned long *addr, + unsigned int offset, unsigned int len) +{ + unsigned int end = offset + len, sum = 0; + + while (offset < end) { + if (test_bit(offset++, addr)) + ++sum; + } + return sum; +} + /* * This function is called from two paths. * One is garbage collection and the other is SSR segment selection. @@ -259,9 +275,9 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); struct victim_sel_policy p; - unsigned int secno, max_cost; + unsigned int secno, last_victim; unsigned int last_segment = MAIN_SEGS(sbi); - int nsearched = 0; + unsigned int nsearched = 0; mutex_lock(&dirty_i->seglist_lock); @@ -269,11 +285,12 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, select_policy(sbi, gc_type, type, &p); p.min_segno = NULL_SEGNO; - p.min_cost = max_cost = get_max_cost(sbi, &p); + p.min_cost = get_max_cost(sbi, &p); if (p.max_search == 0) goto out; + last_victim = sbi->last_victim[p.gc_mode]; if (p.alloc_mode == LFS && gc_type == FG_GC) { p.min_segno = check_bg_victims(sbi); if (p.min_segno != NULL_SEGNO) @@ -296,27 +313,35 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, } p.offset = segno + p.ofs_unit; - if (p.ofs_unit > 1) + if (p.ofs_unit > 1) { p.offset -= segno % p.ofs_unit; + nsearched += count_bits(p.dirty_segmap, + p.offset - p.ofs_unit, + p.ofs_unit); + } else { + nsearched++; + } + secno = GET_SECNO(sbi, segno); if (sec_usage_check(sbi, secno)) - continue; + goto next; if (gc_type == BG_GC && test_bit(secno, dirty_i->victim_secmap)) - continue; + goto next; cost = get_gc_cost(sbi, segno, &p); if (p.min_cost > cost) { p.min_segno = segno; p.min_cost = cost; - } else if (unlikely(cost == max_cost)) { - continue; } - - if (nsearched++ >= p.max_search) { - sbi->last_victim[p.gc_mode] = segno; +next: + if (nsearched >= p.max_search) { + if (!sbi->last_victim[p.gc_mode] && segno <= last_victim) + sbi->last_victim[p.gc_mode] = last_victim + 1; + else + sbi->last_victim[p.gc_mode] = segno + 1; break; } } @@ -400,13 +425,13 @@ static int check_valid_map(struct f2fs_sb_info *sbi, * On validity, copy that node with cold status, otherwise (invalid node) * ignore that. */ -static int gc_node_segment(struct f2fs_sb_info *sbi, +static void gc_node_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, unsigned int segno, int gc_type) { - bool initial = true; struct f2fs_summary *entry; block_t start_addr; int off; + int phase = 0; start_addr = START_BLOCK(sbi, segno); @@ -419,16 +444,24 @@ next_step: struct node_info ni; /* stop BG_GC if there is not enough free sections. */ - if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0)) - return 0; + if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0)) + return; if (check_valid_map(sbi, segno, off) == 0) continue; - if (initial) { + if (phase == 0) { + ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), 1, + META_NAT, true); + continue; + } + + if (phase == 1) { ra_node_page(sbi, nid); continue; } + + /* phase == 2 */ node_page = get_node_page(sbi, nid); if (IS_ERR(node_page)) continue; @@ -445,36 +478,12 @@ next_step: continue; } - /* set page dirty and write it */ - if (gc_type == FG_GC) { - f2fs_wait_on_page_writeback(node_page, NODE); - set_page_dirty(node_page); - } else { - if (!PageWriteback(node_page)) - set_page_dirty(node_page); - } - f2fs_put_page(node_page, 1); + move_node_page(node_page, gc_type); stat_inc_node_blk_count(sbi, 1, gc_type); } - if (initial) { - initial = false; + if (++phase < 3) goto next_step; - } - - if (gc_type == FG_GC) { - struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL, - .nr_to_write = LONG_MAX, - .for_reclaim = 0, - }; - sync_node_pages(sbi, 0, &wbc); - - /* return 1 only if FG_GC succefully reclaimed one */ - if (get_valid_blocks(sbi, segno, 1) == 0) - return 1; - } - return 0; } /* @@ -484,7 +493,7 @@ next_step: * as indirect or double indirect node blocks, are given, it must be a caller's * bug. */ -block_t start_bidx_of_node(unsigned int node_ofs, struct f2fs_inode_info *fi) +block_t start_bidx_of_node(unsigned int node_ofs, struct inode *inode) { unsigned int indirect_blks = 2 * NIDS_PER_BLOCK + 4; unsigned int bidx; @@ -501,7 +510,7 @@ block_t start_bidx_of_node(unsigned int node_ofs, struct f2fs_inode_info *fi) int dec = (node_ofs - indirect_blks - 3) / (NIDS_PER_BLOCK + 1); bidx = node_ofs - 5 - dec; } - return bidx * ADDRS_PER_BLOCK + ADDRS_PER_INODE(fi); + return bidx * ADDRS_PER_BLOCK + ADDRS_PER_INODE(inode); } static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, @@ -547,6 +556,7 @@ static void move_encrypted_block(struct inode *inode, block_t bidx) struct f2fs_summary sum; struct node_info ni; struct page *page; + block_t newaddr; int err; /* do not read out */ @@ -568,21 +578,24 @@ static void move_encrypted_block(struct inode *inode, block_t bidx) * don't cache encrypted data into meta inode until previous dirty * data were writebacked to avoid racing between GC and flush. */ - f2fs_wait_on_page_writeback(page, DATA); + f2fs_wait_on_page_writeback(page, DATA, true); get_node_info(fio.sbi, dn.nid, &ni); set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version); /* read page */ fio.page = page; - fio.blk_addr = dn.data_blkaddr; + fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr; - fio.encrypted_page = pagecache_get_page(META_MAPPING(fio.sbi), - fio.blk_addr, - FGP_LOCK|FGP_CREAT, - GFP_NOFS); - if (!fio.encrypted_page) - goto put_out; + allocate_data_block(fio.sbi, NULL, fio.old_blkaddr, &newaddr, + &sum, CURSEG_COLD_DATA); + + fio.encrypted_page = pagecache_get_page(META_MAPPING(fio.sbi), newaddr, + FGP_LOCK | FGP_CREAT, GFP_NOFS); + if (!fio.encrypted_page) { + err = -ENOMEM; + goto recover_block; + } err = f2fs_submit_page_bio(&fio); if (err) @@ -591,33 +604,39 @@ static void move_encrypted_block(struct inode *inode, block_t bidx) /* write page */ lock_page(fio.encrypted_page); - if (unlikely(!PageUptodate(fio.encrypted_page))) + if (unlikely(fio.encrypted_page->mapping != META_MAPPING(fio.sbi))) { + err = -EIO; goto put_page_out; - if (unlikely(fio.encrypted_page->mapping != META_MAPPING(fio.sbi))) + } + if (unlikely(!PageUptodate(fio.encrypted_page))) { + err = -EIO; goto put_page_out; + } set_page_dirty(fio.encrypted_page); - f2fs_wait_on_page_writeback(fio.encrypted_page, DATA); + f2fs_wait_on_page_writeback(fio.encrypted_page, DATA, true); if (clear_page_dirty_for_io(fio.encrypted_page)) dec_page_count(fio.sbi, F2FS_DIRTY_META); set_page_writeback(fio.encrypted_page); /* allocate block address */ - f2fs_wait_on_page_writeback(dn.node_page, NODE); - allocate_data_block(fio.sbi, NULL, fio.blk_addr, - &fio.blk_addr, &sum, CURSEG_COLD_DATA); + f2fs_wait_on_page_writeback(dn.node_page, NODE, true); + fio.rw = WRITE_SYNC; + fio.new_blkaddr = newaddr; f2fs_submit_page_mbio(&fio); - dn.data_blkaddr = fio.blk_addr; - set_data_blkaddr(&dn); - f2fs_update_extent_cache(&dn); - set_inode_flag(F2FS_I(inode), FI_APPEND_WRITE); + f2fs_update_data_blkaddr(&dn, newaddr); + set_inode_flag(inode, FI_APPEND_WRITE); if (page->index == 0) - set_inode_flag(F2FS_I(inode), FI_FIRST_BLOCK_WRITTEN); + set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN); put_page_out: f2fs_put_page(fio.encrypted_page, 1); +recover_block: + if (err) + __f2fs_replace_block(fio.sbi, &sum, newaddr, fio.old_blkaddr, + true, true); put_out: f2fs_put_dnode(&dn); out: @@ -645,12 +664,23 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type) .page = page, .encrypted_page = NULL, }; + bool is_dirty = PageDirty(page); + int err; + +retry: set_page_dirty(page); - f2fs_wait_on_page_writeback(page, DATA); + f2fs_wait_on_page_writeback(page, DATA, true); if (clear_page_dirty_for_io(page)) inode_dec_dirty_pages(inode); + set_cold_data(page); - do_write_data_page(&fio); + + err = do_write_data_page(&fio); + if (err == -ENOMEM && is_dirty) { + congestion_wait(BLK_RW_ASYNC, HZ/50); + goto retry; + } + clear_cold_data(page); } out: @@ -664,7 +694,7 @@ out: * If the parent node is not valid or the data block address is different, * the victim data block is ignored. */ -static int gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, +static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, struct gc_inode_list *gc_list, unsigned int segno, int gc_type) { struct super_block *sb = sbi->sb; @@ -684,16 +714,23 @@ next_step: struct node_info dni; /* dnode info for the data */ unsigned int ofs_in_node, nofs; block_t start_bidx; + nid_t nid = le32_to_cpu(entry->nid); /* stop BG_GC if there is not enough free sections. */ - if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0)) - return 0; + if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0)) + return; if (check_valid_map(sbi, segno, off) == 0) continue; if (phase == 0) { - ra_node_page(sbi, le32_to_cpu(entry->nid)); + ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), 1, + META_NAT, true); + continue; + } + + if (phase == 1) { + ra_node_page(sbi, nid); continue; } @@ -701,14 +738,14 @@ next_step: if (!is_alive(sbi, entry, &dni, start_addr + off, &nofs)) continue; - if (phase == 1) { + if (phase == 2) { ra_node_page(sbi, dni.ino); continue; } ofs_in_node = le16_to_cpu(entry->ofs_in_node); - if (phase == 2) { + if (phase == 3) { inode = f2fs_iget(sb, dni.ino); if (IS_ERR(inode) || is_bad_inode(inode)) continue; @@ -720,7 +757,7 @@ next_step: continue; } - start_bidx = start_bidx_of_node(nofs, F2FS_I(inode)); + start_bidx = start_bidx_of_node(nofs, inode); data_page = get_read_data_page(inode, start_bidx + ofs_in_node, READA, true); if (IS_ERR(data_page)) { @@ -733,30 +770,41 @@ next_step: continue; } - /* phase 3 */ + /* phase 4 */ inode = find_gc_inode(gc_list, dni.ino); if (inode) { - start_bidx = start_bidx_of_node(nofs, F2FS_I(inode)) + struct f2fs_inode_info *fi = F2FS_I(inode); + bool locked = false; + + if (S_ISREG(inode->i_mode)) { + if (!down_write_trylock(&fi->dio_rwsem[READ])) + continue; + if (!down_write_trylock( + &fi->dio_rwsem[WRITE])) { + up_write(&fi->dio_rwsem[READ]); + continue; + } + locked = true; + } + + start_bidx = start_bidx_of_node(nofs, inode) + ofs_in_node; if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) move_encrypted_block(inode, start_bidx); else move_data_page(inode, start_bidx, gc_type); + + if (locked) { + up_write(&fi->dio_rwsem[WRITE]); + up_write(&fi->dio_rwsem[READ]); + } + stat_inc_data_blk_count(sbi, 1, gc_type); } } - if (++phase < 4) + if (++phase < 5) goto next_step; - - if (gc_type == FG_GC) { - f2fs_submit_merged_bio(sbi, DATA, WRITE); - - /* return 1 only if FG_GC succefully reclaimed one */ - if (get_valid_blocks(sbi, segno, 1) == 0) - return 1; - } - return 0; } static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim, @@ -772,51 +820,84 @@ static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim, return ret; } -static int do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno, +static int do_garbage_collect(struct f2fs_sb_info *sbi, + unsigned int start_segno, struct gc_inode_list *gc_list, int gc_type) { struct page *sum_page; struct f2fs_summary_block *sum; struct blk_plug plug; - int nfree = 0; + unsigned int segno = start_segno; + unsigned int end_segno = start_segno + sbi->segs_per_sec; + int sec_freed = 0; + unsigned char type = IS_DATASEG(get_seg_entry(sbi, segno)->type) ? + SUM_TYPE_DATA : SUM_TYPE_NODE; - /* read segment summary of victim */ - sum_page = get_sum_page(sbi, segno); + /* readahead multi ssa blocks those have contiguous address */ + if (sbi->segs_per_sec > 1) + ra_meta_pages(sbi, GET_SUM_BLOCK(sbi, segno), + sbi->segs_per_sec, META_SSA, true); + + /* reference all summary page */ + while (segno < end_segno) { + sum_page = get_sum_page(sbi, segno++); + unlock_page(sum_page); + } blk_start_plug(&plug); - sum = page_address(sum_page); + for (segno = start_segno; segno < end_segno; segno++) { - /* - * this is to avoid deadlock: - * - lock_page(sum_page) - f2fs_replace_block - * - check_valid_map() - mutex_lock(sentry_lock) - * - mutex_lock(sentry_lock) - change_curseg() - * - lock_page(sum_page) - */ - unlock_page(sum_page); + if (get_valid_blocks(sbi, segno, 1) == 0 || + unlikely(f2fs_cp_error(sbi))) + goto next; - switch (GET_SUM_TYPE((&sum->footer))) { - case SUM_TYPE_NODE: - nfree = gc_node_segment(sbi, sum->entries, segno, gc_type); - break; - case SUM_TYPE_DATA: - nfree = gc_data_segment(sbi, sum->entries, gc_list, - segno, gc_type); - break; + /* find segment summary of victim */ + sum_page = find_get_page(META_MAPPING(sbi), + GET_SUM_BLOCK(sbi, segno)); + f2fs_bug_on(sbi, !PageUptodate(sum_page)); + f2fs_put_page(sum_page, 0); + + sum = page_address(sum_page); + f2fs_bug_on(sbi, type != GET_SUM_TYPE((&sum->footer))); + + /* + * this is to avoid deadlock: + * - lock_page(sum_page) - f2fs_replace_block + * - check_valid_map() - mutex_lock(sentry_lock) + * - mutex_lock(sentry_lock) - change_curseg() + * - lock_page(sum_page) + */ + + if (type == SUM_TYPE_NODE) + gc_node_segment(sbi, sum->entries, segno, gc_type); + else + gc_data_segment(sbi, sum->entries, gc_list, segno, + gc_type); + + stat_inc_seg_count(sbi, type, gc_type); +next: + f2fs_put_page(sum_page, 0); } + + if (gc_type == FG_GC) + f2fs_submit_merged_bio(sbi, + (type == SUM_TYPE_NODE) ? NODE : DATA, WRITE); + blk_finish_plug(&plug); - stat_inc_seg_count(sbi, GET_SUM_TYPE((&sum->footer)), gc_type); + if (gc_type == FG_GC && + get_valid_blocks(sbi, start_segno, sbi->segs_per_sec) == 0) + sec_freed = 1; + stat_inc_call_count(sbi->stat_info); - f2fs_put_page(sum_page, 0); - return nfree; + return sec_freed; } int f2fs_gc(struct f2fs_sb_info *sbi, bool sync) { - unsigned int segno, i; + unsigned int segno; int gc_type = sync ? FG_GC : BG_GC; int sec_freed = 0; int ret = -EINVAL; @@ -832,46 +913,48 @@ gc_more: if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE))) goto stop; - if (unlikely(f2fs_cp_error(sbi))) + if (unlikely(f2fs_cp_error(sbi))) { + ret = -EIO; goto stop; + } - if (gc_type == BG_GC && has_not_enough_free_secs(sbi, sec_freed)) { + if (gc_type == BG_GC && has_not_enough_free_secs(sbi, sec_freed, 0)) { gc_type = FG_GC; - if (__get_victim(sbi, &segno, gc_type) || prefree_segments(sbi)) - write_checkpoint(sbi, &cpc); + /* + * If there is no victim and no prefree segment but still not + * enough free sections, we should flush dent/node blocks and do + * garbage collections. + */ + if (__get_victim(sbi, &segno, gc_type) || + prefree_segments(sbi)) { + ret = write_checkpoint(sbi, &cpc); + if (ret) + goto stop; + segno = NULL_SEGNO; + } else if (has_not_enough_free_secs(sbi, 0, 0)) { + ret = write_checkpoint(sbi, &cpc); + if (ret) + goto stop; + } } if (segno == NULL_SEGNO && !__get_victim(sbi, &segno, gc_type)) goto stop; ret = 0; - /* readahead multi ssa blocks those have contiguous address */ - if (sbi->segs_per_sec > 1) - ra_meta_pages(sbi, GET_SUM_BLOCK(sbi, segno), sbi->segs_per_sec, - META_SSA, true); - - for (i = 0; i < sbi->segs_per_sec; i++) { - /* - * for FG_GC case, halt gcing left segments once failed one - * of segments in selected section to avoid long latency. - */ - if (!do_garbage_collect(sbi, segno + i, &gc_list, gc_type) && - gc_type == FG_GC) - break; - } - - if (i == sbi->segs_per_sec && gc_type == FG_GC) + if (do_garbage_collect(sbi, segno, &gc_list, gc_type) && + gc_type == FG_GC) sec_freed++; if (gc_type == FG_GC) sbi->cur_victim_sec = NULL_SEGNO; if (!sync) { - if (has_not_enough_free_secs(sbi, sec_freed)) + if (has_not_enough_free_secs(sbi, sec_freed, 0)) goto gc_more; if (gc_type == FG_GC) - write_checkpoint(sbi, &cpc); + ret = write_checkpoint(sbi, &cpc); } stop: mutex_unlock(&sbi->gc_mutex); diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h index b4a65be9f7d3..a993967dcdb9 100644 --- a/fs/f2fs/gc.h +++ b/fs/f2fs/gc.h @@ -100,11 +100,3 @@ static inline bool has_enough_invalid_blocks(struct f2fs_sb_info *sbi) return true; return false; } - -static inline int is_idle(struct f2fs_sb_info *sbi) -{ - struct block_device *bdev = sbi->sb->s_bdev; - struct request_queue *q = bdev_get_queue(bdev); - struct request_list *rl = &q->root_rl; - return !(rl->count[BLK_RW_SYNC]) && !(rl->count[BLK_RW_ASYNC]); -} diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index bda7126466c0..a04c1016d511 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -16,9 +16,6 @@ bool f2fs_may_inline_data(struct inode *inode) { - if (!test_opt(F2FS_I_SB(inode), INLINE_DATA)) - return false; - if (f2fs_is_atomic_file(inode)) return false; @@ -54,7 +51,7 @@ void read_inline_data(struct page *page, struct page *ipage) f2fs_bug_on(F2FS_P_SB(page), page->index); - zero_user_segment(page, MAX_INLINE_DATA, PAGE_CACHE_SIZE); + zero_user_segment(page, MAX_INLINE_DATA, PAGE_SIZE); /* Copy the whole inline data block */ src_addr = inline_data_addr(ipage); @@ -62,7 +59,8 @@ void read_inline_data(struct page *page, struct page *ipage) memcpy(dst_addr, src_addr, MAX_INLINE_DATA); flush_dcache_page(page); kunmap_atomic(dst_addr); - SetPageUptodate(page); + if (!PageUptodate(page)) + SetPageUptodate(page); } bool truncate_inline_inode(struct page *ipage, u64 from) @@ -74,9 +72,9 @@ bool truncate_inline_inode(struct page *ipage, u64 from) addr = inline_data_addr(ipage); - f2fs_wait_on_page_writeback(ipage, NODE); + f2fs_wait_on_page_writeback(ipage, NODE, true); memset(addr + from, 0, MAX_INLINE_DATA - from); - + set_page_dirty(ipage); return true; } @@ -96,11 +94,12 @@ int f2fs_read_inline_data(struct inode *inode, struct page *page) } if (page->index) - zero_user_segment(page, 0, PAGE_CACHE_SIZE); + zero_user_segment(page, 0, PAGE_SIZE); else read_inline_data(page, ipage); - SetPageUptodate(page); + if (!PageUptodate(page)) + SetPageUptodate(page); f2fs_put_page(ipage, 1); unlock_page(page); return 0; @@ -108,7 +107,6 @@ int f2fs_read_inline_data(struct inode *inode, struct page *page) int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) { - void *src_addr, *dst_addr; struct f2fs_io_info fio = { .sbi = F2FS_I_SB(dn->inode), .type = DATA, @@ -118,8 +116,6 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) }; int dirty, err; - f2fs_bug_on(F2FS_I_SB(dn->inode), page->index); - if (!f2fs_exist_data(dn->inode)) goto clear_out; @@ -127,21 +123,9 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) if (err) return err; - f2fs_wait_on_page_writeback(page, DATA); + f2fs_bug_on(F2FS_P_SB(page), PageWriteback(page)); - if (PageUptodate(page)) - goto no_update; - - zero_user_segment(page, MAX_INLINE_DATA, PAGE_CACHE_SIZE); - - /* Copy the whole inline data block */ - src_addr = inline_data_addr(dn->inode_page); - dst_addr = kmap_atomic(page); - memcpy(dst_addr, src_addr, MAX_INLINE_DATA); - flush_dcache_page(page); - kunmap_atomic(dst_addr); - SetPageUptodate(page); -no_update: + read_inline_data(page, dn->inode_page); set_page_dirty(page); /* clear dirty state */ @@ -149,23 +133,21 @@ no_update: /* write data page to try to make data consistent */ set_page_writeback(page); - fio.blk_addr = dn->data_blkaddr; + fio.old_blkaddr = dn->data_blkaddr; write_data_page(dn, &fio); - set_data_blkaddr(dn); - f2fs_update_extent_cache(dn); - f2fs_wait_on_page_writeback(page, DATA); + f2fs_wait_on_page_writeback(page, DATA, true); if (dirty) inode_dec_dirty_pages(dn->inode); /* this converted inline_data should be recovered. */ - set_inode_flag(F2FS_I(dn->inode), FI_APPEND_WRITE); + set_inode_flag(dn->inode, FI_APPEND_WRITE); /* clear inline data and flag after data writeback */ truncate_inline_inode(dn->inode_page, 0); + clear_inline_node(dn->inode_page); clear_out: stat_dec_inline_inode(dn->inode); f2fs_clear_inline_inode(dn->inode); - sync_inode_page(dn); f2fs_put_dnode(dn); return 0; } @@ -177,7 +159,10 @@ int f2fs_convert_inline_inode(struct inode *inode) struct page *ipage, *page; int err = 0; - page = grab_cache_page(inode->i_mapping, 0); + if (!f2fs_has_inline_data(inode)) + return 0; + + page = f2fs_grab_cache_page(inode->i_mapping, 0, false); if (!page) return -ENOMEM; @@ -199,6 +184,9 @@ out: f2fs_unlock_op(sbi); f2fs_put_page(page, 1); + + f2fs_balance_fs(sbi, dn.node_changed); + return err; } @@ -220,16 +208,17 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page) f2fs_bug_on(F2FS_I_SB(inode), page->index); - f2fs_wait_on_page_writeback(dn.inode_page, NODE); + f2fs_wait_on_page_writeback(dn.inode_page, NODE, true); src_addr = kmap_atomic(page); dst_addr = inline_data_addr(dn.inode_page); memcpy(dst_addr, src_addr, MAX_INLINE_DATA); kunmap_atomic(src_addr); + set_page_dirty(dn.inode_page); - set_inode_flag(F2FS_I(inode), FI_APPEND_WRITE); - set_inode_flag(F2FS_I(inode), FI_DATA_EXIST); + set_inode_flag(inode, FI_APPEND_WRITE); + set_inode_flag(inode, FI_DATA_EXIST); - sync_inode_page(&dn); + clear_inline_node(dn.inode_page); f2fs_put_dnode(&dn); return 0; } @@ -258,16 +247,16 @@ process_inline: ipage = get_node_page(sbi, inode->i_ino); f2fs_bug_on(sbi, IS_ERR(ipage)); - f2fs_wait_on_page_writeback(ipage, NODE); + f2fs_wait_on_page_writeback(ipage, NODE, true); src_addr = inline_data_addr(npage); dst_addr = inline_data_addr(ipage); memcpy(dst_addr, src_addr, MAX_INLINE_DATA); - set_inode_flag(F2FS_I(inode), FI_INLINE_DATA); - set_inode_flag(F2FS_I(inode), FI_DATA_EXIST); + set_inode_flag(inode, FI_INLINE_DATA); + set_inode_flag(inode, FI_DATA_EXIST); - update_inode(inode, ipage); + set_page_dirty(ipage); f2fs_put_page(ipage, 1); return true; } @@ -278,7 +267,6 @@ process_inline: if (!truncate_inline_inode(ipage, 0)) return false; f2fs_clear_inline_inode(inode); - update_inode(inode, ipage); f2fs_put_page(ipage, 1); } else if (ri && (ri->i_inline & F2FS_INLINE_DATA)) { if (truncate_blocks(inode, 0, false)) @@ -289,7 +277,7 @@ process_inline: } struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir, - struct f2fs_filename *fname, struct page **res_page) + struct fscrypt_name *fname, struct page **res_page) { struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); struct f2fs_inline_dentry *inline_dentry; @@ -300,8 +288,10 @@ struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir, f2fs_hash_t namehash; ipage = get_node_page(sbi, dir->i_ino); - if (IS_ERR(ipage)) + if (IS_ERR(ipage)) { + *res_page = ipage; return NULL; + } namehash = f2fs_dentry_hash(&name); @@ -315,30 +305,6 @@ struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir, else f2fs_put_page(ipage, 0); - /* - * For the most part, it should be a bug when name_len is zero. - * We stop here for figuring out where the bugs has occurred. - */ - f2fs_bug_on(sbi, d.max < 0); - return de; -} - -struct f2fs_dir_entry *f2fs_parent_inline_dir(struct inode *dir, - struct page **p) -{ - struct f2fs_sb_info *sbi = F2FS_I_SB(dir); - struct page *ipage; - struct f2fs_dir_entry *de; - struct f2fs_inline_dentry *dentry_blk; - - ipage = get_node_page(sbi, dir->i_ino); - if (IS_ERR(ipage)) - return NULL; - - dentry_blk = inline_data_addr(ipage); - de = &dentry_blk->dentry[1]; - *p = ipage; - unlock_page(ipage); return de; } @@ -356,10 +322,8 @@ int make_empty_inline_dir(struct inode *inode, struct inode *parent, set_page_dirty(ipage); /* update i_size to MAX_INLINE_DATA */ - if (i_size_read(inode) < MAX_INLINE_DATA) { - i_size_write(inode, MAX_INLINE_DATA); - set_inode_flag(F2FS_I(inode), FI_UPDATE_DIR); - } + if (i_size_read(inode) < MAX_INLINE_DATA) + f2fs_i_size_write(inode, MAX_INLINE_DATA); return 0; } @@ -367,7 +331,7 @@ int make_empty_inline_dir(struct inode *inode, struct inode *parent, * NOTE: ipage is grabbed by caller, but if any error occurs, we should * release ipage in this function. */ -static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage, +static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage, struct f2fs_inline_dentry *inline_dentry) { struct page *page; @@ -375,7 +339,7 @@ static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage, struct f2fs_dentry_block *dentry_blk; int err; - page = grab_cache_page(dir->i_mapping, 0); + page = f2fs_grab_cache_page(dir->i_mapping, 0, false); if (!page) { f2fs_put_page(ipage, 1); return -ENOMEM; @@ -386,8 +350,8 @@ static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage, if (err) goto out; - f2fs_wait_on_page_writeback(page, DATA); - zero_user_segment(page, MAX_INLINE_DATA, PAGE_CACHE_SIZE); + f2fs_wait_on_page_writeback(page, DATA, true); + zero_user_segment(page, MAX_INLINE_DATA, PAGE_SIZE); dentry_blk = kmap_atomic(page); @@ -408,37 +372,132 @@ static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage, NR_INLINE_DENTRY * F2FS_SLOT_LEN); kunmap_atomic(dentry_blk); - SetPageUptodate(page); + if (!PageUptodate(page)) + SetPageUptodate(page); set_page_dirty(page); /* clear inline dir and flag after data writeback */ truncate_inline_inode(ipage, 0); stat_dec_inline_dir(dir); - clear_inode_flag(F2FS_I(dir), FI_INLINE_DENTRY); + clear_inode_flag(dir, FI_INLINE_DENTRY); - if (i_size_read(dir) < PAGE_CACHE_SIZE) { - i_size_write(dir, PAGE_CACHE_SIZE); - set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR); - } - - sync_inode_page(&dn); + f2fs_i_depth_write(dir, 1); + if (i_size_read(dir) < PAGE_SIZE) + f2fs_i_size_write(dir, PAGE_SIZE); out: f2fs_put_page(page, 1); return err; } -int f2fs_add_inline_entry(struct inode *dir, const struct qstr *name, - struct inode *inode, nid_t ino, umode_t mode) +static int f2fs_add_inline_entries(struct inode *dir, + struct f2fs_inline_dentry *inline_dentry) +{ + struct f2fs_dentry_ptr d; + unsigned long bit_pos = 0; + int err = 0; + + make_dentry_ptr(NULL, &d, (void *)inline_dentry, 2); + + while (bit_pos < d.max) { + struct f2fs_dir_entry *de; + struct qstr new_name; + nid_t ino; + umode_t fake_mode; + + if (!test_bit_le(bit_pos, d.bitmap)) { + bit_pos++; + continue; + } + + de = &d.dentry[bit_pos]; + + if (unlikely(!de->name_len)) { + bit_pos++; + continue; + } + + new_name.name = d.filename[bit_pos]; + new_name.len = de->name_len; + + ino = le32_to_cpu(de->ino); + fake_mode = get_de_type(de) << S_SHIFT; + + err = f2fs_add_regular_entry(dir, &new_name, NULL, NULL, + ino, fake_mode); + if (err) + goto punch_dentry_pages; + + bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); + } + return 0; +punch_dentry_pages: + truncate_inode_pages(&dir->i_data, 0); + truncate_blocks(dir, 0, false); + remove_dirty_inode(dir); + return err; +} + +static int f2fs_move_rehashed_dirents(struct inode *dir, struct page *ipage, + struct f2fs_inline_dentry *inline_dentry) +{ + struct f2fs_inline_dentry *backup_dentry; + int err; + + backup_dentry = f2fs_kmalloc(F2FS_I_SB(dir), + sizeof(struct f2fs_inline_dentry), GFP_F2FS_ZERO); + if (!backup_dentry) { + f2fs_put_page(ipage, 1); + return -ENOMEM; + } + + memcpy(backup_dentry, inline_dentry, MAX_INLINE_DATA); + truncate_inline_inode(ipage, 0); + + unlock_page(ipage); + + err = f2fs_add_inline_entries(dir, backup_dentry); + if (err) + goto recover; + + lock_page(ipage); + + stat_dec_inline_dir(dir); + clear_inode_flag(dir, FI_INLINE_DENTRY); + kfree(backup_dentry); + return 0; +recover: + lock_page(ipage); + memcpy(inline_dentry, backup_dentry, MAX_INLINE_DATA); + f2fs_i_depth_write(dir, 0); + f2fs_i_size_write(dir, MAX_INLINE_DATA); + set_page_dirty(ipage); + f2fs_put_page(ipage, 1); + + kfree(backup_dentry); + return err; +} + +static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage, + struct f2fs_inline_dentry *inline_dentry) +{ + if (!F2FS_I(dir)->i_dir_level) + return f2fs_move_inline_dirents(dir, ipage, inline_dentry); + else + return f2fs_move_rehashed_dirents(dir, ipage, inline_dentry); +} + +int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, + const struct qstr *orig_name, + struct inode *inode, nid_t ino, umode_t mode) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); struct page *ipage; unsigned int bit_pos; f2fs_hash_t name_hash; - size_t namelen = name->len; struct f2fs_inline_dentry *dentry_blk = NULL; struct f2fs_dentry_ptr d; - int slots = GET_DENTRY_SLOTS(namelen); + int slots = GET_DENTRY_SLOTS(new_name->len); struct page *page = NULL; int err = 0; @@ -459,25 +518,27 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *name, if (inode) { down_write(&F2FS_I(inode)->i_sem); - page = init_inode_metadata(inode, dir, name, ipage); + page = init_inode_metadata(inode, dir, new_name, + orig_name, ipage); if (IS_ERR(page)) { err = PTR_ERR(page); goto fail; } + if (f2fs_encrypted_inode(dir)) + file_set_enc_name(inode); } - f2fs_wait_on_page_writeback(ipage, NODE); + f2fs_wait_on_page_writeback(ipage, NODE, true); - name_hash = f2fs_dentry_hash(name); + name_hash = f2fs_dentry_hash(new_name); make_dentry_ptr(NULL, &d, (void *)dentry_blk, 2); - f2fs_update_dentry(ino, mode, &d, name, name_hash, bit_pos); + f2fs_update_dentry(ino, mode, &d, new_name, name_hash, bit_pos); set_page_dirty(ipage); /* we don't need to mark_inode_dirty now */ if (inode) { - F2FS_I(inode)->i_pino = dir->i_ino; - update_inode(inode, page); + f2fs_i_pino_write(inode, dir->i_ino); f2fs_put_page(page, 1); } @@ -485,11 +546,6 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *name, fail: if (inode) up_write(&F2FS_I(inode)->i_sem); - - if (is_inode_flag_set(F2FS_I(dir), FI_UPDATE_DIR)) { - update_inode(dir, ipage); - clear_inode_flag(F2FS_I(dir), FI_UPDATE_DIR); - } out: f2fs_put_page(ipage, 1); return err; @@ -504,22 +560,22 @@ void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page, int i; lock_page(page); - f2fs_wait_on_page_writeback(page, NODE); + f2fs_wait_on_page_writeback(page, NODE, true); inline_dentry = inline_data_addr(page); bit_pos = dentry - inline_dentry->dentry; for (i = 0; i < slots; i++) - test_and_clear_bit_le(bit_pos + i, + __clear_bit_le(bit_pos + i, &inline_dentry->dentry_bitmap); set_page_dirty(page); + f2fs_put_page(page, 1); dir->i_ctime = dir->i_mtime = CURRENT_TIME; + f2fs_mark_inode_dirty_sync(dir); if (inode) - f2fs_drop_nlink(dir, inode, page); - - f2fs_put_page(page, 1); + f2fs_drop_nlink(dir, inode); } bool f2fs_empty_inline_dir(struct inode *dir) @@ -547,7 +603,7 @@ bool f2fs_empty_inline_dir(struct inode *dir) } int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx, - struct f2fs_str *fstr) + struct fscrypt_str *fstr) { struct inode *inode = file_inode(file); struct f2fs_inline_dentry *inline_dentry = NULL; diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 97e20decacb4..d7369895a78a 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include "f2fs.h" @@ -18,6 +19,13 @@ #include +void f2fs_mark_inode_dirty_sync(struct inode *inode) +{ + if (f2fs_inode_dirtied(inode)) + return; + mark_inode_dirty_sync(inode); +} + void f2fs_set_inode_flags(struct inode *inode) { unsigned int flags = F2FS_I(inode)->i_flags; @@ -35,6 +43,7 @@ void f2fs_set_inode_flags(struct inode *inode) new_fl |= S_DIRSYNC; inode_set_flags(inode, new_fl, S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); + f2fs_mark_inode_dirty_sync(inode); } static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri) @@ -83,10 +92,10 @@ static void __recover_inline_status(struct inode *inode, struct page *ipage) while (start < end) { if (*start++) { - f2fs_wait_on_page_writeback(ipage, NODE); + f2fs_wait_on_page_writeback(ipage, NODE, true); - set_inode_flag(F2FS_I(inode), FI_DATA_EXIST); - set_raw_inline(F2FS_I(inode), F2FS_INODE(ipage)); + set_inode_flag(inode, FI_DATA_EXIST); + set_raw_inline(inode, F2FS_INODE(ipage)); set_page_dirty(ipage); return; } @@ -138,9 +147,10 @@ static int do_read_inode(struct inode *inode) fi->i_pino = le32_to_cpu(ri->i_pino); fi->i_dir_level = ri->i_dir_level; - f2fs_init_extent_tree(inode, &ri->i_ext); + if (f2fs_init_extent_tree(inode, &ri->i_ext)) + set_page_dirty(node_page); - get_inline_info(fi, ri); + get_inline_info(inode, ri); /* check data exist */ if (f2fs_has_inline_data(inode) && !f2fs_exist_data(inode)) @@ -150,7 +160,10 @@ static int do_read_inode(struct inode *inode) __get_inode_rdev(inode, ri); if (__written_first_block(ri)) - set_inode_flag(F2FS_I(inode), FI_FIRST_BLOCK_WRITTEN); + set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN); + + if (!need_inode_block_update(sbi, inode->i_ino)) + fi->last_disk_size = inode->i_size; f2fs_put_page(node_page, 1); @@ -202,6 +215,7 @@ make_now: inode->i_op = &f2fs_encrypted_symlink_inode_operations; else inode->i_op = &f2fs_symlink_inode_operations; + inode_nohighmem(inode); inode->i_mapping->a_ops = &f2fs_dblock_aops; } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { @@ -221,11 +235,27 @@ bad_inode: return ERR_PTR(ret); } -void update_inode(struct inode *inode, struct page *node_page) +struct inode *f2fs_iget_retry(struct super_block *sb, unsigned long ino) +{ + struct inode *inode; +retry: + inode = f2fs_iget(sb, ino); + if (IS_ERR(inode)) { + if (PTR_ERR(inode) == -ENOMEM) { + congestion_wait(BLK_RW_ASYNC, HZ/50); + goto retry; + } + } + return inode; +} + +int update_inode(struct inode *inode, struct page *node_page) { struct f2fs_inode *ri; - f2fs_wait_on_page_writeback(node_page, NODE); + f2fs_inode_synced(inode); + + f2fs_wait_on_page_writeback(node_page, NODE, true); ri = F2FS_INODE(node_page); @@ -242,7 +272,7 @@ void update_inode(struct inode *inode, struct page *node_page) &ri->i_ext); else memset(&ri->i_ext, 0, sizeof(ri->i_ext)); - set_raw_inline(F2FS_I(inode), ri); + set_raw_inline(inode, ri); ri->i_atime = cpu_to_le64(inode->i_atime.tv_sec); ri->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); @@ -259,15 +289,19 @@ void update_inode(struct inode *inode, struct page *node_page) __set_inode_rdev(inode, ri); set_cold_node(inode, node_page); - set_page_dirty(node_page); - clear_inode_flag(F2FS_I(inode), FI_DIRTY_INODE); + /* deleted inode */ + if (inode->i_nlink == 0) + clear_inline_node(node_page); + + return set_page_dirty(node_page); } -void update_inode_page(struct inode *inode) +int update_inode_page(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct page *node_page; + int ret = 0; retry: node_page = get_node_page(sbi, inode->i_ino); if (IS_ERR(node_page)) { @@ -276,12 +310,14 @@ retry: cond_resched(); goto retry; } else if (err != -ENOENT) { - f2fs_stop_checkpoint(sbi); + f2fs_stop_checkpoint(sbi, false); } - return; + f2fs_inode_synced(inode); + return 0; } - update_inode(inode, node_page); + ret = update_inode(inode, node_page); f2fs_put_page(node_page, 1); + return ret; } int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) @@ -292,16 +328,15 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) inode->i_ino == F2FS_META_INO(sbi)) return 0; - if (!is_inode_flag_set(F2FS_I(inode), FI_DIRTY_INODE)) + if (!is_inode_flag_set(inode, FI_DIRTY_INODE)) return 0; /* * We need to balance fs here to prevent from producing dirty node pages * during the urgent cleaning time when runing out of free sections. */ - update_inode_page(inode); - - f2fs_balance_fs(sbi); + if (update_inode_page(inode)) + f2fs_balance_fs(sbi, true); return 0; } @@ -311,13 +346,12 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) void f2fs_evict_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct f2fs_inode_info *fi = F2FS_I(inode); - nid_t xnid = fi->i_xattr_nid; + nid_t xnid = F2FS_I(inode)->i_xattr_nid; int err = 0; /* some remained atomic pages should discarded */ if (f2fs_is_atomic_file(inode)) - commit_inmem_pages(inode, true); + drop_inmem_pages(inode); trace_f2fs_evict_inode(inode); truncate_inode_pages_final(&inode->i_data); @@ -327,19 +361,24 @@ void f2fs_evict_inode(struct inode *inode) goto out_clear; f2fs_bug_on(sbi, get_dirty_pages(inode)); - remove_dirty_dir_inode(inode); + remove_dirty_inode(inode); f2fs_destroy_extent_tree(inode); if (inode->i_nlink || is_bad_inode(inode)) goto no_delete; - sb_start_intwrite(inode->i_sb); - set_inode_flag(fi, FI_NO_ALLOC); - i_size_write(inode, 0); +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(sbi, FAULT_EVICT_INODE)) + goto no_delete; +#endif + sb_start_intwrite(inode->i_sb); + set_inode_flag(inode, FI_NO_ALLOC); + i_size_write(inode, 0); +retry: if (F2FS_HAS_BLOCKS(inode)) - err = f2fs_truncate(inode, true); + err = f2fs_truncate(inode); if (!err) { f2fs_lock_op(sbi); @@ -347,6 +386,14 @@ void f2fs_evict_inode(struct inode *inode) f2fs_unlock_op(sbi); } + /* give more chances, if ENOMEM case */ + if (err == -ENOMEM) { + err = 0; + goto retry; + } + + if (err) + update_inode_page(inode); sb_end_intwrite(inode->i_sb); no_delete: stat_dec_inline_xattr(inode); @@ -356,36 +403,18 @@ no_delete: invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino, inode->i_ino); if (xnid) invalidate_mapping_pages(NODE_MAPPING(sbi), xnid, xnid); - if (is_inode_flag_set(fi, FI_APPEND_WRITE)) - add_dirty_inode(sbi, inode->i_ino, APPEND_INO); - if (is_inode_flag_set(fi, FI_UPDATE_WRITE)) - add_dirty_inode(sbi, inode->i_ino, UPDATE_INO); - if (is_inode_flag_set(fi, FI_FREE_NID)) { - if (err && err != -ENOENT) - alloc_nid_done(sbi, inode->i_ino); - else - alloc_nid_failed(sbi, inode->i_ino); - clear_inode_flag(fi, FI_FREE_NID); - } - - if (err && err != -ENOENT) { - if (!exist_written_data(sbi, inode->i_ino, ORPHAN_INO)) { - /* - * get here because we failed to release resource - * of inode previously, reminder our user to run fsck - * for fixing. - */ - set_sbi_flag(sbi, SBI_NEED_FSCK); - f2fs_msg(sbi->sb, KERN_WARNING, - "inode (ino:%lu) resource leak, run fsck " - "to fix this issue!", inode->i_ino); - } + if (is_inode_flag_set(inode, FI_APPEND_WRITE)) + add_ino_entry(sbi, inode->i_ino, APPEND_INO); + if (is_inode_flag_set(inode, FI_UPDATE_WRITE)) + add_ino_entry(sbi, inode->i_ino, UPDATE_INO); + if (is_inode_flag_set(inode, FI_FREE_NID)) { + alloc_nid_failed(sbi, inode->i_ino); + clear_inode_flag(inode, FI_FREE_NID); } + f2fs_bug_on(sbi, err && + !exist_written_data(sbi, inode->i_ino, ORPHAN_INO)); out_clear: -#ifdef CONFIG_F2FS_FS_ENCRYPTION - if (fi->i_crypt_info) - f2fs_free_encryption_info(inode, fi->i_crypt_info); -#endif + fscrypt_put_encryption_info(inode, NULL); clear_inode(inode); } @@ -393,37 +422,32 @@ out_clear: void handle_failed_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - int err = 0; + struct node_info ni; - clear_nlink(inode); - make_bad_inode(inode); + /* don't make bad inode, since it becomes a regular file. */ unlock_new_inode(inode); - i_size_write(inode, 0); - if (F2FS_HAS_BLOCKS(inode)) - err = f2fs_truncate(inode, false); - - if (!err) - err = remove_inode_page(inode); - /* - * if we skip truncate_node in remove_inode_page bacause we failed - * before, it's better to find another way to release resource of - * this inode (e.g. valid block count, node block or nid). Here we - * choose to add this inode to orphan list, so that we can call iput - * for releasing in orphan recovery flow. - * * Note: we should add inode to orphan list before f2fs_unlock_op() * so we can prevent losing this orphan when encoutering checkpoint * and following suddenly power-off. */ - if (err && err != -ENOENT) { - err = acquire_orphan_inode(sbi); - if (!err) - add_orphan_inode(sbi, inode->i_ino); + get_node_info(sbi, inode->i_ino, &ni); + + if (ni.blk_addr != NULL_ADDR) { + int err = acquire_orphan_inode(sbi); + if (err) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_msg(sbi->sb, KERN_WARNING, + "Too many orphan inodes, run fsck to fix."); + } else { + add_orphan_inode(inode); + } + alloc_nid_done(sbi, inode->i_ino); + } else { + set_inode_flag(inode, FI_FREE_NID); } - set_inode_flag(F2FS_I(inode), FI_FREE_NID); f2fs_unlock_op(sbi); /* iput will drop the inode object */ diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 2c32110f9fc0..0f071a70522d 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -60,10 +60,14 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) if (f2fs_encrypted_inode(dir) && f2fs_may_encrypt(inode)) f2fs_set_encrypted_inode(inode); - if (f2fs_may_inline_data(inode)) - set_inode_flag(F2FS_I(inode), FI_INLINE_DATA); + set_inode_flag(inode, FI_NEW_INODE); + + if (test_opt(sbi, INLINE_XATTR)) + set_inode_flag(inode, FI_INLINE_XATTR); + if (test_opt(sbi, INLINE_DATA) && f2fs_may_inline_data(inode)) + set_inode_flag(inode, FI_INLINE_DATA); if (f2fs_may_inline_dentry(inode)) - set_inode_flag(F2FS_I(inode), FI_INLINE_DENTRY); + set_inode_flag(inode, FI_INLINE_DENTRY); f2fs_init_extent_tree(inode, NULL); @@ -72,14 +76,13 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) stat_inc_inline_dir(inode); trace_f2fs_new_inode(inode, 0); - mark_inode_dirty(inode); return inode; fail: trace_f2fs_new_inode(inode, err); make_bad_inode(inode); if (nid_free) - set_inode_flag(F2FS_I(inode), FI_FREE_NID); + set_inode_flag(inode, FI_FREE_NID); iput(inode); return ERR_PTR(err); } @@ -88,18 +91,23 @@ static int is_multimedia_file(const unsigned char *s, const char *sub) { size_t slen = strlen(s); size_t sublen = strlen(sub); + int i; /* * filename format of multimedia file should be defined as: - * "filename + '.' + extension". + * "filename + '.' + extension + (optional: '.' + temp extension)". */ if (slen < sublen + 2) return 0; - if (s[slen - sublen - 1] != '.') - return 0; + for (i = 1; i < slen - sublen; i++) { + if (s[i] != '.') + continue; + if (!strncasecmp(s + i + 1, sub, sublen)) + return 1; + } - return !strncasecmp(s + slen - sublen, sub, sublen); + return 0; } /* @@ -128,8 +136,6 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, nid_t ino = 0; int err; - f2fs_balance_fs(sbi); - inode = f2fs_new_inode(dir, mode); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -142,6 +148,8 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, inode->i_mapping->a_ops = &f2fs_dblock_aops; ino = inode->i_ino; + f2fs_balance_fs(sbi, true); + f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); if (err) @@ -169,15 +177,15 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir, int err; if (f2fs_encrypted_inode(dir) && - !f2fs_is_child_context_consistent_with_parent(dir, inode)) + !fscrypt_has_permitted_context(dir, inode)) return -EPERM; - f2fs_balance_fs(sbi); + f2fs_balance_fs(sbi, true); inode->i_ctime = CURRENT_TIME; ihold(inode); - set_inode_flag(F2FS_I(inode), FI_INC_LINK); + set_inode_flag(inode, FI_INC_LINK); f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); if (err) @@ -190,7 +198,7 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir, f2fs_sync_fs(sbi->sb, 1); return 0; out: - clear_inode_flag(F2FS_I(inode), FI_INC_LINK); + clear_inode_flag(inode, FI_INC_LINK); iput(inode); f2fs_unlock_op(sbi); return err; @@ -199,10 +207,14 @@ out: struct dentry *f2fs_get_parent(struct dentry *child) { struct qstr dotdot = QSTR_INIT("..", 2); - unsigned long ino = f2fs_inode_by_name(d_inode(child), &dotdot); - if (!ino) + struct page *page; + unsigned long ino = f2fs_inode_by_name(d_inode(child), &dotdot, &page); + if (!ino) { + if (IS_ERR(page)) + return ERR_CAST(page); return ERR_PTR(-ENOENT); - return d_obtain_alias(f2fs_iget(d_inode(child)->i_sb, ino)); + } + return d_obtain_alias(f2fs_iget(child->d_sb, ino)); } static int __recover_dot_dentries(struct inode *dir, nid_t pino) @@ -214,12 +226,24 @@ static int __recover_dot_dentries(struct inode *dir, nid_t pino) struct page *page; int err = 0; + if (f2fs_readonly(sbi->sb)) { + f2fs_msg(sbi->sb, KERN_INFO, + "skip recovering inline_dots inode (ino:%lu, pino:%u) " + "in readonly mountpoint", dir->i_ino, pino); + return 0; + } + + f2fs_balance_fs(sbi, true); + f2fs_lock_op(sbi); de = f2fs_find_entry(dir, &dot, &page); if (de) { f2fs_dentry_kunmap(dir, page); f2fs_put_page(page, 0); + } else if (IS_ERR(page)) { + err = PTR_ERR(page); + goto out; } else { err = __f2fs_add_link(dir, &dot, NULL, dir->i_ino, S_IFDIR); if (err) @@ -230,14 +254,14 @@ static int __recover_dot_dentries(struct inode *dir, nid_t pino) if (de) { f2fs_dentry_kunmap(dir, page); f2fs_put_page(page, 0); + } else if (IS_ERR(page)) { + err = PTR_ERR(page); } else { err = __f2fs_add_link(dir, &dotdot, NULL, pino, S_IFDIR); } out: - if (!err) { - clear_inode_flag(F2FS_I(dir), FI_INLINE_DOTS); - mark_inode_dirty(dir); - } + if (!err) + clear_inode_flag(dir, FI_INLINE_DOTS); f2fs_unlock_op(sbi); return err; @@ -251,13 +275,32 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, struct page *page; nid_t ino; int err = 0; + unsigned int root_ino = F2FS_ROOT_INO(F2FS_I_SB(dir)); + + if (f2fs_encrypted_inode(dir)) { + int res = fscrypt_get_encryption_info(dir); + + /* + * DCACHE_ENCRYPTED_WITH_KEY is set if the dentry is + * created while the directory was encrypted and we + * don't have access to the key. + */ + if (fscrypt_has_encryption_key(dir)) + fscrypt_set_encrypted_dentry(dentry); + fscrypt_set_d_op(dentry); + if (res && res != -ENOKEY) + return ERR_PTR(res); + } if (dentry->d_name.len > F2FS_NAME_LEN) return ERR_PTR(-ENAMETOOLONG); de = f2fs_find_entry(dir, &dentry->d_name, &page); - if (!de) + if (!de) { + if (IS_ERR(page)) + return (struct dentry *)page; return d_splice_alias(inode, dentry); + } ino = le32_to_cpu(de->ino); f2fs_dentry_kunmap(dir, page); @@ -267,15 +310,29 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, if (IS_ERR(inode)) return ERR_CAST(inode); + if ((dir->i_ino == root_ino) && f2fs_has_inline_dots(dir)) { + err = __recover_dot_dentries(dir, root_ino); + if (err) + goto err_out; + } + if (f2fs_has_inline_dots(inode)) { err = __recover_dot_dentries(inode, dir->i_ino); if (err) goto err_out; } + if (!IS_ERR(inode) && f2fs_encrypted_inode(dir) && + (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) && + !fscrypt_has_permitted_context(dir, inode)) { + bool nokey = f2fs_encrypted_inode(inode) && + !fscrypt_has_encryption_key(inode); + err = nokey ? -ENOKEY : -EPERM; + goto err_out; + } return d_splice_alias(inode, dentry); err_out: - iget_failed(inode); + iput(inode); return ERR_PTR(err); } @@ -288,11 +345,15 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) int err = -ENOENT; trace_f2fs_unlink_enter(dir, dentry); - f2fs_balance_fs(sbi); de = f2fs_find_entry(dir, &dentry->d_name, &page); - if (!de) + if (!de) { + if (IS_ERR(page)) + err = PTR_ERR(page); goto fail; + } + + f2fs_balance_fs(sbi, true); f2fs_lock_op(sbi); err = acquire_orphan_inode(sbi); @@ -305,9 +366,6 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) f2fs_delete_entry(de, page, dir, inode); f2fs_unlock_op(sbi); - /* In order to evict this inode, we set it dirty */ - mark_inode_dirty(inode); - if (IS_DIRSYNC(dir)) f2fs_sync_fs(sbi->sb, 1); fail: @@ -332,16 +390,24 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, struct f2fs_sb_info *sbi = F2FS_I_SB(dir); struct inode *inode; size_t len = strlen(symname); - size_t p_len; - char *p_str; - struct f2fs_str disk_link = FSTR_INIT(NULL, 0); - struct f2fs_encrypted_symlink_data *sd = NULL; + struct fscrypt_str disk_link = FSTR_INIT((char *)symname, len + 1); + struct fscrypt_symlink_data *sd = NULL; int err; - if (len > dir->i_sb->s_blocksize) - return -ENAMETOOLONG; + if (f2fs_encrypted_inode(dir)) { + err = fscrypt_get_encryption_info(dir); + if (err) + return err; - f2fs_balance_fs(sbi); + if (!fscrypt_has_encryption_key(dir)) + return -EPERM; + + disk_link.len = (fscrypt_fname_encrypted_size(dir, len) + + sizeof(struct fscrypt_symlink_data)); + } + + if (disk_link.len > dir->i_sb->s_blocksize) + return -ENAMETOOLONG; inode = f2fs_new_inode(dir, S_IFLNK | S_IRWXUGO); if (IS_ERR(inode)) @@ -351,8 +417,11 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, inode->i_op = &f2fs_encrypted_symlink_inode_operations; else inode->i_op = &f2fs_symlink_inode_operations; + inode_nohighmem(inode); inode->i_mapping->a_ops = &f2fs_dblock_aops; + f2fs_balance_fs(sbi, true); + f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); if (err) @@ -360,42 +429,36 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, f2fs_unlock_op(sbi); alloc_nid_done(sbi, inode->i_ino); - if (f2fs_encrypted_inode(dir)) { + if (f2fs_encrypted_inode(inode)) { struct qstr istr = QSTR_INIT(symname, len); + struct fscrypt_str ostr; - err = f2fs_get_encryption_info(inode); - if (err) - goto err_out; - - err = f2fs_fname_crypto_alloc_buffer(inode, len, &disk_link); - if (err) - goto err_out; - - err = f2fs_fname_usr_to_disk(inode, &istr, &disk_link); - if (err < 0) - goto err_out; - - p_len = encrypted_symlink_data_len(disk_link.len) + 1; - - if (p_len > dir->i_sb->s_blocksize) { - err = -ENAMETOOLONG; - goto err_out; - } - - sd = kzalloc(p_len, GFP_NOFS); + sd = kzalloc(disk_link.len, GFP_NOFS); if (!sd) { err = -ENOMEM; goto err_out; } - memcpy(sd->encrypted_path, disk_link.name, disk_link.len); - sd->len = cpu_to_le16(disk_link.len); - p_str = (char *)sd; - } else { - p_len = len + 1; - p_str = (char *)symname; + + err = fscrypt_get_encryption_info(inode); + if (err) + goto err_out; + + if (!fscrypt_has_encryption_key(inode)) { + err = -EPERM; + goto err_out; + } + + ostr.name = sd->encrypted_path; + ostr.len = disk_link.len; + err = fscrypt_fname_usr_to_disk(inode, &istr, &ostr); + if (err < 0) + goto err_out; + + sd->len = cpu_to_le16(ostr.len); + disk_link.name = (char *)sd; } - err = page_symlink(inode, p_str, p_len); + err = page_symlink(inode, disk_link.name, disk_link.len); err_out: d_instantiate(dentry, inode); @@ -411,7 +474,8 @@ err_out: * performance regression. */ if (!err) { - filemap_write_and_wait_range(inode->i_mapping, 0, p_len - 1); + filemap_write_and_wait_range(inode->i_mapping, 0, + disk_link.len - 1); if (IS_DIRSYNC(dir)) f2fs_sync_fs(sbi->sb, 1); @@ -420,7 +484,6 @@ err_out: } kfree(sd); - f2fs_fname_crypto_free_buffer(&disk_link); return err; out: handle_failed_inode(inode); @@ -433,8 +496,6 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) struct inode *inode; int err; - f2fs_balance_fs(sbi); - inode = f2fs_new_inode(dir, S_IFDIR | mode); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -444,7 +505,9 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) inode->i_mapping->a_ops = &f2fs_dblock_aops; mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_HIGH_ZERO); - set_inode_flag(F2FS_I(inode), FI_INC_LINK); + f2fs_balance_fs(sbi, true); + + set_inode_flag(inode, FI_INC_LINK); f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); if (err) @@ -461,7 +524,7 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) return 0; out_fail: - clear_inode_flag(F2FS_I(inode), FI_INC_LINK); + clear_inode_flag(inode, FI_INC_LINK); handle_failed_inode(inode); return err; } @@ -481,8 +544,6 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry, struct inode *inode; int err = 0; - f2fs_balance_fs(sbi); - inode = f2fs_new_inode(dir, mode); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -490,6 +551,8 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry, init_special_inode(inode, inode->i_mode, rdev); inode->i_op = &f2fs_special_inode_operations; + f2fs_balance_fs(sbi, true); + f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); if (err) @@ -516,9 +579,6 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry, struct inode *inode; int err; - if (!whiteout) - f2fs_balance_fs(sbi); - inode = f2fs_new_inode(dir, mode); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -532,6 +592,8 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry, inode->i_mapping->a_ops = &f2fs_dblock_aops; } + f2fs_balance_fs(sbi, true); + f2fs_lock_op(sbi); err = acquire_orphan_inode(sbi); if (err) @@ -545,17 +607,17 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry, * add this non-linked tmpfile to orphan list, in this way we could * remove all unused data of tmpfile after abnormal power-off. */ - add_orphan_inode(sbi, inode->i_ino); - f2fs_unlock_op(sbi); - + add_orphan_inode(inode); alloc_nid_done(sbi, inode->i_ino); if (whiteout) { - inode_dec_link_count(inode); + f2fs_i_links_write(inode, false); *whiteout = inode; } else { d_tmpfile(dentry, inode); } + /* link_count was changed by d_tmpfile as well. */ + f2fs_unlock_op(sbi); unlock_new_inode(inode); return 0; @@ -569,7 +631,7 @@ out: static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) { if (f2fs_encrypted_inode(dir)) { - int err = f2fs_get_encryption_info(dir); + int err = fscrypt_get_encryption_info(dir); if (err) return err; } @@ -595,26 +657,29 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, struct f2fs_dir_entry *old_dir_entry = NULL; struct f2fs_dir_entry *old_entry; struct f2fs_dir_entry *new_entry; + bool is_old_inline = f2fs_has_inline_dentry(old_dir); int err = -ENOENT; if ((old_dir != new_dir) && f2fs_encrypted_inode(new_dir) && - !f2fs_is_child_context_consistent_with_parent(new_dir, - old_inode)) { + !fscrypt_has_permitted_context(new_dir, old_inode)) { err = -EPERM; goto out; } - f2fs_balance_fs(sbi); - old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page); - if (!old_entry) + if (!old_entry) { + if (IS_ERR(old_page)) + err = PTR_ERR(old_page); goto out; + } if (S_ISDIR(old_inode->i_mode)) { - err = -EIO; old_dir_entry = f2fs_parent_dir(old_inode, &old_dir_page); - if (!old_dir_entry) + if (!old_dir_entry) { + if (IS_ERR(old_dir_page)) + err = PTR_ERR(old_dir_page); goto out_old; + } } if (flags & RENAME_WHITEOUT) { @@ -632,8 +697,13 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, err = -ENOENT; new_entry = f2fs_find_entry(new_dir, &new_dentry->d_name, &new_page); - if (!new_entry) + if (!new_entry) { + if (IS_ERR(new_page)) + err = PTR_ERR(new_page); goto out_whiteout; + } + + f2fs_balance_fs(sbi, true); f2fs_lock_op(sbi); @@ -641,8 +711,9 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, if (err) goto put_out_dir; - if (update_dent_inode(old_inode, new_inode, - &new_dentry->d_name)) { + err = update_dent_inode(old_inode, new_inode, + &new_dentry->d_name); + if (err) { release_orphan_inode(sbi); goto put_out_dir; } @@ -652,20 +723,17 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, new_inode->i_ctime = CURRENT_TIME; down_write(&F2FS_I(new_inode)->i_sem); if (old_dir_entry) - drop_nlink(new_inode); - drop_nlink(new_inode); + f2fs_i_links_write(new_inode, false); + f2fs_i_links_write(new_inode, false); up_write(&F2FS_I(new_inode)->i_sem); - mark_inode_dirty(new_inode); - if (!new_inode->i_nlink) - add_orphan_inode(sbi, new_inode->i_ino); + add_orphan_inode(new_inode); else release_orphan_inode(sbi); - - update_inode_page(old_inode); - update_inode_page(new_inode); } else { + f2fs_balance_fs(sbi, true); + f2fs_lock_op(sbi); err = f2fs_add_link(new_dentry, old_inode); @@ -674,9 +742,29 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, goto out_whiteout; } - if (old_dir_entry) { - inc_nlink(new_dir); - update_inode_page(new_dir); + if (old_dir_entry) + f2fs_i_links_write(new_dir, true); + + /* + * old entry and new entry can locate in the same inline + * dentry in inode, when attaching new entry in inline dentry, + * it could force inline dentry conversion, after that, + * old_entry and old_page will point to wrong address, in + * order to avoid this, let's do the check and update here. + */ + if (is_old_inline && !f2fs_has_inline_dentry(old_dir)) { + f2fs_put_page(old_page, 0); + old_page = NULL; + + old_entry = f2fs_find_entry(old_dir, + &old_dentry->d_name, &old_page); + if (!old_entry) { + err = -ENOENT; + if (IS_ERR(old_page)) + err = PTR_ERR(old_page); + f2fs_unlock_op(sbi); + goto out_whiteout; + } } } @@ -687,13 +775,13 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, up_write(&F2FS_I(old_inode)->i_sem); old_inode->i_ctime = CURRENT_TIME; - mark_inode_dirty(old_inode); + f2fs_mark_inode_dirty_sync(old_inode); f2fs_delete_entry(old_entry, old_page, old_dir, NULL); if (whiteout) { whiteout->i_state |= I_LINKABLE; - set_inode_flag(F2FS_I(whiteout), FI_INC_LINK); + set_inode_flag(whiteout, FI_INC_LINK); err = f2fs_add_link(old_dentry, whiteout); if (err) goto put_out_dir; @@ -705,14 +793,11 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, if (old_dir != new_dir && !whiteout) { f2fs_set_link(old_inode, old_dir_entry, old_dir_page, new_dir); - update_inode_page(old_inode); } else { f2fs_dentry_kunmap(old_inode, old_dir_page); f2fs_put_page(old_dir_page, 0); } - drop_nlink(old_dir); - mark_inode_dirty(old_dir); - update_inode_page(old_dir); + f2fs_i_links_write(old_dir, false); } f2fs_unlock_op(sbi); @@ -756,39 +841,45 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, int err = -ENOENT; if ((f2fs_encrypted_inode(old_dir) || f2fs_encrypted_inode(new_dir)) && - (old_dir != new_dir) && - (!f2fs_is_child_context_consistent_with_parent(new_dir, - old_inode) || - !f2fs_is_child_context_consistent_with_parent(old_dir, - new_inode))) + (old_dir != new_dir) && + (!fscrypt_has_permitted_context(new_dir, old_inode) || + !fscrypt_has_permitted_context(old_dir, new_inode))) return -EPERM; - f2fs_balance_fs(sbi); - old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page); - if (!old_entry) + if (!old_entry) { + if (IS_ERR(old_page)) + err = PTR_ERR(old_page); goto out; + } new_entry = f2fs_find_entry(new_dir, &new_dentry->d_name, &new_page); - if (!new_entry) + if (!new_entry) { + if (IS_ERR(new_page)) + err = PTR_ERR(new_page); goto out_old; + } /* prepare for updating ".." directory entry info later */ if (old_dir != new_dir) { if (S_ISDIR(old_inode->i_mode)) { - err = -EIO; old_dir_entry = f2fs_parent_dir(old_inode, &old_dir_page); - if (!old_dir_entry) + if (!old_dir_entry) { + if (IS_ERR(old_dir_page)) + err = PTR_ERR(old_dir_page); goto out_new; + } } if (S_ISDIR(new_inode->i_mode)) { - err = -EIO; new_dir_entry = f2fs_parent_dir(new_inode, &new_dir_page); - if (!new_dir_entry) + if (!new_dir_entry) { + if (IS_ERR(new_dir_page)) + err = PTR_ERR(new_dir_page); goto out_old_dir; + } } } @@ -807,6 +898,8 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, goto out_new_dir; } + f2fs_balance_fs(sbi, true); + f2fs_lock_op(sbi); err = update_dent_inode(old_inode, new_inode, &new_dentry->d_name); @@ -836,19 +929,13 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, file_lost_pino(old_inode); up_write(&F2FS_I(old_inode)->i_sem); - update_inode_page(old_inode); - old_dir->i_ctime = CURRENT_TIME; if (old_nlink) { down_write(&F2FS_I(old_dir)->i_sem); - if (old_nlink < 0) - drop_nlink(old_dir); - else - inc_nlink(old_dir); + f2fs_i_links_write(old_dir, old_nlink > 0); up_write(&F2FS_I(old_dir)->i_sem); } - mark_inode_dirty(old_dir); - update_inode_page(old_dir); + f2fs_mark_inode_dirty_sync(old_dir); /* update directory entry info of new dir inode */ f2fs_set_link(new_dir, new_entry, new_page, old_inode); @@ -857,19 +944,13 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, file_lost_pino(new_inode); up_write(&F2FS_I(new_inode)->i_sem); - update_inode_page(new_inode); - new_dir->i_ctime = CURRENT_TIME; if (new_nlink) { down_write(&F2FS_I(new_dir)->i_sem); - if (new_nlink < 0) - drop_nlink(new_dir); - else - inc_nlink(new_dir); + f2fs_i_links_write(new_dir, new_nlink > 0); up_write(&F2FS_I(new_dir)->i_sem); } - mark_inode_dirty(new_dir); - update_inode_page(new_dir); + f2fs_mark_inode_dirty_sync(new_dir); f2fs_unlock_op(sbi); @@ -922,89 +1003,85 @@ static int f2fs_rename2(struct inode *old_dir, struct dentry *old_dentry, return f2fs_rename(old_dir, old_dentry, new_dir, new_dentry, flags); } -#ifdef CONFIG_F2FS_FS_ENCRYPTION static const char *f2fs_encrypted_follow_link(struct dentry *dentry, void **cookie) { struct page *cpage = NULL; char *caddr, *paddr = NULL; - struct f2fs_str cstr; - struct f2fs_str pstr = FSTR_INIT(NULL, 0); + struct fscrypt_str cstr = FSTR_INIT(NULL, 0); + struct fscrypt_str pstr = FSTR_INIT(NULL, 0); + struct fscrypt_symlink_data *sd; struct inode *inode = d_inode(dentry); - struct f2fs_encrypted_symlink_data *sd; - loff_t size = min_t(loff_t, i_size_read(inode), PAGE_SIZE - 1); u32 max_size = inode->i_sb->s_blocksize; int res; - res = f2fs_get_encryption_info(inode); + if (!dentry) + return ERR_PTR(-ECHILD); + + res = fscrypt_get_encryption_info(inode); if (res) return ERR_PTR(res); cpage = read_mapping_page(inode->i_mapping, 0, NULL); if (IS_ERR(cpage)) return ERR_CAST(cpage); - caddr = kmap(cpage); - caddr[size] = 0; + caddr = page_address(cpage); /* Symlink is encrypted */ - sd = (struct f2fs_encrypted_symlink_data *)caddr; + sd = (struct fscrypt_symlink_data *)caddr; + cstr.name = sd->encrypted_path; cstr.len = le16_to_cpu(sd->len); - cstr.name = kmalloc(cstr.len, GFP_NOFS); - if (!cstr.name) { - res = -ENOMEM; - goto errout; - } - memcpy(cstr.name, sd->encrypted_path, cstr.len); /* this is broken symlink case */ - if (cstr.name[0] == 0 && cstr.len == 0) { + if (unlikely(cstr.len == 0)) { res = -ENOENT; goto errout; } - if ((cstr.len + sizeof(struct f2fs_encrypted_symlink_data) - 1) > - max_size) { + if ((cstr.len + sizeof(struct fscrypt_symlink_data) - 1) > max_size) { /* Symlink data on the disk is corrupted */ res = -EIO; goto errout; } - res = f2fs_fname_crypto_alloc_buffer(inode, cstr.len, &pstr); + res = fscrypt_fname_alloc_buffer(inode, cstr.len, &pstr); if (res) goto errout; - res = f2fs_fname_disk_to_usr(inode, NULL, &cstr, &pstr); + res = fscrypt_fname_disk_to_usr(inode, 0, 0, &cstr, &pstr); if (res < 0) goto errout; - kfree(cstr.name); + /* this is broken symlink case */ + if (unlikely(pstr.name[0] == 0)) { + res = -ENOENT; + goto errout; + } paddr = pstr.name; /* Null-terminate the name */ paddr[res] = '\0'; - kunmap(cpage); - page_cache_release(cpage); + put_page(cpage); return *cookie = paddr; errout: - kfree(cstr.name); - f2fs_fname_crypto_free_buffer(&pstr); - kunmap(cpage); - page_cache_release(cpage); + fscrypt_fname_free_buffer(&pstr); + put_page(cpage); return ERR_PTR(res); } const struct inode_operations f2fs_encrypted_symlink_inode_operations = { .readlink = generic_readlink, - .follow_link = f2fs_encrypted_follow_link, - .put_link = kfree_put_link, + .follow_link = f2fs_encrypted_follow_link, + .put_link = kfree_put_link, .getattr = f2fs_getattr, .setattr = f2fs_setattr, +#ifdef CONFIG_F2FS_FS_XATTR .setxattr = generic_setxattr, .getxattr = generic_getxattr, .listxattr = f2fs_listxattr, .removexattr = generic_removexattr, -}; #endif +}; const struct inode_operations f2fs_dir_inode_operations = { .create = f2fs_create, diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 7bcbc6e9c40d..b1e615ed2bef 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -46,12 +46,14 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type) */ if (type == FREE_NIDS) { mem_size = (nm_i->fcnt * sizeof(struct free_nid)) >> - PAGE_CACHE_SHIFT; + PAGE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2); } else if (type == NAT_ENTRIES) { mem_size = (nm_i->nat_cnt * sizeof(struct nat_entry)) >> - PAGE_CACHE_SHIFT; + PAGE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2); + if (excess_cached_nats(sbi)) + res = false; } else if (type == DIRTY_DENTS) { if (sbi->sb->s_bdi->wb.dirty_exceeded) return false; @@ -62,16 +64,17 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type) for (i = 0; i <= UPDATE_INO; i++) mem_size += (sbi->im[i].ino_num * - sizeof(struct ino_entry)) >> PAGE_CACHE_SHIFT; + sizeof(struct ino_entry)) >> PAGE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); } else if (type == EXTENT_CACHE) { - mem_size = (sbi->total_ext_tree * sizeof(struct extent_tree) + + mem_size = (atomic_read(&sbi->total_ext_tree) * + sizeof(struct extent_tree) + atomic_read(&sbi->total_ext_node) * - sizeof(struct extent_node)) >> PAGE_CACHE_SHIFT; + sizeof(struct extent_node)) >> PAGE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); } else { - if (sbi->sb->s_bdi->wb.dirty_exceeded) - return false; + if (!sbi->sb->s_bdi->wb.dirty_exceeded) + return true; } return res; } @@ -120,7 +123,7 @@ static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid) src_addr = page_address(src_page); dst_addr = page_address(dst_page); - memcpy(dst_addr, src_addr, PAGE_CACHE_SIZE); + memcpy(dst_addr, src_addr, PAGE_SIZE); set_page_dirty(dst_page); f2fs_put_page(src_page, 1); @@ -256,18 +259,21 @@ static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid) return new; } -static void cache_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid, +static void cache_nat_entry(struct f2fs_sb_info *sbi, nid_t nid, struct f2fs_nat_entry *ne) { + struct f2fs_nm_info *nm_i = NM_I(sbi); struct nat_entry *e; - down_write(&nm_i->nat_tree_lock); e = __lookup_nat_cache(nm_i, nid); if (!e) { e = grab_nat_entry(nm_i, nid); node_info_from_raw_nat(&e->ni, ne); + } else { + f2fs_bug_on(sbi, nat_get_ino(e) != ne->ino || + nat_get_blkaddr(e) != ne->block_addr || + nat_get_version(e) != ne->version); } - up_write(&nm_i->nat_tree_lock); } static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, @@ -355,7 +361,7 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); - struct f2fs_summary_block *sum = curseg->sum_blk; + struct f2fs_journal *journal = curseg->journal; nid_t start_nid = START_NID(nid); struct f2fs_nat_block *nat_blk; struct page *page = NULL; @@ -372,21 +378,20 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni) ni->ino = nat_get_ino(e); ni->blk_addr = nat_get_blkaddr(e); ni->version = nat_get_version(e); - } - up_read(&nm_i->nat_tree_lock); - if (e) + up_read(&nm_i->nat_tree_lock); return; + } memset(&ne, 0, sizeof(struct f2fs_nat_entry)); /* Check current segment summary */ - mutex_lock(&curseg->curseg_mutex); - i = lookup_journal_in_cursum(sum, NAT_JOURNAL, nid, 0); + down_read(&curseg->journal_rwsem); + i = lookup_journal_in_cursum(journal, NAT_JOURNAL, nid, 0); if (i >= 0) { - ne = nat_in_journal(sum, i); + ne = nat_in_journal(journal, i); node_info_from_raw_nat(ni, &ne); } - mutex_unlock(&curseg->curseg_mutex); + up_read(&curseg->journal_rwsem); if (i >= 0) goto cache; @@ -397,18 +402,75 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni) node_info_from_raw_nat(ni, &ne); f2fs_put_page(page, 1); cache: + up_read(&nm_i->nat_tree_lock); /* cache nat entry */ - cache_nat_entry(NM_I(sbi), nid, &ne); + down_write(&nm_i->nat_tree_lock); + cache_nat_entry(sbi, nid, &ne); + up_write(&nm_i->nat_tree_lock); +} + +/* + * readahead MAX_RA_NODE number of node pages. + */ +static void ra_node_pages(struct page *parent, int start, int n) +{ + struct f2fs_sb_info *sbi = F2FS_P_SB(parent); + struct blk_plug plug; + int i, end; + nid_t nid; + + blk_start_plug(&plug); + + /* Then, try readahead for siblings of the desired node */ + end = start + n; + end = min(end, NIDS_PER_BLOCK); + for (i = start; i < end; i++) { + nid = get_nid(parent, i, false); + ra_node_page(sbi, nid); + } + + blk_finish_plug(&plug); +} + +pgoff_t get_next_page_offset(struct dnode_of_data *dn, pgoff_t pgofs) +{ + const long direct_index = ADDRS_PER_INODE(dn->inode); + const long direct_blks = ADDRS_PER_BLOCK; + const long indirect_blks = ADDRS_PER_BLOCK * NIDS_PER_BLOCK; + unsigned int skipped_unit = ADDRS_PER_BLOCK; + int cur_level = dn->cur_level; + int max_level = dn->max_level; + pgoff_t base = 0; + + if (!dn->max_level) + return pgofs + 1; + + while (max_level-- > cur_level) + skipped_unit *= NIDS_PER_BLOCK; + + switch (dn->max_level) { + case 3: + base += 2 * indirect_blks; + case 2: + base += 2 * direct_blks; + case 1: + base += direct_index; + break; + default: + f2fs_bug_on(F2FS_I_SB(dn->inode), 1); + } + + return ((pgofs - base) / skipped_unit + 1) * skipped_unit + base; } /* * The maximum depth is four. * Offset[0] will have raw inode offset. */ -static int get_node_path(struct f2fs_inode_info *fi, long block, +static int get_node_path(struct inode *inode, long block, int offset[4], unsigned int noffset[4]) { - const long direct_index = ADDRS_PER_INODE(fi); + const long direct_index = ADDRS_PER_INODE(inode); const long direct_blks = ADDRS_PER_BLOCK; const long dptrs_per_blk = NIDS_PER_BLOCK; const long indirect_blks = ADDRS_PER_BLOCK * NIDS_PER_BLOCK; @@ -493,10 +555,10 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) int offset[4]; unsigned int noffset[4]; nid_t nids[4]; - int level, i; + int level, i = 0; int err = 0; - level = get_node_path(F2FS_I(dn->inode), index, offset, noffset); + level = get_node_path(dn->inode, index, offset, noffset); nids[0] = dn->inode->i_ino; npage[0] = dn->inode_page; @@ -583,6 +645,11 @@ release_pages: release_out: dn->inode_page = NULL; dn->node_page = NULL; + if (err == -ENOENT) { + dn->cur_level = i; + dn->max_level = level; + dn->ofs_in_node = offset[level]; + } return err; } @@ -606,8 +673,7 @@ static void truncate_node(struct dnode_of_data *dn) if (dn->nid == dn->inode->i_ino) { remove_orphan_inode(sbi, dn->nid); dec_valid_inode_count(sbi); - } else { - sync_inode_page(dn); + f2fs_inode_synced(dn->inode); } invalidate: clear_node_page_dirty(dn->node_page); @@ -666,6 +732,8 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs, return PTR_ERR(page); } + ra_node_pages(page, ofs, NIDS_PER_BLOCK); + rn = F2FS_NODE(page); if (depth < 3) { for (i = ofs; i < NIDS_PER_BLOCK; i++, freed++) { @@ -676,7 +744,8 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs, ret = truncate_dnode(&rdn); if (ret < 0) goto out_err; - set_nid(page, i, 0, false); + if (set_nid(page, i, 0, false)) + dn->node_changed = true; } } else { child_nofs = nofs + ofs * (NIDS_PER_BLOCK + 1) + 1; @@ -689,7 +758,8 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs, rdn.nid = child_nid; ret = truncate_nodes(&rdn, child_nofs, 0, depth - 1); if (ret == (NIDS_PER_BLOCK + 1)) { - set_nid(page, i, 0, false); + if (set_nid(page, i, 0, false)) + dn->node_changed = true; child_nofs += ret; } else if (ret < 0 && ret != -ENOENT) { goto out_err; @@ -741,6 +811,8 @@ static int truncate_partial_nodes(struct dnode_of_data *dn, nid[i + 1] = get_nid(pages[i], offset[i + 1], false); } + ra_node_pages(pages[idx], offset[idx + 1], NIDS_PER_BLOCK); + /* free direct nodes linked to a partial indirect node */ for (i = offset[idx + 1]; i < NIDS_PER_BLOCK; i++) { child_nid = get_nid(pages[idx], i, false); @@ -750,7 +822,8 @@ static int truncate_partial_nodes(struct dnode_of_data *dn, err = truncate_dnode(dn); if (err < 0) goto fail; - set_nid(pages[idx], i, 0, false); + if (set_nid(pages[idx], i, 0, false)) + dn->node_changed = true; } if (offset[idx + 1] == 0) { @@ -787,8 +860,8 @@ int truncate_inode_blocks(struct inode *inode, pgoff_t from) trace_f2fs_truncate_inode_blocks_enter(inode, from); - level = get_node_path(F2FS_I(inode), from, offset, noffset); -restart: + level = get_node_path(inode, from, offset, noffset); + page = get_node_page(sbi, inode->i_ino); if (IS_ERR(page)) { trace_f2fs_truncate_inode_blocks_exit(inode, PTR_ERR(page)); @@ -852,11 +925,8 @@ skip_partial: if (offset[1] == 0 && ri->i_nid[offset[0] - NODE_DIR1_BLOCK]) { lock_page(page); - if (unlikely(page->mapping != NODE_MAPPING(sbi))) { - f2fs_put_page(page, 1); - goto restart; - } - f2fs_wait_on_page_writeback(page, NODE); + BUG_ON(page->mapping != NODE_MAPPING(sbi)); + f2fs_wait_on_page_writeback(page, NODE, true); ri->i_nid[offset[0] - NODE_DIR1_BLOCK] = 0; set_page_dirty(page); unlock_page(page); @@ -885,7 +955,7 @@ int truncate_xattr_node(struct inode *inode, struct page *page) if (IS_ERR(npage)) return PTR_ERR(npage); - F2FS_I(inode)->i_xattr_nid = 0; + f2fs_i_xnid_write(inode, 0); /* need to do checkpoint during fsync */ F2FS_I(inode)->xattr_ver = cur_cp_version(F2FS_CKPT(sbi)); @@ -951,10 +1021,10 @@ struct page *new_node_page(struct dnode_of_data *dn, struct page *page; int err; - if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC))) + if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC))) return ERR_PTR(-EPERM); - page = grab_cache_page(NODE_MAPPING(sbi), dn->nid); + page = f2fs_grab_cache_page(NODE_MAPPING(sbi), dn->nid, false); if (!page) return ERR_PTR(-ENOMEM); @@ -971,23 +1041,19 @@ struct page *new_node_page(struct dnode_of_data *dn, new_ni.ino = dn->inode->i_ino; set_node_addr(sbi, &new_ni, NEW_ADDR, false); - f2fs_wait_on_page_writeback(page, NODE); + f2fs_wait_on_page_writeback(page, NODE, true); fill_node_footer(page, dn->nid, dn->inode->i_ino, ofs, true); set_cold_node(dn->inode, page); - SetPageUptodate(page); - set_page_dirty(page); + if (!PageUptodate(page)) + SetPageUptodate(page); + if (set_page_dirty(page)) + dn->node_changed = true; if (f2fs_has_xattr_block(ofs)) - F2FS_I(dn->inode)->i_xattr_nid = dn->nid; + f2fs_i_xnid_write(dn->inode, dn->nid); - dn->node_page = page; - if (ipage) - update_inode(dn->inode, ipage); - else - sync_inode_page(dn); if (ofs == 0) inc_valid_inode_count(sbi); - return page; fail: @@ -1013,6 +1079,9 @@ static int read_node_page(struct page *page, int rw) .encrypted_page = NULL, }; + if (PageUptodate(page)) + return LOCKED_PAGE; + get_node_info(sbi, page->index, &ni); if (unlikely(ni.blk_addr == NULL_ADDR)) { @@ -1020,10 +1089,7 @@ static int read_node_page(struct page *page, int rw) return -ENOENT; } - if (PageUptodate(page)) - return LOCKED_PAGE; - - fio.blk_addr = ni.blk_addr; + fio.new_blkaddr = fio.old_blkaddr = ni.blk_addr; return f2fs_submit_page_bio(&fio); } @@ -1035,14 +1101,17 @@ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid) struct page *apage; int err; - apage = find_get_page(NODE_MAPPING(sbi), nid); - if (apage && PageUptodate(apage)) { - f2fs_put_page(apage, 0); + if (!nid) return; - } - f2fs_put_page(apage, 0); + f2fs_bug_on(sbi, check_nid_range(sbi, nid)); - apage = grab_cache_page(NODE_MAPPING(sbi), nid); + rcu_read_lock(); + apage = radix_tree_lookup(&NODE_MAPPING(sbi)->page_tree, nid); + rcu_read_unlock(); + if (apage) + return; + + apage = f2fs_grab_cache_page(NODE_MAPPING(sbi), nid, false); if (!apage) return; @@ -1050,53 +1119,17 @@ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid) f2fs_put_page(apage, err ? 1 : 0); } -struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid) +static struct page *__get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid, + struct page *parent, int start) { struct page *page; int err; -repeat: - page = grab_cache_page(NODE_MAPPING(sbi), nid); - if (!page) - return ERR_PTR(-ENOMEM); - err = read_node_page(page, READ_SYNC); - if (err < 0) { - f2fs_put_page(page, 1); - return ERR_PTR(err); - } else if (err != LOCKED_PAGE) { - lock_page(page); - } - - if (unlikely(!PageUptodate(page) || nid != nid_of_node(page))) { - ClearPageUptodate(page); - f2fs_put_page(page, 1); - return ERR_PTR(-EIO); - } - if (unlikely(page->mapping != NODE_MAPPING(sbi))) { - f2fs_put_page(page, 1); - goto repeat; - } - return page; -} - -/* - * Return a locked page for the desired node page. - * And, readahead MAX_RA_NODE number of node pages. - */ -struct page *get_node_page_ra(struct page *parent, int start) -{ - struct f2fs_sb_info *sbi = F2FS_P_SB(parent); - struct blk_plug plug; - struct page *page; - int err, i, end; - nid_t nid; - - /* First, try getting the desired direct node. */ - nid = get_nid(parent, start, false); if (!nid) return ERR_PTR(-ENOENT); + f2fs_bug_on(sbi, check_nid_range(sbi, nid)); repeat: - page = grab_cache_page(NODE_MAPPING(sbi), nid); + page = f2fs_grab_cache_page(NODE_MAPPING(sbi), nid, false); if (!page) return ERR_PTR(-ENOMEM); @@ -1108,61 +1141,116 @@ repeat: goto page_hit; } - blk_start_plug(&plug); - - /* Then, try readahead for siblings of the desired node */ - end = start + MAX_RA_NODE; - end = min(end, NIDS_PER_BLOCK); - for (i = start + 1; i < end; i++) { - nid = get_nid(parent, i, false); - if (!nid) - continue; - ra_node_page(sbi, nid); - } - - blk_finish_plug(&plug); + if (parent) + ra_node_pages(parent, start + 1, MAX_RA_NODE); lock_page(page); + if (unlikely(page->mapping != NODE_MAPPING(sbi))) { f2fs_put_page(page, 1); goto repeat; } + + if (unlikely(!PageUptodate(page))) + goto out_err; page_hit: - if (unlikely(!PageUptodate(page))) { + if(unlikely(nid != nid_of_node(page))) { + f2fs_bug_on(sbi, 1); + ClearPageUptodate(page); +out_err: f2fs_put_page(page, 1); return ERR_PTR(-EIO); } return page; } -void sync_inode_page(struct dnode_of_data *dn) +struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid) { - if (IS_INODE(dn->node_page) || dn->inode_page == dn->node_page) { - update_inode(dn->inode, dn->node_page); - } else if (dn->inode_page) { - if (!dn->inode_page_locked) - lock_page(dn->inode_page); - update_inode(dn->inode, dn->inode_page); - if (!dn->inode_page_locked) - unlock_page(dn->inode_page); - } else { - update_inode_page(dn->inode); - } + return __get_node_page(sbi, nid, NULL, 0); } -int sync_node_pages(struct f2fs_sb_info *sbi, nid_t ino, - struct writeback_control *wbc) +struct page *get_node_page_ra(struct page *parent, int start) +{ + struct f2fs_sb_info *sbi = F2FS_P_SB(parent); + nid_t nid = get_nid(parent, start, false); + + return __get_node_page(sbi, nid, parent, start); +} + +static void flush_inline_data(struct f2fs_sb_info *sbi, nid_t ino) +{ + struct inode *inode; + struct page *page; + int ret; + + /* should flush inline_data before evict_inode */ + inode = ilookup(sbi->sb, ino); + if (!inode) + return; + + page = pagecache_get_page(inode->i_mapping, 0, FGP_LOCK|FGP_NOWAIT, 0); + if (!page) + goto iput_out; + + if (!PageUptodate(page)) + goto page_out; + + if (!PageDirty(page)) + goto page_out; + + if (!clear_page_dirty_for_io(page)) + goto page_out; + + ret = f2fs_write_inline_data(inode, page); + inode_dec_dirty_pages(inode); + if (ret) + set_page_dirty(page); +page_out: + f2fs_put_page(page, 1); +iput_out: + iput(inode); +} + +void move_node_page(struct page *node_page, int gc_type) +{ + if (gc_type == FG_GC) { + struct f2fs_sb_info *sbi = F2FS_P_SB(node_page); + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = 1, + .for_reclaim = 0, + }; + + set_page_dirty(node_page); + f2fs_wait_on_page_writeback(node_page, NODE, true); + + f2fs_bug_on(sbi, PageWriteback(node_page)); + if (!clear_page_dirty_for_io(node_page)) + goto out_page; + + if (NODE_MAPPING(sbi)->a_ops->writepage(node_page, &wbc)) + unlock_page(node_page); + goto release_page; + } else { + /* set page dirty and write it */ + if (!PageWriteback(node_page)) + set_page_dirty(node_page); + } +out_page: + unlock_page(node_page); +release_page: + f2fs_put_page(node_page, 0); +} + +static struct page *last_fsync_dnode(struct f2fs_sb_info *sbi, nid_t ino) { pgoff_t index, end; struct pagevec pvec; - int step = ino ? 2 : 0; - int nwritten = 0, wrote = 0; + struct page *last_page = NULL; pagevec_init(&pvec, 0); - -next_step: index = 0; - end = LONG_MAX; + end = ULONG_MAX; while (index <= end) { int i, nr_pages; @@ -1175,6 +1263,190 @@ next_step: for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; + if (unlikely(f2fs_cp_error(sbi))) { + f2fs_put_page(last_page, 0); + pagevec_release(&pvec); + return ERR_PTR(-EIO); + } + + if (!IS_DNODE(page) || !is_cold_node(page)) + continue; + if (ino_of_node(page) != ino) + continue; + + lock_page(page); + + if (unlikely(page->mapping != NODE_MAPPING(sbi))) { +continue_unlock: + unlock_page(page); + continue; + } + if (ino_of_node(page) != ino) + goto continue_unlock; + + if (!PageDirty(page)) { + /* someone wrote it for us */ + goto continue_unlock; + } + + if (last_page) + f2fs_put_page(last_page, 0); + + get_page(page); + last_page = page; + unlock_page(page); + } + pagevec_release(&pvec); + cond_resched(); + } + return last_page; +} + +int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, + struct writeback_control *wbc, bool atomic) +{ + pgoff_t index, end; + struct pagevec pvec; + int ret = 0; + struct page *last_page = NULL; + bool marked = false; + nid_t ino = inode->i_ino; + int nwritten = 0; + + if (atomic) { + last_page = last_fsync_dnode(sbi, ino); + if (IS_ERR_OR_NULL(last_page)) + return PTR_ERR_OR_ZERO(last_page); + } +retry: + pagevec_init(&pvec, 0); + index = 0; + end = ULONG_MAX; + + while (index <= end) { + int i, nr_pages; + nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, + PAGECACHE_TAG_DIRTY, + min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); + if (nr_pages == 0) + break; + + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + if (unlikely(f2fs_cp_error(sbi))) { + f2fs_put_page(last_page, 0); + pagevec_release(&pvec); + return -EIO; + } + + if (!IS_DNODE(page) || !is_cold_node(page)) + continue; + if (ino_of_node(page) != ino) + continue; + + lock_page(page); + + if (unlikely(page->mapping != NODE_MAPPING(sbi))) { +continue_unlock: + unlock_page(page); + continue; + } + if (ino_of_node(page) != ino) + goto continue_unlock; + + if (!PageDirty(page) && page != last_page) { + /* someone wrote it for us */ + goto continue_unlock; + } + + f2fs_wait_on_page_writeback(page, NODE, true); + BUG_ON(PageWriteback(page)); + + if (!atomic || page == last_page) { + set_fsync_mark(page, 1); + if (IS_INODE(page)) { + if (is_inode_flag_set(inode, + FI_DIRTY_INODE)) + update_inode(inode, page); + set_dentry_mark(page, + need_dentry_mark(sbi, ino)); + } + /* may be written by other thread */ + if (!PageDirty(page)) + set_page_dirty(page); + } + + if (!clear_page_dirty_for_io(page)) + goto continue_unlock; + + ret = NODE_MAPPING(sbi)->a_ops->writepage(page, wbc); + if (ret) { + unlock_page(page); + f2fs_put_page(last_page, 0); + break; + } else { + nwritten++; + } + + if (page == last_page) { + f2fs_put_page(page, 0); + marked = true; + break; + } + } + pagevec_release(&pvec); + cond_resched(); + + if (ret || marked) + break; + } + if (!ret && atomic && !marked) { + f2fs_msg(sbi->sb, KERN_DEBUG, + "Retry to write fsync mark: ino=%u, idx=%lx", + ino, last_page->index); + lock_page(last_page); + set_page_dirty(last_page); + unlock_page(last_page); + goto retry; + } + + if (nwritten) + f2fs_submit_merged_bio_cond(sbi, NULL, NULL, ino, NODE, WRITE); + return ret ? -EIO: 0; +} + +int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc) +{ + pgoff_t index, end; + struct pagevec pvec; + int step = 0; + int nwritten = 0; + int ret = 0; + + pagevec_init(&pvec, 0); + +next_step: + index = 0; + end = ULONG_MAX; + + while (index <= end) { + int i, nr_pages; + nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, + PAGECACHE_TAG_DIRTY, + min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); + if (nr_pages == 0) + break; + + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + if (unlikely(f2fs_cp_error(sbi))) { + pagevec_release(&pvec); + ret = -EIO; + goto out; + } + /* * flushing sequence with step: * 0. indirect nodes @@ -1189,14 +1461,8 @@ next_step: if (step == 2 && (!IS_DNODE(page) || !is_cold_node(page))) continue; - - /* - * If an fsync mode, - * we should not skip writing node pages. - */ - if (ino && ino_of_node(page) == ino) - lock_page(page); - else if (!trylock_page(page)) +lock_node: + if (!trylock_page(page)) continue; if (unlikely(page->mapping != NODE_MAPPING(sbi))) { @@ -1204,33 +1470,33 @@ continue_unlock: unlock_page(page); continue; } - if (ino && ino_of_node(page) != ino) - goto continue_unlock; if (!PageDirty(page)) { /* someone wrote it for us */ goto continue_unlock; } + /* flush inline_data */ + if (is_inline_node(page)) { + clear_inline_node(page); + unlock_page(page); + flush_inline_data(sbi, ino_of_node(page)); + goto lock_node; + } + + f2fs_wait_on_page_writeback(page, NODE, true); + + BUG_ON(PageWriteback(page)); if (!clear_page_dirty_for_io(page)) goto continue_unlock; - /* called by fsync() */ - if (ino && IS_DNODE(page)) { - set_fsync_mark(page, 1); - if (IS_INODE(page)) - set_dentry_mark(page, - need_dentry_mark(sbi, ino)); - nwritten++; - } else { - set_fsync_mark(page, 0); - set_dentry_mark(page, 0); - } + set_fsync_mark(page, 0); + set_dentry_mark(page, 0); if (NODE_MAPPING(sbi)->a_ops->writepage(page, wbc)) unlock_page(page); else - wrote++; + nwritten++; if (--wbc->nr_to_write == 0) break; @@ -1248,15 +1514,15 @@ continue_unlock: step++; goto next_step; } - - if (wrote) +out: + if (nwritten) f2fs_submit_merged_bio(sbi, NODE, WRITE); - return nwritten; + return ret; } int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino) { - pgoff_t index = 0, end = LONG_MAX; + pgoff_t index = 0, end = ULONG_MAX; struct pagevec pvec; int ret2 = 0, ret = 0; @@ -1278,7 +1544,7 @@ int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino) continue; if (ino && ino_of_node(page) == ino) { - f2fs_wait_on_page_writeback(page, NODE); + f2fs_wait_on_page_writeback(page, NODE, true); if (TestClearPageError(page)) ret = -EIO; } @@ -1317,8 +1583,6 @@ static int f2fs_write_node_page(struct page *page, if (unlikely(f2fs_cp_error(sbi))) goto redirty_out; - f2fs_wait_on_page_writeback(page, NODE); - /* get old block addr of this node page */ nid = nid_of_node(page); f2fs_bug_on(sbi, page->index != nid); @@ -1342,14 +1606,18 @@ static int f2fs_write_node_page(struct page *page, } set_page_writeback(page); - fio.blk_addr = ni.blk_addr; + fio.old_blkaddr = ni.blk_addr; write_node_page(nid, &fio); - set_node_addr(sbi, &ni, fio.blk_addr, is_fsync_dnode(page)); + set_node_addr(sbi, &ni, fio.new_blkaddr, is_fsync_dnode(page)); dec_page_count(sbi, F2FS_DIRTY_NODES); up_read(&sbi->node_write); - unlock_page(page); if (wbc->for_reclaim) + f2fs_submit_merged_bio_cond(sbi, NULL, page, 0, NODE, WRITE); + + unlock_page(page); + + if (unlikely(f2fs_cp_error(sbi))) f2fs_submit_merged_bio(sbi, NODE, WRITE); return 0; @@ -1363,10 +1631,9 @@ static int f2fs_write_node_pages(struct address_space *mapping, struct writeback_control *wbc) { struct f2fs_sb_info *sbi = F2FS_M_SB(mapping); + struct blk_plug plug; long diff; - trace_f2fs_writepages(mapping->host, wbc, NODE); - /* balancing f2fs's metadata in background */ f2fs_balance_fs_bg(sbi); @@ -1374,14 +1641,19 @@ static int f2fs_write_node_pages(struct address_space *mapping, if (get_pages(sbi, F2FS_DIRTY_NODES) < nr_pages_to_skip(sbi, NODE)) goto skip_write; + trace_f2fs_writepages(mapping->host, wbc, NODE); + diff = nr_pages_to_write(sbi, NODE, wbc); wbc->sync_mode = WB_SYNC_NONE; - sync_node_pages(sbi, 0, wbc); + blk_start_plug(&plug); + sync_node_pages(sbi, wbc); + blk_finish_plug(&plug); wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff); return 0; skip_write: wbc->pages_skipped += get_pages(sbi, F2FS_DIRTY_NODES); + trace_f2fs_writepages(mapping->host, wbc, NODE); return 0; } @@ -1389,9 +1661,10 @@ static int f2fs_set_node_page_dirty(struct page *page) { trace_f2fs_set_page_dirty(page, NODE); - SetPageUptodate(page); + if (!PageUptodate(page)) + SetPageUptodate(page); if (!PageDirty(page)) { - __set_page_dirty_nobuffers(page); + f2fs_set_page_dirty_nobuffers(page); inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES); SetPagePrivate(page); f2fs_trace_pid(page); @@ -1409,6 +1682,9 @@ const struct address_space_operations f2fs_node_aops = { .set_page_dirty = f2fs_set_node_page_dirty, .invalidatepage = f2fs_invalidate_page, .releasepage = f2fs_release_page, +#ifdef CONFIG_MIGRATION + .migratepage = f2fs_migrate_page, +#endif }; static struct free_nid *__lookup_free_nid_list(struct f2fs_nm_info *nm_i, @@ -1429,7 +1705,6 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *i; struct nat_entry *ne; - bool allocated = false; if (!available_free_memory(sbi, FREE_NIDS)) return -1; @@ -1440,14 +1715,9 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) if (build) { /* do not add allocated nids */ - down_read(&nm_i->nat_tree_lock); ne = __lookup_nat_cache(nm_i, nid); - if (ne && - (!get_nat_flag(ne, IS_CHECKPOINTED) || + if (ne && (!get_nat_flag(ne, IS_CHECKPOINTED) || nat_get_blkaddr(ne) != NULL_ADDR)) - allocated = true; - up_read(&nm_i->nat_tree_lock); - if (allocated) return 0; } @@ -1516,22 +1786,24 @@ static void scan_nat_page(struct f2fs_sb_info *sbi, } } -static void build_free_nids(struct f2fs_sb_info *sbi) +void build_free_nids(struct f2fs_sb_info *sbi) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); - struct f2fs_summary_block *sum = curseg->sum_blk; + struct f2fs_journal *journal = curseg->journal; int i = 0; nid_t nid = nm_i->next_scan_nid; /* Enough entries */ - if (nm_i->fcnt > NAT_ENTRY_PER_BLOCK) + if (nm_i->fcnt >= NAT_ENTRY_PER_BLOCK) return; /* readahead nat pages to be scanned */ ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES, META_NAT, true); + down_read(&nm_i->nat_tree_lock); + while (1) { struct page *page = get_current_nat_page(sbi, nid); @@ -1550,16 +1822,19 @@ static void build_free_nids(struct f2fs_sb_info *sbi) nm_i->next_scan_nid = nid; /* find free nids from current sum_pages */ - mutex_lock(&curseg->curseg_mutex); - for (i = 0; i < nats_in_cursum(sum); i++) { - block_t addr = le32_to_cpu(nat_in_journal(sum, i).block_addr); - nid = le32_to_cpu(nid_in_journal(sum, i)); + down_read(&curseg->journal_rwsem); + for (i = 0; i < nats_in_cursum(journal); i++) { + block_t addr; + + addr = le32_to_cpu(nat_in_journal(journal, i).block_addr); + nid = le32_to_cpu(nid_in_journal(journal, i)); if (addr == NULL_ADDR) add_free_nid(sbi, nid, true); else remove_free_nid(nm_i, nid); } - mutex_unlock(&curseg->curseg_mutex); + up_read(&curseg->journal_rwsem); + up_read(&nm_i->nat_tree_lock); ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nm_i->next_scan_nid), nm_i->ra_nid_pages, META_NAT, false); @@ -1575,6 +1850,10 @@ bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid) struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *i = NULL; retry: +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(sbi, FAULT_ALLOC_NID)) + return false; +#endif if (unlikely(sbi->total_valid_node_count + 1 > nm_i->available_nids)) return false; @@ -1582,8 +1861,6 @@ retry: /* We should not use stale free nids created by build_free_nids */ if (nm_i->fcnt && !on_build_free_nids(nm_i)) { - struct node_info ni; - f2fs_bug_on(sbi, list_empty(&nm_i->free_nid_list)); list_for_each_entry(i, &nm_i->free_nid_list, list) if (i->state == NID_NEW) @@ -1594,13 +1871,6 @@ retry: i->state = NID_ALLOC; nm_i->fcnt--; spin_unlock(&nm_i->free_nid_list_lock); - - /* check nid is allocated already */ - get_node_info(sbi, *nid, &ni); - if (ni.blk_addr != NULL_ADDR) { - alloc_nid_done(sbi, *nid); - goto retry; - } return true; } spin_unlock(&nm_i->free_nid_list_lock); @@ -1663,12 +1933,15 @@ int try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink) struct free_nid *i, *next; int nr = nr_shrink; + if (nm_i->fcnt <= MAX_FREE_NIDS) + return 0; + if (!mutex_trylock(&nm_i->build_lock)) return 0; spin_lock(&nm_i->free_nid_list_lock); list_for_each_entry_safe(i, next, &nm_i->free_nid_list, list) { - if (nr_shrink <= 0 || nm_i->fcnt <= NAT_ENTRY_PER_BLOCK) + if (nr_shrink <= 0 || nm_i->fcnt <= MAX_FREE_NIDS) break; if (i->state == NID_ALLOC) continue; @@ -1695,7 +1968,7 @@ void recover_inline_xattr(struct inode *inode, struct page *page) ri = F2FS_INODE(page); if (!(ri->i_inline & F2FS_INLINE_XATTR)) { - clear_inode_flag(F2FS_I(inode), FI_INLINE_XATTR); + clear_inode_flag(inode, FI_INLINE_XATTR); goto update_inode; } @@ -1703,7 +1976,7 @@ void recover_inline_xattr(struct inode *inode, struct page *page) src_addr = inline_xattr_addr(page); inline_size = inline_xattr_size(inode); - f2fs_wait_on_page_writeback(ipage, NODE); + f2fs_wait_on_page_writeback(ipage, NODE, true); memcpy(dst_addr, src_addr, inline_size); update_inode: update_inode(inode, ipage); @@ -1737,13 +2010,11 @@ recover_xnid: get_node_info(sbi, new_xnid, &ni); ni.ino = inode->i_ino; set_node_addr(sbi, &ni, NEW_ADDR, false); - F2FS_I(inode)->i_xattr_nid = new_xnid; + f2fs_i_xnid_write(inode, new_xnid); /* 3: update xattr blkaddr */ refresh_sit_entry(sbi, NEW_ADDR, blkaddr); set_node_addr(sbi, &ni, blkaddr, false); - - update_inode_page(inode); } int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) @@ -1757,15 +2028,18 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) if (unlikely(old_ni.blk_addr != NULL_ADDR)) return -EINVAL; - - ipage = grab_cache_page(NODE_MAPPING(sbi), ino); - if (!ipage) - return -ENOMEM; +retry: + ipage = f2fs_grab_cache_page(NODE_MAPPING(sbi), ino, false); + if (!ipage) { + congestion_wait(BLK_RW_ASYNC, HZ/50); + goto retry; + } /* Should not use this inode from free nid list */ remove_free_nid(NM_I(sbi), ino); - SetPageUptodate(ipage); + if (!PageUptodate(ipage)) + SetPageUptodate(ipage); fill_node_footer(ipage, ino, ino, 0, true); src = F2FS_INODE(page); @@ -1831,28 +2105,26 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); - struct f2fs_summary_block *sum = curseg->sum_blk; + struct f2fs_journal *journal = curseg->journal; int i; - mutex_lock(&curseg->curseg_mutex); - for (i = 0; i < nats_in_cursum(sum); i++) { + down_write(&curseg->journal_rwsem); + for (i = 0; i < nats_in_cursum(journal); i++) { struct nat_entry *ne; struct f2fs_nat_entry raw_ne; - nid_t nid = le32_to_cpu(nid_in_journal(sum, i)); + nid_t nid = le32_to_cpu(nid_in_journal(journal, i)); - raw_ne = nat_in_journal(sum, i); + raw_ne = nat_in_journal(journal, i); - down_write(&nm_i->nat_tree_lock); ne = __lookup_nat_cache(nm_i, nid); if (!ne) { ne = grab_nat_entry(nm_i, nid); node_info_from_raw_nat(&ne->ni, &raw_ne); } __set_nat_cache_dirty(nm_i, ne); - up_write(&nm_i->nat_tree_lock); } - update_nats_in_cursum(sum, -i); - mutex_unlock(&curseg->curseg_mutex); + update_nats_in_cursum(journal, -i); + up_write(&curseg->journal_rwsem); } static void __adjust_nat_entry_set(struct nat_entry_set *nes, @@ -1877,24 +2149,23 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, struct nat_entry_set *set) { struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); - struct f2fs_summary_block *sum = curseg->sum_blk; + struct f2fs_journal *journal = curseg->journal; nid_t start_nid = set->set * NAT_ENTRY_PER_BLOCK; bool to_journal = true; struct f2fs_nat_block *nat_blk; struct nat_entry *ne, *cur; struct page *page = NULL; - struct f2fs_nm_info *nm_i = NM_I(sbi); /* * there are two steps to flush nat entries: * #1, flush nat entries to journal in current hot data summary block. * #2, flush nat entries to nat page. */ - if (!__has_cursum_space(sum, set->entry_cnt, NAT_JOURNAL)) + if (!__has_cursum_space(journal, set->entry_cnt, NAT_JOURNAL)) to_journal = false; if (to_journal) { - mutex_lock(&curseg->curseg_mutex); + down_write(&curseg->journal_rwsem); } else { page = get_next_nat_page(sbi, start_nid); nat_blk = page_address(page); @@ -1911,35 +2182,29 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, continue; if (to_journal) { - offset = lookup_journal_in_cursum(sum, + offset = lookup_journal_in_cursum(journal, NAT_JOURNAL, nid, 1); f2fs_bug_on(sbi, offset < 0); - raw_ne = &nat_in_journal(sum, offset); - nid_in_journal(sum, offset) = cpu_to_le32(nid); + raw_ne = &nat_in_journal(journal, offset); + nid_in_journal(journal, offset) = cpu_to_le32(nid); } else { raw_ne = &nat_blk->entries[nid - start_nid]; } raw_nat_from_node_info(raw_ne, &ne->ni); - - down_write(&NM_I(sbi)->nat_tree_lock); nat_reset_flag(ne); __clear_nat_cache_dirty(NM_I(sbi), ne); - up_write(&NM_I(sbi)->nat_tree_lock); - if (nat_get_blkaddr(ne) == NULL_ADDR) add_free_nid(sbi, nid, false); } if (to_journal) - mutex_unlock(&curseg->curseg_mutex); + up_write(&curseg->journal_rwsem); else f2fs_put_page(page, 1); f2fs_bug_on(sbi, set->entry_cnt); - down_write(&nm_i->nat_tree_lock); radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set); - up_write(&nm_i->nat_tree_lock); kmem_cache_free(nat_entry_set_slab, set); } @@ -1950,7 +2215,7 @@ void flush_nat_entries(struct f2fs_sb_info *sbi) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); - struct f2fs_summary_block *sum = curseg->sum_blk; + struct f2fs_journal *journal = curseg->journal; struct nat_entry_set *setvec[SETVEC_SIZE]; struct nat_entry_set *set, *tmp; unsigned int found; @@ -1959,29 +2224,32 @@ void flush_nat_entries(struct f2fs_sb_info *sbi) if (!nm_i->dirty_nat_cnt) return; + + down_write(&nm_i->nat_tree_lock); + /* * if there are no enough space in journal to store dirty nat * entries, remove all entries from journal and merge them * into nat entry set. */ - if (!__has_cursum_space(sum, nm_i->dirty_nat_cnt, NAT_JOURNAL)) + if (!__has_cursum_space(journal, nm_i->dirty_nat_cnt, NAT_JOURNAL)) remove_nats_in_journal(sbi); - down_write(&nm_i->nat_tree_lock); while ((found = __gang_lookup_nat_set(nm_i, set_idx, SETVEC_SIZE, setvec))) { unsigned idx; set_idx = setvec[found - 1]->set + 1; for (idx = 0; idx < found; idx++) __adjust_nat_entry_set(setvec[idx], &sets, - MAX_NAT_JENTRIES(sum)); + MAX_NAT_JENTRIES(journal)); } - up_write(&nm_i->nat_tree_lock); /* flush dirty nats in nat entry set */ list_for_each_entry_safe(set, tmp, &sets, set_list) __flush_nat_entry_set(sbi, set); + up_write(&nm_i->nat_tree_lock); + f2fs_bug_on(sbi, nm_i->dirty_nat_cnt); } @@ -2006,6 +2274,7 @@ static int init_node_manager(struct f2fs_sb_info *sbi) nm_i->nat_cnt = 0; nm_i->ram_thresh = DEF_RAM_THRESHOLD; nm_i->ra_nid_pages = DEF_RA_NID_PAGES; + nm_i->dirty_nats_ratio = DEF_DIRTY_NAT_RATIO_THRESHOLD; INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC); INIT_LIST_HEAD(&nm_i->free_nid_list); diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index e4fffd2d98c4..868bec65e51c 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -15,15 +15,21 @@ #define NAT_BLOCK_OFFSET(start_nid) (start_nid / NAT_ENTRY_PER_BLOCK) /* # of pages to perform synchronous readahead before building free nids */ -#define FREE_NID_PAGES 4 +#define FREE_NID_PAGES 8 +#define MAX_FREE_NIDS (NAT_ENTRY_PER_BLOCK * FREE_NID_PAGES) -#define DEF_RA_NID_PAGES 4 /* # of nid pages to be readaheaded */ +#define DEF_RA_NID_PAGES 0 /* # of nid pages to be readaheaded */ /* maximum readahead size for node during getting data blocks */ #define MAX_RA_NODE 128 /* control the memory footprint threshold (10MB per 1GB ram) */ -#define DEF_RAM_THRESHOLD 10 +#define DEF_RAM_THRESHOLD 1 + +/* control dirty nats ratio threshold (default: 10% over max nid count) */ +#define DEF_DIRTY_NAT_RATIO_THRESHOLD 10 +/* control total # of nats */ +#define DEF_NAT_CACHE_THRESHOLD 100000 /* vector size for gang look-up from nat cache that consists of radix tree */ #define NATVEC_SIZE 64 @@ -117,6 +123,17 @@ static inline void raw_nat_from_node_info(struct f2fs_nat_entry *raw_ne, raw_ne->version = ni->version; } +static inline bool excess_dirty_nats(struct f2fs_sb_info *sbi) +{ + return NM_I(sbi)->dirty_nat_cnt >= NM_I(sbi)->max_nid * + NM_I(sbi)->dirty_nats_ratio / 100; +} + +static inline bool excess_cached_nats(struct f2fs_sb_info *sbi) +{ + return NM_I(sbi)->nat_cnt >= DEF_NAT_CACHE_THRESHOLD; +} + enum mem_type { FREE_NIDS, /* indicates the free nid list */ NAT_ENTRIES, /* indicates the cached nat entry */ @@ -183,7 +200,7 @@ static inline pgoff_t current_nat_addr(struct f2fs_sb_info *sbi, nid_t start) block_addr = (pgoff_t)(nm_i->nat_blkaddr + (seg_off << sbi->log_blocks_per_seg << 1) + - (block_off & ((1 << sbi->log_blocks_per_seg) - 1))); + (block_off & (sbi->blocks_per_seg - 1))); if (f2fs_test_bit(block_off, nm_i->nat_bitmap)) block_addr += sbi->blocks_per_seg; @@ -212,6 +229,37 @@ static inline void set_to_next_nat(struct f2fs_nm_info *nm_i, nid_t start_nid) f2fs_change_bit(block_off, nm_i->nat_bitmap); } +static inline nid_t ino_of_node(struct page *node_page) +{ + struct f2fs_node *rn = F2FS_NODE(node_page); + return le32_to_cpu(rn->footer.ino); +} + +static inline nid_t nid_of_node(struct page *node_page) +{ + struct f2fs_node *rn = F2FS_NODE(node_page); + return le32_to_cpu(rn->footer.nid); +} + +static inline unsigned int ofs_of_node(struct page *node_page) +{ + struct f2fs_node *rn = F2FS_NODE(node_page); + unsigned flag = le32_to_cpu(rn->footer.flag); + return flag >> OFFSET_BIT_SHIFT; +} + +static inline __u64 cpver_of_node(struct page *node_page) +{ + struct f2fs_node *rn = F2FS_NODE(node_page); + return le64_to_cpu(rn->footer.cp_ver); +} + +static inline block_t next_blkaddr_of_node(struct page *node_page) +{ + struct f2fs_node *rn = F2FS_NODE(node_page); + return le32_to_cpu(rn->footer.next_blkaddr); +} + static inline void fill_node_footer(struct page *page, nid_t nid, nid_t ino, unsigned int ofs, bool reset) { @@ -242,40 +290,30 @@ static inline void fill_node_footer_blkaddr(struct page *page, block_t blkaddr) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(F2FS_P_SB(page)); struct f2fs_node *rn = F2FS_NODE(page); + size_t crc_offset = le32_to_cpu(ckpt->checksum_offset); + __u64 cp_ver = le64_to_cpu(ckpt->checkpoint_ver); - rn->footer.cp_ver = ckpt->checkpoint_ver; + if (__is_set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG)) { + __u64 crc = le32_to_cpu(*((__le32 *) + ((unsigned char *)ckpt + crc_offset))); + cp_ver |= (crc << 32); + } + rn->footer.cp_ver = cpu_to_le64(cp_ver); rn->footer.next_blkaddr = cpu_to_le32(blkaddr); } -static inline nid_t ino_of_node(struct page *node_page) +static inline bool is_recoverable_dnode(struct page *page) { - struct f2fs_node *rn = F2FS_NODE(node_page); - return le32_to_cpu(rn->footer.ino); -} + struct f2fs_checkpoint *ckpt = F2FS_CKPT(F2FS_P_SB(page)); + size_t crc_offset = le32_to_cpu(ckpt->checksum_offset); + __u64 cp_ver = cur_cp_version(ckpt); -static inline nid_t nid_of_node(struct page *node_page) -{ - struct f2fs_node *rn = F2FS_NODE(node_page); - return le32_to_cpu(rn->footer.nid); -} - -static inline unsigned int ofs_of_node(struct page *node_page) -{ - struct f2fs_node *rn = F2FS_NODE(node_page); - unsigned flag = le32_to_cpu(rn->footer.flag); - return flag >> OFFSET_BIT_SHIFT; -} - -static inline unsigned long long cpver_of_node(struct page *node_page) -{ - struct f2fs_node *rn = F2FS_NODE(node_page); - return le64_to_cpu(rn->footer.cp_ver); -} - -static inline block_t next_blkaddr_of_node(struct page *node_page) -{ - struct f2fs_node *rn = F2FS_NODE(node_page); - return le32_to_cpu(rn->footer.next_blkaddr); + if (__is_set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG)) { + __u64 crc = le32_to_cpu(*((__le32 *) + ((unsigned char *)ckpt + crc_offset))); + cp_ver |= (crc << 32); + } + return cpu_to_le64(cp_ver) == cpver_of_node(page); } /* @@ -317,17 +355,17 @@ static inline bool IS_DNODE(struct page *node_page) return true; } -static inline void set_nid(struct page *p, int off, nid_t nid, bool i) +static inline int set_nid(struct page *p, int off, nid_t nid, bool i) { struct f2fs_node *rn = F2FS_NODE(p); - f2fs_wait_on_page_writeback(p, NODE); + f2fs_wait_on_page_writeback(p, NODE, true); if (i) rn->i.i_nid[off - NODE_DIR1_BLOCK] = cpu_to_le32(nid); else rn->in.nid[off] = cpu_to_le32(nid); - set_page_dirty(p); + return set_page_dirty(p); } static inline nid_t get_nid(struct page *p, int off, bool i) @@ -370,6 +408,21 @@ static inline int is_node(struct page *page, int type) #define is_fsync_dnode(page) is_node(page, FSYNC_BIT_SHIFT) #define is_dent_dnode(page) is_node(page, DENT_BIT_SHIFT) +static inline int is_inline_node(struct page *page) +{ + return PageChecked(page); +} + +static inline void set_inline_node(struct page *page) +{ + SetPageChecked(page); +} + +static inline void clear_inline_node(struct page *page) +{ + ClearPageChecked(page); +} + static inline void set_cold_node(struct inode *inode, struct page *page) { struct f2fs_node *rn = F2FS_NODE(page); diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index cbf74f47cce8..2fc84a991325 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -49,8 +49,9 @@ static struct kmem_cache *fsync_entry_slab; bool space_for_roll_forward(struct f2fs_sb_info *sbi) { - if (sbi->last_valid_block_count + sbi->alloc_valid_block_count - > sbi->user_block_count) + s64 nalloc = percpu_counter_sum_positive(&sbi->alloc_valid_block_count); + + if (sbi->last_valid_block_count + nalloc > sbi->user_block_count) return false; return true; } @@ -67,42 +68,71 @@ static struct fsync_inode_entry *get_fsync_inode(struct list_head *head, return NULL; } -static int recover_dentry(struct inode *inode, struct page *ipage) +static struct fsync_inode_entry *add_fsync_inode(struct f2fs_sb_info *sbi, + struct list_head *head, nid_t ino) +{ + struct inode *inode; + struct fsync_inode_entry *entry; + + inode = f2fs_iget_retry(sbi->sb, ino); + if (IS_ERR(inode)) + return ERR_CAST(inode); + + entry = f2fs_kmem_cache_alloc(fsync_entry_slab, GFP_F2FS_ZERO); + entry->inode = inode; + list_add_tail(&entry->list, head); + + return entry; +} + +static void del_fsync_inode(struct fsync_inode_entry *entry) +{ + iput(entry->inode); + list_del(&entry->list); + kmem_cache_free(fsync_entry_slab, entry); +} + +static int recover_dentry(struct inode *inode, struct page *ipage, + struct list_head *dir_list) { struct f2fs_inode *raw_inode = F2FS_INODE(ipage); nid_t pino = le32_to_cpu(raw_inode->i_pino); struct f2fs_dir_entry *de; - struct qstr name; + struct fscrypt_name fname; struct page *page; struct inode *dir, *einode; + struct fsync_inode_entry *entry; int err = 0; + char *name; - dir = f2fs_iget(inode->i_sb, pino); - if (IS_ERR(dir)) { - err = PTR_ERR(dir); - goto out; + entry = get_fsync_inode(dir_list, pino); + if (!entry) { + entry = add_fsync_inode(F2FS_I_SB(inode), dir_list, pino); + if (IS_ERR(entry)) { + dir = ERR_CAST(entry); + err = PTR_ERR(entry); + goto out; + } } - if (file_enc_name(inode)) { - iput(dir); - return 0; - } + dir = entry->inode; - name.len = le32_to_cpu(raw_inode->i_namelen); - name.name = raw_inode->i_name; + memset(&fname, 0, sizeof(struct fscrypt_name)); + fname.disk_name.len = le32_to_cpu(raw_inode->i_namelen); + fname.disk_name.name = raw_inode->i_name; - if (unlikely(name.len > F2FS_NAME_LEN)) { + if (unlikely(fname.disk_name.len > F2FS_NAME_LEN)) { WARN_ON(1); err = -ENAMETOOLONG; - goto out_err; + goto out; } retry: - de = f2fs_find_entry(dir, &name, &page); + de = __f2fs_find_entry(dir, &fname, &page); if (de && inode->i_ino == le32_to_cpu(de->ino)) goto out_unmap_put; if (de) { - einode = f2fs_iget(inode->i_sb, le32_to_cpu(de->ino)); + einode = f2fs_iget_retry(inode->i_sb, le32_to_cpu(de->ino)); if (IS_ERR(einode)) { WARN_ON(1); err = PTR_ERR(einode); @@ -118,29 +148,27 @@ retry: f2fs_delete_entry(de, page, dir, einode); iput(einode); goto retry; - } - err = __f2fs_add_link(dir, &name, inode, inode->i_ino, inode->i_mode); - if (err) - goto out_err; - - if (is_inode_flag_set(F2FS_I(dir), FI_DELAY_IPUT)) { - iput(dir); + } else if (IS_ERR(page)) { + err = PTR_ERR(page); } else { - add_dirty_dir_inode(dir); - set_inode_flag(F2FS_I(dir), FI_DELAY_IPUT); + err = __f2fs_do_add_link(dir, &fname, inode, + inode->i_ino, inode->i_mode); } - + if (err == -ENOMEM) + goto retry; goto out; out_unmap_put: f2fs_dentry_kunmap(dir, page); f2fs_put_page(page, 0); -out_err: - iput(dir); out: + if (file_enc_name(inode)) + name = ""; + else + name = raw_inode->i_name; f2fs_msg(inode->i_sb, KERN_NOTICE, "%s: ino = %x, name = %s, dir = %lx, err = %d", - __func__, ino_of_node(ipage), raw_inode->i_name, + __func__, ino_of_node(ipage), name, IS_ERR(dir) ? 0 : dir->i_ino, err); return err; } @@ -151,7 +179,7 @@ static void recover_inode(struct inode *inode, struct page *page) char *name; inode->i_mode = le16_to_cpu(raw->i_mode); - i_size_write(inode, le64_to_cpu(raw->i_size)); + f2fs_i_size_write(inode, le64_to_cpu(raw->i_size)); inode->i_atime.tv_sec = le64_to_cpu(raw->i_mtime); inode->i_ctime.tv_sec = le64_to_cpu(raw->i_ctime); inode->i_mtime.tv_sec = le64_to_cpu(raw->i_mtime); @@ -168,9 +196,34 @@ static void recover_inode(struct inode *inode, struct page *page) ino_of_node(page), name); } +static bool is_same_inode(struct inode *inode, struct page *ipage) +{ + struct f2fs_inode *ri = F2FS_INODE(ipage); + struct timespec disk; + + if (!IS_INODE(ipage)) + return true; + + disk.tv_sec = le64_to_cpu(ri->i_ctime); + disk.tv_nsec = le32_to_cpu(ri->i_ctime_nsec); + if (timespec_compare(&inode->i_ctime, &disk) > 0) + return false; + + disk.tv_sec = le64_to_cpu(ri->i_atime); + disk.tv_nsec = le32_to_cpu(ri->i_atime_nsec); + if (timespec_compare(&inode->i_atime, &disk) > 0) + return false; + + disk.tv_sec = le64_to_cpu(ri->i_mtime); + disk.tv_nsec = le32_to_cpu(ri->i_mtime_nsec); + if (timespec_compare(&inode->i_mtime, &disk) > 0) + return false; + + return true; +} + static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) { - unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi)); struct curseg_info *curseg; struct page *page = NULL; block_t blkaddr; @@ -180,8 +233,6 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); - ra_meta_pages(sbi, blkaddr, 1, META_POR, true); - while (1) { struct fsync_inode_entry *entry; @@ -190,49 +241,41 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) page = get_tmp_page(sbi, blkaddr); - if (cp_ver != cpver_of_node(page)) + if (!is_recoverable_dnode(page)) break; if (!is_fsync_dnode(page)) goto next; entry = get_fsync_inode(head, ino_of_node(page)); - if (!entry) { + if (entry) { + if (!is_same_inode(entry->inode, page)) + goto next; + } else { if (IS_INODE(page) && is_dent_dnode(page)) { err = recover_inode_page(sbi, page); if (err) break; } - /* add this fsync inode to the list */ - entry = kmem_cache_alloc(fsync_entry_slab, GFP_F2FS_ZERO); - if (!entry) { - err = -ENOMEM; - break; - } /* * CP | dnode(F) | inode(DF) * For this case, we should not give up now. */ - entry->inode = f2fs_iget(sbi->sb, ino_of_node(page)); - if (IS_ERR(entry->inode)) { - err = PTR_ERR(entry->inode); - kmem_cache_free(fsync_entry_slab, entry); + entry = add_fsync_inode(sbi, head, ino_of_node(page)); + if (IS_ERR(entry)) { + err = PTR_ERR(entry); if (err == -ENOENT) { err = 0; goto next; } break; } - list_add_tail(&entry->list, head); } entry->blkaddr = blkaddr; - if (IS_INODE(page)) { - entry->last_inode = blkaddr; - if (is_dent_dnode(page)) - entry->last_dentry = blkaddr; - } + if (IS_INODE(page) && is_dent_dnode(page)) + entry->last_dentry = blkaddr; next: /* check next segment */ blkaddr = next_blkaddr_of_node(page); @@ -248,11 +291,8 @@ static void destroy_fsync_dnodes(struct list_head *head) { struct fsync_inode_entry *entry, *tmp; - list_for_each_entry_safe(entry, tmp, head, list) { - iput(entry->inode); - list_del(&entry->list); - kmem_cache_free(fsync_entry_slab, entry); - } + list_for_each_entry_safe(entry, tmp, head, list) + del_fsync_inode(entry); } static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, @@ -314,15 +354,14 @@ got_it: if (ino != dn->inode->i_ino) { /* Deallocate previous index in the node page */ - inode = f2fs_iget(sbi->sb, ino); + inode = f2fs_iget_retry(sbi->sb, ino); if (IS_ERR(inode)) return PTR_ERR(inode); } else { inode = dn->inode; } - bidx = start_bidx_of_node(offset, F2FS_I(inode)) + - le16_to_cpu(sum.ofs_in_node); + bidx = start_bidx_of_node(offset, inode) + le16_to_cpu(sum.ofs_in_node); /* * if inode page is locked, unlock temporarily, but its reference @@ -357,10 +396,9 @@ truncate_out: static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, struct page *page, block_t blkaddr) { - struct f2fs_inode_info *fi = F2FS_I(inode); - unsigned int start, end; struct dnode_of_data dn; struct node_info ni; + unsigned int start, end; int err = 0, recovered = 0; /* step 1: recover xattr */ @@ -380,16 +418,21 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, goto out; /* step 3: recover data indices */ - start = start_bidx_of_node(ofs_of_node(page), fi); - end = start + ADDRS_PER_PAGE(page, fi); + start = start_bidx_of_node(ofs_of_node(page), inode); + end = start + ADDRS_PER_PAGE(page, inode); set_new_dnode(&dn, inode, NULL, NULL, 0); - +retry_dn: err = get_dnode_of_data(&dn, start, ALLOC_NODE); - if (err) + if (err) { + if (err == -ENOMEM) { + congestion_wait(BLK_RW_ASYNC, HZ/50); + goto retry_dn; + } goto out; + } - f2fs_wait_on_page_writeback(dn.node_page, NODE); + f2fs_wait_on_page_writeback(dn.node_page, NODE, true); get_node_info(sbi, dn.nid, &ni); f2fs_bug_on(sbi, ni.ino != ino_of_node(page)); @@ -411,14 +454,16 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, continue; } + if ((start + 1) << PAGE_SHIFT > i_size_read(inode)) + f2fs_i_size_write(inode, (start + 1) << PAGE_SHIFT); + /* * dest is reserved block, invalidate src block * and then reserve one new block in dnode page. */ if (dest == NEW_ADDR) { truncate_data_blocks_range(&dn, 1); - err = reserve_new_block(&dn); - f2fs_bug_on(sbi, err); + reserve_new_block(&dn); continue; } @@ -427,25 +472,33 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, if (src == NULL_ADDR) { err = reserve_new_block(&dn); +#ifdef CONFIG_F2FS_FAULT_INJECTION + while (err) + err = reserve_new_block(&dn); +#endif /* We should not get -ENOSPC */ f2fs_bug_on(sbi, err); + if (err) + goto err; } - +retry_prev: /* Check the previous node page having this index */ err = check_index_in_prev_nodes(sbi, dest, &dn); - if (err) + if (err) { + if (err == -ENOMEM) { + congestion_wait(BLK_RW_ASYNC, HZ/50); + goto retry_prev; + } goto err; + } /* write dummy data page */ f2fs_replace_block(sbi, &dn, src, dest, - ni.version, false); + ni.version, false, false); recovered++; } } - if (IS_INODE(dn.node_page)) - sync_inode_page(&dn); - copy_node_footer(dn.node_page, page); fill_node_footer(dn.node_page, dn.nid, ni.ino, ofs_of_node(page), false); @@ -459,17 +512,16 @@ out: return err; } -static int recover_data(struct f2fs_sb_info *sbi, - struct list_head *head, int type) +static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list, + struct list_head *dir_list) { - unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi)); struct curseg_info *curseg; struct page *page = NULL; int err = 0; block_t blkaddr; /* get node pages in the current segment */ - curseg = CURSEG_I(sbi, type); + curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); while (1) { @@ -482,12 +534,12 @@ static int recover_data(struct f2fs_sb_info *sbi, page = get_tmp_page(sbi, blkaddr); - if (cp_ver != cpver_of_node(page)) { + if (!is_recoverable_dnode(page)) { f2fs_put_page(page, 1); break; } - entry = get_fsync_inode(head, ino_of_node(page)); + entry = get_fsync_inode(inode_list, ino_of_node(page)); if (!entry) goto next; /* @@ -495,10 +547,10 @@ static int recover_data(struct f2fs_sb_info *sbi, * In this case, we can lose the latest inode(x). * So, call recover_inode for the inode update. */ - if (entry->last_inode == blkaddr) + if (IS_INODE(page)) recover_inode(entry->inode, page); if (entry->last_dentry == blkaddr) { - err = recover_dentry(entry->inode, page); + err = recover_dentry(entry->inode, page, dir_list); if (err) { f2fs_put_page(page, 1); break; @@ -510,11 +562,8 @@ static int recover_data(struct f2fs_sb_info *sbi, break; } - if (entry->blkaddr == blkaddr) { - iput(entry->inode); - list_del(&entry->list); - kmem_cache_free(fsync_entry_slab, entry); - } + if (entry->blkaddr == blkaddr) + del_fsync_inode(entry); next: /* check next segment */ blkaddr = next_blkaddr_of_node(page); @@ -525,12 +574,14 @@ next: return err; } -int recover_fsync_data(struct f2fs_sb_info *sbi) +int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) { struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); struct list_head inode_list; + struct list_head dir_list; block_t blkaddr; int err; + int ret = 0; bool need_writecp = false; fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry", @@ -539,6 +590,7 @@ int recover_fsync_data(struct f2fs_sb_info *sbi) return -ENOMEM; INIT_LIST_HEAD(&inode_list); + INIT_LIST_HEAD(&dir_list); /* prevent checkpoint */ mutex_lock(&sbi->cp_mutex); @@ -547,25 +599,26 @@ int recover_fsync_data(struct f2fs_sb_info *sbi) /* step #1: find fsynced inode numbers */ err = find_fsync_dnodes(sbi, &inode_list); - if (err) + if (err || list_empty(&inode_list)) goto out; - if (list_empty(&inode_list)) + if (check_only) { + ret = 1; goto out; + } need_writecp = true; /* step #2: recover data */ - err = recover_data(sbi, &inode_list, CURSEG_WARM_NODE); + err = recover_data(sbi, &inode_list, &dir_list); if (!err) f2fs_bug_on(sbi, !list_empty(&inode_list)); out: destroy_fsync_dnodes(&inode_list); - kmem_cache_destroy(fsync_entry_slab); /* truncate meta pages to be used by the recovery */ truncate_inode_pages_range(META_MAPPING(sbi), - (loff_t)MAIN_BLKADDR(sbi) << PAGE_CACHE_SHIFT, -1); + (loff_t)MAIN_BLKADDR(sbi) << PAGE_SHIFT, -1); if (err) { truncate_inode_pages_final(NODE_MAPPING(sbi)); @@ -573,31 +626,20 @@ out: } clear_sbi_flag(sbi, SBI_POR_DOING); - if (err) { - bool invalidate = false; + if (err) + set_ckpt_flags(sbi, CP_ERROR_FLAG); + mutex_unlock(&sbi->cp_mutex); - if (discard_next_dnode(sbi, blkaddr)) - invalidate = true; + /* let's drop all the directory inodes for clean checkpoint */ + destroy_fsync_dnodes(&dir_list); - /* Flush all the NAT/SIT pages */ - while (get_pages(sbi, F2FS_DIRTY_META)) - sync_meta_pages(sbi, META, LONG_MAX); - - /* invalidate temporary meta page */ - if (invalidate) - invalidate_mapping_pages(META_MAPPING(sbi), - blkaddr, blkaddr); - - set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG); - mutex_unlock(&sbi->cp_mutex); - } else if (need_writecp) { + if (!err && need_writecp) { struct cp_control cpc = { .reason = CP_RECOVERY, }; - mutex_unlock(&sbi->cp_mutex); - write_checkpoint(sbi, &cpc); - } else { - mutex_unlock(&sbi->cp_mutex); + err = write_checkpoint(sbi, &cpc); } - return err; + + kmem_cache_destroy(fsync_entry_slab); + return ret ? ret: err; } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index f77b3258454a..b3c61ae37f92 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -86,6 +86,7 @@ static inline unsigned long __reverse_ffs(unsigned long word) /* * __find_rev_next(_zero)_bit is copied from lib/find_next_bit.c because * f2fs_set_bit makes MSB and LSB reversed in a byte. + * @size must be integral times of unsigned long. * Example: * MSB <--> LSB * f2fs_set_bit(0, bitmap) => 1000 0000 @@ -95,94 +96,73 @@ static unsigned long __find_rev_next_bit(const unsigned long *addr, unsigned long size, unsigned long offset) { const unsigned long *p = addr + BIT_WORD(offset); - unsigned long result = offset & ~(BITS_PER_LONG - 1); + unsigned long result = size; unsigned long tmp; if (offset >= size) return size; - size -= result; + size -= (offset & ~(BITS_PER_LONG - 1)); offset %= BITS_PER_LONG; - if (!offset) - goto aligned; - tmp = __reverse_ulong((unsigned char *)p); - tmp &= ~0UL >> offset; + while (1) { + if (*p == 0) + goto pass; - if (size < BITS_PER_LONG) - goto found_first; - if (tmp) - goto found_middle; - - size -= BITS_PER_LONG; - result += BITS_PER_LONG; - p++; -aligned: - while (size & ~(BITS_PER_LONG-1)) { tmp = __reverse_ulong((unsigned char *)p); + + tmp &= ~0UL >> offset; + if (size < BITS_PER_LONG) + tmp &= (~0UL << (BITS_PER_LONG - size)); if (tmp) - goto found_middle; - result += BITS_PER_LONG; + goto found; +pass: + if (size <= BITS_PER_LONG) + break; size -= BITS_PER_LONG; + offset = 0; p++; } - if (!size) - return result; - - tmp = __reverse_ulong((unsigned char *)p); -found_first: - tmp &= (~0UL << (BITS_PER_LONG - size)); - if (!tmp) /* Are any bits set? */ - return result + size; /* Nope. */ -found_middle: - return result + __reverse_ffs(tmp); + return result; +found: + return result - size + __reverse_ffs(tmp); } static unsigned long __find_rev_next_zero_bit(const unsigned long *addr, unsigned long size, unsigned long offset) { const unsigned long *p = addr + BIT_WORD(offset); - unsigned long result = offset & ~(BITS_PER_LONG - 1); + unsigned long result = size; unsigned long tmp; if (offset >= size) return size; - size -= result; + size -= (offset & ~(BITS_PER_LONG - 1)); offset %= BITS_PER_LONG; - if (!offset) - goto aligned; - tmp = __reverse_ulong((unsigned char *)p); - tmp |= ~((~0UL << offset) >> offset); + while (1) { + if (*p == ~0UL) + goto pass; - if (size < BITS_PER_LONG) - goto found_first; - if (tmp != ~0UL) - goto found_middle; - - size -= BITS_PER_LONG; - result += BITS_PER_LONG; - p++; -aligned: - while (size & ~(BITS_PER_LONG - 1)) { tmp = __reverse_ulong((unsigned char *)p); + + if (offset) + tmp |= ~0UL << (BITS_PER_LONG - offset); + if (size < BITS_PER_LONG) + tmp |= ~0UL >> size; if (tmp != ~0UL) - goto found_middle; - result += BITS_PER_LONG; + goto found; +pass: + if (size <= BITS_PER_LONG) + break; size -= BITS_PER_LONG; + offset = 0; p++; } - if (!size) - return result; - - tmp = __reverse_ulong((unsigned char *)p); -found_first: - tmp |= ~(~0UL << (BITS_PER_LONG - size)); - if (tmp == ~0UL) /* Are any bits zero? */ - return result + size; /* Nope. */ -found_middle: - return result + __reverse_ffz(tmp); + return result; +found: + return result - size + __reverse_ffz(tmp); } void register_inmem_page(struct inode *inode, struct page *page) @@ -211,69 +191,149 @@ void register_inmem_page(struct inode *inode, struct page *page) trace_f2fs_register_inmem_page(page, INMEM); } -int commit_inmem_pages(struct inode *inode, bool abort) +static int __revoke_inmem_pages(struct inode *inode, + struct list_head *head, bool drop, bool recover) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct inmem_pages *cur, *tmp; + int err = 0; + + list_for_each_entry_safe(cur, tmp, head, list) { + struct page *page = cur->page; + + if (drop) + trace_f2fs_commit_inmem_page(page, INMEM_DROP); + + lock_page(page); + + if (recover) { + struct dnode_of_data dn; + struct node_info ni; + + trace_f2fs_commit_inmem_page(page, INMEM_REVOKE); + + set_new_dnode(&dn, inode, NULL, NULL, 0); + if (get_dnode_of_data(&dn, page->index, LOOKUP_NODE)) { + err = -EAGAIN; + goto next; + } + get_node_info(sbi, dn.nid, &ni); + f2fs_replace_block(sbi, &dn, dn.data_blkaddr, + cur->old_addr, ni.version, true, true); + f2fs_put_dnode(&dn); + } +next: + /* we don't need to invalidate this in the sccessful status */ + if (drop || recover) + ClearPageUptodate(page); + set_page_private(page, 0); + ClearPagePrivate(page); + f2fs_put_page(page, 1); + + list_del(&cur->list); + kmem_cache_free(inmem_entry_slab, cur); + dec_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES); + } + return err; +} + +void drop_inmem_pages(struct inode *inode) +{ + struct f2fs_inode_info *fi = F2FS_I(inode); + + clear_inode_flag(inode, FI_ATOMIC_FILE); + + mutex_lock(&fi->inmem_lock); + __revoke_inmem_pages(inode, &fi->inmem_pages, true, false); + mutex_unlock(&fi->inmem_lock); +} + +static int __commit_inmem_pages(struct inode *inode, + struct list_head *revoke_list) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); struct inmem_pages *cur, *tmp; - bool submit_bio = false; struct f2fs_io_info fio = { .sbi = sbi, .type = DATA, .rw = WRITE_SYNC | REQ_PRIO, .encrypted_page = NULL, }; + bool submit_bio = false; int err = 0; - /* - * The abort is true only when f2fs_evict_inode is called. - * Basically, the f2fs_evict_inode doesn't produce any data writes, so - * that we don't need to call f2fs_balance_fs. - * Otherwise, f2fs_gc in f2fs_balance_fs can wait forever until this - * inode becomes free by iget_locked in f2fs_iget. - */ - if (!abort) { - f2fs_balance_fs(sbi); - f2fs_lock_op(sbi); + list_for_each_entry_safe(cur, tmp, &fi->inmem_pages, list) { + struct page *page = cur->page; + + lock_page(page); + if (page->mapping == inode->i_mapping) { + trace_f2fs_commit_inmem_page(page, INMEM); + + set_page_dirty(page); + f2fs_wait_on_page_writeback(page, DATA, true); + if (clear_page_dirty_for_io(page)) + inode_dec_dirty_pages(inode); + + fio.page = page; + err = do_write_data_page(&fio); + if (err) { + unlock_page(page); + break; + } + + /* record old blkaddr for revoking */ + cur->old_addr = fio.old_blkaddr; + + clear_cold_data(page); + submit_bio = true; + } + unlock_page(page); + list_move_tail(&cur->list, revoke_list); } - mutex_lock(&fi->inmem_lock); - list_for_each_entry_safe(cur, tmp, &fi->inmem_pages, list) { - lock_page(cur->page); - if (!abort) { - if (cur->page->mapping == inode->i_mapping) { - set_page_dirty(cur->page); - f2fs_wait_on_page_writeback(cur->page, DATA); - if (clear_page_dirty_for_io(cur->page)) - inode_dec_dirty_pages(inode); - trace_f2fs_commit_inmem_page(cur->page, INMEM); - fio.page = cur->page; - err = do_write_data_page(&fio); - if (err) { - unlock_page(cur->page); - break; - } - clear_cold_data(cur->page); - submit_bio = true; - } - } else { - trace_f2fs_commit_inmem_page(cur->page, INMEM_DROP); - } - set_page_private(cur->page, 0); - ClearPagePrivate(cur->page); - f2fs_put_page(cur->page, 1); + if (submit_bio) + f2fs_submit_merged_bio_cond(sbi, inode, NULL, 0, DATA, WRITE); - list_del(&cur->list); - kmem_cache_free(inmem_entry_slab, cur); - dec_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES); + if (!err) + __revoke_inmem_pages(inode, revoke_list, false, false); + + return err; +} + +int commit_inmem_pages(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_inode_info *fi = F2FS_I(inode); + struct list_head revoke_list; + int err; + + INIT_LIST_HEAD(&revoke_list); + f2fs_balance_fs(sbi, true); + f2fs_lock_op(sbi); + + mutex_lock(&fi->inmem_lock); + err = __commit_inmem_pages(inode, &revoke_list); + if (err) { + int ret; + /* + * try to revoke all committed pages, but still we could fail + * due to no memory or other reason, if that happened, EAGAIN + * will be returned, which means in such case, transaction is + * already not integrity, caller should use journal to do the + * recovery or rewrite & commit last transaction. For other + * error number, revoking was done by filesystem itself. + */ + ret = __revoke_inmem_pages(inode, &revoke_list, false, true); + if (ret) + err = ret; + + /* drop all uncommitted pages */ + __revoke_inmem_pages(inode, &fi->inmem_pages, true, false); } mutex_unlock(&fi->inmem_lock); - if (!abort) { - f2fs_unlock_op(sbi); - if (submit_bio) - f2fs_submit_merged_bio(sbi, DATA, WRITE); - } + f2fs_unlock_op(sbi); return err; } @@ -281,13 +341,25 @@ int commit_inmem_pages(struct inode *inode, bool abort) * This function balances dirty node and dentry pages. * In addition, it controls garbage collection. */ -void f2fs_balance_fs(struct f2fs_sb_info *sbi) +void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) { +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(sbi, FAULT_CHECKPOINT)) + f2fs_stop_checkpoint(sbi, false); +#endif + + if (!need) + return; + + /* balance_fs_bg is able to be pending */ + if (excess_cached_nats(sbi)) + f2fs_balance_fs_bg(sbi); + /* * We should do GC or end up with checkpoint, if there are so many dirty * dir/node pages without enough free segments. */ - if (has_not_enough_free_secs(sbi, 0)) { + if (has_not_enough_free_secs(sbi, 0, 0)) { mutex_lock(&sbi->gc_mutex); f2fs_gc(sbi, false); } @@ -304,14 +376,26 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK); if (!available_free_memory(sbi, FREE_NIDS)) - try_to_free_nids(sbi, NAT_ENTRY_PER_BLOCK * FREE_NID_PAGES); + try_to_free_nids(sbi, MAX_FREE_NIDS); + else + build_free_nids(sbi); /* checkpoint is the only way to shrink partial cached entries */ if (!available_free_memory(sbi, NAT_ENTRIES) || - excess_prefree_segs(sbi) || !available_free_memory(sbi, INO_ENTRIES) || - jiffies > sbi->cp_expires) + excess_prefree_segs(sbi) || + excess_dirty_nats(sbi) || + (is_idle(sbi) && f2fs_time_over(sbi, CP_TIME))) { + if (test_opt(sbi, DATA_FLUSH)) { + struct blk_plug plug; + + blk_start_plug(&plug); + sync_dirty_inodes(sbi, FILE_INODE); + blk_finish_plug(&plug); + } f2fs_sync_fs(sbi->sb, true); + stat_inc_bg_cp_count(sbi->stat_info); + } } static int issue_flush_thread(void *data) @@ -361,24 +445,28 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi) if (test_opt(sbi, NOBARRIER)) return 0; - if (!test_opt(sbi, FLUSH_MERGE)) { + if (!test_opt(sbi, FLUSH_MERGE) || !atomic_read(&fcc->submit_flush)) { struct bio *bio = f2fs_bio_alloc(0); int ret; + atomic_inc(&fcc->submit_flush); bio->bi_bdev = sbi->sb->s_bdev; ret = submit_bio_wait(WRITE_FLUSH, bio); + atomic_dec(&fcc->submit_flush); bio_put(bio); return ret; } init_completion(&cmd.wait); + atomic_inc(&fcc->submit_flush); llist_add(&cmd.llnode, &fcc->issue_list); if (!fcc->dispatch_list) wake_up(&fcc->flush_wait_queue); wait_for_completion(&cmd.wait); + atomic_dec(&fcc->submit_flush); return cmd.ret; } @@ -392,6 +480,7 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi) fcc = kzalloc(sizeof(struct flush_cmd_control), GFP_KERNEL); if (!fcc) return -ENOMEM; + atomic_set(&fcc->submit_flush, 0); init_waitqueue_head(&fcc->flush_wait_queue); init_llist_head(&fcc->issue_list); SM_I(sbi)->cmd_control_info = fcc; @@ -513,28 +602,6 @@ static int f2fs_issue_discard(struct f2fs_sb_info *sbi, return blkdev_issue_discard(sbi->sb->s_bdev, start, len, GFP_NOFS, 0); } -bool discard_next_dnode(struct f2fs_sb_info *sbi, block_t blkaddr) -{ - int err = -ENOTSUPP; - - if (test_opt(sbi, DISCARD)) { - struct seg_entry *se = get_seg_entry(sbi, - GET_SEGNO(sbi, blkaddr)); - unsigned int offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr); - - if (f2fs_test_bit(offset, se->discard_map)) - return false; - - err = f2fs_issue_discard(sbi, blkaddr, 1); - } - - if (err) { - update_meta_page(sbi, NULL, blkaddr); - return true; - } - return false; -} - static void __add_discard_entry(struct f2fs_sb_info *sbi, struct cp_control *cpc, struct seg_entry *se, unsigned int start, unsigned int end) @@ -573,7 +640,7 @@ static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc) bool force = (cpc->reason == CP_DISCARD); int i; - if (se->valid_blocks == max_blocks) + if (se->valid_blocks == max_blocks || !f2fs_discard_en(sbi)) return; if (!force) { @@ -593,6 +660,10 @@ static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc) break; end = __find_rev_next_zero_bit(dmap, max_blocks, start + 1); + if (force && start && end != max_blocks + && (end - start) < cpc->trim_minlen) + continue; + __add_discard_entry(sbi, cpc, se, start, end); } } @@ -630,6 +701,8 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc) struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); unsigned long *prefree_map = dirty_i->dirty_segmap[PRE]; unsigned int start = 0, end = -1; + unsigned int secno, start_segno; + bool force = (cpc->reason == CP_DISCARD); mutex_lock(&dirty_i->seglist_lock); @@ -646,17 +719,31 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc) dirty_i->nr_dirty[PRE] -= end - start; - if (!test_opt(sbi, DISCARD)) + if (force || !test_opt(sbi, DISCARD)) continue; - f2fs_issue_discard(sbi, START_BLOCK(sbi, start), + if (!test_opt(sbi, LFS) || sbi->segs_per_sec == 1) { + f2fs_issue_discard(sbi, START_BLOCK(sbi, start), (end - start) << sbi->log_blocks_per_seg); + continue; + } +next: + secno = GET_SECNO(sbi, start); + start_segno = secno * sbi->segs_per_sec; + if (!IS_CURSEC(sbi, secno) && + !get_valid_blocks(sbi, start, sbi->segs_per_sec)) + f2fs_issue_discard(sbi, START_BLOCK(sbi, start_segno), + sbi->segs_per_sec << sbi->log_blocks_per_seg); + + start = start_segno + sbi->segs_per_sec; + if (start < end) + goto next; } mutex_unlock(&dirty_i->seglist_lock); /* send small discards */ list_for_each_entry_safe(entry, this, head, list) { - if (cpc->reason == CP_DISCARD && entry->len < cpc->trim_minlen) + if (force && entry->len < cpc->trim_minlen) goto skip; f2fs_issue_discard(sbi, entry->blkaddr, entry->len); cpc->trimmed += entry->len; @@ -711,12 +798,14 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) if (del > 0) { if (f2fs_test_and_set_bit(offset, se->cur_valid_map)) f2fs_bug_on(sbi, 1); - if (!f2fs_test_and_set_bit(offset, se->discard_map)) + if (f2fs_discard_en(sbi) && + !f2fs_test_and_set_bit(offset, se->discard_map)) sbi->discard_blks--; } else { if (!f2fs_test_and_clear_bit(offset, se->cur_valid_map)) f2fs_bug_on(sbi, 1); - if (f2fs_test_and_clear_bit(offset, se->discard_map)) + if (f2fs_discard_en(sbi) && + f2fs_test_and_clear_bit(offset, se->discard_map)) sbi->discard_blks++; } if (!f2fs_test_bit(offset, se->ckpt_valid_map)) @@ -817,12 +906,12 @@ int npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra) } } - sum_in_page = (PAGE_CACHE_SIZE - 2 * SUM_JOURNAL_SIZE - + sum_in_page = (PAGE_SIZE - 2 * SUM_JOURNAL_SIZE - SUM_FOOTER_SIZE) / SUMMARY_SIZE; if (valid_sum_count <= sum_in_page) return 1; else if ((valid_sum_count - sum_in_page) <= - (PAGE_CACHE_SIZE - SUM_FOOTER_SIZE) / SUMMARY_SIZE) + (PAGE_SIZE - SUM_FOOTER_SIZE) / SUMMARY_SIZE) return 2; return 3; } @@ -841,9 +930,9 @@ void update_meta_page(struct f2fs_sb_info *sbi, void *src, block_t blk_addr) void *dst = page_address(page); if (src) - memcpy(dst, src, PAGE_CACHE_SIZE); + memcpy(dst, src, PAGE_SIZE); else - memset(dst, 0, PAGE_CACHE_SIZE); + memset(dst, 0, PAGE_SIZE); set_page_dirty(page); f2fs_put_page(page, 1); } @@ -854,6 +943,31 @@ static void write_sum_page(struct f2fs_sb_info *sbi, update_meta_page(sbi, (void *)sum_blk, blk_addr); } +static void write_current_sum_page(struct f2fs_sb_info *sbi, + int type, block_t blk_addr) +{ + struct curseg_info *curseg = CURSEG_I(sbi, type); + struct page *page = grab_meta_page(sbi, blk_addr); + struct f2fs_summary_block *src = curseg->sum_blk; + struct f2fs_summary_block *dst; + + dst = (struct f2fs_summary_block *)page_address(page); + + mutex_lock(&curseg->curseg_mutex); + + down_read(&curseg->journal_rwsem); + memcpy(&dst->journal, curseg->journal, SUM_JOURNAL_SIZE); + up_read(&curseg->journal_rwsem); + + memcpy(dst->entries, src->entries, SUM_ENTRY_SIZE); + memcpy(&dst->footer, &src->footer, SUM_FOOTER_SIZE); + + mutex_unlock(&curseg->curseg_mutex); + + set_page_dirty(page); + f2fs_put_page(page, 1); +} + static int is_next_segment_free(struct f2fs_sb_info *sbi, int type) { struct curseg_info *curseg = CURSEG_I(sbi, type); @@ -886,9 +1000,8 @@ static void get_new_segment(struct f2fs_sb_info *sbi, if (!new_sec && ((*newseg + 1) % sbi->segs_per_sec)) { segno = find_next_zero_bit(free_i->free_segmap, - MAIN_SEGS(sbi), *newseg + 1); - if (segno - *newseg < sbi->segs_per_sec - - (*newseg % sbi->segs_per_sec)) + (hint + 1) * sbi->segs_per_sec, *newseg + 1); + if (segno < (hint + 1) * sbi->segs_per_sec) goto got_it; } find_other_zone: @@ -1071,7 +1184,7 @@ static int get_ssr_segment(struct f2fs_sb_info *sbi, int type) struct curseg_info *curseg = CURSEG_I(sbi, type); const struct victim_selection *v_ops = DIRTY_I(sbi)->v_ops; - if (IS_NODESEG(type) || !has_not_enough_free_secs(sbi, 0)) + if (IS_NODESEG(type) || !has_not_enough_free_secs(sbi, 0, 0)) return v_ops->get_victim(sbi, &(curseg)->next_segno, BG_GC, type, SSR); @@ -1120,6 +1233,9 @@ void allocate_new_segments(struct f2fs_sb_info *sbi) { int i; + if (test_opt(sbi, LFS)) + return; + for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) __allocate_new_segments(sbi, i); } @@ -1134,6 +1250,7 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) __u64 end = start + F2FS_BYTES_TO_BLK(range->len) - 1; unsigned int start_segno, end_segno; struct cp_control cpc; + int err = 0; if (start >= MAX_BLKADDR(sbi) || range->len < sbi->blocksize) return -EINVAL; @@ -1142,6 +1259,12 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) if (end <= MAIN_BLKADDR(sbi)) goto out; + if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) { + f2fs_msg(sbi->sb, KERN_WARNING, + "Found FS corruption, run fsck to fix."); + goto out; + } + /* start/end segment number in main_area */ start_segno = (start <= MAIN_BLKADDR(sbi)) ? 0 : GET_SEGNO(sbi, start); end_segno = (end >= MAX_BLKADDR(sbi)) ? MAIN_SEGS(sbi) - 1 : @@ -1164,12 +1287,16 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) sbi->segs_per_sec) - 1, end_segno); mutex_lock(&sbi->gc_mutex); - write_checkpoint(sbi, &cpc); + err = write_checkpoint(sbi, &cpc); mutex_unlock(&sbi->gc_mutex); + if (err) + break; + + schedule(); } out: range->len = F2FS_BLK_TO_BYTES(cpc.trimmed); - return 0; + return err; } static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type) @@ -1256,7 +1383,7 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, /* direct_io'ed data is aligned to the segment for better performance */ if (direct_io && curseg->next_blkoff && - !has_not_enough_free_secs(sbi, 0)) + !has_not_enough_free_secs(sbi, 0, 0)) __allocate_new_segments(sbi, type); *new_blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); @@ -1292,11 +1419,17 @@ static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) { int type = __get_segment_type(fio->page, fio->type); - allocate_data_block(fio->sbi, fio->page, fio->blk_addr, - &fio->blk_addr, sum, type); + if (fio->type == NODE || fio->type == DATA) + mutex_lock(&fio->sbi->wio_mutex[fio->type]); + + allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr, + &fio->new_blkaddr, sum, type); /* writeout dirty page into bdev */ f2fs_submit_page_mbio(fio); + + if (fio->type == NODE || fio->type == DATA) + mutex_unlock(&fio->sbi->wio_mutex[fio->type]); } void write_meta_page(struct f2fs_sb_info *sbi, struct page *page) @@ -1305,7 +1438,8 @@ void write_meta_page(struct f2fs_sb_info *sbi, struct page *page) .sbi = sbi, .type = META, .rw = WRITE_SYNC | REQ_META | REQ_PRIO, - .blk_addr = page->index, + .old_blkaddr = page->index, + .new_blkaddr = page->index, .page = page, .encrypted_page = NULL, }; @@ -1335,19 +1469,19 @@ void write_data_page(struct dnode_of_data *dn, struct f2fs_io_info *fio) get_node_info(sbi, dn->nid, &ni); set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); do_write_page(&sum, fio); - dn->data_blkaddr = fio->blk_addr; + f2fs_update_data_blkaddr(dn, fio->new_blkaddr); } void rewrite_data_page(struct f2fs_io_info *fio) { + fio->new_blkaddr = fio->old_blkaddr; stat_inc_inplace_blocks(fio->sbi); f2fs_submit_page_mbio(fio); } -static void __f2fs_replace_block(struct f2fs_sb_info *sbi, - struct f2fs_summary *sum, +void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, block_t old_blkaddr, block_t new_blkaddr, - bool recover_curseg) + bool recover_curseg, bool recover_newaddr) { struct sit_info *sit_i = SIT_I(sbi); struct curseg_info *curseg; @@ -1390,7 +1524,7 @@ static void __f2fs_replace_block(struct f2fs_sb_info *sbi, curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr); __add_sum_entry(sbi, type, sum); - if (!recover_curseg) + if (!recover_curseg || recover_newaddr) update_sit_entry(sbi, new_blkaddr, 1); if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) update_sit_entry(sbi, old_blkaddr, -1); @@ -1414,66 +1548,30 @@ static void __f2fs_replace_block(struct f2fs_sb_info *sbi, void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn, block_t old_addr, block_t new_addr, - unsigned char version, bool recover_curseg) + unsigned char version, bool recover_curseg, + bool recover_newaddr) { struct f2fs_summary sum; set_summary(&sum, dn->nid, dn->ofs_in_node, version); - __f2fs_replace_block(sbi, &sum, old_addr, new_addr, recover_curseg); + __f2fs_replace_block(sbi, &sum, old_addr, new_addr, + recover_curseg, recover_newaddr); - dn->data_blkaddr = new_addr; - set_data_blkaddr(dn); - f2fs_update_extent_cache(dn); -} - -static inline bool is_merged_page(struct f2fs_sb_info *sbi, - struct page *page, enum page_type type) -{ - enum page_type btype = PAGE_TYPE_OF_BIO(type); - struct f2fs_bio_info *io = &sbi->write_io[btype]; - struct bio_vec *bvec; - struct page *target; - int i; - - down_read(&io->io_rwsem); - if (!io->bio) { - up_read(&io->io_rwsem); - return false; - } - - bio_for_each_segment_all(bvec, io->bio, i) { - - if (bvec->bv_page->mapping) { - target = bvec->bv_page; - } else { - struct f2fs_crypto_ctx *ctx; - - /* encrypted page */ - ctx = (struct f2fs_crypto_ctx *)page_private( - bvec->bv_page); - target = ctx->w.control_page; - } - - if (page == target) { - up_read(&io->io_rwsem); - return true; - } - } - - up_read(&io->io_rwsem); - return false; + f2fs_update_data_blkaddr(dn, new_addr); } void f2fs_wait_on_page_writeback(struct page *page, - enum page_type type) + enum page_type type, bool ordered) { if (PageWriteback(page)) { struct f2fs_sb_info *sbi = F2FS_P_SB(page); - if (is_merged_page(sbi, page, type)) - f2fs_submit_merged_bio(sbi, type, WRITE); - wait_on_page_writeback(page); + f2fs_submit_merged_bio_cond(sbi, NULL, page, 0, type, WRITE); + if (ordered) + wait_on_page_writeback(page); + else + wait_for_stable_page(page); } } @@ -1482,14 +1580,12 @@ void f2fs_wait_on_encrypted_page_writeback(struct f2fs_sb_info *sbi, { struct page *cpage; - if (blkaddr == NEW_ADDR) + if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR) return; - f2fs_bug_on(sbi, blkaddr == NULL_ADDR); - cpage = find_lock_page(META_MAPPING(sbi), blkaddr); if (cpage) { - f2fs_wait_on_page_writeback(cpage, DATA); + f2fs_wait_on_page_writeback(cpage, DATA, true); f2fs_put_page(cpage, 1); } } @@ -1510,12 +1606,11 @@ static int read_compacted_summaries(struct f2fs_sb_info *sbi) /* Step 1: restore nat cache */ seg_i = CURSEG_I(sbi, CURSEG_HOT_DATA); - memcpy(&seg_i->sum_blk->n_nats, kaddr, SUM_JOURNAL_SIZE); + memcpy(seg_i->journal, kaddr, SUM_JOURNAL_SIZE); /* Step 2: restore sit cache */ seg_i = CURSEG_I(sbi, CURSEG_COLD_DATA); - memcpy(&seg_i->sum_blk->n_sits, kaddr + SUM_JOURNAL_SIZE, - SUM_JOURNAL_SIZE); + memcpy(seg_i->journal, kaddr + SUM_JOURNAL_SIZE, SUM_JOURNAL_SIZE); offset = 2 * SUM_JOURNAL_SIZE; /* Step 3: restore summary entries */ @@ -1539,7 +1634,7 @@ static int read_compacted_summaries(struct f2fs_sb_info *sbi) s = (struct f2fs_summary *)(kaddr + offset); seg_i->sum_blk->entries[j] = *s; offset += SUMMARY_SIZE; - if (offset + SUMMARY_SIZE <= PAGE_CACHE_SIZE - + if (offset + SUMMARY_SIZE <= PAGE_SIZE - SUM_FOOTER_SIZE) continue; @@ -1611,7 +1706,14 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type) /* set uncompleted segment to curseg */ curseg = CURSEG_I(sbi, type); mutex_lock(&curseg->curseg_mutex); - memcpy(curseg->sum_blk, sum, PAGE_CACHE_SIZE); + + /* update journal info */ + down_write(&curseg->journal_rwsem); + memcpy(curseg->journal, &sum->journal, SUM_JOURNAL_SIZE); + up_write(&curseg->journal_rwsem); + + memcpy(curseg->sum_blk->entries, sum->entries, SUM_ENTRY_SIZE); + memcpy(&curseg->sum_blk->footer, &sum->footer, SUM_FOOTER_SIZE); curseg->next_segno = segno; reset_curseg(sbi, type, 0); curseg->alloc_type = ckpt->alloc_type[type]; @@ -1626,7 +1728,7 @@ static int restore_curseg_summaries(struct f2fs_sb_info *sbi) int type = CURSEG_HOT_DATA; int err; - if (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_COMPACT_SUM_FLAG)) { + if (is_set_ckpt_flags(sbi, CP_COMPACT_SUM_FLAG)) { int npages = npages_for_summary_flush(sbi, true); if (npages >= 2) @@ -1666,13 +1768,12 @@ static void write_compacted_summaries(struct f2fs_sb_info *sbi, block_t blkaddr) /* Step 1: write nat cache */ seg_i = CURSEG_I(sbi, CURSEG_HOT_DATA); - memcpy(kaddr, &seg_i->sum_blk->n_nats, SUM_JOURNAL_SIZE); + memcpy(kaddr, seg_i->journal, SUM_JOURNAL_SIZE); written_size += SUM_JOURNAL_SIZE; /* Step 2: write sit cache */ seg_i = CURSEG_I(sbi, CURSEG_COLD_DATA); - memcpy(kaddr + written_size, &seg_i->sum_blk->n_sits, - SUM_JOURNAL_SIZE); + memcpy(kaddr + written_size, seg_i->journal, SUM_JOURNAL_SIZE); written_size += SUM_JOURNAL_SIZE; /* Step 3: write summary entries */ @@ -1694,7 +1795,7 @@ static void write_compacted_summaries(struct f2fs_sb_info *sbi, block_t blkaddr) *summary = seg_i->sum_blk->entries[j]; written_size += SUMMARY_SIZE; - if (written_size + SUMMARY_SIZE <= PAGE_CACHE_SIZE - + if (written_size + SUMMARY_SIZE <= PAGE_SIZE - SUM_FOOTER_SIZE) continue; @@ -1718,17 +1819,13 @@ static void write_normal_summaries(struct f2fs_sb_info *sbi, else end = type + NR_CURSEG_NODE_TYPE; - for (i = type; i < end; i++) { - struct curseg_info *sum = CURSEG_I(sbi, i); - mutex_lock(&sum->curseg_mutex); - write_sum_page(sbi, sum->sum_blk, blkaddr + (i - type)); - mutex_unlock(&sum->curseg_mutex); - } + for (i = type; i < end; i++) + write_current_sum_page(sbi, i, blkaddr + (i - type)); } void write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk) { - if (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_COMPACT_SUM_FLAG)) + if (is_set_ckpt_flags(sbi, CP_COMPACT_SUM_FLAG)) write_compacted_summaries(sbi, start_blk); else write_normal_summaries(sbi, start_blk, CURSEG_HOT_DATA); @@ -1739,24 +1836,24 @@ void write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk) write_normal_summaries(sbi, start_blk, CURSEG_HOT_NODE); } -int lookup_journal_in_cursum(struct f2fs_summary_block *sum, int type, +int lookup_journal_in_cursum(struct f2fs_journal *journal, int type, unsigned int val, int alloc) { int i; if (type == NAT_JOURNAL) { - for (i = 0; i < nats_in_cursum(sum); i++) { - if (le32_to_cpu(nid_in_journal(sum, i)) == val) + for (i = 0; i < nats_in_cursum(journal); i++) { + if (le32_to_cpu(nid_in_journal(journal, i)) == val) return i; } - if (alloc && nats_in_cursum(sum) < NAT_JOURNAL_ENTRIES) - return update_nats_in_cursum(sum, 1); + if (alloc && __has_cursum_space(journal, 1, NAT_JOURNAL)) + return update_nats_in_cursum(journal, 1); } else if (type == SIT_JOURNAL) { - for (i = 0; i < sits_in_cursum(sum); i++) - if (le32_to_cpu(segno_in_journal(sum, i)) == val) + for (i = 0; i < sits_in_cursum(journal); i++) + if (le32_to_cpu(segno_in_journal(journal, i)) == val) return i; - if (alloc && sits_in_cursum(sum) < SIT_JOURNAL_ENTRIES) - return update_sits_in_cursum(sum, 1); + if (alloc && __has_cursum_space(journal, 1, SIT_JOURNAL)) + return update_sits_in_cursum(journal, 1); } return -1; } @@ -1785,7 +1882,7 @@ static struct page *get_next_sit_page(struct f2fs_sb_info *sbi, src_addr = page_address(src_page); dst_addr = page_address(dst_page); - memcpy(dst_addr, src_addr, PAGE_CACHE_SIZE); + memcpy(dst_addr, src_addr, PAGE_SIZE); set_page_dirty(dst_page); f2fs_put_page(src_page, 1); @@ -1860,20 +1957,22 @@ static void add_sits_in_set(struct f2fs_sb_info *sbi) static void remove_sits_in_journal(struct f2fs_sb_info *sbi) { struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA); - struct f2fs_summary_block *sum = curseg->sum_blk; + struct f2fs_journal *journal = curseg->journal; int i; - for (i = sits_in_cursum(sum) - 1; i >= 0; i--) { + down_write(&curseg->journal_rwsem); + for (i = 0; i < sits_in_cursum(journal); i++) { unsigned int segno; bool dirtied; - segno = le32_to_cpu(segno_in_journal(sum, i)); + segno = le32_to_cpu(segno_in_journal(journal, i)); dirtied = __mark_sit_entry_dirty(sbi, segno); if (!dirtied) add_sit_entry(segno, &SM_I(sbi)->sit_entry_set); } - update_sits_in_cursum(sum, -sits_in_cursum(sum)); + update_sits_in_cursum(journal, -i); + up_write(&curseg->journal_rwsem); } /* @@ -1885,13 +1984,12 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) struct sit_info *sit_i = SIT_I(sbi); unsigned long *bitmap = sit_i->dirty_sentries_bitmap; struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA); - struct f2fs_summary_block *sum = curseg->sum_blk; + struct f2fs_journal *journal = curseg->journal; struct sit_entry_set *ses, *tmp; struct list_head *head = &SM_I(sbi)->sit_entry_set; bool to_journal = true; struct seg_entry *se; - mutex_lock(&curseg->curseg_mutex); mutex_lock(&sit_i->sentry_lock); if (!sit_i->dirty_sentries) @@ -1908,7 +2006,7 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) * entries, remove all entries from journal and add and account * them in sit entry set. */ - if (!__has_cursum_space(sum, sit_i->dirty_sentries, SIT_JOURNAL)) + if (!__has_cursum_space(journal, sit_i->dirty_sentries, SIT_JOURNAL)) remove_sits_in_journal(sbi); /* @@ -1925,10 +2023,12 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) unsigned int segno = start_segno; if (to_journal && - !__has_cursum_space(sum, ses->entry_cnt, SIT_JOURNAL)) + !__has_cursum_space(journal, ses->entry_cnt, SIT_JOURNAL)) to_journal = false; - if (!to_journal) { + if (to_journal) { + down_write(&curseg->journal_rwsem); + } else { page = get_next_sit_page(sbi, start_segno); raw_sit = page_address(page); } @@ -1946,13 +2046,13 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) } if (to_journal) { - offset = lookup_journal_in_cursum(sum, + offset = lookup_journal_in_cursum(journal, SIT_JOURNAL, segno, 1); f2fs_bug_on(sbi, offset < 0); - segno_in_journal(sum, offset) = + segno_in_journal(journal, offset) = cpu_to_le32(segno); seg_info_to_raw_sit(se, - &sit_in_journal(sum, offset)); + &sit_in_journal(journal, offset)); } else { sit_offset = SIT_ENTRY_OFFSET(sit_i, segno); seg_info_to_raw_sit(se, @@ -1964,7 +2064,9 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) ses->entry_cnt--; } - if (!to_journal) + if (to_journal) + up_write(&curseg->journal_rwsem); + else f2fs_put_page(page, 1); f2fs_bug_on(sbi, ses->entry_cnt); @@ -1979,7 +2081,6 @@ out: add_discard_addrs(sbi, cpc); } mutex_unlock(&sit_i->sentry_lock); - mutex_unlock(&curseg->curseg_mutex); set_prefree_as_free_segments(sbi); } @@ -2015,12 +2116,16 @@ static int build_sit_info(struct f2fs_sb_info *sbi) = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); sit_i->sentries[start].ckpt_valid_map = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); - sit_i->sentries[start].discard_map - = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); if (!sit_i->sentries[start].cur_valid_map || - !sit_i->sentries[start].ckpt_valid_map || - !sit_i->sentries[start].discard_map) + !sit_i->sentries[start].ckpt_valid_map) return -ENOMEM; + + if (f2fs_discard_en(sbi)) { + sit_i->sentries[start].discard_map + = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); + if (!sit_i->sentries[start].discard_map) + return -ENOMEM; + } } sit_i->tmp_map = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); @@ -2108,9 +2213,14 @@ static int build_curseg(struct f2fs_sb_info *sbi) for (i = 0; i < NR_CURSEG_TYPE; i++) { mutex_init(&array[i].curseg_mutex); - array[i].sum_blk = kzalloc(PAGE_CACHE_SIZE, GFP_KERNEL); + array[i].sum_blk = kzalloc(PAGE_SIZE, GFP_KERNEL); if (!array[i].sum_blk) return -ENOMEM; + init_rwsem(&array[i].journal_rwsem); + array[i].journal = kzalloc(sizeof(struct f2fs_journal), + GFP_KERNEL); + if (!array[i].journal) + return -ENOMEM; array[i].segno = NULL_SEGNO; array[i].next_blkoff = 0; } @@ -2121,11 +2231,13 @@ static void build_sit_entries(struct f2fs_sb_info *sbi) { struct sit_info *sit_i = SIT_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA); - struct f2fs_summary_block *sum = curseg->sum_blk; + struct f2fs_journal *journal = curseg->journal; + struct seg_entry *se; + struct f2fs_sit_entry sit; int sit_blk_cnt = SIT_BLK_CNT(sbi); unsigned int i, start, end; unsigned int readed, start_blk = 0; - int nrpages = MAX_BIO_BLOCKS(sbi); + int nrpages = MAX_BIO_BLOCKS(sbi) * 8; do { readed = ra_meta_pages(sbi, start_blk, nrpages, META_SIT, true); @@ -2134,41 +2246,58 @@ static void build_sit_entries(struct f2fs_sb_info *sbi) end = (start_blk + readed) * sit_i->sents_per_block; for (; start < end && start < MAIN_SEGS(sbi); start++) { - struct seg_entry *se = &sit_i->sentries[start]; struct f2fs_sit_block *sit_blk; - struct f2fs_sit_entry sit; struct page *page; - mutex_lock(&curseg->curseg_mutex); - for (i = 0; i < sits_in_cursum(sum); i++) { - if (le32_to_cpu(segno_in_journal(sum, i)) - == start) { - sit = sit_in_journal(sum, i); - mutex_unlock(&curseg->curseg_mutex); - goto got_it; - } - } - mutex_unlock(&curseg->curseg_mutex); - + se = &sit_i->sentries[start]; page = get_current_sit_page(sbi, start); sit_blk = (struct f2fs_sit_block *)page_address(page); sit = sit_blk->entries[SIT_ENTRY_OFFSET(sit_i, start)]; f2fs_put_page(page, 1); -got_it: + check_block_count(sbi, start, &sit); seg_info_from_raw_sit(se, &sit); /* build discard map only one time */ - memcpy(se->discard_map, se->cur_valid_map, SIT_VBLOCK_MAP_SIZE); - sbi->discard_blks += sbi->blocks_per_seg - se->valid_blocks; - - if (sbi->segs_per_sec > 1) { - struct sec_entry *e = get_sec_entry(sbi, start); - e->valid_blocks += se->valid_blocks; + if (f2fs_discard_en(sbi)) { + memcpy(se->discard_map, se->cur_valid_map, + SIT_VBLOCK_MAP_SIZE); + sbi->discard_blks += sbi->blocks_per_seg - + se->valid_blocks; } + + if (sbi->segs_per_sec > 1) + get_sec_entry(sbi, start)->valid_blocks += + se->valid_blocks; } start_blk += readed; } while (start_blk < sit_blk_cnt); + + down_read(&curseg->journal_rwsem); + for (i = 0; i < sits_in_cursum(journal); i++) { + unsigned int old_valid_blocks; + + start = le32_to_cpu(segno_in_journal(journal, i)); + se = &sit_i->sentries[start]; + sit = sit_in_journal(journal, i); + + old_valid_blocks = se->valid_blocks; + + check_block_count(sbi, start, &sit); + seg_info_from_raw_sit(se, &sit); + + if (f2fs_discard_en(sbi)) { + memcpy(se->discard_map, se->cur_valid_map, + SIT_VBLOCK_MAP_SIZE); + sbi->discard_blks += old_valid_blocks - + se->valid_blocks; + } + + if (sbi->segs_per_sec > 1) + get_sec_entry(sbi, start)->valid_blocks += + se->valid_blocks - old_valid_blocks; + } + up_read(&curseg->journal_rwsem); } static void init_free_segmap(struct f2fs_sb_info *sbi) @@ -2301,7 +2430,11 @@ int build_segment_manager(struct f2fs_sb_info *sbi) sm_info->ssa_blkaddr = le32_to_cpu(raw_super->ssa_blkaddr); sm_info->rec_prefree_segments = sm_info->main_segments * DEF_RECLAIM_PREFREE_SEGMENTS / 100; - sm_info->ipu_policy = 1 << F2FS_IPU_FSYNC; + if (sm_info->rec_prefree_segments > DEF_MAX_RECLAIM_PREFREE_SEGMENTS) + sm_info->rec_prefree_segments = DEF_MAX_RECLAIM_PREFREE_SEGMENTS; + + if (!test_opt(sbi, LFS)) + sm_info->ipu_policy = 1 << F2FS_IPU_FSYNC; sm_info->min_ipu_util = DEF_MIN_IPU_UTIL; sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS; @@ -2383,8 +2516,10 @@ static void destroy_curseg(struct f2fs_sb_info *sbi) if (!array) return; SM_I(sbi)->curseg_array = NULL; - for (i = 0; i < NR_CURSEG_TYPE; i++) + for (i = 0; i < NR_CURSEG_TYPE; i++) { kfree(array[i].sum_blk); + kfree(array[i].journal); + } kfree(array); } @@ -2450,7 +2585,7 @@ int __init create_segment_manager_caches(void) sit_entry_set_slab = f2fs_kmem_cache_create("sit_entry_set", sizeof(struct sit_entry_set)); if (!sit_entry_set_slab) - goto destory_discard_entry; + goto destroy_discard_entry; inmem_entry_slab = f2fs_kmem_cache_create("inmem_page_entry", sizeof(struct inmem_pages)); @@ -2460,7 +2595,7 @@ int __init create_segment_manager_caches(void) destroy_sit_entry_set: kmem_cache_destroy(sit_entry_set_slab); -destory_discard_entry: +destroy_discard_entry: kmem_cache_destroy(discard_entry_slab); fail: return -ENOMEM; diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index ee44d346ea44..fecb856ad874 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -16,6 +16,7 @@ #define NULL_SECNO ((unsigned int)(~0)) #define DEF_RECLAIM_PREFREE_SEGMENTS 5 /* 5% over total segments */ +#define DEF_MAX_RECLAIM_PREFREE_SEGMENTS 4096 /* 8GB in maximum */ /* L: Logical segment # in volume, R: Relative segment # in main area */ #define GET_L2R_SEGNO(free_i, segno) (segno - free_i->start_segno) @@ -158,16 +159,17 @@ struct victim_sel_policy { }; struct seg_entry { - unsigned short valid_blocks; /* # of valid blocks */ + unsigned int type:6; /* segment type like CURSEG_XXX_TYPE */ + unsigned int valid_blocks:10; /* # of valid blocks */ + unsigned int ckpt_valid_blocks:10; /* # of valid blocks last cp */ + unsigned int padding:6; /* padding */ unsigned char *cur_valid_map; /* validity bitmap of blocks */ /* * # of valid blocks and the validity bitmap stored in the the last * checkpoint pack. This information is used by the SSR mode. */ - unsigned short ckpt_valid_blocks; - unsigned char *ckpt_valid_map; + unsigned char *ckpt_valid_map; /* validity bitmap of blocks last cp */ unsigned char *discard_map; - unsigned char type; /* segment type like CURSEG_XXX_TYPE */ unsigned long long mtime; /* modification time of the segment */ }; @@ -183,7 +185,7 @@ struct segment_allocation { * this value is set in page as a private data which indicate that * the page is atomically written, and it is in inmem_pages list. */ -#define ATOMIC_WRITTEN_PAGE 0x0000ffff +#define ATOMIC_WRITTEN_PAGE ((unsigned long)-1) #define IS_ATOMIC_WRITTEN_PAGE(page) \ (page_private(page) == (unsigned long)ATOMIC_WRITTEN_PAGE) @@ -191,6 +193,7 @@ struct segment_allocation { struct inmem_pages { struct list_head list; struct page *page; + block_t old_addr; /* for revoking when fail to commit */ }; struct sit_info { @@ -257,6 +260,8 @@ struct victim_selection { struct curseg_info { struct mutex curseg_mutex; /* lock for consistency */ struct f2fs_summary_block *sum_blk; /* cached summary block */ + struct rw_semaphore journal_rwsem; /* protect journal area */ + struct f2fs_journal *journal; /* cached journal info */ unsigned char alloc_type; /* current allocation type */ unsigned int segno; /* current segment number */ unsigned short next_blkoff; /* next block offset to write */ @@ -466,20 +471,27 @@ static inline bool need_SSR(struct f2fs_sb_info *sbi) { int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES); int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS); + + if (test_opt(sbi, LFS)) + return false; + return free_sections(sbi) <= (node_secs + 2 * dent_secs + reserved_sections(sbi) + 1); } -static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed) +static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, + int freed, int needed) { int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES); int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS); + node_secs += get_blocktype_secs(sbi, F2FS_DIRTY_IMETA); + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) return false; - return (free_sections(sbi) + freed) <= (node_secs + 2 * dent_secs + - reserved_sections(sbi)); + return (free_sections(sbi) + freed) <= + (node_secs + 2 * dent_secs + reserved_sections(sbi) + needed); } static inline bool excess_prefree_segs(struct f2fs_sb_info *sbi) @@ -527,6 +539,9 @@ static inline bool need_inplace_update(struct inode *inode) if (S_ISDIR(inode->i_mode) || f2fs_is_atomic_file(inode)) return false; + if (test_opt(sbi, LFS)) + return false; + if (policy & (0x1 << F2FS_IPU_FORCE)) return true; if (policy & (0x1 << F2FS_IPU_SSR) && need_SSR(sbi)) @@ -540,7 +555,7 @@ static inline bool need_inplace_update(struct inode *inode) /* this is only set during fdatasync */ if (policy & (0x1 << F2FS_IPU_FSYNC) && - is_inode_flag_set(F2FS_I(inode), FI_NEED_IPU)) + is_inode_flag_set(inode, FI_NEED_IPU)) return true; return false; @@ -573,8 +588,8 @@ static inline void check_seg_range(struct f2fs_sb_info *sbi, unsigned int segno) static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr) { - f2fs_bug_on(sbi, blk_addr < SEG0_BLKADDR(sbi) - || blk_addr >= MAX_BLKADDR(sbi)); + BUG_ON(blk_addr < SEG0_BLKADDR(sbi) + || blk_addr >= MAX_BLKADDR(sbi)); } /* @@ -702,9 +717,9 @@ static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type) if (type == DATA) return sbi->blocks_per_seg; else if (type == NODE) - return 3 * sbi->blocks_per_seg; + return 8 * sbi->blocks_per_seg; else if (type == META) - return MAX_BIO_BLOCKS(sbi); + return 8 * MAX_BIO_BLOCKS(sbi); else return 0; } @@ -722,10 +737,8 @@ static inline long nr_pages_to_write(struct f2fs_sb_info *sbi, int type, nr_to_write = wbc->nr_to_write; - if (type == DATA) - desired = 4096; - else if (type == NODE) - desired = 3 * max_hw_blocks(sbi); + if (type == NODE) + desired = 2 * max_hw_blocks(sbi); else desired = MAX_BIO_BLOCKS(sbi); diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c index da0d8e0b55a5..46c915425923 100644 --- a/fs/f2fs/shrinker.c +++ b/fs/f2fs/shrinker.c @@ -13,6 +13,7 @@ #include #include "f2fs.h" +#include "node.h" static LIST_HEAD(f2fs_list); static DEFINE_SPINLOCK(f2fs_list_lock); @@ -25,14 +26,15 @@ static unsigned long __count_nat_entries(struct f2fs_sb_info *sbi) static unsigned long __count_free_nids(struct f2fs_sb_info *sbi) { - if (NM_I(sbi)->fcnt > NAT_ENTRY_PER_BLOCK) - return NM_I(sbi)->fcnt - NAT_ENTRY_PER_BLOCK; + if (NM_I(sbi)->fcnt > MAX_FREE_NIDS) + return NM_I(sbi)->fcnt - MAX_FREE_NIDS; return 0; } static unsigned long __count_extent_cache(struct f2fs_sb_info *sbi) { - return sbi->total_ext_tree + atomic_read(&sbi->total_ext_node); + return atomic_read(&sbi->total_zombie_tree) + + atomic_read(&sbi->total_ext_node); } unsigned long f2fs_shrink_count(struct shrinker *shrink, diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 3a65e0132352..fd249cc9b96e 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -39,6 +39,35 @@ static struct proc_dir_entry *f2fs_proc_root; static struct kmem_cache *f2fs_inode_cachep; static struct kset *f2fs_kset; +#ifdef CONFIG_F2FS_FAULT_INJECTION + +char *fault_name[FAULT_MAX] = { + [FAULT_KMALLOC] = "kmalloc", + [FAULT_PAGE_ALLOC] = "page alloc", + [FAULT_ALLOC_NID] = "alloc nid", + [FAULT_ORPHAN] = "orphan", + [FAULT_BLOCK] = "no more block", + [FAULT_DIR_DEPTH] = "too big dir depth", + [FAULT_EVICT_INODE] = "evict_inode fail", + [FAULT_IO] = "IO error", + [FAULT_CHECKPOINT] = "checkpoint error", +}; + +static void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, + unsigned int rate) +{ + struct f2fs_fault_info *ffi = &sbi->fault_info; + + if (rate) { + atomic_set(&ffi->inject_ops, 0); + ffi->inject_rate = rate; + ffi->inject_type = (1 << FAULT_MAX) - 1; + } else { + memset(ffi, 0, sizeof(struct f2fs_fault_info)); + } +} +#endif + /* f2fs-wide shrinker description */ static struct shrinker f2fs_shrinker_info = { .scan_objects = f2fs_shrink_scan, @@ -51,6 +80,7 @@ enum { Opt_disable_roll_forward, Opt_norecovery, Opt_discard, + Opt_nodiscard, Opt_noheap, Opt_user_xattr, Opt_nouser_xattr, @@ -61,12 +91,19 @@ enum { Opt_inline_xattr, Opt_inline_data, Opt_inline_dentry, + Opt_noinline_dentry, Opt_flush_merge, + Opt_noflush_merge, Opt_nobarrier, Opt_fastboot, Opt_extent_cache, Opt_noextent_cache, Opt_noinline_data, + Opt_data_flush, + Opt_mode, + Opt_fault_injection, + Opt_lazytime, + Opt_nolazytime, Opt_err, }; @@ -75,6 +112,7 @@ static match_table_t f2fs_tokens = { {Opt_disable_roll_forward, "disable_roll_forward"}, {Opt_norecovery, "norecovery"}, {Opt_discard, "discard"}, + {Opt_nodiscard, "nodiscard"}, {Opt_noheap, "no_heap"}, {Opt_user_xattr, "user_xattr"}, {Opt_nouser_xattr, "nouser_xattr"}, @@ -85,12 +123,19 @@ static match_table_t f2fs_tokens = { {Opt_inline_xattr, "inline_xattr"}, {Opt_inline_data, "inline_data"}, {Opt_inline_dentry, "inline_dentry"}, + {Opt_noinline_dentry, "noinline_dentry"}, {Opt_flush_merge, "flush_merge"}, + {Opt_noflush_merge, "noflush_merge"}, {Opt_nobarrier, "nobarrier"}, {Opt_fastboot, "fastboot"}, {Opt_extent_cache, "extent_cache"}, {Opt_noextent_cache, "noextent_cache"}, {Opt_noinline_data, "noinline_data"}, + {Opt_data_flush, "data_flush"}, + {Opt_mode, "mode=%s"}, + {Opt_fault_injection, "fault_injection=%u"}, + {Opt_lazytime, "lazytime"}, + {Opt_nolazytime, "nolazytime"}, {Opt_err, NULL}, }; @@ -100,6 +145,10 @@ enum { SM_INFO, /* struct f2fs_sm_info */ NM_INFO, /* struct f2fs_nm_info */ F2FS_SBI, /* struct f2fs_sb_info */ +#ifdef CONFIG_F2FS_FAULT_INJECTION + FAULT_INFO_RATE, /* struct f2fs_fault_info */ + FAULT_INFO_TYPE, /* struct f2fs_fault_info */ +#endif }; struct f2fs_attr { @@ -121,9 +170,27 @@ static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type) return (unsigned char *)NM_I(sbi); else if (struct_type == F2FS_SBI) return (unsigned char *)sbi; +#ifdef CONFIG_F2FS_FAULT_INJECTION + else if (struct_type == FAULT_INFO_RATE || + struct_type == FAULT_INFO_TYPE) + return (unsigned char *)&sbi->fault_info; +#endif return NULL; } +static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + struct super_block *sb = sbi->sb; + + if (!sb->s_bdev->bd_part) + return snprintf(buf, PAGE_SIZE, "0\n"); + + return snprintf(buf, PAGE_SIZE, "%llu\n", + (unsigned long long)(sbi->kbytes_written + + BD_PART_WRITTEN(sbi))); +} + static ssize_t f2fs_sbi_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { @@ -157,6 +224,10 @@ static ssize_t f2fs_sbi_store(struct f2fs_attr *a, ret = kstrtoul(skip_spaces(buf), 0, &t); if (ret < 0) return ret; +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (a->struct_type == FAULT_INFO_TYPE && t >= (1 << FAULT_MAX)) + return -EINVAL; +#endif *ui = t; return count; } @@ -202,6 +273,9 @@ static struct f2fs_attr f2fs_attr_##_name = { \ f2fs_sbi_show, f2fs_sbi_store, \ offsetof(struct struct_name, elname)) +#define F2FS_GENERAL_RO_ATTR(name) \ +static struct f2fs_attr f2fs_attr_##name = __ATTR(name, 0444, name##_show, NULL) + F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_min_sleep_time, min_sleep_time); F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_max_sleep_time, max_sleep_time); F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_no_gc_sleep_time, no_gc_sleep_time); @@ -214,9 +288,16 @@ F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks); F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh); F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ra_nid_pages, ra_nid_pages); +F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, dirty_nats_ratio, dirty_nats_ratio); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, cp_interval, cp_interval); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, cp_interval, interval_time[CP_TIME]); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, idle_interval, interval_time[REQ_TIME]); +#ifdef CONFIG_F2FS_FAULT_INJECTION +F2FS_RW_ATTR(FAULT_INFO_RATE, f2fs_fault_info, inject_rate, inject_rate); +F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type); +#endif +F2FS_GENERAL_RO_ATTR(lifetime_write_kbytes); #define ATTR_LIST(name) (&f2fs_attr_##name.attr) static struct attribute *f2fs_attrs[] = { @@ -234,7 +315,14 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(dir_level), ATTR_LIST(ram_thresh), ATTR_LIST(ra_nid_pages), + ATTR_LIST(dirty_nats_ratio), ATTR_LIST(cp_interval), + ATTR_LIST(idle_interval), +#ifdef CONFIG_F2FS_FAULT_INJECTION + ATTR_LIST(inject_rate), + ATTR_LIST(inject_type), +#endif + ATTR_LIST(lifetime_write_kbytes), NULL, }; @@ -330,6 +418,8 @@ static int parse_options(struct super_block *sb, char *options) "the device does not support discard"); } break; + case Opt_nodiscard: + clear_opt(sbi, DISCARD); case Opt_noheap: set_opt(sbi, NOHEAP); break; @@ -388,9 +478,15 @@ static int parse_options(struct super_block *sb, char *options) case Opt_inline_dentry: set_opt(sbi, INLINE_DENTRY); break; + case Opt_noinline_dentry: + clear_opt(sbi, INLINE_DENTRY); + break; case Opt_flush_merge: set_opt(sbi, FLUSH_MERGE); break; + case Opt_noflush_merge: + clear_opt(sbi, FLUSH_MERGE); + break; case Opt_nobarrier: set_opt(sbi, NOBARRIER); break; @@ -406,6 +502,42 @@ static int parse_options(struct super_block *sb, char *options) case Opt_noinline_data: clear_opt(sbi, INLINE_DATA); break; + case Opt_data_flush: + set_opt(sbi, DATA_FLUSH); + break; + case Opt_mode: + name = match_strdup(&args[0]); + + if (!name) + return -ENOMEM; + if (strlen(name) == 8 && + !strncmp(name, "adaptive", 8)) { + set_opt_mode(sbi, F2FS_MOUNT_ADAPTIVE); + } else if (strlen(name) == 3 && + !strncmp(name, "lfs", 3)) { + set_opt_mode(sbi, F2FS_MOUNT_LFS); + } else { + kfree(name); + return -EINVAL; + } + kfree(name); + break; + case Opt_fault_injection: + if (args->from && match_int(args, &arg)) + return -EINVAL; +#ifdef CONFIG_F2FS_FAULT_INJECTION + f2fs_build_fault_attr(sbi, arg); +#else + f2fs_msg(sb, KERN_INFO, + "FAULT_INJECTION was not selected"); +#endif + break; + case Opt_lazytime: + sb->s_flags |= MS_LAZYTIME; + break; + case Opt_nolazytime: + sb->s_flags &= ~MS_LAZYTIME; + break; default: f2fs_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" or missing value", @@ -426,26 +558,25 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) init_once((void *) fi); + if (percpu_counter_init(&fi->dirty_pages, 0, GFP_NOFS)) { + kmem_cache_free(f2fs_inode_cachep, fi); + return NULL; + } + /* Initialize f2fs-specific inode info */ fi->vfs_inode.i_version = 1; - atomic_set(&fi->dirty_pages, 0); fi->i_current_depth = 1; fi->i_advise = 0; init_rwsem(&fi->i_sem); + INIT_LIST_HEAD(&fi->dirty_list); + INIT_LIST_HEAD(&fi->gdirty_list); INIT_LIST_HEAD(&fi->inmem_pages); mutex_init(&fi->inmem_lock); - - set_inode_flag(fi, FI_NEW_INODE); - - if (test_opt(F2FS_SB(sb), INLINE_XATTR)) - set_inode_flag(fi, FI_INLINE_XATTR); + init_rwsem(&fi->dio_rwsem[READ]); + init_rwsem(&fi->dio_rwsem[WRITE]); /* Will be used by directory only */ fi->i_dir_level = F2FS_SB(sb)->dir_level; - -#ifdef CONFIG_F2FS_FS_ENCRYPTION - fi->i_crypt_info = NULL; -#endif return &fi->vfs_inode; } @@ -458,7 +589,7 @@ static int f2fs_drop_inode(struct inode *inode) * - f2fs_gc -> iput -> evict * - inode_wait_for_writeback(inode) */ - if (!inode_unhashed(inode) && inode->i_state & I_SYNC) { + if ((!inode_unhashed(inode) && inode->i_state & I_SYNC)) { if (!inode->i_nlink && !is_bad_inode(inode)) { /* to avoid evict_inode call simultaneously */ atomic_inc(&inode->i_count); @@ -466,32 +597,66 @@ static int f2fs_drop_inode(struct inode *inode) /* some remained atomic pages should discarded */ if (f2fs_is_atomic_file(inode)) - commit_inmem_pages(inode, true); + drop_inmem_pages(inode); /* should remain fi->extent_tree for writepage */ f2fs_destroy_extent_node(inode); sb_start_intwrite(inode->i_sb); - i_size_write(inode, 0); + f2fs_i_size_write(inode, 0); if (F2FS_HAS_BLOCKS(inode)) - f2fs_truncate(inode, true); + f2fs_truncate(inode); sb_end_intwrite(inode->i_sb); -#ifdef CONFIG_F2FS_FS_ENCRYPTION - if (F2FS_I(inode)->i_crypt_info) - f2fs_free_encryption_info(inode, - F2FS_I(inode)->i_crypt_info); -#endif + fscrypt_put_encryption_info(inode, NULL); spin_lock(&inode->i_lock); atomic_dec(&inode->i_count); } return 0; } + return generic_drop_inode(inode); } +int f2fs_inode_dirtied(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + spin_lock(&sbi->inode_lock[DIRTY_META]); + if (is_inode_flag_set(inode, FI_DIRTY_INODE)) { + spin_unlock(&sbi->inode_lock[DIRTY_META]); + return 1; + } + + set_inode_flag(inode, FI_DIRTY_INODE); + list_add_tail(&F2FS_I(inode)->gdirty_list, + &sbi->inode_list[DIRTY_META]); + inc_page_count(sbi, F2FS_DIRTY_IMETA); + stat_inc_dirty_inode(sbi, DIRTY_META); + spin_unlock(&sbi->inode_lock[DIRTY_META]); + + return 0; +} + +void f2fs_inode_synced(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + spin_lock(&sbi->inode_lock[DIRTY_META]); + if (!is_inode_flag_set(inode, FI_DIRTY_INODE)) { + spin_unlock(&sbi->inode_lock[DIRTY_META]); + return; + } + list_del_init(&F2FS_I(inode)->gdirty_list); + clear_inode_flag(inode, FI_DIRTY_INODE); + clear_inode_flag(inode, FI_AUTO_RECOVER); + dec_page_count(sbi, F2FS_DIRTY_IMETA); + stat_dec_dirty_inode(F2FS_I_SB(inode), DIRTY_META); + spin_unlock(&sbi->inode_lock[DIRTY_META]); +} + /* * f2fs_dirty_inode() is called from __mark_inode_dirty() * @@ -499,7 +664,19 @@ static int f2fs_drop_inode(struct inode *inode) */ static void f2fs_dirty_inode(struct inode *inode, int flags) { - set_inode_flag(F2FS_I(inode), FI_DIRTY_INODE); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + if (inode->i_ino == F2FS_NODE_INO(sbi) || + inode->i_ino == F2FS_META_INO(sbi)) + return; + + if (flags == I_DIRTY_TIME) + return; + + if (is_inode_flag_set(inode, FI_AUTO_RECOVER)) + clear_inode_flag(inode, FI_AUTO_RECOVER); + + f2fs_inode_dirtied(inode); } static void f2fs_i_callback(struct rcu_head *head) @@ -510,15 +687,27 @@ static void f2fs_i_callback(struct rcu_head *head) static void f2fs_destroy_inode(struct inode *inode) { + percpu_counter_destroy(&F2FS_I(inode)->dirty_pages); call_rcu(&inode->i_rcu, f2fs_i_callback); } +static void destroy_percpu_info(struct f2fs_sb_info *sbi) +{ + int i; + + for (i = 0; i < NR_COUNT_TYPE; i++) + percpu_counter_destroy(&sbi->nr_pages[i]); + percpu_counter_destroy(&sbi->alloc_valid_block_count); + percpu_counter_destroy(&sbi->total_valid_inode_count); +} + static void f2fs_put_super(struct super_block *sb) { struct f2fs_sb_info *sbi = F2FS_SB(sb); if (sbi->s_proc) { remove_proc_entry("segment_info", sbi->s_proc); + remove_proc_entry("segment_bits", sbi->s_proc); remove_proc_entry(sb->s_id, f2fs_proc_root); } kobject_del(&sbi->s_kobj); @@ -534,7 +723,7 @@ static void f2fs_put_super(struct super_block *sb) * clean checkpoint again. */ if (is_sbi_flag_set(sbi, SBI_IS_DIRTY) || - !is_set_ckpt_flags(F2FS_CKPT(sbi), CP_UMOUNT_FLAG)) { + !is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) { struct cp_control cpc = { .reason = CP_UMOUNT, }; @@ -548,12 +737,15 @@ static void f2fs_put_super(struct super_block *sb) * normally superblock is clean, so we need to release this. * In addition, EIO will skip do checkpoint, we need this as well. */ - release_dirty_inode(sbi); + release_ino_entry(sbi, true); release_discard_addrs(sbi); f2fs_leave_shrinker(sbi); mutex_unlock(&sbi->umount_mutex); + /* our cp_error case, we can wait for any writeback page */ + f2fs_flush_merged_bios(sbi); + iput(sbi->node_inode); iput(sbi->meta_inode); @@ -566,13 +758,18 @@ static void f2fs_put_super(struct super_block *sb) wait_for_completion(&sbi->s_kobj_unregister); sb->s_fs_info = NULL; - brelse(sbi->raw_super_buf); + if (sbi->s_chksum_driver) + crypto_free_shash(sbi->s_chksum_driver); + kfree(sbi->raw_super); + + destroy_percpu_info(sbi); kfree(sbi); } int f2fs_sync_fs(struct super_block *sb, int sync) { struct f2fs_sb_info *sbi = F2FS_SB(sb); + int err = 0; trace_f2fs_sync_fs(sb, sync); @@ -582,14 +779,12 @@ int f2fs_sync_fs(struct super_block *sb, int sync) cpc.reason = __get_cp_reason(sbi); mutex_lock(&sbi->gc_mutex); - write_checkpoint(sbi, &cpc); + err = write_checkpoint(sbi, &cpc); mutex_unlock(&sbi->gc_mutex); - } else { - f2fs_balance_fs(sbi); } f2fs_trace_ios(NULL, 1); - return 0; + return err; } static int f2fs_freeze(struct super_block *sb) @@ -623,7 +818,7 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_bsize = sbi->blocksize; buf->f_blocks = total_count - start_count; - buf->f_bfree = buf->f_blocks - valid_user_blocks(sbi) - ovp_count; + buf->f_bfree = user_block_count - valid_user_blocks(sbi) + ovp_count; buf->f_bavail = user_block_count - valid_user_blocks(sbi); buf->f_files = sbi->total_node_count - F2FS_RESERVED_NODE_NUM; @@ -676,6 +871,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",noinline_data"); if (test_opt(sbi, INLINE_DENTRY)) seq_puts(seq, ",inline_dentry"); + else + seq_puts(seq, ",noinline_dentry"); if (!f2fs_readonly(sbi->sb) && test_opt(sbi, FLUSH_MERGE)) seq_puts(seq, ",flush_merge"); if (test_opt(sbi, NOBARRIER)) @@ -686,6 +883,14 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",extent_cache"); else seq_puts(seq, ",noextent_cache"); + if (test_opt(sbi, DATA_FLUSH)) + seq_puts(seq, ",data_flush"); + + seq_puts(seq, ",mode="); + if (test_opt(sbi, ADAPTIVE)) + seq_puts(seq, "adaptive"); + else if (test_opt(sbi, LFS)) + seq_puts(seq, "lfs"); seq_printf(seq, ",active_logs=%u", sbi->active_logs); return 0; @@ -718,19 +923,47 @@ static int segment_info_seq_show(struct seq_file *seq, void *offset) return 0; } -static int segment_info_open_fs(struct inode *inode, struct file *file) +static int segment_bits_seq_show(struct seq_file *seq, void *offset) { - return single_open(file, segment_info_seq_show, PDE_DATA(inode)); + struct super_block *sb = seq->private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + unsigned int total_segs = + le32_to_cpu(sbi->raw_super->segment_count_main); + int i, j; + + seq_puts(seq, "format: segment_type|valid_blocks|bitmaps\n" + "segment_type(0:HD, 1:WD, 2:CD, 3:HN, 4:WN, 5:CN)\n"); + + for (i = 0; i < total_segs; i++) { + struct seg_entry *se = get_seg_entry(sbi, i); + + seq_printf(seq, "%-10d", i); + seq_printf(seq, "%d|%-3u|", se->type, + get_valid_blocks(sbi, i, 1)); + for (j = 0; j < SIT_VBLOCK_MAP_SIZE; j++) + seq_printf(seq, " %.2x", se->cur_valid_map[j]); + seq_putc(seq, '\n'); + } + return 0; } -static const struct file_operations f2fs_seq_segment_info_fops = { - .owner = THIS_MODULE, - .open = segment_info_open_fs, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, +#define F2FS_PROC_FILE_DEF(_name) \ +static int _name##_open_fs(struct inode *inode, struct file *file) \ +{ \ + return single_open(file, _name##_seq_show, PDE_DATA(inode)); \ +} \ + \ +static const struct file_operations f2fs_seq_##_name##_fops = { \ + .owner = THIS_MODULE, \ + .open = _name##_open_fs, \ + .read = seq_read, \ + .llseek = seq_lseek, \ + .release = single_release, \ }; +F2FS_PROC_FILE_DEF(segment_info); +F2FS_PROC_FILE_DEF(segment_bits); + static void default_options(struct f2fs_sb_info *sbi) { /* init some FS parameters */ @@ -738,7 +971,16 @@ static void default_options(struct f2fs_sb_info *sbi) set_opt(sbi, BG_GC); set_opt(sbi, INLINE_DATA); + set_opt(sbi, INLINE_DENTRY); set_opt(sbi, EXTENT_CACHE); + sbi->sb->s_flags |= MS_LAZYTIME; + set_opt(sbi, FLUSH_MERGE); + if (f2fs_sb_mounted_hmsmr(sbi->sb)) { + set_opt_mode(sbi, F2FS_MOUNT_LFS); + set_opt(sbi, DISCARD); + } else { + set_opt_mode(sbi, F2FS_MOUNT_ADAPTIVE); + } #ifdef CONFIG_F2FS_FS_XATTR set_opt(sbi, XATTR_USER); @@ -746,6 +988,10 @@ static void default_options(struct f2fs_sb_info *sbi) #ifdef CONFIG_F2FS_FS_POSIX_ACL set_opt(sbi, POSIX_ACL); #endif + +#ifdef CONFIG_F2FS_FAULT_INJECTION + f2fs_build_fault_attr(sbi, 0); +#endif } static int f2fs_remount(struct super_block *sb, int *flags, char *data) @@ -756,8 +1002,9 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) bool need_restart_gc = false; bool need_stop_gc = false; bool no_extent_cache = !test_opt(sbi, EXTENT_CACHE); - - sync_filesystem(sb); +#ifdef CONFIG_F2FS_FAULT_INJECTION + struct f2fs_fault_info ffi = sbi->fault_info; +#endif /* * Save the old mount options in case we @@ -766,6 +1013,15 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) org_mount_opt = sbi->mount_opt; active_logs = sbi->active_logs; + /* recover superblocks we couldn't write due to previous RO mount */ + if (!(*flags & MS_RDONLY) && is_sbi_flag_set(sbi, SBI_NEED_SB_WRITE)) { + err = f2fs_commit_super(sbi, false); + f2fs_msg(sb, KERN_INFO, + "Try to recover all the superblocks, ret: %d", err); + if (!err) + clear_sbi_flag(sbi, SBI_NEED_SB_WRITE); + } + sbi->mount_opt.opt = 0; default_options(sbi); @@ -797,7 +1053,6 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) if ((*flags & MS_RDONLY) || !test_opt(sbi, BG_GC)) { if (sbi->gc_thread) { stop_gc_thread(sbi); - f2fs_sync_fs(sb, 1); need_restart_gc = true; } } else if (!sbi->gc_thread) { @@ -807,6 +1062,16 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) need_stop_gc = true; } + if (*flags & MS_RDONLY) { + writeback_inodes_sb(sb, WB_REASON_SYNC); + sync_inodes_sb(sb); + + set_sbi_flag(sbi, SBI_IS_DIRTY); + set_sbi_flag(sbi, SBI_IS_CLOSE); + f2fs_sync_fs(sb, 1); + clear_sbi_flag(sbi, SBI_IS_CLOSE); + } + /* * We stop issue flush thread if FS is mounted as RO * or if flush_merge is not passed in mount option. @@ -820,8 +1085,9 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) } skip: /* Update the POSIXACL Flag */ - sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | + sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | (test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0); + return 0; restore_gc: if (need_restart_gc) { @@ -834,6 +1100,9 @@ restore_gc: restore_opts: sbi->mount_opt = org_mount_opt; sbi->active_logs = active_logs; +#ifdef CONFIG_F2FS_FAULT_INJECTION + sbi->fault_info = ffi; +#endif return err; } @@ -853,6 +1122,48 @@ static struct super_operations f2fs_sops = { .remount_fs = f2fs_remount, }; +#ifdef CONFIG_F2FS_FS_ENCRYPTION +static int f2fs_get_context(struct inode *inode, void *ctx, size_t len) +{ + return f2fs_getxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION, + F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, + ctx, len, NULL); +} + +static int f2fs_key_prefix(struct inode *inode, u8 **key) +{ + *key = F2FS_I_SB(inode)->key_prefix; + return F2FS_I_SB(inode)->key_prefix_size; +} + +static int f2fs_set_context(struct inode *inode, const void *ctx, size_t len, + void *fs_data) +{ + return f2fs_setxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION, + F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, + ctx, len, fs_data, XATTR_CREATE); +} + +static unsigned f2fs_max_namelen(struct inode *inode) +{ + return S_ISLNK(inode->i_mode) ? + inode->i_sb->s_blocksize : F2FS_NAME_LEN; +} + +static struct fscrypt_operations f2fs_cryptops = { + .get_context = f2fs_get_context, + .key_prefix = f2fs_key_prefix, + .set_context = f2fs_set_context, + .is_encrypted = f2fs_encrypted_inode, + .empty_dir = f2fs_empty_dir, + .max_namelen = f2fs_max_namelen, +}; +#else +static struct fscrypt_operations f2fs_cryptops = { + .is_encrypted = f2fs_encrypted_inode, +}; +#endif + static struct inode *f2fs_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation) { @@ -898,7 +1209,7 @@ static const struct export_operations f2fs_export_ops = { .get_parent = f2fs_get_parent, }; -static loff_t max_file_size(unsigned bits) +static loff_t max_file_blocks(void) { loff_t result = (DEF_ADDRS_PER_INODE - F2FS_INLINE_XATTR_ADDRS); loff_t leaf_count = ADDRS_PER_BLOCK; @@ -914,13 +1225,131 @@ static loff_t max_file_size(unsigned bits) leaf_count *= NIDS_PER_BLOCK; result += leaf_count; - result <<= bits; return result; } -static int sanity_check_raw_super(struct super_block *sb, - struct f2fs_super_block *raw_super) +static int __f2fs_commit_super(struct buffer_head *bh, + struct f2fs_super_block *super) { + lock_buffer(bh); + if (super) + memcpy(bh->b_data + F2FS_SUPER_OFFSET, super, sizeof(*super)); + set_buffer_uptodate(bh); + set_buffer_dirty(bh); + unlock_buffer(bh); + + /* it's rare case, we can do fua all the time */ + return __sync_dirty_buffer(bh, WRITE_FLUSH_FUA); +} + +static inline bool sanity_check_area_boundary(struct f2fs_sb_info *sbi, + struct buffer_head *bh) +{ + struct f2fs_super_block *raw_super = (struct f2fs_super_block *) + (bh->b_data + F2FS_SUPER_OFFSET); + struct super_block *sb = sbi->sb; + u32 segment0_blkaddr = le32_to_cpu(raw_super->segment0_blkaddr); + u32 cp_blkaddr = le32_to_cpu(raw_super->cp_blkaddr); + u32 sit_blkaddr = le32_to_cpu(raw_super->sit_blkaddr); + u32 nat_blkaddr = le32_to_cpu(raw_super->nat_blkaddr); + u32 ssa_blkaddr = le32_to_cpu(raw_super->ssa_blkaddr); + u32 main_blkaddr = le32_to_cpu(raw_super->main_blkaddr); + u32 segment_count_ckpt = le32_to_cpu(raw_super->segment_count_ckpt); + u32 segment_count_sit = le32_to_cpu(raw_super->segment_count_sit); + u32 segment_count_nat = le32_to_cpu(raw_super->segment_count_nat); + u32 segment_count_ssa = le32_to_cpu(raw_super->segment_count_ssa); + u32 segment_count_main = le32_to_cpu(raw_super->segment_count_main); + u32 segment_count = le32_to_cpu(raw_super->segment_count); + u32 log_blocks_per_seg = le32_to_cpu(raw_super->log_blocks_per_seg); + u64 main_end_blkaddr = main_blkaddr + + (segment_count_main << log_blocks_per_seg); + u64 seg_end_blkaddr = segment0_blkaddr + + (segment_count << log_blocks_per_seg); + + if (segment0_blkaddr != cp_blkaddr) { + f2fs_msg(sb, KERN_INFO, + "Mismatch start address, segment0(%u) cp_blkaddr(%u)", + segment0_blkaddr, cp_blkaddr); + return true; + } + + if (cp_blkaddr + (segment_count_ckpt << log_blocks_per_seg) != + sit_blkaddr) { + f2fs_msg(sb, KERN_INFO, + "Wrong CP boundary, start(%u) end(%u) blocks(%u)", + cp_blkaddr, sit_blkaddr, + segment_count_ckpt << log_blocks_per_seg); + return true; + } + + if (sit_blkaddr + (segment_count_sit << log_blocks_per_seg) != + nat_blkaddr) { + f2fs_msg(sb, KERN_INFO, + "Wrong SIT boundary, start(%u) end(%u) blocks(%u)", + sit_blkaddr, nat_blkaddr, + segment_count_sit << log_blocks_per_seg); + return true; + } + + if (nat_blkaddr + (segment_count_nat << log_blocks_per_seg) != + ssa_blkaddr) { + f2fs_msg(sb, KERN_INFO, + "Wrong NAT boundary, start(%u) end(%u) blocks(%u)", + nat_blkaddr, ssa_blkaddr, + segment_count_nat << log_blocks_per_seg); + return true; + } + + if (ssa_blkaddr + (segment_count_ssa << log_blocks_per_seg) != + main_blkaddr) { + f2fs_msg(sb, KERN_INFO, + "Wrong SSA boundary, start(%u) end(%u) blocks(%u)", + ssa_blkaddr, main_blkaddr, + segment_count_ssa << log_blocks_per_seg); + return true; + } + + if (main_end_blkaddr > seg_end_blkaddr) { + f2fs_msg(sb, KERN_INFO, + "Wrong MAIN_AREA boundary, start(%u) end(%u) block(%u)", + main_blkaddr, + segment0_blkaddr + + (segment_count << log_blocks_per_seg), + segment_count_main << log_blocks_per_seg); + return true; + } else if (main_end_blkaddr < seg_end_blkaddr) { + int err = 0; + char *res; + + /* fix in-memory information all the time */ + raw_super->segment_count = cpu_to_le32((main_end_blkaddr - + segment0_blkaddr) >> log_blocks_per_seg); + + if (f2fs_readonly(sb) || bdev_read_only(sb->s_bdev)) { + set_sbi_flag(sbi, SBI_NEED_SB_WRITE); + res = "internally"; + } else { + err = __f2fs_commit_super(bh, NULL); + res = err ? "failed" : "done"; + } + f2fs_msg(sb, KERN_INFO, + "Fix alignment : %s, start(%u) end(%u) block(%u)", + res, main_blkaddr, + segment0_blkaddr + + (segment_count << log_blocks_per_seg), + segment_count_main << log_blocks_per_seg); + if (err) + return true; + } + return false; +} + +static int sanity_check_raw_super(struct f2fs_sb_info *sbi, + struct buffer_head *bh) +{ + struct f2fs_super_block *raw_super = (struct f2fs_super_block *) + (bh->b_data + F2FS_SUPER_OFFSET); + struct super_block *sb = sbi->sb; unsigned int blocksize; if (F2FS_SUPER_MAGIC != le32_to_cpu(raw_super->magic)) { @@ -931,10 +1360,10 @@ static int sanity_check_raw_super(struct super_block *sb, } /* Currently, support only 4KB page cache size */ - if (F2FS_BLKSIZE != PAGE_CACHE_SIZE) { + if (F2FS_BLKSIZE != PAGE_SIZE) { f2fs_msg(sb, KERN_INFO, "Invalid page_cache_size (%lu), supports only 4KB\n", - PAGE_CACHE_SIZE); + PAGE_SIZE); return 1; } @@ -947,6 +1376,14 @@ static int sanity_check_raw_super(struct super_block *sb, return 1; } + /* check log blocks per segment */ + if (le32_to_cpu(raw_super->log_blocks_per_seg) != 9) { + f2fs_msg(sb, KERN_INFO, + "Invalid log blocks per segment (%u)\n", + le32_to_cpu(raw_super->log_blocks_per_seg)); + return 1; + } + /* Currently, support 512/1024/2048/4096 bytes sector size */ if (le32_to_cpu(raw_super->log_sectorsize) > F2FS_MAX_LOG_SECTOR_SIZE || @@ -965,10 +1402,27 @@ static int sanity_check_raw_super(struct super_block *sb, le32_to_cpu(raw_super->log_sectorsize)); return 1; } + + /* check reserved ino info */ + if (le32_to_cpu(raw_super->node_ino) != 1 || + le32_to_cpu(raw_super->meta_ino) != 2 || + le32_to_cpu(raw_super->root_ino) != 3) { + f2fs_msg(sb, KERN_INFO, + "Invalid Fs Meta Ino: node(%u) meta(%u) root(%u)", + le32_to_cpu(raw_super->node_ino), + le32_to_cpu(raw_super->meta_ino), + le32_to_cpu(raw_super->root_ino)); + return 1; + } + + /* check CP/SIT/NAT/SSA/MAIN_AREA area boundary */ + if (sanity_check_area_boundary(sbi, bh)) + return 1; + return 0; } -static int sanity_check_ckpt(struct f2fs_sb_info *sbi) +int sanity_check_ckpt(struct f2fs_sb_info *sbi) { unsigned int total, fsmeta; struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); @@ -994,7 +1448,6 @@ static int sanity_check_ckpt(struct f2fs_sb_info *sbi) static void init_sb_info(struct f2fs_sb_info *sbi) { struct f2fs_super_block *raw_super = sbi->raw_super; - int i; sbi->log_sectors_per_block = le32_to_cpu(raw_super->log_sectors_per_block); @@ -1014,111 +1467,131 @@ static void init_sb_info(struct f2fs_sb_info *sbi) sbi->cur_victim_sec = NULL_SECNO; sbi->max_victim_search = DEF_MAX_VICTIM_SEARCH; - for (i = 0; i < NR_COUNT_TYPE; i++) - atomic_set(&sbi->nr_pages[i], 0); - sbi->dir_level = DEF_DIR_LEVEL; - sbi->cp_interval = DEF_CP_INTERVAL; + sbi->interval_time[CP_TIME] = DEF_CP_INTERVAL; + sbi->interval_time[REQ_TIME] = DEF_IDLE_INTERVAL; clear_sbi_flag(sbi, SBI_NEED_FSCK); INIT_LIST_HEAD(&sbi->s_list); mutex_init(&sbi->umount_mutex); + mutex_init(&sbi->wio_mutex[NODE]); + mutex_init(&sbi->wio_mutex[DATA]); + spin_lock_init(&sbi->cp_lock); + +#ifdef CONFIG_F2FS_FS_ENCRYPTION + memcpy(sbi->key_prefix, F2FS_KEY_DESC_PREFIX, + F2FS_KEY_DESC_PREFIX_SIZE); + sbi->key_prefix_size = F2FS_KEY_DESC_PREFIX_SIZE; +#endif +} + +static int init_percpu_info(struct f2fs_sb_info *sbi) +{ + int i, err; + + for (i = 0; i < NR_COUNT_TYPE; i++) { + err = percpu_counter_init(&sbi->nr_pages[i], 0, GFP_KERNEL); + if (err) + return err; + } + + err = percpu_counter_init(&sbi->alloc_valid_block_count, 0, GFP_KERNEL); + if (err) + return err; + + return percpu_counter_init(&sbi->total_valid_inode_count, 0, + GFP_KERNEL); } /* * Read f2fs raw super block. - * Because we have two copies of super block, so read the first one at first, - * if the first one is invalid, move to read the second one. + * Because we have two copies of super block, so read both of them + * to get the first valid one. If any one of them is broken, we pass + * them recovery flag back to the caller. */ -static int read_raw_super_block(struct super_block *sb, +static int read_raw_super_block(struct f2fs_sb_info *sbi, struct f2fs_super_block **raw_super, - struct buffer_head **raw_super_buf, - int *recovery) + int *valid_super_block, int *recovery) { - int block = 0; - struct buffer_head *buffer; + struct super_block *sb = sbi->sb; + int block; + struct buffer_head *bh; struct f2fs_super_block *super; int err = 0; -retry: - buffer = sb_bread(sb, block); - if (!buffer) { - *recovery = 1; - f2fs_msg(sb, KERN_ERR, "Unable to read %dth superblock", + super = kzalloc(sizeof(struct f2fs_super_block), GFP_KERNEL); + if (!super) + return -ENOMEM; + + for (block = 0; block < 2; block++) { + bh = sb_bread(sb, block); + if (!bh) { + f2fs_msg(sb, KERN_ERR, "Unable to read %dth superblock", block + 1); - if (block == 0) { - block++; - goto retry; - } else { err = -EIO; - goto out; + continue; } - } - super = (struct f2fs_super_block *) - ((char *)(buffer)->b_data + F2FS_SUPER_OFFSET); - - /* sanity checking of raw super */ - if (sanity_check_raw_super(sb, super)) { - brelse(buffer); - *recovery = 1; - f2fs_msg(sb, KERN_ERR, - "Can't find valid F2FS filesystem in %dth superblock", - block + 1); - if (block == 0) { - block++; - goto retry; - } else { + /* sanity checking of raw super */ + if (sanity_check_raw_super(sbi, bh)) { + f2fs_msg(sb, KERN_ERR, + "Can't find valid F2FS filesystem in %dth superblock", + block + 1); err = -EINVAL; - goto out; + brelse(bh); + continue; } + + if (!*raw_super) { + memcpy(super, bh->b_data + F2FS_SUPER_OFFSET, + sizeof(*super)); + *valid_super_block = block; + *raw_super = super; + } + brelse(bh); } - if (!*raw_super) { - *raw_super_buf = buffer; - *raw_super = super; - } else { - /* already have a valid superblock */ - brelse(buffer); - } + /* Fail to read any one of the superblocks*/ + if (err < 0) + *recovery = 1; - /* check the validity of the second superblock */ - if (block == 0) { - block++; - goto retry; - } - -out: /* No valid superblock */ if (!*raw_super) - return err; + kfree(super); + else + err = 0; - return 0; + return err; } int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover) { - struct buffer_head *sbh = sbi->raw_super_buf; - sector_t block = sbh->b_blocknr; + struct buffer_head *bh; int err; - /* write back-up superblock first */ - sbh->b_blocknr = block ? 0 : 1; - mark_buffer_dirty(sbh); - err = sync_dirty_buffer(sbh); + if ((recover && f2fs_readonly(sbi->sb)) || + bdev_read_only(sbi->sb->s_bdev)) { + set_sbi_flag(sbi, SBI_NEED_SB_WRITE); + return -EROFS; + } - sbh->b_blocknr = block; + /* write back-up superblock first */ + bh = sb_getblk(sbi->sb, sbi->valid_super_block ? 0: 1); + if (!bh) + return -EIO; + err = __f2fs_commit_super(bh, F2FS_RAW_SUPER(sbi)); + brelse(bh); /* if we are in recovery path, skip writing valid superblock */ if (recover || err) - goto out; + return err; /* write current valid superblock */ - mark_buffer_dirty(sbh); - err = sync_dirty_buffer(sbh); -out: - clear_buffer_write_io_error(sbh); - set_buffer_uptodate(sbh); + bh = sb_getblk(sbi->sb, sbi->valid_super_block); + if (!bh) + return -EIO; + err = __f2fs_commit_super(bh, F2FS_RAW_SUPER(sbi)); + brelse(bh); return err; } @@ -1126,17 +1599,17 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) { struct f2fs_sb_info *sbi; struct f2fs_super_block *raw_super; - struct buffer_head *raw_super_buf; struct inode *root; - long err; + int err; bool retry = true, need_fsck = false; char *options = NULL; - int recovery, i; + int recovery, i, valid_super_block; + struct curseg_info *seg_i; try_onemore: err = -EINVAL; raw_super = NULL; - raw_super_buf = NULL; + valid_super_block = -1; recovery = 0; /* allocate memory for f2fs-specific super block info */ @@ -1144,17 +1617,31 @@ try_onemore: if (!sbi) return -ENOMEM; + sbi->sb = sb; + + /* Load the checksum driver */ + sbi->s_chksum_driver = crypto_alloc_shash("crc32", 0, 0); + if (IS_ERR(sbi->s_chksum_driver)) { + f2fs_msg(sb, KERN_ERR, "Cannot load crc32 driver."); + err = PTR_ERR(sbi->s_chksum_driver); + sbi->s_chksum_driver = NULL; + goto free_sbi; + } + /* set a block size */ if (unlikely(!sb_set_blocksize(sb, F2FS_BLKSIZE))) { f2fs_msg(sb, KERN_ERR, "unable to set blocksize"); goto free_sbi; } - err = read_raw_super_block(sb, &raw_super, &raw_super_buf, &recovery); + err = read_raw_super_block(sbi, &raw_super, &valid_super_block, + &recovery); if (err) goto free_sbi; sb->s_fs_info = sbi; + sbi->raw_super = raw_super; + default_options(sbi); /* parse mount options */ options = kstrdup((const char *)data, GFP_KERNEL); @@ -1167,11 +1654,14 @@ try_onemore: if (err) goto free_options; - sb->s_maxbytes = max_file_size(le32_to_cpu(raw_super->log_blocksize)); + sbi->max_file_blocks = max_file_blocks(); + sb->s_maxbytes = sbi->max_file_blocks << + le32_to_cpu(raw_super->log_blocksize); sb->s_max_links = F2FS_LINK_MAX; get_random_bytes(&sbi->s_next_generation, sizeof(u32)); sb->s_op = &f2fs_sops; + sb->s_cop = &f2fs_cryptops; sb->s_xattr = f2fs_xattr_handlers; sb->s_export_op = &f2fs_export_ops; sb->s_magic = F2FS_SUPER_MAGIC; @@ -1181,11 +1671,8 @@ try_onemore: memcpy(sb->s_uuid, raw_super->uuid, sizeof(raw_super->uuid)); /* init f2fs-specific super block info */ - sbi->sb = sb; - sbi->raw_super = raw_super; - sbi->raw_super_buf = raw_super_buf; + sbi->valid_super_block = valid_super_block; mutex_init(&sbi->gc_mutex); - mutex_init(&sbi->writepages); mutex_init(&sbi->cp_mutex); init_rwsem(&sbi->node_write); @@ -1206,6 +1693,10 @@ try_onemore: init_waitqueue_head(&sbi->cp_wait); init_sb_info(sbi); + err = init_percpu_info(sbi); + if (err) + goto free_options; + /* get an inode for meta space */ sbi->meta_inode = f2fs_iget(sb, F2FS_META_INO(sbi)); if (IS_ERR(sbi->meta_inode)) { @@ -1220,24 +1711,19 @@ try_onemore: goto free_meta_inode; } - /* sanity checking of checkpoint */ - err = -EINVAL; - if (sanity_check_ckpt(sbi)) { - f2fs_msg(sb, KERN_ERR, "Invalid F2FS checkpoint"); - goto free_cp; - } - sbi->total_valid_node_count = le32_to_cpu(sbi->ckpt->valid_node_count); - sbi->total_valid_inode_count = - le32_to_cpu(sbi->ckpt->valid_inode_count); + percpu_counter_set(&sbi->total_valid_inode_count, + le32_to_cpu(sbi->ckpt->valid_inode_count)); sbi->user_block_count = le64_to_cpu(sbi->ckpt->user_block_count); sbi->total_valid_block_count = le64_to_cpu(sbi->ckpt->valid_block_count); sbi->last_valid_block_count = sbi->total_valid_block_count; - sbi->alloc_valid_block_count = 0; - INIT_LIST_HEAD(&sbi->dir_inode_list); - spin_lock_init(&sbi->dir_inode_lock); + + for (i = 0; i < NR_INODE_TYPE; i++) { + INIT_LIST_HEAD(&sbi->inode_list[i]); + spin_lock_init(&sbi->inode_lock[i]); + } init_extent_cache_info(sbi); @@ -1257,6 +1743,17 @@ try_onemore: goto free_nm; } + /* For write statistics */ + if (sb->s_bdev->bd_part) + sbi->sectors_written_start = + (u64)part_stat_read(sb->s_bdev->bd_part, sectors[1]); + + /* Read accumulated write IO statistics if exists */ + seg_i = CURSEG_I(sbi, CURSEG_HOT_NODE); + if (__exist_node_summaries(sbi)) + sbi->kbytes_written = + le64_to_cpu(seg_i->journal->info.kbytes_written); + build_gc_manager(sbi); /* get an inode for node space */ @@ -1300,9 +1797,12 @@ try_onemore: if (f2fs_proc_root) sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root); - if (sbi->s_proc) + if (sbi->s_proc) { proc_create_data("segment_info", S_IRUGO, sbi->s_proc, &f2fs_seq_segment_info_fops, sb); + proc_create_data("segment_bits", S_IRUGO, sbi->s_proc, + &f2fs_seq_segment_bits_fops, sb); + } sbi->s_kobj.kset = f2fs_kset; init_completion(&sbi->s_kobj_unregister); @@ -1318,7 +1818,7 @@ try_onemore: * previous checkpoint was not done by clean system shutdown. */ if (bdev_read_only(sb->s_bdev) && - !is_set_ckpt_flags(sbi->ckpt, CP_UMOUNT_FLAG)) { + !is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) { err = -EROFS; goto free_kobj; } @@ -1326,14 +1826,27 @@ try_onemore: if (need_fsck) set_sbi_flag(sbi, SBI_NEED_FSCK); - err = recover_fsync_data(sbi); - if (err) { + if (!retry) + goto skip_recovery; + + err = recover_fsync_data(sbi, false); + if (err < 0) { need_fsck = true; f2fs_msg(sb, KERN_ERR, - "Cannot recover all fsync data errno=%ld", err); + "Cannot recover all fsync data errno=%d", err); + goto free_kobj; + } + } else { + err = recover_fsync_data(sbi, true); + + if (!f2fs_readonly(sb) && err > 0) { + err = -EINVAL; + f2fs_msg(sb, KERN_ERR, + "Need to recover fsync data"); goto free_kobj; } } +skip_recovery: /* recover_fsync_data() cleared this already */ clear_sbi_flag(sbi, SBI_POR_DOING); @@ -1350,20 +1863,26 @@ try_onemore: kfree(options); /* recover broken superblock */ - if (recovery && !f2fs_readonly(sb) && !bdev_read_only(sb->s_bdev)) { - f2fs_msg(sb, KERN_INFO, "Recover invalid superblock"); - f2fs_commit_super(sbi, true); + if (recovery) { + err = f2fs_commit_super(sbi, true); + f2fs_msg(sb, KERN_INFO, + "Try to recover %dth superblock, ret: %d", + sbi->valid_super_block ? 1 : 2, err); } - sbi->cp_expires = round_jiffies_up(jiffies); - + f2fs_update_time(sbi, CP_TIME); + f2fs_update_time(sbi, REQ_TIME); return 0; free_kobj: + f2fs_sync_inode_meta(sbi); kobject_del(&sbi->s_kobj); + kobject_put(&sbi->s_kobj); + wait_for_completion(&sbi->s_kobj_unregister); free_proc: if (sbi->s_proc) { remove_proc_entry("segment_info", sbi->s_proc); + remove_proc_entry("segment_bits", sbi->s_proc); remove_proc_entry(sb->s_id, f2fs_proc_root); } f2fs_destroy_stats(sbi); @@ -1371,7 +1890,9 @@ free_root_inode: dput(sb->s_root); sb->s_root = NULL; free_node_inode: + truncate_inode_pages_final(NODE_MAPPING(sbi)); mutex_lock(&sbi->umount_mutex); + release_ino_entry(sbi, true); f2fs_leave_shrinker(sbi); iput(sbi->node_inode); mutex_unlock(&sbi->umount_mutex); @@ -1379,16 +1900,18 @@ free_nm: destroy_node_manager(sbi); free_sm: destroy_segment_manager(sbi); -free_cp: kfree(sbi->ckpt); free_meta_inode: make_bad_inode(sbi->meta_inode); iput(sbi->meta_inode); free_options: + destroy_percpu_info(sbi); kfree(options); free_sb_buf: - brelse(raw_super_buf); + kfree(raw_super); free_sbi: + if (sbi->s_chksum_driver) + crypto_free_shash(sbi->s_chksum_driver); kfree(sbi); /* give only one another chance */ @@ -1424,8 +1947,9 @@ MODULE_ALIAS_FS("f2fs"); static int __init init_inodecache(void) { - f2fs_inode_cachep = f2fs_kmem_cache_create("f2fs_inode_cache", - sizeof(struct f2fs_inode_info)); + f2fs_inode_cachep = kmem_cache_create("f2fs_inode_cache", + sizeof(struct f2fs_inode_info), 0, + SLAB_RECLAIM_ACCOUNT, NULL); if (!f2fs_inode_cachep) return -ENOMEM; return 0; @@ -1467,25 +1991,23 @@ static int __init init_f2fs_fs(void) err = -ENOMEM; goto free_extent_cache; } - err = f2fs_init_crypto(); - if (err) - goto free_kset; - err = register_shrinker(&f2fs_shrinker_info); if (err) - goto free_crypto; + goto free_kset; err = register_filesystem(&f2fs_fs_type); if (err) goto free_shrinker; - f2fs_create_root_stats(); + err = f2fs_create_root_stats(); + if (err) + goto free_filesystem; f2fs_proc_root = proc_mkdir("fs/f2fs", NULL); return 0; +free_filesystem: + unregister_filesystem(&f2fs_fs_type); free_shrinker: unregister_shrinker(&f2fs_shrinker_info); -free_crypto: - f2fs_exit_crypto(); free_kset: kset_unregister(f2fs_kset); free_extent_cache: @@ -1506,15 +2028,14 @@ static void __exit exit_f2fs_fs(void) { remove_proc_entry("fs/f2fs", NULL); f2fs_destroy_root_stats(); - unregister_shrinker(&f2fs_shrinker_info); unregister_filesystem(&f2fs_fs_type); - f2fs_exit_crypto(); + unregister_shrinker(&f2fs_shrinker_info); + kset_unregister(f2fs_kset); destroy_extent_cache(); destroy_checkpoint_caches(); destroy_segment_manager_caches(); destroy_node_manager_caches(); destroy_inodecache(); - kset_unregister(f2fs_kset); f2fs_destroy_trace_ios(); } diff --git a/fs/f2fs/trace.c b/fs/f2fs/trace.c index 145fb659ad44..562ce0821559 100644 --- a/fs/f2fs/trace.c +++ b/fs/f2fs/trace.c @@ -29,7 +29,8 @@ static inline void __print_last_io(void) last_io.major, last_io.minor, last_io.pid, "----------------", last_io.type, - last_io.fio.rw, last_io.fio.blk_addr, + last_io.fio.rw, + last_io.fio.new_blkaddr, last_io.len); memset(&last_io, 0, sizeof(last_io)); } @@ -101,7 +102,8 @@ void f2fs_trace_ios(struct f2fs_io_info *fio, int flush) last_io.pid == pid && last_io.type == __file_type(inode, pid) && last_io.fio.rw == fio->rw && - last_io.fio.blk_addr + last_io.len == fio->blk_addr) { + last_io.fio.new_blkaddr + last_io.len == + fio->new_blkaddr) { last_io.len++; return; } diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 862368a32e53..69c6bb9cf207 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -151,7 +151,7 @@ static int f2fs_xattr_advise_set(const struct xattr_handler *handler, return -EINVAL; F2FS_I(inode)->i_advise |= *(char *)value; - mark_inode_dirty(inode); + f2fs_mark_inode_dirty_sync(inode); return 0; } @@ -264,18 +264,20 @@ static struct f2fs_xattr_entry *__find_xattr(void *base_addr, int index, return entry; } -static void *read_all_xattrs(struct inode *inode, struct page *ipage) +static int read_all_xattrs(struct inode *inode, struct page *ipage, + void **base_addr) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_xattr_header *header; size_t size = PAGE_SIZE, inline_size = 0; void *txattr_addr; + int err; inline_size = inline_xattr_size(inode); txattr_addr = kzalloc(inline_size + size, GFP_F2FS_ZERO); if (!txattr_addr) - return NULL; + return -ENOMEM; /* read from inline xattr */ if (inline_size) { @@ -286,8 +288,10 @@ static void *read_all_xattrs(struct inode *inode, struct page *ipage) inline_addr = inline_xattr_addr(ipage); } else { page = get_node_page(sbi, inode->i_ino); - if (IS_ERR(page)) + if (IS_ERR(page)) { + err = PTR_ERR(page); goto fail; + } inline_addr = inline_xattr_addr(page); } memcpy(txattr_addr, inline_addr, inline_size); @@ -301,8 +305,10 @@ static void *read_all_xattrs(struct inode *inode, struct page *ipage) /* The inode already has an extended attribute block. */ xpage = get_node_page(sbi, F2FS_I(inode)->i_xattr_nid); - if (IS_ERR(xpage)) + if (IS_ERR(xpage)) { + err = PTR_ERR(xpage); goto fail; + } xattr_addr = page_address(xpage); memcpy(txattr_addr + inline_size, xattr_addr, PAGE_SIZE); @@ -316,10 +322,11 @@ static void *read_all_xattrs(struct inode *inode, struct page *ipage) header->h_magic = cpu_to_le32(F2FS_XATTR_MAGIC); header->h_refcount = cpu_to_le32(1); } - return txattr_addr; + *base_addr = txattr_addr; + return 0; fail: kzfree(txattr_addr); - return NULL; + return err; } static inline int write_all_xattrs(struct inode *inode, __u32 hsize, @@ -345,7 +352,8 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, if (ipage) { inline_addr = inline_xattr_addr(ipage); - f2fs_wait_on_page_writeback(ipage, NODE); + f2fs_wait_on_page_writeback(ipage, NODE, true); + set_page_dirty(ipage); } else { page = get_node_page(sbi, inode->i_ino); if (IS_ERR(page)) { @@ -353,7 +361,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, return PTR_ERR(page); } inline_addr = inline_xattr_addr(page); - f2fs_wait_on_page_writeback(page, NODE); + f2fs_wait_on_page_writeback(page, NODE, true); } memcpy(inline_addr, txattr_addr, inline_size); f2fs_put_page(page, 1); @@ -374,7 +382,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, return PTR_ERR(xpage); } f2fs_bug_on(sbi, new_nid); - f2fs_wait_on_page_writeback(xpage, NODE); + f2fs_wait_on_page_writeback(xpage, NODE, true); } else { struct dnode_of_data dn; set_new_dnode(&dn, inode, NULL, NULL, new_nid); @@ -412,9 +420,9 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name, if (len > F2FS_NAME_LEN) return -ERANGE; - base_addr = read_all_xattrs(inode, ipage); - if (!base_addr) - return -ENOMEM; + error = read_all_xattrs(inode, ipage, &base_addr); + if (error) + return error; entry = __find_xattr(base_addr, index, len, name); if (IS_XATTR_LAST_ENTRY(entry)) { @@ -448,9 +456,9 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) int error = 0; size_t rest = buffer_size; - base_addr = read_all_xattrs(inode, NULL); - if (!base_addr) - return -ENOMEM; + error = read_all_xattrs(inode, NULL, &base_addr); + if (error) + return error; list_for_each_xattr(entry, base_addr) { const struct xattr_handler *handler = @@ -481,13 +489,12 @@ static int __f2fs_setxattr(struct inode *inode, int index, const char *name, const void *value, size_t size, struct page *ipage, int flags) { - struct f2fs_inode_info *fi = F2FS_I(inode); struct f2fs_xattr_entry *here, *last; void *base_addr; int found, newsize; size_t len; __u32 new_hsize; - int error = -ENOMEM; + int error = 0; if (name == NULL) return -EINVAL; @@ -503,9 +510,9 @@ static int __f2fs_setxattr(struct inode *inode, int index, if (size > MAX_VALUE_LEN(inode)) return -E2BIG; - base_addr = read_all_xattrs(inode, ipage); - if (!base_addr) - goto exit; + error = read_all_xattrs(inode, ipage, &base_addr); + if (error) + return error; /* find entry with wanted name. */ here = __find_xattr(base_addr, index, len, name); @@ -538,7 +545,7 @@ static int __f2fs_setxattr(struct inode *inode, int index, free = free + ENTRY_SIZE(here); if (unlikely(free < newsize)) { - error = -ENOSPC; + error = -E2BIG; goto exit; } } @@ -566,7 +573,6 @@ static int __f2fs_setxattr(struct inode *inode, int index, * Before we come here, old entry is removed. * We just write new entry. */ - memset(last, 0, newsize); last->e_name_index = index; last->e_name_len = len; memcpy(last->e_name, name, len); @@ -580,19 +586,17 @@ static int __f2fs_setxattr(struct inode *inode, int index, if (error) goto exit; - if (is_inode_flag_set(fi, FI_ACL_MODE)) { - inode->i_mode = fi->i_acl_mode; + if (is_inode_flag_set(inode, FI_ACL_MODE)) { + inode->i_mode = F2FS_I(inode)->i_acl_mode; inode->i_ctime = CURRENT_TIME; - clear_inode_flag(fi, FI_ACL_MODE); + clear_inode_flag(inode, FI_ACL_MODE); } if (index == F2FS_XATTR_INDEX_ENCRYPTION && !strcmp(name, F2FS_XATTR_NAME_ENCRYPTION_CONTEXT)) f2fs_set_encrypted_inode(inode); - - if (ipage) - update_inode(inode, ipage); - else - update_inode_page(inode); + f2fs_mark_inode_dirty_sync(inode); + if (!error && S_ISDIR(inode->i_mode)) + set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_CP); exit: kzfree(base_addr); return error; @@ -609,7 +613,7 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name, if (ipage) return __f2fs_setxattr(inode, index, name, value, size, ipage, flags); - f2fs_balance_fs(sbi); + f2fs_balance_fs(sbi, true); f2fs_lock_op(sbi); /* protect xattr_ver */ @@ -618,5 +622,6 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name, up_write(&F2FS_I(inode)->i_sem); f2fs_unlock_op(sbi); + f2fs_update_time(sbi, REQ_TIME); return err; } diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h index 71a7100d5492..d2fd0387a3c7 100644 --- a/fs/f2fs/xattr.h +++ b/fs/f2fs/xattr.h @@ -126,7 +126,8 @@ extern ssize_t f2fs_listxattr(struct dentry *, char *, size_t); #define f2fs_xattr_handlers NULL static inline int f2fs_setxattr(struct inode *inode, int index, - const char *name, const void *value, size_t size, int flags) + const char *name, const void *value, size_t size, + struct page *page, int flags) { return -EOPNOTSUPP; } diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 8d7151eb6ceb..2d0a78050936 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -228,6 +228,7 @@ struct dentry_operations { #define DCACHE_MAY_FREE 0x00800000 #define DCACHE_FALLTHRU 0x01000000 /* Fall through to lower layer */ #define DCACHE_OP_SELECT_INODE 0x02000000 /* Unioned entry: dcache op selects inode */ +#define DCACHE_ENCRYPTED_WITH_KEY 0x04000000 /* dir is encrypted with a valid key */ #define DCACHE_OP_REAL 0x08000000 extern seqlock_t rename_lock; diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index 25c6324a0dd0..422630b8e588 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -21,7 +21,7 @@ #define F2FS_BLKSIZE 4096 /* support only 4KB block */ #define F2FS_BLKSIZE_BITS 12 /* bits for F2FS_BLKSIZE */ #define F2FS_MAX_EXTENSION 64 /* # of extension entries */ -#define F2FS_BLK_ALIGN(x) (((x) + F2FS_BLKSIZE - 1) / F2FS_BLKSIZE) +#define F2FS_BLK_ALIGN(x) (((x) + F2FS_BLKSIZE - 1) >> F2FS_BLKSIZE_BITS) #define NULL_ADDR ((block_t)0) /* used as block_t addresses */ #define NEW_ADDR ((block_t)-1) /* used as block_t addresses */ @@ -51,6 +51,7 @@ #define MAX_ACTIVE_DATA_LOGS 8 #define VERSION_LEN 256 +#define MAX_VOLUME_NAME 512 /* * For superblock @@ -84,7 +85,7 @@ struct f2fs_super_block { __le32 node_ino; /* node inode number */ __le32 meta_ino; /* meta inode number */ __u8 uuid[16]; /* 128-bit uuid for volume */ - __le16 volume_name[512]; /* volume name */ + __le16 volume_name[MAX_VOLUME_NAME]; /* volume name */ __le32 extension_count; /* # of extensions below */ __u8 extension_list[F2FS_MAX_EXTENSION][8]; /* extension array */ __le32 cp_payload; @@ -99,6 +100,7 @@ struct f2fs_super_block { /* * For checkpoint */ +#define CP_CRC_RECOVERY_FLAG 0x00000040 #define CP_FASTBOOT_FLAG 0x00000020 #define CP_FSCK_FLAG 0x00000010 #define CP_ERROR_FLAG 0x00000008 @@ -169,12 +171,12 @@ struct f2fs_extent { #define F2FS_INLINE_XATTR_ADDRS 50 /* 200 bytes for inline xattrs */ #define DEF_ADDRS_PER_INODE 923 /* Address Pointers in an Inode */ #define DEF_NIDS_PER_INODE 5 /* Node IDs in an Inode */ -#define ADDRS_PER_INODE(fi) addrs_per_inode(fi) +#define ADDRS_PER_INODE(inode) addrs_per_inode(inode) #define ADDRS_PER_BLOCK 1018 /* Address Pointers in a Direct Block */ #define NIDS_PER_BLOCK 1018 /* Node IDs in an Indirect Block */ -#define ADDRS_PER_PAGE(page, fi) \ - (IS_INODE(page) ? ADDRS_PER_INODE(fi) : ADDRS_PER_BLOCK) +#define ADDRS_PER_PAGE(page, inode) \ + (IS_INODE(page) ? ADDRS_PER_INODE(inode) : ADDRS_PER_BLOCK) #define NODE_DIR1_BLOCK (DEF_ADDRS_PER_INODE + 1) #define NODE_DIR2_BLOCK (DEF_ADDRS_PER_INODE + 2) @@ -261,7 +263,7 @@ struct f2fs_node { /* * For NAT entries */ -#define NAT_ENTRY_PER_BLOCK (PAGE_CACHE_SIZE / sizeof(struct f2fs_nat_entry)) +#define NAT_ENTRY_PER_BLOCK (PAGE_SIZE / sizeof(struct f2fs_nat_entry)) struct f2fs_nat_entry { __u8 version; /* latest version of cached nat entry */ @@ -281,7 +283,7 @@ struct f2fs_nat_block { * Not allow to change this. */ #define SIT_VBLOCK_MAP_SIZE 64 -#define SIT_ENTRY_PER_BLOCK (PAGE_CACHE_SIZE / sizeof(struct f2fs_sit_entry)) +#define SIT_ENTRY_PER_BLOCK (PAGE_SIZE / sizeof(struct f2fs_sit_entry)) /* * Note that f2fs_sit_entry->vblocks has the following bit-field information. @@ -344,7 +346,7 @@ struct f2fs_summary { struct summary_footer { unsigned char entry_type; /* SUM_TYPE_XXX */ - __u32 check_sum; /* summary checksum */ + __le32 check_sum; /* summary checksum */ } __packed; #define SUM_JOURNAL_SIZE (F2FS_BLKSIZE - SUM_FOOTER_SIZE -\ @@ -357,6 +359,12 @@ struct summary_footer { sizeof(struct sit_journal_entry)) #define SIT_JOURNAL_RESERVED ((SUM_JOURNAL_SIZE - 2) %\ sizeof(struct sit_journal_entry)) + +/* Reserved area should make size of f2fs_extra_info equals to + * that of nat_journal and sit_journal. + */ +#define EXTRA_INFO_RESERVED (SUM_JOURNAL_SIZE - 2 - 8) + /* * frequently updated NAT/SIT entries can be stored in the spare area in * summary blocks @@ -386,18 +394,28 @@ struct sit_journal { __u8 reserved[SIT_JOURNAL_RESERVED]; } __packed; -/* 4KB-sized summary block structure */ -struct f2fs_summary_block { - struct f2fs_summary entries[ENTRIES_IN_SUM]; +struct f2fs_extra_info { + __le64 kbytes_written; + __u8 reserved[EXTRA_INFO_RESERVED]; +} __packed; + +struct f2fs_journal { union { __le16 n_nats; __le16 n_sits; }; - /* spare area is used by NAT or SIT journals */ + /* spare area is used by NAT or SIT journals or extra info */ union { struct nat_journal nat_j; struct sit_journal sit_j; + struct f2fs_extra_info info; }; +} __packed; + +/* 4KB-sized summary block structure */ +struct f2fs_summary_block { + struct f2fs_summary entries[ENTRIES_IN_SUM]; + struct f2fs_journal journal; struct summary_footer footer; } __packed; @@ -491,4 +509,6 @@ enum { F2FS_FT_MAX }; +#define S_SHIFT 12 + #endif /* _LINUX_F2FS_FS_H */ diff --git a/include/linux/fs.h b/include/linux/fs.h index 0166582c4d78..a88271902ff2 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -52,6 +52,8 @@ struct swap_info_struct; struct seq_file; struct workqueue_struct; struct iov_iter; +struct fscrypt_info; +struct fscrypt_operations; extern void __init inode_init(void); extern void __init inode_init_early(void); @@ -676,6 +678,9 @@ struct inode { struct hlist_head i_fsnotify_marks; #endif +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) + struct fscrypt_info *i_crypt_info; +#endif void *i_private; /* fs or device private pointer */ }; @@ -1331,6 +1336,8 @@ struct super_block { #endif const struct xattr_handler **s_xattr; + const struct fscrypt_operations *s_cop; + struct hlist_bl_head s_anon; /* anonymous dentries for (nfs) exporting */ struct list_head s_mounts; /* list of mounts; _not_ for fs use */ struct block_device *s_bdev; diff --git a/include/linux/fscrypto.h b/include/linux/fscrypto.h new file mode 100644 index 000000000000..76cff18bb032 --- /dev/null +++ b/include/linux/fscrypto.h @@ -0,0 +1,435 @@ +/* + * General per-file encryption definition + * + * Copyright (C) 2015, Google, Inc. + * + * Written by Michael Halcrow, 2015. + * Modified by Jaegeuk Kim, 2015. + */ + +#ifndef _LINUX_FSCRYPTO_H +#define _LINUX_FSCRYPTO_H + +#include +#include +#include +#include +#include +#include +#include + +#define FS_KEY_DERIVATION_NONCE_SIZE 16 +#define FS_ENCRYPTION_CONTEXT_FORMAT_V1 1 + +#define FS_POLICY_FLAGS_PAD_4 0x00 +#define FS_POLICY_FLAGS_PAD_8 0x01 +#define FS_POLICY_FLAGS_PAD_16 0x02 +#define FS_POLICY_FLAGS_PAD_32 0x03 +#define FS_POLICY_FLAGS_PAD_MASK 0x03 +#define FS_POLICY_FLAGS_VALID 0x03 + +/* Encryption algorithms */ +#define FS_ENCRYPTION_MODE_INVALID 0 +#define FS_ENCRYPTION_MODE_AES_256_XTS 1 +#define FS_ENCRYPTION_MODE_AES_256_GCM 2 +#define FS_ENCRYPTION_MODE_AES_256_CBC 3 +#define FS_ENCRYPTION_MODE_AES_256_CTS 4 + +/** + * Encryption context for inode + * + * Protector format: + * 1 byte: Protector format (1 = this version) + * 1 byte: File contents encryption mode + * 1 byte: File names encryption mode + * 1 byte: Flags + * 8 bytes: Master Key descriptor + * 16 bytes: Encryption Key derivation nonce + */ +struct fscrypt_context { + u8 format; + u8 contents_encryption_mode; + u8 filenames_encryption_mode; + u8 flags; + u8 master_key_descriptor[FS_KEY_DESCRIPTOR_SIZE]; + u8 nonce[FS_KEY_DERIVATION_NONCE_SIZE]; +} __packed; + +/* Encryption parameters */ +#define FS_XTS_TWEAK_SIZE 16 +#define FS_AES_128_ECB_KEY_SIZE 16 +#define FS_AES_256_GCM_KEY_SIZE 32 +#define FS_AES_256_CBC_KEY_SIZE 32 +#define FS_AES_256_CTS_KEY_SIZE 32 +#define FS_AES_256_XTS_KEY_SIZE 64 +#define FS_MAX_KEY_SIZE 64 + +#define FS_KEY_DESC_PREFIX "fscrypt:" +#define FS_KEY_DESC_PREFIX_SIZE 8 + +/* This is passed in from userspace into the kernel keyring */ +struct fscrypt_key { + u32 mode; + u8 raw[FS_MAX_KEY_SIZE]; + u32 size; +} __packed; + +struct fscrypt_info { + u8 ci_data_mode; + u8 ci_filename_mode; + u8 ci_flags; + struct crypto_skcipher *ci_ctfm; + struct key *ci_keyring_key; + u8 ci_master_key[FS_KEY_DESCRIPTOR_SIZE]; +}; + +#define FS_CTX_REQUIRES_FREE_ENCRYPT_FL 0x00000001 +#define FS_WRITE_PATH_FL 0x00000002 + +struct fscrypt_ctx { + union { + struct { + struct page *bounce_page; /* Ciphertext page */ + struct page *control_page; /* Original page */ + } w; + struct { + struct bio *bio; + struct work_struct work; + } r; + struct list_head free_list; /* Free list */ + }; + u8 flags; /* Flags */ + u8 mode; /* Encryption mode for tfm */ +}; + +struct fscrypt_completion_result { + struct completion completion; + int res; +}; + +#define DECLARE_FS_COMPLETION_RESULT(ecr) \ + struct fscrypt_completion_result ecr = { \ + COMPLETION_INITIALIZER((ecr).completion), 0 } + +static inline int fscrypt_key_size(int mode) +{ + switch (mode) { + case FS_ENCRYPTION_MODE_AES_256_XTS: + return FS_AES_256_XTS_KEY_SIZE; + case FS_ENCRYPTION_MODE_AES_256_GCM: + return FS_AES_256_GCM_KEY_SIZE; + case FS_ENCRYPTION_MODE_AES_256_CBC: + return FS_AES_256_CBC_KEY_SIZE; + case FS_ENCRYPTION_MODE_AES_256_CTS: + return FS_AES_256_CTS_KEY_SIZE; + default: + BUG(); + } + return 0; +} + +#define FS_FNAME_NUM_SCATTER_ENTRIES 4 +#define FS_CRYPTO_BLOCK_SIZE 16 +#define FS_FNAME_CRYPTO_DIGEST_SIZE 32 + +/** + * For encrypted symlinks, the ciphertext length is stored at the beginning + * of the string in little-endian format. + */ +struct fscrypt_symlink_data { + __le16 len; + char encrypted_path[1]; +} __packed; + +/** + * This function is used to calculate the disk space required to + * store a filename of length l in encrypted symlink format. + */ +static inline u32 fscrypt_symlink_data_len(u32 l) +{ + if (l < FS_CRYPTO_BLOCK_SIZE) + l = FS_CRYPTO_BLOCK_SIZE; + return (l + sizeof(struct fscrypt_symlink_data) - 1); +} + +struct fscrypt_str { + unsigned char *name; + u32 len; +}; + +struct fscrypt_name { + const struct qstr *usr_fname; + struct fscrypt_str disk_name; + u32 hash; + u32 minor_hash; + struct fscrypt_str crypto_buf; +}; + +#define FSTR_INIT(n, l) { .name = n, .len = l } +#define FSTR_TO_QSTR(f) QSTR_INIT((f)->name, (f)->len) +#define fname_name(p) ((p)->disk_name.name) +#define fname_len(p) ((p)->disk_name.len) + +/* + * crypto opertions for filesystems + */ +struct fscrypt_operations { + int (*get_context)(struct inode *, void *, size_t); + int (*key_prefix)(struct inode *, u8 **); + int (*prepare_context)(struct inode *); + int (*set_context)(struct inode *, const void *, size_t, void *); + int (*dummy_context)(struct inode *); + bool (*is_encrypted)(struct inode *); + bool (*empty_dir)(struct inode *); + unsigned (*max_namelen)(struct inode *); +}; + +static inline bool fscrypt_dummy_context_enabled(struct inode *inode) +{ + if (inode->i_sb->s_cop->dummy_context && + inode->i_sb->s_cop->dummy_context(inode)) + return true; + return false; +} + +static inline bool fscrypt_valid_contents_enc_mode(u32 mode) +{ + return (mode == FS_ENCRYPTION_MODE_AES_256_XTS); +} + +static inline bool fscrypt_valid_filenames_enc_mode(u32 mode) +{ + return (mode == FS_ENCRYPTION_MODE_AES_256_CTS); +} + +static inline u32 fscrypt_validate_encryption_key_size(u32 mode, u32 size) +{ + if (size == fscrypt_key_size(mode)) + return size; + return 0; +} + +static inline bool fscrypt_is_dot_dotdot(const struct qstr *str) +{ + if (str->len == 1 && str->name[0] == '.') + return true; + + if (str->len == 2 && str->name[0] == '.' && str->name[1] == '.') + return true; + + return false; +} + +static inline struct page *fscrypt_control_page(struct page *page) +{ +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) + return ((struct fscrypt_ctx *)page_private(page))->w.control_page; +#else + WARN_ON_ONCE(1); + return ERR_PTR(-EINVAL); +#endif +} + +static inline int fscrypt_has_encryption_key(struct inode *inode) +{ +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) + return (inode->i_crypt_info != NULL); +#else + return 0; +#endif +} + +static inline void fscrypt_set_encrypted_dentry(struct dentry *dentry) +{ +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) + spin_lock(&dentry->d_lock); + dentry->d_flags |= DCACHE_ENCRYPTED_WITH_KEY; + spin_unlock(&dentry->d_lock); +#endif +} + +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) +extern const struct dentry_operations fscrypt_d_ops; +#endif + +static inline void fscrypt_set_d_op(struct dentry *dentry) +{ +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) + d_set_d_op(dentry, &fscrypt_d_ops); +#endif +} + +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) +/* crypto.c */ +extern struct kmem_cache *fscrypt_info_cachep; +int fscrypt_initialize(void); + +extern struct fscrypt_ctx *fscrypt_get_ctx(struct inode *, gfp_t); +extern void fscrypt_release_ctx(struct fscrypt_ctx *); +extern struct page *fscrypt_encrypt_page(struct inode *, struct page *, gfp_t); +extern int fscrypt_decrypt_page(struct page *); +extern void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *, struct bio *); +extern void fscrypt_pullback_bio_page(struct page **, bool); +extern void fscrypt_restore_control_page(struct page *); +extern int fscrypt_zeroout_range(struct inode *, pgoff_t, sector_t, + unsigned int); +/* policy.c */ +extern int fscrypt_process_policy(struct file *, const struct fscrypt_policy *); +extern int fscrypt_get_policy(struct inode *, struct fscrypt_policy *); +extern int fscrypt_has_permitted_context(struct inode *, struct inode *); +extern int fscrypt_inherit_context(struct inode *, struct inode *, + void *, bool); +/* keyinfo.c */ +extern int get_crypt_info(struct inode *); +extern int fscrypt_get_encryption_info(struct inode *); +extern void fscrypt_put_encryption_info(struct inode *, struct fscrypt_info *); + +/* fname.c */ +extern int fscrypt_setup_filename(struct inode *, const struct qstr *, + int lookup, struct fscrypt_name *); +extern void fscrypt_free_filename(struct fscrypt_name *); +extern u32 fscrypt_fname_encrypted_size(struct inode *, u32); +extern int fscrypt_fname_alloc_buffer(struct inode *, u32, + struct fscrypt_str *); +extern void fscrypt_fname_free_buffer(struct fscrypt_str *); +extern int fscrypt_fname_disk_to_usr(struct inode *, u32, u32, + const struct fscrypt_str *, struct fscrypt_str *); +extern int fscrypt_fname_usr_to_disk(struct inode *, const struct qstr *, + struct fscrypt_str *); +#endif + +/* crypto.c */ +static inline struct fscrypt_ctx *fscrypt_notsupp_get_ctx(struct inode *i, + gfp_t f) +{ + return ERR_PTR(-EOPNOTSUPP); +} + +static inline void fscrypt_notsupp_release_ctx(struct fscrypt_ctx *c) +{ + return; +} + +static inline struct page *fscrypt_notsupp_encrypt_page(struct inode *i, + struct page *p, gfp_t f) +{ + return ERR_PTR(-EOPNOTSUPP); +} + +static inline int fscrypt_notsupp_decrypt_page(struct page *p) +{ + return -EOPNOTSUPP; +} + +static inline void fscrypt_notsupp_decrypt_bio_pages(struct fscrypt_ctx *c, + struct bio *b) +{ + return; +} + +static inline void fscrypt_notsupp_pullback_bio_page(struct page **p, bool b) +{ + return; +} + +static inline void fscrypt_notsupp_restore_control_page(struct page *p) +{ + return; +} + +static inline int fscrypt_notsupp_zeroout_range(struct inode *i, pgoff_t p, + sector_t s, unsigned int f) +{ + return -EOPNOTSUPP; +} + +/* policy.c */ +static inline int fscrypt_notsupp_process_policy(struct file *f, + const struct fscrypt_policy *p) +{ + return -EOPNOTSUPP; +} + +static inline int fscrypt_notsupp_get_policy(struct inode *i, + struct fscrypt_policy *p) +{ + return -EOPNOTSUPP; +} + +static inline int fscrypt_notsupp_has_permitted_context(struct inode *p, + struct inode *i) +{ + return 0; +} + +static inline int fscrypt_notsupp_inherit_context(struct inode *p, + struct inode *i, void *v, bool b) +{ + return -EOPNOTSUPP; +} + +/* keyinfo.c */ +static inline int fscrypt_notsupp_get_encryption_info(struct inode *i) +{ + return -EOPNOTSUPP; +} + +static inline void fscrypt_notsupp_put_encryption_info(struct inode *i, + struct fscrypt_info *f) +{ + return; +} + + /* fname.c */ +static inline int fscrypt_notsupp_setup_filename(struct inode *dir, + const struct qstr *iname, + int lookup, struct fscrypt_name *fname) +{ + if (dir->i_sb->s_cop->is_encrypted(dir)) + return -EOPNOTSUPP; + + memset(fname, 0, sizeof(struct fscrypt_name)); + fname->usr_fname = iname; + fname->disk_name.name = (unsigned char *)iname->name; + fname->disk_name.len = iname->len; + return 0; +} + +static inline void fscrypt_notsupp_free_filename(struct fscrypt_name *fname) +{ + return; +} + +static inline u32 fscrypt_notsupp_fname_encrypted_size(struct inode *i, u32 s) +{ + /* never happens */ + WARN_ON(1); + return 0; +} + +static inline int fscrypt_notsupp_fname_alloc_buffer(struct inode *inode, + u32 ilen, struct fscrypt_str *crypto_str) +{ + return -EOPNOTSUPP; +} + +static inline void fscrypt_notsupp_fname_free_buffer(struct fscrypt_str *c) +{ + return; +} + +static inline int fscrypt_notsupp_fname_disk_to_usr(struct inode *inode, + u32 hash, u32 minor_hash, + const struct fscrypt_str *iname, + struct fscrypt_str *oname) +{ + return -EOPNOTSUPP; +} + +static inline int fscrypt_notsupp_fname_usr_to_disk(struct inode *inode, + const struct qstr *iname, + struct fscrypt_str *oname) +{ + return -EOPNOTSUPP; +} +#endif /* _LINUX_FSCRYPTO_H */ diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 00b4a6308249..3a09bb4dc3b2 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -52,6 +52,7 @@ TRACE_DEFINE_ENUM(CP_DISCARD); { META_FLUSH, "META_FLUSH" }, \ { INMEM, "INMEM" }, \ { INMEM_DROP, "INMEM_DROP" }, \ + { INMEM_REVOKE, "INMEM_REVOKE" }, \ { IPU, "IN-PLACE" }, \ { OPU, "OUT-OF-PLACE" }) @@ -693,28 +694,32 @@ TRACE_EVENT(f2fs_direct_IO_exit, __entry->ret) ); -TRACE_EVENT(f2fs_reserve_new_block, +TRACE_EVENT(f2fs_reserve_new_blocks, - TP_PROTO(struct inode *inode, nid_t nid, unsigned int ofs_in_node), + TP_PROTO(struct inode *inode, nid_t nid, unsigned int ofs_in_node, + blkcnt_t count), - TP_ARGS(inode, nid, ofs_in_node), + TP_ARGS(inode, nid, ofs_in_node, count), TP_STRUCT__entry( __field(dev_t, dev) __field(nid_t, nid) __field(unsigned int, ofs_in_node) + __field(blkcnt_t, count) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->nid = nid; __entry->ofs_in_node = ofs_in_node; + __entry->count = count; ), - TP_printk("dev = (%d,%d), nid = %u, ofs_in_node = %u", + TP_printk("dev = (%d,%d), nid = %u, ofs_in_node = %u, count = %llu", show_dev(__entry), (unsigned int)__entry->nid, - __entry->ofs_in_node) + __entry->ofs_in_node, + (unsigned long long)__entry->count) ); DECLARE_EVENT_CLASS(f2fs__submit_page_bio, @@ -727,7 +732,8 @@ DECLARE_EVENT_CLASS(f2fs__submit_page_bio, __field(dev_t, dev) __field(ino_t, ino) __field(pgoff_t, index) - __field(block_t, blkaddr) + __field(block_t, old_blkaddr) + __field(block_t, new_blkaddr) __field(int, rw) __field(int, type) ), @@ -736,16 +742,18 @@ DECLARE_EVENT_CLASS(f2fs__submit_page_bio, __entry->dev = page->mapping->host->i_sb->s_dev; __entry->ino = page->mapping->host->i_ino; __entry->index = page->index; - __entry->blkaddr = fio->blk_addr; + __entry->old_blkaddr = fio->old_blkaddr; + __entry->new_blkaddr = fio->new_blkaddr; __entry->rw = fio->rw; __entry->type = fio->type; ), TP_printk("dev = (%d,%d), ino = %lu, page_index = 0x%lx, " - "blkaddr = 0x%llx, rw = %s%s, type = %s", + "oldaddr = 0x%llx, newaddr = 0x%llx rw = %s%s, type = %s", show_dev_ino(__entry), (unsigned long)__entry->index, - (unsigned long long)__entry->blkaddr, + (unsigned long long)__entry->old_blkaddr, + (unsigned long long)__entry->new_blkaddr, show_bio_type(__entry->rw), show_block_type(__entry->type)) ); @@ -1265,6 +1273,44 @@ TRACE_EVENT(f2fs_destroy_extent_tree, __entry->node_cnt) ); +DECLARE_EVENT_CLASS(f2fs_sync_dirty_inodes, + + TP_PROTO(struct super_block *sb, int type, s64 count), + + TP_ARGS(sb, type, count), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(int, type) + __field(s64, count) + ), + + TP_fast_assign( + __entry->dev = sb->s_dev; + __entry->type = type; + __entry->count = count; + ), + + TP_printk("dev = (%d,%d), %s, dirty count = %lld", + show_dev(__entry), + show_file_type(__entry->type), + __entry->count) +); + +DEFINE_EVENT(f2fs_sync_dirty_inodes, f2fs_sync_dirty_inodes_enter, + + TP_PROTO(struct super_block *sb, int type, s64 count), + + TP_ARGS(sb, type, count) +); + +DEFINE_EVENT(f2fs_sync_dirty_inodes, f2fs_sync_dirty_inodes_exit, + + TP_PROTO(struct super_block *sb, int type, s64 count), + + TP_ARGS(sb, type, count) +); + #endif /* _TRACE_F2FS_H */ /* This part must be outside protection */ diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index f15d980249b5..c8c093e8c83d 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -170,6 +170,24 @@ struct inodes_stat_t { #define FS_IOC32_GETVERSION _IOR('v', 1, int) #define FS_IOC32_SETVERSION _IOW('v', 2, int) +/* + * File system encryption support + */ +/* Policy provided via an ioctl on the topmost directory */ +#define FS_KEY_DESCRIPTOR_SIZE 8 + +struct fscrypt_policy { + __u8 version; + __u8 contents_encryption_mode; + __u8 filenames_encryption_mode; + __u8 flags; + __u8 master_key_descriptor[FS_KEY_DESCRIPTOR_SIZE]; +} __packed; + +#define FS_IOC_SET_ENCRYPTION_POLICY _IOR('f', 19, struct fscrypt_policy) +#define FS_IOC_GET_ENCRYPTION_PWSALT _IOW('f', 20, __u8[16]) +#define FS_IOC_GET_ENCRYPTION_POLICY _IOW('f', 21, struct fscrypt_policy) + /* * Inode flags (FS_IOC_GETFLAGS / FS_IOC_SETFLAGS) */ From a1561fae1b30467bddc2e0cf7752125ee54fe2e4 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 12 Oct 2016 13:38:41 -0700 Subject: [PATCH 002/804] f2fs: fix wrong sum_page pointer in f2fs_gc This patch fixes using a wrong pointer for sum_page in f2fs_gc. Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 0a0a1ad1fe1f..4336807cc690 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -848,16 +848,16 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, for (segno = start_segno; segno < end_segno; segno++) { - if (get_valid_blocks(sbi, segno, 1) == 0 || - unlikely(f2fs_cp_error(sbi))) - goto next; - /* find segment summary of victim */ sum_page = find_get_page(META_MAPPING(sbi), GET_SUM_BLOCK(sbi, segno)); - f2fs_bug_on(sbi, !PageUptodate(sum_page)); f2fs_put_page(sum_page, 0); + if (get_valid_blocks(sbi, segno, 1) == 0 || + !PageUptodate(sum_page) || + unlikely(f2fs_cp_error(sbi))) + goto next; + sum = page_address(sum_page); f2fs_bug_on(sbi, type != GET_SUM_TYPE((&sum->footer))); From 2ca2001b3a36ad58081e6907c396072a80a1ecc9 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 22 Nov 2016 14:06:03 -0800 Subject: [PATCH 003/804] posix_acl: Clear SGID bit when setting file permissions Cherry-pick to f2fs only for generic/375 from: (073931017: posix_acl: Clear SGID bit when setting file permissions) Signed-off-by: Jaegeuk Kim --- fs/f2fs/acl.c | 6 ++---- fs/posix_acl.c | 31 +++++++++++++++++++++++++++++++ include/linux/posix_acl.h | 1 + 3 files changed, 34 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index fb0744b94c2f..4a34040932e9 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -215,12 +215,10 @@ static int __f2fs_set_acl(struct inode *inode, int type, case ACL_TYPE_ACCESS: name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS; if (acl) { - error = posix_acl_equiv_mode(acl, &inode->i_mode); - if (error < 0) + error = posix_acl_update_mode(inode, &inode->i_mode, &acl); + if (error) return error; set_acl_inode(inode, inode->i_mode); - if (error == 0) - acl = NULL; } break; diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 34bd1bd354e6..a60d3cc5b55d 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -592,6 +592,37 @@ no_mem: } EXPORT_SYMBOL_GPL(posix_acl_create); +/** + * posix_acl_update_mode - update mode in set_acl + * + * Update the file mode when setting an ACL: compute the new file permission + * bits based on the ACL. In addition, if the ACL is equivalent to the new + * file mode, set *acl to NULL to indicate that no ACL should be set. + * + * As with chmod, clear the setgit bit if the caller is not in the owning group + * or capable of CAP_FSETID (see inode_change_ok). + * + * Called from set_acl inode operations. + */ +int posix_acl_update_mode(struct inode *inode, umode_t *mode_p, + struct posix_acl **acl) +{ + umode_t mode = inode->i_mode; + int error; + + error = posix_acl_equiv_mode(*acl, &mode); + if (error < 0) + return error; + if (error == 0) + *acl = NULL; + if (!in_group_p(inode->i_gid) && + !capable_wrt_inode_uidgid(inode, CAP_FSETID)) + mode &= ~S_ISGID; + *mode_p = mode; + return 0; +} +EXPORT_SYMBOL(posix_acl_update_mode); + /* * Fix up the uids and gids in posix acl extended attributes in place. */ diff --git a/include/linux/posix_acl.h b/include/linux/posix_acl.h index 3e96a6a76103..d1a8ad7e5ae4 100644 --- a/include/linux/posix_acl.h +++ b/include/linux/posix_acl.h @@ -95,6 +95,7 @@ extern int set_posix_acl(struct inode *, int, struct posix_acl *); extern int posix_acl_chmod(struct inode *, umode_t); extern int posix_acl_create(struct inode *, umode_t *, struct posix_acl **, struct posix_acl **); +extern int posix_acl_update_mode(struct inode *, umode_t *, struct posix_acl **); extern int simple_set_acl(struct inode *, struct posix_acl *, int); extern int simple_acl_create(struct inode *, struct inode *); From 4d42545f4996ba997eecd6c02bd1e3a816695bcd Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 23 Nov 2016 10:51:17 -0800 Subject: [PATCH 004/804] f2fs: fix overflow due to condition check order In the last ilen case, i was already increased, resulting in accessing out- of-boundary entry of do_replace and blkaddr. Fix to check ilen first to exit the loop. Fixes: 2aa8fbb9693020 ("f2fs: refactor __exchange_data_block for speed up") Cc: stable@vger.kernel.org # 4.8+ Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index c6e33258fabf..5c4ea4cf2fb1 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -971,7 +971,7 @@ static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode, new_size = (dst + i) << PAGE_SHIFT; if (dst_inode->i_size < new_size) f2fs_i_size_write(dst_inode, new_size); - } while ((do_replace[i] || blkaddr[i] == NULL_ADDR) && --ilen); + } while (--ilen && (do_replace[i] || blkaddr[i] == NULL_ADDR)); f2fs_put_dnode(&dn); } else { From 34a546cb043f95529a24ff042f2cdcf72b25b4f0 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 24 Nov 2016 12:45:15 -0800 Subject: [PATCH 005/804] f2fs: fix to determine start_cp_addr by sbi->cur_cp_pack We don't guarantee cp_addr is fixed by cp_version. This is to sync with f2fs-tools. Cc: stable@vger.kernel.org Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 8 +++++++- fs/f2fs/f2fs.h | 28 +++++++++++++++++----------- 2 files changed, 24 insertions(+), 12 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index cb23d6cf676b..1608ae8eea97 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -770,6 +770,11 @@ int get_valid_checkpoint(struct f2fs_sb_info *sbi) if (sanity_check_ckpt(sbi)) goto fail_no_cp; + if (cur_page == cp1) + sbi->cur_cp_pack = 1; + else + sbi->cur_cp_pack = 2; + if (cp_blks <= 1) goto done; @@ -1121,7 +1126,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) le32_to_cpu(ckpt->checksum_offset))) = cpu_to_le32(crc32); - start_blk = __start_cp_addr(sbi); + start_blk = __start_cp_next_addr(sbi); /* need to wait for end_io results */ wait_on_all_pages_writeback(sbi); @@ -1185,6 +1190,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) clear_prefree_segments(sbi, cpc); clear_sbi_flag(sbi, SBI_IS_DIRTY); clear_sbi_flag(sbi, SBI_NEED_CP); + __set_cp_next_pack(sbi); /* * redirty superblock if metadata like node page or inode cache is diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index af293e84e5cd..45d1e4522760 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -789,6 +789,7 @@ struct f2fs_sb_info { /* for checkpoint */ struct f2fs_checkpoint *ckpt; /* raw checkpoint pointer */ + int cur_cp_pack; /* remain current cp pack */ spinlock_t cp_lock; /* for flag in ckpt */ struct inode *meta_inode; /* cache meta blocks */ struct mutex cp_mutex; /* checkpoint procedure lock */ @@ -1354,22 +1355,27 @@ static inline void *__bitmap_ptr(struct f2fs_sb_info *sbi, int flag) static inline block_t __start_cp_addr(struct f2fs_sb_info *sbi) { - block_t start_addr; - struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); - unsigned long long ckpt_version = cur_cp_version(ckpt); + block_t start_addr = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_blkaddr); - start_addr = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_blkaddr); - - /* - * odd numbered checkpoint should at cp segment 0 - * and even segment must be at cp segment 1 - */ - if (!(ckpt_version & 1)) + if (sbi->cur_cp_pack == 2) start_addr += sbi->blocks_per_seg; - return start_addr; } +static inline block_t __start_cp_next_addr(struct f2fs_sb_info *sbi) +{ + block_t start_addr = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_blkaddr); + + if (sbi->cur_cp_pack == 1) + start_addr += sbi->blocks_per_seg; + return start_addr; +} + +static inline void __set_cp_next_pack(struct f2fs_sb_info *sbi) +{ + sbi->cur_cp_pack = (sbi->cur_cp_pack == 1) ? 2 : 1; +} + static inline block_t __start_sum_addr(struct f2fs_sb_info *sbi) { return le32_to_cpu(F2FS_CKPT(sbi)->cp_pack_start_sum); From 75a192655e64b2a76433acdc759cbd509de1efac Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 11 Oct 2016 22:31:34 +0800 Subject: [PATCH 006/804] f2fs: exclude free nids building and allocation During nid allocation, it needs to exclude building and allocating flow of free nids, this is because while building free nid cache, there are two steps: a) load free nids from unused nat entries in NAT pages, b) update free nid cache by checking nat journal. The two steps should be atomical, otherwise an used nid can be allocated as free one after a) and before b). This patch adds missing lock which covers build_free_nids in unlock_operation and f2fs_balance_fs_bg to avoid that. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index b1e615ed2bef..a8c2bd3e5029 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1786,7 +1786,7 @@ static void scan_nat_page(struct f2fs_sb_info *sbi, } } -void build_free_nids(struct f2fs_sb_info *sbi) +void __build_free_nids(struct f2fs_sb_info *sbi) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); @@ -1840,6 +1840,13 @@ void build_free_nids(struct f2fs_sb_info *sbi) nm_i->ra_nid_pages, META_NAT, false); } +void build_free_nids(struct f2fs_sb_info *sbi) +{ + mutex_lock(&NM_I(sbi)->build_lock); + __build_free_nids(sbi); + mutex_unlock(&NM_I(sbi)->build_lock); +} + /* * If this function returns success, caller can obtain a new nid * from second parameter of this function. @@ -1876,9 +1883,7 @@ retry: spin_unlock(&nm_i->free_nid_list_lock); /* Let's scan nat pages and its caches to get free nids */ - mutex_lock(&nm_i->build_lock); build_free_nids(sbi); - mutex_unlock(&nm_i->build_lock); goto retry; } From 7a2d5d5f8150767cc1db952bf775f2d712dc33f1 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 11 Oct 2016 22:57:00 +0800 Subject: [PATCH 007/804] f2fs: fix to release discard entries during checkpoint In f2fs_fill_super, if there is any IO error occurs during recovery, cached discard entries will be leaked, in order to avoid this, make write_checkpoint() handle memory release by itself, besides, move clear_prefree_segments to write_checkpoint for readability. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim Conflicts: fs/f2fs/checkpoint.c --- fs/f2fs/checkpoint.c | 5 ++++- fs/f2fs/super.c | 1 - 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 1608ae8eea97..63ca342a3cc8 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1187,7 +1187,6 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) if (unlikely(f2fs_cp_error(sbi))) return -EIO; - clear_prefree_segments(sbi, cpc); clear_sbi_flag(sbi, SBI_IS_DIRTY); clear_sbi_flag(sbi, SBI_NEED_CP); __set_cp_next_pack(sbi); @@ -1264,6 +1263,10 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* unlock all the fs_lock[] in do_checkpoint() */ err = do_checkpoint(sbi, cpc); + if (err) + release_discard_addrs(sbi); + else + clear_prefree_segments(sbi, cpc); unblock_operations(sbi); stat_inc_cp_count(sbi->stat_info); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index fd249cc9b96e..006138a6c5ab 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -738,7 +738,6 @@ static void f2fs_put_super(struct super_block *sb) * In addition, EIO will skip do checkpoint, we need this as well. */ release_ino_entry(sbi, true); - release_discard_addrs(sbi); f2fs_leave_shrinker(sbi); mutex_unlock(&sbi->umount_mutex); From 372f295d622c643f865e3cb83b2cf9b23f5bc49b Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 11 Oct 2016 22:57:01 +0800 Subject: [PATCH 008/804] f2fs: give a chance to detach from dirty list If there is no dirty pages in inode, we should give a chance to detach the inode from global dirty list, otherwise it needs to call another unnecessary .writepages for detaching. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 8 +++++--- fs/f2fs/dir.c | 1 + fs/f2fs/gc.c | 4 +++- fs/f2fs/inline.c | 4 +++- fs/f2fs/node.c | 1 + fs/f2fs/segment.c | 4 +++- 6 files changed, 16 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 7a3ac306a57c..15c0fe40ed5c 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1776,12 +1776,14 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset, return; if (PageDirty(page)) { - if (inode->i_ino == F2FS_META_INO(sbi)) + if (inode->i_ino == F2FS_META_INO(sbi)) { dec_page_count(sbi, F2FS_DIRTY_META); - else if (inode->i_ino == F2FS_NODE_INO(sbi)) + } else if (inode->i_ino == F2FS_NODE_INO(sbi)) { dec_page_count(sbi, F2FS_DIRTY_NODES); - else + } else { inode_dec_dirty_pages(inode); + remove_dirty_inode(inode); + } } /* This is atomic written page, keep Private */ diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index e634a637c443..c0dba11519cf 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -742,6 +742,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, ClearPagePrivate(page); ClearPageUptodate(page); inode_dec_dirty_pages(dir); + remove_dirty_inode(dir); } f2fs_put_page(page, 1); } diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 4336807cc690..72a0ca08f901 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -670,8 +670,10 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type) retry: set_page_dirty(page); f2fs_wait_on_page_writeback(page, DATA, true); - if (clear_page_dirty_for_io(page)) + if (clear_page_dirty_for_io(page)) { inode_dec_dirty_pages(inode); + remove_dirty_inode(inode); + } set_cold_data(page); diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index a04c1016d511..b21a0788f2cd 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -136,8 +136,10 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) fio.old_blkaddr = dn->data_blkaddr; write_data_page(dn, &fio); f2fs_wait_on_page_writeback(page, DATA, true); - if (dirty) + if (dirty) { inode_dec_dirty_pages(dn->inode); + remove_dirty_inode(dn->inode); + } /* this converted inline_data should be recovered. */ set_inode_flag(dn->inode, FI_APPEND_WRITE); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index a8c2bd3e5029..97eb2c0811b5 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1203,6 +1203,7 @@ static void flush_inline_data(struct f2fs_sb_info *sbi, nid_t ino) ret = f2fs_write_inline_data(inode, page); inode_dec_dirty_pages(inode); + remove_dirty_inode(inode); if (ret) set_page_dirty(page); page_out: diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index b3c61ae37f92..75477ec6c535 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -272,8 +272,10 @@ static int __commit_inmem_pages(struct inode *inode, set_page_dirty(page); f2fs_wait_on_page_writeback(page, DATA, true); - if (clear_page_dirty_for_io(page)) + if (clear_page_dirty_for_io(page)) { inode_dec_dirty_pages(inode); + remove_dirty_inode(inode); + } fio.page = page; err = do_write_data_page(&fio); From 48cceaae9acbd68469dfd371419fee736bad3e58 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 11 Oct 2016 22:57:02 +0800 Subject: [PATCH 009/804] f2fs: add missing f2fs_balance_fs in f2fs_zero_range f2fs_balance_fs should be called in between node page updating, otherwise node page count will exceeded far beyond watermark of triggering foreground garbage collection, result in facing high risk of hitting LFS allocation failure. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 5c4ea4cf2fb1..c0774c98dce4 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1222,6 +1222,9 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, ret = f2fs_do_zero_range(&dn, index, end); f2fs_put_dnode(&dn); f2fs_unlock_op(sbi); + + f2fs_balance_fs(sbi, dn.node_changed); + if (ret) goto out; From fc843bf42b3ab83b44112895ad77b43ea9249eb0 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 11 Oct 2016 22:57:03 +0800 Subject: [PATCH 010/804] f2fs: don't miss any f2fs_balance_fs cases In f2fs_map_blocks, let f2fs_balance_fs detects node page modification with dn.node_changed to avoid miss some corner cases. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 15c0fe40ed5c..01dc6ac79224 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -670,7 +670,6 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, unsigned int ofs_in_node, last_ofs_in_node; blkcnt_t prealloc; struct extent_info ei; - bool allocated = false; block_t blkaddr; if (!maxblocks) @@ -729,10 +728,8 @@ next_block: } } else { err = __allocate_data_block(&dn); - if (!err) { + if (!err) set_inode_flag(inode, FI_APPEND_WRITE); - allocated = true; - } } if (err) goto sync_out; @@ -787,7 +784,6 @@ skip: err = reserve_new_blocks(&dn, prealloc); if (err) goto sync_out; - allocated = dn.node_changed; map->m_len += dn.ofs_in_node - ofs_in_node; if (prealloc && dn.ofs_in_node != last_ofs_in_node + 1) { @@ -806,9 +802,8 @@ skip: if (create) { f2fs_unlock_op(sbi); - f2fs_balance_fs(sbi, allocated); + f2fs_balance_fs(sbi, dn.node_changed); } - allocated = false; goto next_dnode; sync_out: @@ -816,7 +811,7 @@ sync_out: unlock_out: if (create) { f2fs_unlock_op(sbi); - f2fs_balance_fs(sbi, allocated); + f2fs_balance_fs(sbi, dn.node_changed); } out: trace_f2fs_map_blocks(inode, map, err); From a24f28d74694db4639ad644246dcc330b0cef2c4 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 11 Oct 2016 22:57:04 +0800 Subject: [PATCH 011/804] f2fs: be aware of extent beyond EOF in fiemap f2fs can support fallocating blocks beyond file size without changing the size, but ->fiemap of f2fs was restricted and can't detect these extents fallocated past EOF, now relieve the restriction. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 01dc6ac79224..6e00e017bb4f 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -880,7 +880,6 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, struct buffer_head map_bh; sector_t start_blk, last_blk; pgoff_t next_pgofs; - loff_t isize; u64 logical = 0, phys = 0, size = 0; u32 flags = 0; int ret = 0; @@ -897,13 +896,6 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, inode_lock(inode); - isize = i_size_read(inode); - if (start >= isize) - goto out; - - if (start + len > isize) - len = isize - start; - if (logical_to_blk(inode, len) == 0) len = blk_to_logical(inode, 1); @@ -922,13 +914,11 @@ next: /* HOLE */ if (!buffer_mapped(&map_bh)) { start_blk = next_pgofs; - /* Go through holes util pass the EOF */ - if (blk_to_logical(inode, start_blk) < isize) + + if (blk_to_logical(inode, start_blk) < blk_to_logical(inode, + F2FS_I_SB(inode)->max_file_blocks)) goto prep_next; - /* Found a hole beyond isize means no more extents. - * Note that the premise is that filesystems don't - * punch holes beyond isize and keep size unchanged. - */ + flags |= FIEMAP_EXTENT_LAST; } From 70aa0e6cb1a3be365613997c33bd468b01bce93e Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 11 Oct 2016 22:57:05 +0800 Subject: [PATCH 012/804] f2fs: fix to update largest extent under lock In order to avoid racing problem, make largest extent cache being updated under lock. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/inode.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index d7369895a78a..1fbebcb33a9d 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -252,6 +252,7 @@ retry: int update_inode(struct inode *inode, struct page *node_page) { struct f2fs_inode *ri; + struct extent_tree *et = F2FS_I(inode)->extent_tree; f2fs_inode_synced(inode); @@ -267,11 +268,13 @@ int update_inode(struct inode *inode, struct page *node_page) ri->i_size = cpu_to_le64(i_size_read(inode)); ri->i_blocks = cpu_to_le64(inode->i_blocks); - if (F2FS_I(inode)->extent_tree) - set_raw_extent(&F2FS_I(inode)->extent_tree->largest, - &ri->i_ext); - else + if (et) { + read_lock(&et->lock); + set_raw_extent(&et->largest, &ri->i_ext); + read_unlock(&et->lock); + } else { memset(&ri->i_ext, 0, sizeof(ri->i_ext)); + } set_raw_inline(inode, ri); ri->i_atime = cpu_to_le64(inode->i_atime.tv_sec); From a6c3b7211039846974b5b80e68032951a0999f86 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 11 Oct 2016 22:57:06 +0800 Subject: [PATCH 013/804] f2fs: fix error handling in fsync_node_pages In fsync_node_pages, if f2fs was taged with CP_ERROR_FLAG, make sure bio cache was flushed before return. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 97eb2c0811b5..bc38e5a92b4b 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1338,7 +1338,8 @@ retry: if (unlikely(f2fs_cp_error(sbi))) { f2fs_put_page(last_page, 0); pagevec_release(&pvec); - return -EIO; + ret = -EIO; + goto out; } if (!IS_DNODE(page) || !is_cold_node(page)) @@ -1411,7 +1412,7 @@ continue_unlock: unlock_page(last_page); goto retry; } - +out: if (nwritten) f2fs_submit_merged_bio_cond(sbi, NULL, NULL, ino, NODE, WRITE); return ret ? -EIO: 0; From 5f3ec1f715c1c18b544e2480472c3a2cdf19a425 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 11 Oct 2016 10:36:12 -0700 Subject: [PATCH 014/804] f2fs: fix sparse warnings f2fs contained a number of endianness conversion bugs. Also, one function should have been 'static'. Found with sparse by running 'make C=2 CF=-D__CHECK_ENDIAN__ fs/f2fs/' Signed-off-by: Eric Biggers Signed-off-by: Jaegeuk Kim Conflicts: fs/f2fs/segment.c --- fs/f2fs/dir.c | 2 +- fs/f2fs/inline.c | 2 +- fs/f2fs/node.c | 5 +++-- fs/f2fs/node.h | 2 +- 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index c0dba11519cf..7136dc1ade11 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -136,7 +136,7 @@ struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *fname, /* show encrypted name */ if (fname->hash) { - if (de->hash_code == fname->hash) + if (de->hash_code == cpu_to_le32(fname->hash)) goto found; } else if (de_name.len == name->len && de->hash_code == namehash && diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index b21a0788f2cd..06d20489d532 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -420,7 +420,7 @@ static int f2fs_add_inline_entries(struct inode *dir, } new_name.name = d.filename[bit_pos]; - new_name.len = de->name_len; + new_name.len = le16_to_cpu(de->name_len); ino = le32_to_cpu(de->ino); fake_mode = get_de_type(de) << S_SHIFT; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index bc38e5a92b4b..d2ba37a84f8e 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -270,8 +270,9 @@ static void cache_nat_entry(struct f2fs_sb_info *sbi, nid_t nid, e = grab_nat_entry(nm_i, nid); node_info_from_raw_nat(&e->ni, ne); } else { - f2fs_bug_on(sbi, nat_get_ino(e) != ne->ino || - nat_get_blkaddr(e) != ne->block_addr || + f2fs_bug_on(sbi, nat_get_ino(e) != le32_to_cpu(ne->ino) || + nat_get_blkaddr(e) != + le32_to_cpu(ne->block_addr) || nat_get_version(e) != ne->version); } } diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 868bec65e51c..cfdcf98516a1 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -313,7 +313,7 @@ static inline bool is_recoverable_dnode(struct page *page) ((unsigned char *)ckpt + crc_offset))); cp_ver |= (crc << 32); } - return cpu_to_le64(cp_ver) == cpver_of_node(page); + return cp_ver == cpver_of_node(page); } /* From a943c829bed9925e110fe5ea6891a10d7607e781 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 11 Oct 2016 22:56:59 +0800 Subject: [PATCH 015/804] f2fs: clear nlink if fail to add_link We don't need to keep incomplete created inode in cache, so if we fail to add link into directory during new inode creation, it's better to set nlink of inode to zero, then we can evict inode immediately. Otherwise release of nid belong to inode will be delayed until inode cache is being shrunk, it may cause a seemingly endless loop while allocating free nids in time of testing generic/269 case of fstest suit. Signed-off-by: Chao Yu [Jaegeuk Kim: add update_inode_page to fix kernel panic] Signed-off-by: Jaegeuk Kim --- fs/f2fs/inode.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 1fbebcb33a9d..d32fd0343eae 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -387,6 +387,8 @@ retry: f2fs_lock_op(sbi); err = remove_inode_page(inode); f2fs_unlock_op(sbi); + if (err == -ENOENT) + err = 0; } /* give more chances, if ENOMEM case */ @@ -427,6 +429,18 @@ void handle_failed_inode(struct inode *inode) struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct node_info ni; + /* + * clear nlink of inode in order to release resource of inode + * immediately. + */ + clear_nlink(inode); + + /* + * we must call this to avoid inode being remained as dirty, resulting + * in a panic when flushing dirty inodes in gdirty_list. + */ + update_inode_page(inode); + /* don't make bad inode, since it becomes a regular file. */ unlock_new_inode(inode); From 3499fdbee609d03aedc25ebd7baa420e343e4dbb Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 12 Oct 2016 19:28:29 +0800 Subject: [PATCH 016/804] f2fs: split free nid list During free nid allocation, in order to do preallocation, we will tag free nid entry as allocated one and still leave it in free nid list, for other allocators who want to grab free nids, it needs to traverse the free nid list for lookup. It becomes overhead in scenario of allocating free nid intensively by multithreads. This patch splits free nid list to two list: {free,alloc}_nid_list, to keep free nids and preallocated free nids separately, after that, traverse latency will be gone, besides split nid_cnt for separate statistic. Additionally, introduce __insert_nid_to_list and __remove_nid_from_list for cleanup. Signed-off-by: Chao Yu [Jaegeuk Kim: modify f2fs_bug_on to avoid needless branches] Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 11 ++-- fs/f2fs/f2fs.h | 14 +++-- fs/f2fs/node.c | 136 +++++++++++++++++++++++++++------------------ fs/f2fs/node.h | 11 ++-- fs/f2fs/shrinker.c | 4 +- 5 files changed, 108 insertions(+), 68 deletions(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index fb245bd302e4..6af146c48644 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -74,7 +74,8 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->dirty_nats = NM_I(sbi)->dirty_nat_cnt; si->sits = MAIN_SEGS(sbi); si->dirty_sits = SIT_I(sbi)->dirty_sentries; - si->fnids = NM_I(sbi)->fcnt; + si->free_nids = NM_I(sbi)->nid_cnt[FREE_NID_LIST]; + si->alloc_nids = NM_I(sbi)->nid_cnt[ALLOC_NID_LIST]; si->bg_gc = sbi->bg_gc; si->util_free = (int)(free_user_blocks(sbi) >> sbi->log_blocks_per_seg) * 100 / (int)(sbi->user_block_count >> sbi->log_blocks_per_seg) @@ -194,7 +195,9 @@ get_cache: si->cache_mem += sizeof(struct flush_cmd_control); /* free nids */ - si->cache_mem += NM_I(sbi)->fcnt * sizeof(struct free_nid); + si->cache_mem += (NM_I(sbi)->nid_cnt[FREE_NID_LIST] + + NM_I(sbi)->nid_cnt[ALLOC_NID_LIST]) * + sizeof(struct free_nid); si->cache_mem += NM_I(sbi)->nat_cnt * sizeof(struct nat_entry); si->cache_mem += NM_I(sbi)->dirty_nat_cnt * sizeof(struct nat_entry_set); @@ -324,8 +327,8 @@ static int stat_show(struct seq_file *s, void *v) si->ndirty_imeta); seq_printf(s, " - NATs: %9d/%9d\n - SITs: %9d/%9d\n", si->dirty_nats, si->nats, si->dirty_sits, si->sits); - seq_printf(s, " - free_nids: %9d\n", - si->fnids); + seq_printf(s, " - free_nids: %9d, alloc_nids: %9d\n", + si->free_nids, si->alloc_nids); seq_puts(s, "\nDistribution of User Blocks:"); seq_puts(s, " [ valid | invalid | free ]\n"); seq_puts(s, " ["); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 45d1e4522760..cec025852c22 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -529,6 +529,12 @@ static inline void __try_update_largest_extent(struct inode *inode, } } +enum nid_list { + FREE_NID_LIST, + ALLOC_NID_LIST, + MAX_NID_LIST, +}; + struct f2fs_nm_info { block_t nat_blkaddr; /* base disk address of NAT */ nid_t max_nid; /* maximum possible node ids */ @@ -548,9 +554,9 @@ struct f2fs_nm_info { /* free node ids management */ struct radix_tree_root free_nid_root;/* root of the free_nid cache */ - struct list_head free_nid_list; /* a list for free nids */ - spinlock_t free_nid_list_lock; /* protect free nid list */ - unsigned int fcnt; /* the number of free node id */ + struct list_head nid_list[MAX_NID_LIST];/* lists for free nids */ + unsigned int nid_cnt[MAX_NID_LIST]; /* the number of free node id */ + spinlock_t nid_list_lock; /* protect nid lists ops */ struct mutex build_lock; /* lock for build free nids */ /* for checkpoint */ @@ -2214,7 +2220,7 @@ struct f2fs_stat_info { s64 ndirty_node, ndirty_dent, ndirty_meta, ndirty_data, ndirty_imeta; s64 inmem_pages; unsigned int ndirty_dirs, ndirty_files, ndirty_all; - int nats, dirty_nats, sits, dirty_sits, fnids; + int nats, dirty_nats, sits, dirty_sits, free_nids, alloc_nids; int total_count, utilization; int bg_gc, wb_bios; int inline_xattr, inline_inode, inline_dir, orphans; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index d2ba37a84f8e..5bb2fa324e68 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -45,8 +45,8 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type) * give 25%, 25%, 50%, 50%, 50% memory for each components respectively */ if (type == FREE_NIDS) { - mem_size = (nm_i->fcnt * sizeof(struct free_nid)) >> - PAGE_SHIFT; + mem_size = (nm_i->nid_cnt[FREE_NID_LIST] * + sizeof(struct free_nid)) >> PAGE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2); } else if (type == NAT_ENTRIES) { mem_size = (nm_i->nat_cnt * sizeof(struct nat_entry)) >> @@ -1699,10 +1699,31 @@ static struct free_nid *__lookup_free_nid_list(struct f2fs_nm_info *nm_i, static void __del_from_free_nid_list(struct f2fs_nm_info *nm_i, struct free_nid *i) { - list_del(&i->list); radix_tree_delete(&nm_i->free_nid_root, i->nid); } +static void __insert_nid_to_list(struct f2fs_sb_info *sbi, + struct free_nid *i, enum nid_list list) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + + f2fs_bug_on(sbi, list == FREE_NID_LIST ? i->state != NID_NEW : + i->state != NID_ALLOC); + nm_i->nid_cnt[list]++; + list_add_tail(&i->list, &nm_i->nid_list[list]); +} + +static void __remove_nid_from_list(struct f2fs_sb_info *sbi, + struct free_nid *i, enum nid_list list) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + + f2fs_bug_on(sbi, list == FREE_NID_LIST ? i->state != NID_NEW : + i->state != NID_ALLOC); + nm_i->nid_cnt[list]--; + list_del(&i->list); +} + static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) { struct f2fs_nm_info *nm_i = NM_I(sbi); @@ -1733,33 +1754,33 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) return 0; } - spin_lock(&nm_i->free_nid_list_lock); + spin_lock(&nm_i->nid_list_lock); if (radix_tree_insert(&nm_i->free_nid_root, i->nid, i)) { - spin_unlock(&nm_i->free_nid_list_lock); + spin_unlock(&nm_i->nid_list_lock); radix_tree_preload_end(); kmem_cache_free(free_nid_slab, i); return 0; } - list_add_tail(&i->list, &nm_i->free_nid_list); - nm_i->fcnt++; - spin_unlock(&nm_i->free_nid_list_lock); + __insert_nid_to_list(sbi, i, FREE_NID_LIST); + spin_unlock(&nm_i->nid_list_lock); radix_tree_preload_end(); return 1; } -static void remove_free_nid(struct f2fs_nm_info *nm_i, nid_t nid) +static void remove_free_nid(struct f2fs_sb_info *sbi, nid_t nid) { + struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *i; bool need_free = false; - spin_lock(&nm_i->free_nid_list_lock); + spin_lock(&nm_i->nid_list_lock); i = __lookup_free_nid_list(nm_i, nid); if (i && i->state == NID_NEW) { + __remove_nid_from_list(sbi, i, FREE_NID_LIST); __del_from_free_nid_list(nm_i, i); - nm_i->fcnt--; need_free = true; } - spin_unlock(&nm_i->free_nid_list_lock); + spin_unlock(&nm_i->nid_list_lock); if (need_free) kmem_cache_free(free_nid_slab, i); @@ -1798,7 +1819,7 @@ void __build_free_nids(struct f2fs_sb_info *sbi) nid_t nid = nm_i->next_scan_nid; /* Enough entries */ - if (nm_i->fcnt >= NAT_ENTRY_PER_BLOCK) + if (nm_i->nid_cnt[FREE_NID_LIST] >= NAT_ENTRY_PER_BLOCK) return; /* readahead nat pages to be scanned */ @@ -1834,7 +1855,7 @@ void __build_free_nids(struct f2fs_sb_info *sbi) if (addr == NULL_ADDR) add_free_nid(sbi, nid, true); else - remove_free_nid(nm_i, nid); + remove_free_nid(sbi, nid); } up_read(&curseg->journal_rwsem); up_read(&nm_i->nat_tree_lock); @@ -1867,23 +1888,22 @@ retry: if (unlikely(sbi->total_valid_node_count + 1 > nm_i->available_nids)) return false; - spin_lock(&nm_i->free_nid_list_lock); + spin_lock(&nm_i->nid_list_lock); /* We should not use stale free nids created by build_free_nids */ - if (nm_i->fcnt && !on_build_free_nids(nm_i)) { - f2fs_bug_on(sbi, list_empty(&nm_i->free_nid_list)); - list_for_each_entry(i, &nm_i->free_nid_list, list) - if (i->state == NID_NEW) - break; - - f2fs_bug_on(sbi, i->state != NID_NEW); + if (nm_i->nid_cnt[FREE_NID_LIST] && !on_build_free_nids(nm_i)) { + f2fs_bug_on(sbi, list_empty(&nm_i->nid_list[FREE_NID_LIST])); + i = list_first_entry(&nm_i->nid_list[FREE_NID_LIST], + struct free_nid, list); *nid = i->nid; + + __remove_nid_from_list(sbi, i, FREE_NID_LIST); i->state = NID_ALLOC; - nm_i->fcnt--; - spin_unlock(&nm_i->free_nid_list_lock); + __insert_nid_to_list(sbi, i, ALLOC_NID_LIST); + spin_unlock(&nm_i->nid_list_lock); return true; } - spin_unlock(&nm_i->free_nid_list_lock); + spin_unlock(&nm_i->nid_list_lock); /* Let's scan nat pages and its caches to get free nids */ build_free_nids(sbi); @@ -1898,11 +1918,12 @@ void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid) struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *i; - spin_lock(&nm_i->free_nid_list_lock); + spin_lock(&nm_i->nid_list_lock); i = __lookup_free_nid_list(nm_i, nid); - f2fs_bug_on(sbi, !i || i->state != NID_ALLOC); + f2fs_bug_on(sbi, !i); + __remove_nid_from_list(sbi, i, ALLOC_NID_LIST); __del_from_free_nid_list(nm_i, i); - spin_unlock(&nm_i->free_nid_list_lock); + spin_unlock(&nm_i->nid_list_lock); kmem_cache_free(free_nid_slab, i); } @@ -1919,17 +1940,20 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) if (!nid) return; - spin_lock(&nm_i->free_nid_list_lock); + spin_lock(&nm_i->nid_list_lock); i = __lookup_free_nid_list(nm_i, nid); - f2fs_bug_on(sbi, !i || i->state != NID_ALLOC); + f2fs_bug_on(sbi, !i); + + __remove_nid_from_list(sbi, i, ALLOC_NID_LIST); + if (!available_free_memory(sbi, FREE_NIDS)) { __del_from_free_nid_list(nm_i, i); need_free = true; } else { i->state = NID_NEW; - nm_i->fcnt++; + __insert_nid_to_list(sbi, i, FREE_NID_LIST); } - spin_unlock(&nm_i->free_nid_list_lock); + spin_unlock(&nm_i->nid_list_lock); if (need_free) kmem_cache_free(free_nid_slab, i); @@ -1941,24 +1965,26 @@ int try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink) struct free_nid *i, *next; int nr = nr_shrink; - if (nm_i->fcnt <= MAX_FREE_NIDS) + if (nm_i->nid_cnt[FREE_NID_LIST] <= MAX_FREE_NIDS) return 0; if (!mutex_trylock(&nm_i->build_lock)) return 0; - spin_lock(&nm_i->free_nid_list_lock); - list_for_each_entry_safe(i, next, &nm_i->free_nid_list, list) { - if (nr_shrink <= 0 || nm_i->fcnt <= MAX_FREE_NIDS) + spin_lock(&nm_i->nid_list_lock); + list_for_each_entry_safe(i, next, &nm_i->nid_list[FREE_NID_LIST], + list) { + if (nr_shrink <= 0 || + nm_i->nid_cnt[FREE_NID_LIST] <= MAX_FREE_NIDS) break; - if (i->state == NID_ALLOC) - continue; + + __remove_nid_from_list(sbi, i, FREE_NID_LIST); __del_from_free_nid_list(nm_i, i); + kmem_cache_free(free_nid_slab, i); - nm_i->fcnt--; nr_shrink--; } - spin_unlock(&nm_i->free_nid_list_lock); + spin_unlock(&nm_i->nid_list_lock); mutex_unlock(&nm_i->build_lock); return nr - nr_shrink; @@ -2014,7 +2040,7 @@ recover_xnid: if (unlikely(!inc_valid_node_count(sbi, inode))) f2fs_bug_on(sbi, 1); - remove_free_nid(NM_I(sbi), new_xnid); + remove_free_nid(sbi, new_xnid); get_node_info(sbi, new_xnid, &ni); ni.ino = inode->i_ino; set_node_addr(sbi, &ni, NEW_ADDR, false); @@ -2044,7 +2070,7 @@ retry: } /* Should not use this inode from free nid list */ - remove_free_nid(NM_I(sbi), ino); + remove_free_nid(sbi, ino); if (!PageUptodate(ipage)) SetPageUptodate(ipage); @@ -2278,20 +2304,22 @@ static int init_node_manager(struct f2fs_sb_info *sbi) /* not used nids: 0, node, meta, (and root counted as valid node) */ nm_i->available_nids = nm_i->max_nid - F2FS_RESERVED_NODE_NUM; - nm_i->fcnt = 0; + nm_i->nid_cnt[FREE_NID_LIST] = 0; + nm_i->nid_cnt[ALLOC_NID_LIST] = 0; nm_i->nat_cnt = 0; nm_i->ram_thresh = DEF_RAM_THRESHOLD; nm_i->ra_nid_pages = DEF_RA_NID_PAGES; nm_i->dirty_nats_ratio = DEF_DIRTY_NAT_RATIO_THRESHOLD; INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC); - INIT_LIST_HEAD(&nm_i->free_nid_list); + INIT_LIST_HEAD(&nm_i->nid_list[FREE_NID_LIST]); + INIT_LIST_HEAD(&nm_i->nid_list[ALLOC_NID_LIST]); INIT_RADIX_TREE(&nm_i->nat_root, GFP_NOIO); INIT_RADIX_TREE(&nm_i->nat_set_root, GFP_NOIO); INIT_LIST_HEAD(&nm_i->nat_entries); mutex_init(&nm_i->build_lock); - spin_lock_init(&nm_i->free_nid_list_lock); + spin_lock_init(&nm_i->nid_list_lock); init_rwsem(&nm_i->nat_tree_lock); nm_i->next_scan_nid = le32_to_cpu(sbi->ckpt->next_free_nid); @@ -2336,17 +2364,19 @@ void destroy_node_manager(struct f2fs_sb_info *sbi) return; /* destroy free nid list */ - spin_lock(&nm_i->free_nid_list_lock); - list_for_each_entry_safe(i, next_i, &nm_i->free_nid_list, list) { - f2fs_bug_on(sbi, i->state == NID_ALLOC); + spin_lock(&nm_i->nid_list_lock); + list_for_each_entry_safe(i, next_i, &nm_i->nid_list[FREE_NID_LIST], + list) { + __remove_nid_from_list(sbi, i, FREE_NID_LIST); __del_from_free_nid_list(nm_i, i); - nm_i->fcnt--; - spin_unlock(&nm_i->free_nid_list_lock); + spin_unlock(&nm_i->nid_list_lock); kmem_cache_free(free_nid_slab, i); - spin_lock(&nm_i->free_nid_list_lock); + spin_lock(&nm_i->nid_list_lock); } - f2fs_bug_on(sbi, nm_i->fcnt); - spin_unlock(&nm_i->free_nid_list_lock); + f2fs_bug_on(sbi, nm_i->nid_cnt[FREE_NID_LIST]); + f2fs_bug_on(sbi, nm_i->nid_cnt[ALLOC_NID_LIST]); + f2fs_bug_on(sbi, !list_empty(&nm_i->nid_list[ALLOC_NID_LIST])); + spin_unlock(&nm_i->nid_list_lock); /* destroy nat cache */ down_write(&nm_i->nat_tree_lock); diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index cfdcf98516a1..e7997e240366 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -169,14 +169,15 @@ static inline void next_free_nid(struct f2fs_sb_info *sbi, nid_t *nid) struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *fnid; - spin_lock(&nm_i->free_nid_list_lock); - if (nm_i->fcnt <= 0) { - spin_unlock(&nm_i->free_nid_list_lock); + spin_lock(&nm_i->nid_list_lock); + if (nm_i->nid_cnt[FREE_NID_LIST] <= 0) { + spin_unlock(&nm_i->nid_list_lock); return; } - fnid = list_entry(nm_i->free_nid_list.next, struct free_nid, list); + fnid = list_entry(nm_i->nid_list[FREE_NID_LIST].next, + struct free_nid, list); *nid = fnid->nid; - spin_unlock(&nm_i->free_nid_list_lock); + spin_unlock(&nm_i->nid_list_lock); } /* diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c index 46c915425923..ec539f407cc4 100644 --- a/fs/f2fs/shrinker.c +++ b/fs/f2fs/shrinker.c @@ -26,8 +26,8 @@ static unsigned long __count_nat_entries(struct f2fs_sb_info *sbi) static unsigned long __count_free_nids(struct f2fs_sb_info *sbi) { - if (NM_I(sbi)->fcnt > MAX_FREE_NIDS) - return NM_I(sbi)->fcnt - MAX_FREE_NIDS; + if (NM_I(sbi)->nid_cnt[FREE_NID_LIST] > MAX_FREE_NIDS) + return NM_I(sbi)->nid_cnt[FREE_NID_LIST] - MAX_FREE_NIDS; return 0; } From bae23863f810952bf54caf9ee56cd3b2763d22bf Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 12 Oct 2016 10:09:59 -0700 Subject: [PATCH 017/804] f2fs: clean up free nid list operations This patch cleans up to use consistent free nid list ops. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 56 +++++++++++++++++++++++--------------------------- 1 file changed, 26 insertions(+), 30 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 5bb2fa324e68..ef5357c7af24 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1696,25 +1696,26 @@ static struct free_nid *__lookup_free_nid_list(struct f2fs_nm_info *nm_i, return radix_tree_lookup(&nm_i->free_nid_root, n); } -static void __del_from_free_nid_list(struct f2fs_nm_info *nm_i, - struct free_nid *i) -{ - radix_tree_delete(&nm_i->free_nid_root, i->nid); -} - -static void __insert_nid_to_list(struct f2fs_sb_info *sbi, - struct free_nid *i, enum nid_list list) +static int __insert_nid_to_list(struct f2fs_sb_info *sbi, + struct free_nid *i, enum nid_list list, bool new) { struct f2fs_nm_info *nm_i = NM_I(sbi); + if (new) { + int err = radix_tree_insert(&nm_i->free_nid_root, i->nid, i); + if (err) + return err; + } + f2fs_bug_on(sbi, list == FREE_NID_LIST ? i->state != NID_NEW : i->state != NID_ALLOC); nm_i->nid_cnt[list]++; list_add_tail(&i->list, &nm_i->nid_list[list]); + return 0; } static void __remove_nid_from_list(struct f2fs_sb_info *sbi, - struct free_nid *i, enum nid_list list) + struct free_nid *i, enum nid_list list, bool reuse) { struct f2fs_nm_info *nm_i = NM_I(sbi); @@ -1722,6 +1723,8 @@ static void __remove_nid_from_list(struct f2fs_sb_info *sbi, i->state != NID_ALLOC); nm_i->nid_cnt[list]--; list_del(&i->list); + if (!reuse) + radix_tree_delete(&nm_i->free_nid_root, i->nid); } static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) @@ -1729,6 +1732,7 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *i; struct nat_entry *ne; + int err; if (!available_free_memory(sbi, FREE_NIDS)) return -1; @@ -1755,15 +1759,13 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) } spin_lock(&nm_i->nid_list_lock); - if (radix_tree_insert(&nm_i->free_nid_root, i->nid, i)) { - spin_unlock(&nm_i->nid_list_lock); - radix_tree_preload_end(); + err = __insert_nid_to_list(sbi, i, FREE_NID_LIST, true); + spin_unlock(&nm_i->nid_list_lock); + radix_tree_preload_end(); + if (err) { kmem_cache_free(free_nid_slab, i); return 0; } - __insert_nid_to_list(sbi, i, FREE_NID_LIST); - spin_unlock(&nm_i->nid_list_lock); - radix_tree_preload_end(); return 1; } @@ -1776,8 +1778,7 @@ static void remove_free_nid(struct f2fs_sb_info *sbi, nid_t nid) spin_lock(&nm_i->nid_list_lock); i = __lookup_free_nid_list(nm_i, nid); if (i && i->state == NID_NEW) { - __remove_nid_from_list(sbi, i, FREE_NID_LIST); - __del_from_free_nid_list(nm_i, i); + __remove_nid_from_list(sbi, i, FREE_NID_LIST, false); need_free = true; } spin_unlock(&nm_i->nid_list_lock); @@ -1897,9 +1898,9 @@ retry: struct free_nid, list); *nid = i->nid; - __remove_nid_from_list(sbi, i, FREE_NID_LIST); + __remove_nid_from_list(sbi, i, FREE_NID_LIST, true); i->state = NID_ALLOC; - __insert_nid_to_list(sbi, i, ALLOC_NID_LIST); + __insert_nid_to_list(sbi, i, ALLOC_NID_LIST, false); spin_unlock(&nm_i->nid_list_lock); return true; } @@ -1921,8 +1922,7 @@ void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid) spin_lock(&nm_i->nid_list_lock); i = __lookup_free_nid_list(nm_i, nid); f2fs_bug_on(sbi, !i); - __remove_nid_from_list(sbi, i, ALLOC_NID_LIST); - __del_from_free_nid_list(nm_i, i); + __remove_nid_from_list(sbi, i, ALLOC_NID_LIST, false); spin_unlock(&nm_i->nid_list_lock); kmem_cache_free(free_nid_slab, i); @@ -1944,14 +1944,13 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) i = __lookup_free_nid_list(nm_i, nid); f2fs_bug_on(sbi, !i); - __remove_nid_from_list(sbi, i, ALLOC_NID_LIST); - if (!available_free_memory(sbi, FREE_NIDS)) { - __del_from_free_nid_list(nm_i, i); + __remove_nid_from_list(sbi, i, ALLOC_NID_LIST, false); need_free = true; } else { + __remove_nid_from_list(sbi, i, ALLOC_NID_LIST, true); i->state = NID_NEW; - __insert_nid_to_list(sbi, i, FREE_NID_LIST); + __insert_nid_to_list(sbi, i, FREE_NID_LIST, false); } spin_unlock(&nm_i->nid_list_lock); @@ -1978,9 +1977,7 @@ int try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink) nm_i->nid_cnt[FREE_NID_LIST] <= MAX_FREE_NIDS) break; - __remove_nid_from_list(sbi, i, FREE_NID_LIST); - __del_from_free_nid_list(nm_i, i); - + __remove_nid_from_list(sbi, i, FREE_NID_LIST, false); kmem_cache_free(free_nid_slab, i); nr_shrink--; } @@ -2367,8 +2364,7 @@ void destroy_node_manager(struct f2fs_sb_info *sbi) spin_lock(&nm_i->nid_list_lock); list_for_each_entry_safe(i, next_i, &nm_i->nid_list[FREE_NID_LIST], list) { - __remove_nid_from_list(sbi, i, FREE_NID_LIST); - __del_from_free_nid_list(nm_i, i); + __remove_nid_from_list(sbi, i, FREE_NID_LIST, false); spin_unlock(&nm_i->nid_list_lock); kmem_cache_free(free_nid_slab, i); spin_lock(&nm_i->nid_list_lock); From b1b14da24aab69fbb84159fe5c57035dafc50276 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 11 Oct 2016 22:31:35 +0800 Subject: [PATCH 018/804] f2fs: don't interrupt free nids building during nid allocation Let build_free_nids support sync/async methods, in allocation flow of nids, we use synchronuous method, so that we can avoid looping in alloc_nid when free memory is low; in unblock_operations and f2fs_balance_fs_bg we use asynchronuous method in where low memory condition can interrupt us. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 2 +- fs/f2fs/f2fs.h | 2 +- fs/f2fs/node.c | 22 ++++++++++------------ fs/f2fs/segment.c | 2 +- 4 files changed, 13 insertions(+), 15 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 63ca342a3cc8..1d273f51bc1c 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -990,7 +990,7 @@ static void unblock_operations(struct f2fs_sb_info *sbi) { up_write(&sbi->node_write); - build_free_nids(sbi); + build_free_nids(sbi, false); f2fs_unlock_all(sbi); } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index cec025852c22..9e8d3c9af54a 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2071,7 +2071,7 @@ void move_node_page(struct page *, int); int fsync_node_pages(struct f2fs_sb_info *, struct inode *, struct writeback_control *, bool); int sync_node_pages(struct f2fs_sb_info *, struct writeback_control *); -void build_free_nids(struct f2fs_sb_info *); +void build_free_nids(struct f2fs_sb_info *, bool); bool alloc_nid(struct f2fs_sb_info *, nid_t *); void alloc_nid_done(struct f2fs_sb_info *, nid_t); void alloc_nid_failed(struct f2fs_sb_info *, nid_t); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index ef5357c7af24..5800a1082fe8 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1734,9 +1734,6 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) struct nat_entry *ne; int err; - if (!available_free_memory(sbi, FREE_NIDS)) - return -1; - /* 0 nid should not be used */ if (unlikely(nid == 0)) return 0; @@ -1804,14 +1801,12 @@ static void scan_nat_page(struct f2fs_sb_info *sbi, blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr); f2fs_bug_on(sbi, blk_addr == NEW_ADDR); - if (blk_addr == NULL_ADDR) { - if (add_free_nid(sbi, start_nid, true) < 0) - break; - } + if (blk_addr == NULL_ADDR) + add_free_nid(sbi, start_nid, true); } } -void __build_free_nids(struct f2fs_sb_info *sbi) +void __build_free_nids(struct f2fs_sb_info *sbi, bool sync) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); @@ -1823,6 +1818,9 @@ void __build_free_nids(struct f2fs_sb_info *sbi) if (nm_i->nid_cnt[FREE_NID_LIST] >= NAT_ENTRY_PER_BLOCK) return; + if (!sync && !available_free_memory(sbi, FREE_NIDS)) + return; + /* readahead nat pages to be scanned */ ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES, META_NAT, true); @@ -1865,10 +1863,10 @@ void __build_free_nids(struct f2fs_sb_info *sbi) nm_i->ra_nid_pages, META_NAT, false); } -void build_free_nids(struct f2fs_sb_info *sbi) +void build_free_nids(struct f2fs_sb_info *sbi, bool sync) { mutex_lock(&NM_I(sbi)->build_lock); - __build_free_nids(sbi); + __build_free_nids(sbi, sync); mutex_unlock(&NM_I(sbi)->build_lock); } @@ -1907,7 +1905,7 @@ retry: spin_unlock(&nm_i->nid_list_lock); /* Let's scan nat pages and its caches to get free nids */ - build_free_nids(sbi); + build_free_nids(sbi, true); goto retry; } @@ -2344,7 +2342,7 @@ int build_node_manager(struct f2fs_sb_info *sbi) if (err) return err; - build_free_nids(sbi); + build_free_nids(sbi, true); return 0; } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 75477ec6c535..48903702de27 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -380,7 +380,7 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) if (!available_free_memory(sbi, FREE_NIDS)) try_to_free_nids(sbi, MAX_FREE_NIDS); else - build_free_nids(sbi); + build_free_nids(sbi, false); /* checkpoint is the only way to shrink partial cached entries */ if (!available_free_memory(sbi, NAT_ENTRIES) || From 518a2cf9a065fe11df9382dfb089c745c6ebca69 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 11 Oct 2016 22:31:36 +0800 Subject: [PATCH 019/804] f2fs: avoid casted negative value as shrink count This patch makes sure it returns a positive value instead of a probable casted negative value as shrink count. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/shrinker.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c index ec539f407cc4..5c60fc28ec75 100644 --- a/fs/f2fs/shrinker.c +++ b/fs/f2fs/shrinker.c @@ -21,14 +21,16 @@ static unsigned int shrinker_run_no; static unsigned long __count_nat_entries(struct f2fs_sb_info *sbi) { - return NM_I(sbi)->nat_cnt - NM_I(sbi)->dirty_nat_cnt; + long count = NM_I(sbi)->nat_cnt - NM_I(sbi)->dirty_nat_cnt; + + return count > 0 ? count : 0; } static unsigned long __count_free_nids(struct f2fs_sb_info *sbi) { - if (NM_I(sbi)->nid_cnt[FREE_NID_LIST] > MAX_FREE_NIDS) - return NM_I(sbi)->nid_cnt[FREE_NID_LIST] - MAX_FREE_NIDS; - return 0; + long count = NM_I(sbi)->nid_cnt[FREE_NID_LIST] - MAX_FREE_NIDS; + + return count > 0 ? count : 0; } static unsigned long __count_extent_cache(struct f2fs_sb_info *sbi) From 2c58f7dea88d81f35e7e78fb6a3af41f9d759346 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 14 Oct 2016 13:28:05 -0700 Subject: [PATCH 020/804] f2fs: count dirty inodes to flush node pages during checkpoint If there are a lot of dirty inodes, we need to flush all of them when doing checkpoint. So, we need to count this for enough free space. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index fecb856ad874..762743988426 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -471,11 +471,12 @@ static inline bool need_SSR(struct f2fs_sb_info *sbi) { int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES); int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS); + int imeta_secs = get_blocktype_secs(sbi, F2FS_DIRTY_IMETA); if (test_opt(sbi, LFS)) return false; - return free_sections(sbi) <= (node_secs + 2 * dent_secs + + return free_sections(sbi) <= (node_secs + 2 * dent_secs + imeta_secs + reserved_sections(sbi) + 1); } @@ -484,14 +485,14 @@ static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, { int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES); int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS); - - node_secs += get_blocktype_secs(sbi, F2FS_DIRTY_IMETA); + int imeta_secs = get_blocktype_secs(sbi, F2FS_DIRTY_IMETA); if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) return false; return (free_sections(sbi) + freed) <= - (node_secs + 2 * dent_secs + reserved_sections(sbi) + needed); + (node_secs + 2 * dent_secs + imeta_secs + + reserved_sections(sbi) + needed); } static inline bool excess_prefree_segs(struct f2fs_sb_info *sbi) From 4ce47914814383d34518cb22121b3216fbcefec1 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 14 Oct 2016 13:30:31 -0700 Subject: [PATCH 021/804] f2fs: call f2fs_balance_fs for setattr If inode becomes dirty, we need to check the # of dirty inodes whether or not further checkpoint would be required. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index c0774c98dce4..53ba384cb675 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -695,7 +695,6 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) err = f2fs_truncate(inode); if (err) return err; - f2fs_balance_fs(F2FS_I_SB(inode), true); } else { /* * do not trim all blocks after i_size if target size is @@ -724,6 +723,10 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) } f2fs_mark_inode_dirty_sync(inode); + + /* inode change will produce dirty node pages flushed by checkpoint */ + f2fs_balance_fs(F2FS_I_SB(inode), true); + return err; } From cab4de5c485aa661a3019ca92bf617294652fb4e Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 17 Oct 2016 15:36:31 -0700 Subject: [PATCH 022/804] f2fs: declare static function for __build_free_nids This patch avoids build warning. Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 5800a1082fe8..e1ce0b8438fc 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1806,7 +1806,7 @@ static void scan_nat_page(struct f2fs_sb_info *sbi, } } -void __build_free_nids(struct f2fs_sb_info *sbi, bool sync) +static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); From 1d486e74cf8427152f96688d466a1c57a44a7642 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 18 Oct 2016 11:07:45 -0700 Subject: [PATCH 023/804] f2fs: use BIO_MAX_PAGES for bio allocation We don't need to allocate bio partially in order to maximize sequential writes. Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 2 +- fs/f2fs/data.c | 4 +--- fs/f2fs/node.c | 3 +-- fs/f2fs/segment.c | 4 ++-- fs/f2fs/segment.h | 17 +++-------------- 5 files changed, 8 insertions(+), 22 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 1d273f51bc1c..1dffe86651be 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -226,7 +226,7 @@ void ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index) f2fs_put_page(page, 0); if (readahead) - ra_meta_pages(sbi, index, MAX_BIO_BLOCKS(sbi), META_POR, true); + ra_meta_pages(sbi, index, BIO_MAX_PAGES, META_POR, true); } static int f2fs_write_meta_page(struct page *page, diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 6e00e017bb4f..465fa9d62485 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -272,10 +272,8 @@ void f2fs_submit_page_mbio(struct f2fs_io_info *fio) __submit_merged_bio(io); alloc_new: if (io->bio == NULL) { - int bio_blocks = MAX_BIO_BLOCKS(sbi); - io->bio = __bio_alloc(sbi, fio->new_blkaddr, - bio_blocks, is_read); + BIO_MAX_PAGES, is_read); io->fio = *fio; } diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index e1ce0b8438fc..389be7f6e07c 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2099,7 +2099,6 @@ int restore_node_summary(struct f2fs_sb_info *sbi, struct f2fs_node *rn; struct f2fs_summary *sum_entry; block_t addr; - int bio_blocks = MAX_BIO_BLOCKS(sbi); int i, idx, last_offset, nrpages; /* scan the node segment */ @@ -2108,7 +2107,7 @@ int restore_node_summary(struct f2fs_sb_info *sbi, sum_entry = &sum->entries[0]; for (i = 0; i < last_offset; i += nrpages, addr += nrpages) { - nrpages = min(last_offset - i, bio_blocks); + nrpages = min(last_offset - i, BIO_MAX_PAGES); /* readahead node pages */ ra_meta_pages(sbi, addr, nrpages, META_POR, true); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 48903702de27..ec4d74c26067 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2239,10 +2239,10 @@ static void build_sit_entries(struct f2fs_sb_info *sbi) int sit_blk_cnt = SIT_BLK_CNT(sbi); unsigned int i, start, end; unsigned int readed, start_blk = 0; - int nrpages = MAX_BIO_BLOCKS(sbi) * 8; do { - readed = ra_meta_pages(sbi, start_blk, nrpages, META_SIT, true); + readed = ra_meta_pages(sbi, start_blk, BIO_MAX_PAGES, + META_SIT, true); start = start_blk * sit_i->sents_per_block; end = (start_blk + readed) * sit_i->sents_per_block; diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 762743988426..89ab4301ef02 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -102,8 +102,6 @@ (((sector_t)blk_addr) << F2FS_LOG_SECTORS_PER_BLOCK) #define SECTOR_TO_BLOCK(sectors) \ (sectors >> F2FS_LOG_SECTORS_PER_BLOCK) -#define MAX_BIO_BLOCKS(sbi) \ - ((int)min((int)max_hw_blocks(sbi), BIO_MAX_PAGES)) /* * indicate a block allocation direction: RIGHT and LEFT. @@ -696,13 +694,6 @@ static inline bool sec_usage_check(struct f2fs_sb_info *sbi, unsigned int secno) return false; } -static inline unsigned int max_hw_blocks(struct f2fs_sb_info *sbi) -{ - struct block_device *bdev = sbi->sb->s_bdev; - struct request_queue *q = bdev_get_queue(bdev); - return SECTOR_TO_BLOCK(queue_max_sectors(q)); -} - /* * It is very important to gather dirty pages and write at once, so that we can * submit a big bio without interfering other data writes. @@ -720,7 +711,7 @@ static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type) else if (type == NODE) return 8 * sbi->blocks_per_seg; else if (type == META) - return 8 * MAX_BIO_BLOCKS(sbi); + return 8 * BIO_MAX_PAGES; else return 0; } @@ -737,11 +728,9 @@ static inline long nr_pages_to_write(struct f2fs_sb_info *sbi, int type, return 0; nr_to_write = wbc->nr_to_write; - + desired = BIO_MAX_PAGES; if (type == NODE) - desired = 2 * max_hw_blocks(sbi); - else - desired = MAX_BIO_BLOCKS(sbi); + desired <<= 1; wbc->nr_to_write = desired; return desired - nr_to_write; From 3139e5f850cdbafc1b0b6f787e1473c5ea687603 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 10 Nov 2016 18:04:05 -0800 Subject: [PATCH 024/804] f2fs: Replace CURRENT_TIME_SEC with current_time() for inode timestamps This is for backport only. fs: Replace CURRENT_TIME_SEC with current_time() for inode timestamps Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 8 ++++---- fs/f2fs/f2fs.h | 22 ++++++++++++++++++++++ fs/f2fs/file.c | 8 ++++---- fs/f2fs/inline.c | 2 +- fs/f2fs/namei.c | 8 ++++---- fs/f2fs/xattr.c | 2 +- 6 files changed, 36 insertions(+), 14 deletions(-) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 7136dc1ade11..3b8ebec0450b 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -312,7 +312,7 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, f2fs_dentry_kunmap(dir, page); set_page_dirty(page); - dir->i_mtime = dir->i_ctime = CURRENT_TIME; + dir->i_mtime = dir->i_ctime = current_time(dir); f2fs_mark_inode_dirty_sync(dir); f2fs_put_page(page, 1); } @@ -465,7 +465,7 @@ void update_parent_metadata(struct inode *dir, struct inode *inode, f2fs_i_links_write(dir, true); clear_inode_flag(inode, FI_NEW_INODE); } - dir->i_mtime = dir->i_ctime = CURRENT_TIME; + dir->i_mtime = dir->i_ctime = current_time(dir); f2fs_mark_inode_dirty_sync(dir); if (F2FS_I(dir)->i_current_depth != current_depth) @@ -683,7 +683,7 @@ void f2fs_drop_nlink(struct inode *dir, struct inode *inode) if (S_ISDIR(inode->i_mode)) f2fs_i_links_write(dir, false); - inode->i_ctime = CURRENT_TIME; + inode->i_ctime = current_time(inode); f2fs_i_links_write(inode, false); if (S_ISDIR(inode->i_mode)) { @@ -730,7 +730,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, kunmap(page); /* kunmap - pair of f2fs_find_entry */ set_page_dirty(page); - dir->i_ctime = dir->i_mtime = CURRENT_TIME; + dir->i_ctime = dir->i_mtime = current_time(dir); f2fs_mark_inode_dirty_sync(dir); if (inode) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 9e8d3c9af54a..1938fe457041 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -138,6 +138,28 @@ static inline void inode_nohighmem(struct inode *inode) mapping_set_gfp_mask(inode->i_mapping, GFP_USER); } +/** + * current_time - Return FS time + * @inode: inode. + * + * Return the current time truncated to the time granularity supported by + * the fs. + * + * Note that inode and inode->sb cannot be NULL. + * Otherwise, the function warns and returns time without truncation. + */ +static inline struct timespec current_time(struct inode *inode) +{ + struct timespec now = current_kernel_time(); + + if (unlikely(!inode->i_sb)) { + WARN(1, "current_time() called with uninitialized super_block in the inode"); + return now; + } + + return timespec_trunc(now, inode->i_sb->s_time_gran); +} + /* * For checkpoint manager */ diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 53ba384cb675..04a3205d2934 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -632,7 +632,7 @@ int f2fs_truncate(struct inode *inode) if (err) return err; - inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode->i_mtime = inode->i_ctime = current_time(inode); f2fs_mark_inode_dirty_sync(inode); return 0; } @@ -708,7 +708,7 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) if (err) return err; } - inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode->i_mtime = inode->i_ctime = current_time(inode); } } @@ -1402,7 +1402,7 @@ static long f2fs_fallocate(struct file *file, int mode, } if (!ret) { - inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode->i_mtime = inode->i_ctime = current_time(inode); f2fs_mark_inode_dirty_sync(inode); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); } @@ -1494,7 +1494,7 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg) fi->i_flags = flags; inode_unlock(inode); - inode->i_ctime = CURRENT_TIME; + inode->i_ctime = current_time(inode); f2fs_set_inode_flags(inode); out: mnt_drop_write_file(filp); diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 06d20489d532..3106155994b4 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -573,7 +573,7 @@ void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page, set_page_dirty(page); f2fs_put_page(page, 1); - dir->i_ctime = dir->i_mtime = CURRENT_TIME; + dir->i_ctime = dir->i_mtime = current_time(dir); f2fs_mark_inode_dirty_sync(dir); if (inode) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 0f071a70522d..ae29726afff0 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -46,7 +46,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) inode->i_ino = ino; inode->i_blocks = 0; - inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); inode->i_generation = sbi->s_next_generation++; err = insert_inode_locked(inode); @@ -182,7 +182,7 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir, f2fs_balance_fs(sbi, true); - inode->i_ctime = CURRENT_TIME; + inode->i_ctime = current_time(inode); ihold(inode); set_inode_flag(inode, FI_INC_LINK); @@ -720,7 +720,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, f2fs_set_link(new_dir, new_entry, new_page, old_inode); - new_inode->i_ctime = CURRENT_TIME; + new_inode->i_ctime = current_time(new_inode); down_write(&F2FS_I(new_inode)->i_sem); if (old_dir_entry) f2fs_i_links_write(new_inode, false); @@ -774,7 +774,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, file_set_enc_name(old_inode); up_write(&F2FS_I(old_inode)->i_sem); - old_inode->i_ctime = CURRENT_TIME; + old_inode->i_ctime = current_time(old_inode); f2fs_mark_inode_dirty_sync(old_inode); f2fs_delete_entry(old_entry, old_page, old_dir, NULL); diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 69c6bb9cf207..3a42405b6515 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -588,7 +588,7 @@ static int __f2fs_setxattr(struct inode *inode, int index, if (is_inode_flag_set(inode, FI_ACL_MODE)) { inode->i_mode = F2FS_I(inode)->i_acl_mode; - inode->i_ctime = CURRENT_TIME; + inode->i_ctime = current_time(inode); clear_inode_flag(inode, FI_ACL_MODE); } if (index == F2FS_XATTR_INDEX_ENCRYPTION && From 20339a1214b21266e44660c91fd9b391206c9d35 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 14 Oct 2016 11:51:23 -0700 Subject: [PATCH 025/804] f2fs: keep dirty inodes selectively for checkpoint This is to avoid no free segment bug during checkpoint caused by a number of dirty inodes. The case was reported by Chao like this. 1. mount with lazytime option 2. fill 4k file until disk is full 3. sync filesystem 4. read all files in the image 5. umount In this case, we actually don't need to flush dirty inode to inode page during checkpoint. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/acl.c | 2 +- fs/f2fs/dir.c | 6 +++--- fs/f2fs/extent_cache.c | 2 +- fs/f2fs/f2fs.h | 26 +++++++++++++------------- fs/f2fs/file.c | 9 +++++---- fs/f2fs/inline.c | 2 +- fs/f2fs/inode.c | 7 ++++--- fs/f2fs/namei.c | 6 +++--- fs/f2fs/super.c | 29 ++++++++++++++++------------- fs/f2fs/xattr.c | 4 ++-- 10 files changed, 49 insertions(+), 44 deletions(-) diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index 4a34040932e9..a45d1f4b7b0f 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -387,7 +387,7 @@ int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage, if (error) return error; - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); if (default_acl) { error = __f2fs_set_acl(inode, ACL_TYPE_DEFAULT, default_acl, diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 3b8ebec0450b..5594667c2f41 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -313,7 +313,7 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, set_page_dirty(page); dir->i_mtime = dir->i_ctime = current_time(dir); - f2fs_mark_inode_dirty_sync(dir); + f2fs_mark_inode_dirty_sync(dir, false); f2fs_put_page(page, 1); } @@ -466,7 +466,7 @@ void update_parent_metadata(struct inode *dir, struct inode *inode, clear_inode_flag(inode, FI_NEW_INODE); } dir->i_mtime = dir->i_ctime = current_time(dir); - f2fs_mark_inode_dirty_sync(dir); + f2fs_mark_inode_dirty_sync(dir, false); if (F2FS_I(dir)->i_current_depth != current_depth) f2fs_i_depth_write(dir, current_depth); @@ -731,7 +731,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, set_page_dirty(page); dir->i_ctime = dir->i_mtime = current_time(dir); - f2fs_mark_inode_dirty_sync(dir); + f2fs_mark_inode_dirty_sync(dir, false); if (inode) f2fs_drop_nlink(dir, inode); diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 2b06d4fcd954..4db44da7ef69 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -172,7 +172,7 @@ static void __drop_largest_extent(struct inode *inode, if (fofs < largest->fofs + largest->len && fofs + len > largest->fofs) { largest->len = 0; - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); } } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 1938fe457041..0d2502fdf892 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -541,13 +541,13 @@ static inline bool __is_front_mergeable(struct extent_info *cur, return __is_extent_mergeable(cur, front); } -extern void f2fs_mark_inode_dirty_sync(struct inode *); +extern void f2fs_mark_inode_dirty_sync(struct inode *, bool); static inline void __try_update_largest_extent(struct inode *inode, struct extent_tree *et, struct extent_node *en) { if (en->ei.len > et->largest.len) { et->largest = en->ei; - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); } } @@ -1680,7 +1680,7 @@ static inline void __mark_inode_dirty_flag(struct inode *inode, return; case FI_DATA_EXIST: case FI_INLINE_DOTS: - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); } } @@ -1707,7 +1707,7 @@ static inline void set_acl_inode(struct inode *inode, umode_t mode) { F2FS_I(inode)->i_acl_mode = mode; set_inode_flag(inode, FI_ACL_MODE); - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, false); } static inline void f2fs_i_links_write(struct inode *inode, bool inc) @@ -1716,7 +1716,7 @@ static inline void f2fs_i_links_write(struct inode *inode, bool inc) inc_nlink(inode); else drop_nlink(inode); - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); } static inline void f2fs_i_blocks_write(struct inode *inode, @@ -1727,7 +1727,7 @@ static inline void f2fs_i_blocks_write(struct inode *inode, inode->i_blocks = add ? inode->i_blocks + diff : inode->i_blocks - diff; - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); if (clean || recover) set_inode_flag(inode, FI_AUTO_RECOVER); } @@ -1741,7 +1741,7 @@ static inline void f2fs_i_size_write(struct inode *inode, loff_t i_size) return; i_size_write(inode, i_size); - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); if (clean || recover) set_inode_flag(inode, FI_AUTO_RECOVER); } @@ -1756,19 +1756,19 @@ static inline bool f2fs_skip_inode_update(struct inode *inode) static inline void f2fs_i_depth_write(struct inode *inode, unsigned int depth) { F2FS_I(inode)->i_current_depth = depth; - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); } static inline void f2fs_i_xnid_write(struct inode *inode, nid_t xnid) { F2FS_I(inode)->i_xattr_nid = xnid; - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); } static inline void f2fs_i_pino_write(struct inode *inode, nid_t pino) { F2FS_I(inode)->i_pino = pino; - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); } static inline void get_inline_info(struct inode *inode, struct f2fs_inode *ri) @@ -1896,13 +1896,13 @@ static inline int is_file(struct inode *inode, int type) static inline void set_file(struct inode *inode, int type) { F2FS_I(inode)->i_advise |= type; - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); } static inline void clear_file(struct inode *inode, int type) { F2FS_I(inode)->i_advise &= ~type; - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); } static inline int f2fs_readonly(struct super_block *sb) @@ -2054,7 +2054,7 @@ static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode) /* * super.c */ -int f2fs_inode_dirtied(struct inode *); +int f2fs_inode_dirtied(struct inode *, bool); void f2fs_inode_synced(struct inode *); int f2fs_commit_super(struct f2fs_sb_info *, bool); int f2fs_sync_fs(struct super_block *, int); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 04a3205d2934..ce38a350fb38 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -265,7 +265,7 @@ sync_nodes: } if (need_inode_block_update(sbi, ino)) { - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); f2fs_write_inode(inode, NULL); goto sync_nodes; } @@ -633,7 +633,7 @@ int f2fs_truncate(struct inode *inode) return err; inode->i_mtime = inode->i_ctime = current_time(inode); - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, false); return 0; } @@ -722,7 +722,8 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) } } - f2fs_mark_inode_dirty_sync(inode); + /* update attributes only */ + f2fs_mark_inode_dirty_sync(inode, false); /* inode change will produce dirty node pages flushed by checkpoint */ f2fs_balance_fs(F2FS_I_SB(inode), true); @@ -1403,7 +1404,7 @@ static long f2fs_fallocate(struct file *file, int mode, if (!ret) { inode->i_mtime = inode->i_ctime = current_time(inode); - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, false); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); } diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 3106155994b4..841aa13d9f4e 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -574,7 +574,7 @@ void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page, f2fs_put_page(page, 1); dir->i_ctime = dir->i_mtime = current_time(dir); - f2fs_mark_inode_dirty_sync(dir); + f2fs_mark_inode_dirty_sync(dir, false); if (inode) f2fs_drop_nlink(dir, inode); diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index d32fd0343eae..bfa512dde4ab 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -19,10 +19,11 @@ #include -void f2fs_mark_inode_dirty_sync(struct inode *inode) +void f2fs_mark_inode_dirty_sync(struct inode *inode, bool sync) { - if (f2fs_inode_dirtied(inode)) + if (f2fs_inode_dirtied(inode, sync)) return; + mark_inode_dirty_sync(inode); } @@ -43,7 +44,7 @@ void f2fs_set_inode_flags(struct inode *inode) new_fl |= S_DIRSYNC; inode_set_flags(inode, new_fl, S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, false); } static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index ae29726afff0..7f2fdb154180 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -775,7 +775,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, up_write(&F2FS_I(old_inode)->i_sem); old_inode->i_ctime = current_time(old_inode); - f2fs_mark_inode_dirty_sync(old_inode); + f2fs_mark_inode_dirty_sync(old_inode, false); f2fs_delete_entry(old_entry, old_page, old_dir, NULL); @@ -935,7 +935,7 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, f2fs_i_links_write(old_dir, old_nlink > 0); up_write(&F2FS_I(old_dir)->i_sem); } - f2fs_mark_inode_dirty_sync(old_dir); + f2fs_mark_inode_dirty_sync(old_dir, false); /* update directory entry info of new dir inode */ f2fs_set_link(new_dir, new_entry, new_page, old_inode); @@ -950,7 +950,7 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, f2fs_i_links_write(new_dir, new_nlink > 0); up_write(&F2FS_I(new_dir)->i_sem); } - f2fs_mark_inode_dirty_sync(new_dir); + f2fs_mark_inode_dirty_sync(new_dir, false); f2fs_unlock_op(sbi); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 006138a6c5ab..23190a94840b 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -620,24 +620,25 @@ static int f2fs_drop_inode(struct inode *inode) return generic_drop_inode(inode); } -int f2fs_inode_dirtied(struct inode *inode) +int f2fs_inode_dirtied(struct inode *inode, bool sync) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + int ret = 0; spin_lock(&sbi->inode_lock[DIRTY_META]); if (is_inode_flag_set(inode, FI_DIRTY_INODE)) { - spin_unlock(&sbi->inode_lock[DIRTY_META]); - return 1; + ret = 1; + } else { + set_inode_flag(inode, FI_DIRTY_INODE); + stat_inc_dirty_inode(sbi, DIRTY_META); } - - set_inode_flag(inode, FI_DIRTY_INODE); - list_add_tail(&F2FS_I(inode)->gdirty_list, + if (sync && list_empty(&F2FS_I(inode)->gdirty_list)) { + list_add_tail(&F2FS_I(inode)->gdirty_list, &sbi->inode_list[DIRTY_META]); - inc_page_count(sbi, F2FS_DIRTY_IMETA); - stat_inc_dirty_inode(sbi, DIRTY_META); + inc_page_count(sbi, F2FS_DIRTY_IMETA); + } spin_unlock(&sbi->inode_lock[DIRTY_META]); - - return 0; + return ret; } void f2fs_inode_synced(struct inode *inode) @@ -649,10 +650,12 @@ void f2fs_inode_synced(struct inode *inode) spin_unlock(&sbi->inode_lock[DIRTY_META]); return; } - list_del_init(&F2FS_I(inode)->gdirty_list); + if (!list_empty(&F2FS_I(inode)->gdirty_list)) { + list_del_init(&F2FS_I(inode)->gdirty_list); + dec_page_count(sbi, F2FS_DIRTY_IMETA); + } clear_inode_flag(inode, FI_DIRTY_INODE); clear_inode_flag(inode, FI_AUTO_RECOVER); - dec_page_count(sbi, F2FS_DIRTY_IMETA); stat_dec_dirty_inode(F2FS_I_SB(inode), DIRTY_META); spin_unlock(&sbi->inode_lock[DIRTY_META]); } @@ -676,7 +679,7 @@ static void f2fs_dirty_inode(struct inode *inode, int flags) if (is_inode_flag_set(inode, FI_AUTO_RECOVER)) clear_inode_flag(inode, FI_AUTO_RECOVER); - f2fs_inode_dirtied(inode); + f2fs_inode_dirtied(inode, false); } static void f2fs_i_callback(struct rcu_head *head) diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 3a42405b6515..1c4d5e39586c 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -151,7 +151,7 @@ static int f2fs_xattr_advise_set(const struct xattr_handler *handler, return -EINVAL; F2FS_I(inode)->i_advise |= *(char *)value; - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); return 0; } @@ -594,7 +594,7 @@ static int __f2fs_setxattr(struct inode *inode, int index, if (index == F2FS_XATTR_INDEX_ENCRYPTION && !strcmp(name, F2FS_XATTR_NAME_ENCRYPTION_CONTEXT)) f2fs_set_encrypted_inode(inode); - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); if (!error && S_ISDIR(inode->i_mode)) set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_CP); exit: From 86f4d9f42e8a3bb2a02ca1eaecc180f1f1b21ee3 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 19 Oct 2016 18:27:56 -0700 Subject: [PATCH 026/804] f2fs: make clean inodes when flushing inode page This patch tries to make more clean inodes when flushing dirty inodes in checkpoint. Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 6 +++++- fs/f2fs/inode.c | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 1dffe86651be..ed79757c36e0 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -924,7 +924,11 @@ int f2fs_sync_inode_meta(struct f2fs_sb_info *sbi) inode = igrab(&fi->vfs_inode); spin_unlock(&sbi->inode_lock[DIRTY_META]); if (inode) { - update_inode_page(inode); + sync_inode_metadata(inode, 0); + + /* it's on eviction */ + if (is_inode_flag_set(inode, FI_DIRTY_INODE)) + update_inode_page(inode); iput(inode); } }; diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index bfa512dde4ab..7b5e402f0a72 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -339,7 +339,7 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) * We need to balance fs here to prevent from producing dirty node pages * during the urgent cleaning time when runing out of free sections. */ - if (update_inode_page(inode)) + if (update_inode_page(inode) && wbc && wbc->nr_to_write) f2fs_balance_fs(sbi, true); return 0; } From 1789a2ca8a3e2247076419891f39f4d076b3738f Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 20 Oct 2016 19:09:57 -0700 Subject: [PATCH 027/804] f2fs: remove percpu_count due to performance regression This patch removes percpu_count usage due to performance regression in iozone. Fixes: 523be8a6b3 ("f2fs: use percpu_counter for page counters") Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 12 ++++++------ fs/f2fs/f2fs.h | 12 ++++++------ fs/f2fs/super.c | 16 +++++----------- 3 files changed, 17 insertions(+), 23 deletions(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 6af146c48644..2fdf23398fa1 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -313,17 +313,17 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, " - Inner Struct Count: tree: %d(%d), node: %d\n", si->ext_tree, si->zombie_tree, si->ext_node); seq_puts(s, "\nBalancing F2FS Async:\n"); - seq_printf(s, " - inmem: %4lld, wb_bios: %4d\n", + seq_printf(s, " - inmem: %4d, wb_bios: %4d\n", si->inmem_pages, si->wb_bios); - seq_printf(s, " - nodes: %4lld in %4d\n", + seq_printf(s, " - nodes: %4d in %4d\n", si->ndirty_node, si->node_pages); - seq_printf(s, " - dents: %4lld in dirs:%4d (%4d)\n", + seq_printf(s, " - dents: %4d in dirs:%4d (%4d)\n", si->ndirty_dent, si->ndirty_dirs, si->ndirty_all); - seq_printf(s, " - datas: %4lld in files:%4d\n", + seq_printf(s, " - datas: %4d in files:%4d\n", si->ndirty_data, si->ndirty_files); - seq_printf(s, " - meta: %4lld in %4d\n", + seq_printf(s, " - meta: %4d in %4d\n", si->ndirty_meta, si->meta_pages); - seq_printf(s, " - imeta: %4lld\n", + seq_printf(s, " - imeta: %4d\n", si->ndirty_imeta); seq_printf(s, " - NATs: %9d/%9d\n - SITs: %9d/%9d\n", si->dirty_nats, si->nats, si->dirty_sits, si->sits); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 0d2502fdf892..932c53f441db 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -872,7 +872,7 @@ struct f2fs_sb_info { atomic_t nr_wb_bios; /* # of writeback bios */ /* # of pages, see count_type */ - struct percpu_counter nr_pages[NR_COUNT_TYPE]; + atomic_t nr_pages[NR_COUNT_TYPE]; /* # of allocated blocks */ struct percpu_counter alloc_valid_block_count; @@ -1286,7 +1286,7 @@ static inline void dec_valid_block_count(struct f2fs_sb_info *sbi, static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type) { - percpu_counter_inc(&sbi->nr_pages[count_type]); + atomic_inc(&sbi->nr_pages[count_type]); if (count_type == F2FS_DIRTY_DATA || count_type == F2FS_INMEM_PAGES) return; @@ -1303,7 +1303,7 @@ static inline void inode_inc_dirty_pages(struct inode *inode) static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type) { - percpu_counter_dec(&sbi->nr_pages[count_type]); + atomic_dec(&sbi->nr_pages[count_type]); } static inline void inode_dec_dirty_pages(struct inode *inode) @@ -1319,7 +1319,7 @@ static inline void inode_dec_dirty_pages(struct inode *inode) static inline s64 get_pages(struct f2fs_sb_info *sbi, int count_type) { - return percpu_counter_sum_positive(&sbi->nr_pages[count_type]); + return atomic_read(&sbi->nr_pages[count_type]); } static inline s64 get_dirty_pages(struct inode *inode) @@ -2239,8 +2239,8 @@ struct f2fs_stat_info { unsigned long long hit_largest, hit_cached, hit_rbtree; unsigned long long hit_total, total_ext; int ext_tree, zombie_tree, ext_node; - s64 ndirty_node, ndirty_dent, ndirty_meta, ndirty_data, ndirty_imeta; - s64 inmem_pages; + int ndirty_node, ndirty_dent, ndirty_meta, ndirty_data, ndirty_imeta; + int inmem_pages; unsigned int ndirty_dirs, ndirty_files, ndirty_all; int nats, dirty_nats, sits, dirty_sits, free_nids, alloc_nids; int total_count, utilization; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 23190a94840b..6034d51fc5fc 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -696,10 +696,6 @@ static void f2fs_destroy_inode(struct inode *inode) static void destroy_percpu_info(struct f2fs_sb_info *sbi) { - int i; - - for (i = 0; i < NR_COUNT_TYPE; i++) - percpu_counter_destroy(&sbi->nr_pages[i]); percpu_counter_destroy(&sbi->alloc_valid_block_count); percpu_counter_destroy(&sbi->total_valid_inode_count); } @@ -1450,6 +1446,7 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi) static void init_sb_info(struct f2fs_sb_info *sbi) { struct f2fs_super_block *raw_super = sbi->raw_super; + int i; sbi->log_sectors_per_block = le32_to_cpu(raw_super->log_sectors_per_block); @@ -1474,6 +1471,9 @@ static void init_sb_info(struct f2fs_sb_info *sbi) sbi->interval_time[REQ_TIME] = DEF_IDLE_INTERVAL; clear_sbi_flag(sbi, SBI_NEED_FSCK); + for (i = 0; i < NR_COUNT_TYPE; i++) + atomic_set(&sbi->nr_pages[i], 0); + INIT_LIST_HEAD(&sbi->s_list); mutex_init(&sbi->umount_mutex); mutex_init(&sbi->wio_mutex[NODE]); @@ -1489,13 +1489,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi) static int init_percpu_info(struct f2fs_sb_info *sbi) { - int i, err; - - for (i = 0; i < NR_COUNT_TYPE; i++) { - err = percpu_counter_init(&sbi->nr_pages[i], 0, GFP_KERNEL); - if (err) - return err; - } + int err; err = percpu_counter_init(&sbi->alloc_valid_block_count, 0, GFP_KERNEL); if (err) From 93ae1e63e4757ca32d39dacc8488ade28fbae4d8 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 2 Nov 2016 14:52:15 +0100 Subject: [PATCH 028/804] f2fs: hide a maybe-uninitialized warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit gcc is unsure about the use of last_ofs_in_node, which might happen without a prior initialization: fs/f2fs//git/arm-soc/fs/f2fs/data.c: In function ‘f2fs_map_blocks’: fs/f2fs/data.c:799:54: warning: ‘last_ofs_in_node’ may be used uninitialized in this function [-Wmaybe-uninitialized] if (prealloc && dn.ofs_in_node != last_ofs_in_node + 1) { As pointed out by Chao Yu, the code is actually correct as 'prealloc' is only set if the last_ofs_in_node has been set, the two always get updated together. This initializes last_ofs_in_node to dn.ofs_in_node for each new dnode at the start of the 'next_block' loop, which at that point is a correct initialization as well. I assume that compilers that correctly track the contents of the variables and do not warn about the condition also figure out that they can eliminate the extra assignment here. Fixes: 46008c6d4232 ("f2fs: support in batch multi blocks preallocation") Signed-off-by: Arnd Bergmann Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 465fa9d62485..192bc039194d 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -707,7 +707,7 @@ next_dnode: } prealloc = 0; - ofs_in_node = dn.ofs_in_node; + last_ofs_in_node = ofs_in_node = dn.ofs_in_node; end_offset = ADDRS_PER_PAGE(dn.node_page, inode); next_block: From e164e43eb20b2980dc0e7f740c8b3e0596e9778f Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 1 Dec 2016 10:44:44 -0800 Subject: [PATCH 029/804] fs/crypto: catch up 4.9-rc6 Signed-off-by: Jaegeuk Kim --- fs/crypto/crypto.c | 26 ++++---- fs/crypto/fname.c | 132 ++++++++++++++++++--------------------- fs/crypto/keyinfo.c | 85 ++++++++++++++++--------- fs/crypto/policy.c | 4 ++ fs/f2fs/dir.c | 6 +- fs/f2fs/namei.c | 6 +- include/linux/fscrypto.h | 24 ------- 7 files changed, 141 insertions(+), 142 deletions(-) diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index 2fc8c43ce531..2d40ab9edc9f 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -28,7 +28,6 @@ #include #include #include -#include static unsigned int num_prealloc_crypto_pages = 32; static unsigned int num_prealloc_crypto_ctxs = 128; @@ -128,11 +127,11 @@ struct fscrypt_ctx *fscrypt_get_ctx(struct inode *inode, gfp_t gfp_flags) EXPORT_SYMBOL(fscrypt_get_ctx); /** - * fscrypt_complete() - The completion callback for page encryption - * @req: The asynchronous encryption request context - * @res: The result of the encryption operation + * page_crypt_complete() - completion callback for page crypto + * @req: The asynchronous cipher request context + * @res: The result of the cipher operation */ -static void fscrypt_complete(struct crypto_async_request *req, int res) +static void page_crypt_complete(struct crypto_async_request *req, int res) { struct fscrypt_completion_result *ecr = req->data; @@ -152,7 +151,10 @@ static int do_page_crypto(struct inode *inode, struct page *src_page, struct page *dest_page, gfp_t gfp_flags) { - u8 xts_tweak[FS_XTS_TWEAK_SIZE]; + struct { + __le64 index; + u8 padding[FS_XTS_TWEAK_SIZE - sizeof(__le64)]; + } xts_tweak; struct skcipher_request *req = NULL; DECLARE_FS_COMPLETION_RESULT(ecr); struct scatterlist dst, src; @@ -170,19 +172,17 @@ static int do_page_crypto(struct inode *inode, skcipher_request_set_callback( req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, - fscrypt_complete, &ecr); + page_crypt_complete, &ecr); - BUILD_BUG_ON(FS_XTS_TWEAK_SIZE < sizeof(index)); - memcpy(xts_tweak, &index, sizeof(index)); - memset(&xts_tweak[sizeof(index)], 0, - FS_XTS_TWEAK_SIZE - sizeof(index)); + BUILD_BUG_ON(sizeof(xts_tweak) != FS_XTS_TWEAK_SIZE); + xts_tweak.index = cpu_to_le64(index); + memset(xts_tweak.padding, 0, sizeof(xts_tweak.padding)); sg_init_table(&dst, 1); sg_set_page(&dst, dest_page, PAGE_SIZE, 0); sg_init_table(&src, 1); sg_set_page(&src, src_page, PAGE_SIZE, 0); - skcipher_request_set_crypt(req, &src, &dst, PAGE_SIZE, - xts_tweak); + skcipher_request_set_crypt(req, &src, &dst, PAGE_SIZE, &xts_tweak); if (rw == FS_DECRYPT) res = crypto_skcipher_decrypt(req); else diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index 5d6d49113efa..9b774f4b50c8 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -10,21 +10,16 @@ * This has not yet undergone a rigorous security audit. */ -#include -#include #include #include #include -static u32 size_round_up(size_t size, size_t blksize) -{ - return ((size + blksize - 1) / blksize) * blksize; -} - /** - * dir_crypt_complete() - + * fname_crypt_complete() - completion callback for filename crypto + * @req: The asynchronous cipher request context + * @res: The result of the cipher operation */ -static void dir_crypt_complete(struct crypto_async_request *req, int res) +static void fname_crypt_complete(struct crypto_async_request *req, int res) { struct fscrypt_completion_result *ecr = req->data; @@ -35,90 +30,80 @@ static void dir_crypt_complete(struct crypto_async_request *req, int res) } /** - * fname_encrypt() - + * fname_encrypt() - encrypt a filename * - * This function encrypts the input filename, and returns the length of the - * ciphertext. Errors are returned as negative numbers. We trust the caller to - * allocate sufficient memory to oname string. + * The caller must have allocated sufficient memory for the @oname string. + * + * Return: 0 on success, -errno on failure */ static int fname_encrypt(struct inode *inode, const struct qstr *iname, struct fscrypt_str *oname) { - u32 ciphertext_len; struct skcipher_request *req = NULL; DECLARE_FS_COMPLETION_RESULT(ecr); struct fscrypt_info *ci = inode->i_crypt_info; struct crypto_skcipher *tfm = ci->ci_ctfm; int res = 0; char iv[FS_CRYPTO_BLOCK_SIZE]; - struct scatterlist src_sg, dst_sg; + struct scatterlist sg; int padding = 4 << (ci->ci_flags & FS_POLICY_FLAGS_PAD_MASK); - char *workbuf, buf[32], *alloc_buf = NULL; - unsigned lim; + unsigned int lim; + unsigned int cryptlen; lim = inode->i_sb->s_cop->max_namelen(inode); if (iname->len <= 0 || iname->len > lim) return -EIO; - ciphertext_len = (iname->len < FS_CRYPTO_BLOCK_SIZE) ? - FS_CRYPTO_BLOCK_SIZE : iname->len; - ciphertext_len = size_round_up(ciphertext_len, padding); - ciphertext_len = (ciphertext_len > lim) ? lim : ciphertext_len; + /* + * Copy the filename to the output buffer for encrypting in-place and + * pad it with the needed number of NUL bytes. + */ + cryptlen = max_t(unsigned int, iname->len, FS_CRYPTO_BLOCK_SIZE); + cryptlen = round_up(cryptlen, padding); + cryptlen = min(cryptlen, lim); + memcpy(oname->name, iname->name, iname->len); + memset(oname->name + iname->len, 0, cryptlen - iname->len); - if (ciphertext_len <= sizeof(buf)) { - workbuf = buf; - } else { - alloc_buf = kmalloc(ciphertext_len, GFP_NOFS); - if (!alloc_buf) - return -ENOMEM; - workbuf = alloc_buf; - } + /* Initialize the IV */ + memset(iv, 0, FS_CRYPTO_BLOCK_SIZE); - /* Allocate request */ + /* Set up the encryption request */ req = skcipher_request_alloc(tfm, GFP_NOFS); if (!req) { printk_ratelimited(KERN_ERR - "%s: crypto_request_alloc() failed\n", __func__); - kfree(alloc_buf); + "%s: skcipher_request_alloc() failed\n", __func__); return -ENOMEM; } skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, - dir_crypt_complete, &ecr); + fname_crypt_complete, &ecr); + sg_init_one(&sg, oname->name, cryptlen); + skcipher_request_set_crypt(req, &sg, &sg, cryptlen, iv); - /* Copy the input */ - memcpy(workbuf, iname->name, iname->len); - if (iname->len < ciphertext_len) - memset(workbuf + iname->len, 0, ciphertext_len - iname->len); - - /* Initialize IV */ - memset(iv, 0, FS_CRYPTO_BLOCK_SIZE); - - /* Create encryption request */ - sg_init_one(&src_sg, workbuf, ciphertext_len); - sg_init_one(&dst_sg, oname->name, ciphertext_len); - skcipher_request_set_crypt(req, &src_sg, &dst_sg, ciphertext_len, iv); + /* Do the encryption */ res = crypto_skcipher_encrypt(req); if (res == -EINPROGRESS || res == -EBUSY) { + /* Request is being completed asynchronously; wait for it */ wait_for_completion(&ecr.completion); res = ecr.res; } - kfree(alloc_buf); skcipher_request_free(req); - if (res < 0) + if (res < 0) { printk_ratelimited(KERN_ERR "%s: Error (error code %d)\n", __func__, res); + return res; + } - oname->len = ciphertext_len; - return res; + oname->len = cryptlen; + return 0; } -/* - * fname_decrypt() - * This function decrypts the input filename, and returns - * the length of the plaintext. - * Errors are returned as negative numbers. - * We trust the caller to allocate sufficient memory to oname string. +/** + * fname_decrypt() - decrypt a filename + * + * The caller must have allocated sufficient memory for the @oname string. + * + * Return: 0 on success, -errno on failure */ static int fname_decrypt(struct inode *inode, const struct fscrypt_str *iname, @@ -146,7 +131,7 @@ static int fname_decrypt(struct inode *inode, } skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, - dir_crypt_complete, &ecr); + fname_crypt_complete, &ecr); /* Initialize IV */ memset(iv, 0, FS_CRYPTO_BLOCK_SIZE); @@ -168,7 +153,7 @@ static int fname_decrypt(struct inode *inode, } oname->len = strnlen(oname->name, iname->len); - return oname->len; + return 0; } static const char *lookup_table = @@ -231,9 +216,8 @@ u32 fscrypt_fname_encrypted_size(struct inode *inode, u32 ilen) if (ci) padding = 4 << (ci->ci_flags & FS_POLICY_FLAGS_PAD_MASK); - if (ilen < FS_CRYPTO_BLOCK_SIZE) - ilen = FS_CRYPTO_BLOCK_SIZE; - return size_round_up(ilen, padding); + ilen = max(ilen, (u32)FS_CRYPTO_BLOCK_SIZE); + return round_up(ilen, padding); } EXPORT_SYMBOL(fscrypt_fname_encrypted_size); @@ -279,6 +263,10 @@ EXPORT_SYMBOL(fscrypt_fname_free_buffer); /** * fscrypt_fname_disk_to_usr() - converts a filename from disk space to user * space + * + * The caller must have allocated sufficient memory for the @oname string. + * + * Return: 0 on success, -errno on failure */ int fscrypt_fname_disk_to_usr(struct inode *inode, u32 hash, u32 minor_hash, @@ -287,13 +275,12 @@ int fscrypt_fname_disk_to_usr(struct inode *inode, { const struct qstr qname = FSTR_TO_QSTR(iname); char buf[24]; - int ret; if (fscrypt_is_dot_dotdot(&qname)) { oname->name[0] = '.'; oname->name[iname->len - 1] = '.'; oname->len = iname->len; - return oname->len; + return 0; } if (iname->len < FS_CRYPTO_BLOCK_SIZE) @@ -303,9 +290,9 @@ int fscrypt_fname_disk_to_usr(struct inode *inode, return fname_decrypt(inode, iname, oname); if (iname->len <= FS_FNAME_CRYPTO_DIGEST_SIZE) { - ret = digest_encode(iname->name, iname->len, oname->name); - oname->len = ret; - return ret; + oname->len = digest_encode(iname->name, iname->len, + oname->name); + return 0; } if (hash) { memcpy(buf, &hash, 4); @@ -315,15 +302,18 @@ int fscrypt_fname_disk_to_usr(struct inode *inode, } memcpy(buf + 8, iname->name + iname->len - 16, 16); oname->name[0] = '_'; - ret = digest_encode(buf, 24, oname->name + 1); - oname->len = ret + 1; - return ret + 1; + oname->len = 1 + digest_encode(buf, 24, oname->name + 1); + return 0; } EXPORT_SYMBOL(fscrypt_fname_disk_to_usr); /** * fscrypt_fname_usr_to_disk() - converts a filename from user space to disk * space + * + * The caller must have allocated sufficient memory for the @oname string. + * + * Return: 0 on success, -errno on failure */ int fscrypt_fname_usr_to_disk(struct inode *inode, const struct qstr *iname, @@ -333,7 +323,7 @@ int fscrypt_fname_usr_to_disk(struct inode *inode, oname->name[0] = '.'; oname->name[iname->len - 1] = '.'; oname->len = iname->len; - return oname->len; + return 0; } if (inode->i_crypt_info) return fname_encrypt(inode, iname, oname); @@ -367,10 +357,10 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, if (dir->i_crypt_info) { ret = fscrypt_fname_alloc_buffer(dir, iname->len, &fname->crypto_buf); - if (ret < 0) + if (ret) return ret; ret = fname_encrypt(dir, iname, &fname->crypto_buf); - if (ret < 0) + if (ret) goto errout; fname->disk_name.name = fname->crypto_buf.name; fname->disk_name.len = fname->crypto_buf.len; diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c index 1ac263eddc4e..67fb6d8876d0 100644 --- a/fs/crypto/keyinfo.c +++ b/fs/crypto/keyinfo.c @@ -8,11 +8,8 @@ * Written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar, 2015. */ -#include #include -#include #include -#include #include static void derive_crypt_complete(struct crypto_async_request *req, int rc) @@ -139,6 +136,38 @@ out: return res; } +static int determine_cipher_type(struct fscrypt_info *ci, struct inode *inode, + const char **cipher_str_ret, int *keysize_ret) +{ + if (S_ISREG(inode->i_mode)) { + if (ci->ci_data_mode == FS_ENCRYPTION_MODE_AES_256_XTS) { + *cipher_str_ret = "xts(aes)"; + *keysize_ret = FS_AES_256_XTS_KEY_SIZE; + return 0; + } + pr_warn_once("fscrypto: unsupported contents encryption mode " + "%d for inode %lu\n", + ci->ci_data_mode, inode->i_ino); + return -ENOKEY; + } + + if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) { + if (ci->ci_filename_mode == FS_ENCRYPTION_MODE_AES_256_CTS) { + *cipher_str_ret = "cts(cbc(aes))"; + *keysize_ret = FS_AES_256_CTS_KEY_SIZE; + return 0; + } + pr_warn_once("fscrypto: unsupported filenames encryption mode " + "%d for inode %lu\n", + ci->ci_filename_mode, inode->i_ino); + return -ENOKEY; + } + + pr_warn_once("fscrypto: unsupported file type %d for inode %lu\n", + (inode->i_mode & S_IFMT), inode->i_ino); + return -ENOKEY; +} + static void put_crypt_info(struct fscrypt_info *ci) { if (!ci) @@ -155,8 +184,8 @@ int get_crypt_info(struct inode *inode) struct fscrypt_context ctx; struct crypto_skcipher *ctfm; const char *cipher_str; - u8 raw_key[FS_MAX_KEY_SIZE]; - u8 mode; + int keysize; + u8 *raw_key = NULL; int res; res = fscrypt_initialize(); @@ -179,13 +208,19 @@ retry: if (res < 0) { if (!fscrypt_dummy_context_enabled(inode)) return res; + ctx.format = FS_ENCRYPTION_CONTEXT_FORMAT_V1; ctx.contents_encryption_mode = FS_ENCRYPTION_MODE_AES_256_XTS; ctx.filenames_encryption_mode = FS_ENCRYPTION_MODE_AES_256_CTS; ctx.flags = 0; } else if (res != sizeof(ctx)) { return -EINVAL; } - res = 0; + + if (ctx.format != FS_ENCRYPTION_CONTEXT_FORMAT_V1) + return -EINVAL; + + if (ctx.flags & ~FS_POLICY_FLAGS_VALID) + return -EINVAL; crypt_info = kmem_cache_alloc(fscrypt_info_cachep, GFP_NOFS); if (!crypt_info) @@ -198,27 +233,20 @@ retry: crypt_info->ci_keyring_key = NULL; memcpy(crypt_info->ci_master_key, ctx.master_key_descriptor, sizeof(crypt_info->ci_master_key)); - if (S_ISREG(inode->i_mode)) - mode = crypt_info->ci_data_mode; - else if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) - mode = crypt_info->ci_filename_mode; - else - BUG(); - switch (mode) { - case FS_ENCRYPTION_MODE_AES_256_XTS: - cipher_str = "xts(aes)"; - break; - case FS_ENCRYPTION_MODE_AES_256_CTS: - cipher_str = "cts(cbc(aes))"; - break; - default: - printk_once(KERN_WARNING - "%s: unsupported key mode %d (ino %u)\n", - __func__, mode, (unsigned) inode->i_ino); - res = -ENOKEY; + res = determine_cipher_type(crypt_info, inode, &cipher_str, &keysize); + if (res) goto out; - } + + /* + * This cannot be a stack buffer because it is passed to the scatterlist + * crypto API as part of key derivation. + */ + res = -ENOMEM; + raw_key = kmalloc(FS_MAX_KEY_SIZE, GFP_NOFS); + if (!raw_key) + goto out; + if (fscrypt_dummy_context_enabled(inode)) { memset(raw_key, 0x42, FS_AES_256_XTS_KEY_SIZE); goto got_key; @@ -253,11 +281,12 @@ got_key: crypt_info->ci_ctfm = ctfm; crypto_skcipher_clear_flags(ctfm, ~0); crypto_skcipher_set_flags(ctfm, CRYPTO_TFM_REQ_WEAK_KEY); - res = crypto_skcipher_setkey(ctfm, raw_key, fscrypt_key_size(mode)); + res = crypto_skcipher_setkey(ctfm, raw_key, keysize); if (res) goto out; - memzero_explicit(raw_key, sizeof(raw_key)); + kzfree(raw_key); + raw_key = NULL; if (cmpxchg(&inode->i_crypt_info, NULL, crypt_info) != NULL) { put_crypt_info(crypt_info); goto retry; @@ -268,7 +297,7 @@ out: if (res == -ENOKEY) res = 0; put_crypt_info(crypt_info); - memzero_explicit(raw_key, sizeof(raw_key)); + kzfree(raw_key); return res; } diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index ed115acb5dee..6865663aac69 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -109,6 +109,8 @@ int fscrypt_process_policy(struct file *filp, if (ret) return ret; + inode_lock(inode); + if (!inode_has_encryption_context(inode)) { if (!S_ISDIR(inode->i_mode)) ret = -EINVAL; @@ -127,6 +129,8 @@ int fscrypt_process_policy(struct file *filp, ret = -EINVAL; } + inode_unlock(inode); + mnt_drop_write_file(filp); return ret; } diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 5594667c2f41..210082783d5a 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -814,12 +814,12 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, if (f2fs_encrypted_inode(d->inode)) { int save_len = fstr->len; - int ret; + int err; - ret = fscrypt_fname_disk_to_usr(d->inode, + err = fscrypt_fname_disk_to_usr(d->inode, (u32)de->hash_code, 0, &de_name, fstr); - if (ret < 0) + if (err) return true; de_name = *fstr; diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 7f2fdb154180..468b2dbe6d34 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -451,7 +451,7 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, ostr.name = sd->encrypted_path; ostr.len = disk_link.len; err = fscrypt_fname_usr_to_disk(inode, &istr, &ostr); - if (err < 0) + if (err) goto err_out; sd->len = cpu_to_le16(ostr.len); @@ -1047,7 +1047,7 @@ static const char *f2fs_encrypted_follow_link(struct dentry *dentry, void **cook goto errout; res = fscrypt_fname_disk_to_usr(inode, 0, 0, &cstr, &pstr); - if (res < 0) + if (res) goto errout; /* this is broken symlink case */ @@ -1059,7 +1059,7 @@ static const char *f2fs_encrypted_follow_link(struct dentry *dentry, void **cook paddr = pstr.name; /* Null-terminate the name */ - paddr[res] = '\0'; + paddr[pstr.len] = '\0'; put_page(cpage); return *cookie = paddr; diff --git a/include/linux/fscrypto.h b/include/linux/fscrypto.h index 76cff18bb032..ff8b11b26f31 100644 --- a/include/linux/fscrypto.h +++ b/include/linux/fscrypto.h @@ -111,23 +111,6 @@ struct fscrypt_completion_result { struct fscrypt_completion_result ecr = { \ COMPLETION_INITIALIZER((ecr).completion), 0 } -static inline int fscrypt_key_size(int mode) -{ - switch (mode) { - case FS_ENCRYPTION_MODE_AES_256_XTS: - return FS_AES_256_XTS_KEY_SIZE; - case FS_ENCRYPTION_MODE_AES_256_GCM: - return FS_AES_256_GCM_KEY_SIZE; - case FS_ENCRYPTION_MODE_AES_256_CBC: - return FS_AES_256_CBC_KEY_SIZE; - case FS_ENCRYPTION_MODE_AES_256_CTS: - return FS_AES_256_CTS_KEY_SIZE; - default: - BUG(); - } - return 0; -} - #define FS_FNAME_NUM_SCATTER_ENTRIES 4 #define FS_CRYPTO_BLOCK_SIZE 16 #define FS_FNAME_CRYPTO_DIGEST_SIZE 32 @@ -202,13 +185,6 @@ static inline bool fscrypt_valid_filenames_enc_mode(u32 mode) return (mode == FS_ENCRYPTION_MODE_AES_256_CTS); } -static inline u32 fscrypt_validate_encryption_key_size(u32 mode, u32 size) -{ - if (size == fscrypt_key_size(mode)) - return size; - return 0; -} - static inline bool fscrypt_is_dot_dotdot(const struct qstr *str) { if (str->len == 1 && str->name[0] == '.') From 16c3c372dca7de2199197fcf08a90f6c276011c2 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 29 Oct 2016 18:46:34 +0800 Subject: [PATCH 030/804] f2fs: report error of f2fs_fill_dentries Report error of f2fs_fill_dentries to ->iterate_shared, otherwise when error ocurrs, user may just list part of dirents in target directory without any hints. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 21 ++++++++++++--------- fs/f2fs/f2fs.h | 2 +- fs/f2fs/inline.c | 6 ++++-- 3 files changed, 17 insertions(+), 12 deletions(-) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 210082783d5a..4436079dbf0c 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -785,7 +785,7 @@ bool f2fs_empty_dir(struct inode *dir) return true; } -bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, +int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, unsigned int start_pos, struct fscrypt_str *fstr) { unsigned char d_type = DT_UNKNOWN; @@ -820,7 +820,7 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, (u32)de->hash_code, 0, &de_name, fstr); if (err) - return true; + return err; de_name = *fstr; fstr->len = save_len; @@ -828,12 +828,12 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, if (!dir_emit(ctx, de_name.name, de_name.len, le32_to_cpu(de->ino), d_type)) - return true; + return 1; bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); ctx->pos = start_pos + bit_pos; } - return false; + return 0; } static int f2fs_readdir(struct file *file, struct dir_context *ctx) @@ -872,17 +872,21 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) dentry_page = get_lock_data_page(inode, n, false); if (IS_ERR(dentry_page)) { err = PTR_ERR(dentry_page); - if (err == -ENOENT) + if (err == -ENOENT) { + err = 0; continue; - else + } else { goto out; + } } dentry_blk = kmap(dentry_page); make_dentry_ptr(inode, &d, (void *)dentry_blk, 1); - if (f2fs_fill_dentries(ctx, &d, n * NR_DENTRY_IN_BLOCK, &fstr)) { + err = f2fs_fill_dentries(ctx, &d, + n * NR_DENTRY_IN_BLOCK, &fstr); + if (err) { kunmap(dentry_page); f2fs_put_page(dentry_page, 1); break; @@ -892,10 +896,9 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) kunmap(dentry_page); f2fs_put_page(dentry_page, 1); } - err = 0; out: fscrypt_fname_free_buffer(&fstr); - return err; + return err < 0 ? err : 0; } static int f2fs_dir_open(struct inode *inode, struct file *filp) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 932c53f441db..4b13d70d716c 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2014,7 +2014,7 @@ void set_de_type(struct f2fs_dir_entry *, umode_t); unsigned char get_de_type(struct f2fs_dir_entry *); struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *, f2fs_hash_t, int *, struct f2fs_dentry_ptr *); -bool f2fs_fill_dentries(struct dir_context *, struct f2fs_dentry_ptr *, +int f2fs_fill_dentries(struct dir_context *, struct f2fs_dentry_ptr *, unsigned int, struct fscrypt_str *); void do_make_empty_dir(struct inode *, struct inode *, struct f2fs_dentry_ptr *); diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 841aa13d9f4e..3f8bfc87c6dc 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -611,6 +611,7 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx, struct f2fs_inline_dentry *inline_dentry = NULL; struct page *ipage = NULL; struct f2fs_dentry_ptr d; + int err; if (ctx->pos == NR_INLINE_DENTRY) return 0; @@ -623,11 +624,12 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx, make_dentry_ptr(inode, &d, (void *)inline_dentry, 2); - if (!f2fs_fill_dentries(ctx, &d, 0, fstr)) + err = f2fs_fill_dentries(ctx, &d, 0, fstr); + if (!err) ctx->pos = NR_INLINE_DENTRY; f2fs_put_page(ipage, 1); - return 0; + return err < 0 ? err : 0; } int f2fs_inline_data_fiemap(struct inode *inode, From 20cf9476e3b8d1b6ecadf7abf0970d90a393217d Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 31 Oct 2016 14:01:41 -0700 Subject: [PATCH 031/804] f2fs: avoid infinite loop in the EIO case on recover_orphan_inodes This patch should fix an infinite loop case below. F2FS-fs : inject IO error in f2fs_read_end_io+0xf3/0x120 [f2fs] F2FS-fs (nvme0n1p1): recover_orphan_inode: orphan failed (ino=39ac1a), run fsck to fix. ... [] sync_meta_pages+0xae/0x270 [f2fs] [] ? flush_sit_entries+0x8d/0x960 [f2fs] [] write_checkpoint+0x361/0xf20 [f2fs] [] ? trace_hardirqs_on+0xd/0x10 [] ? f2fs_sync_fs+0x79/0x190 [f2fs] [] f2fs_sync_fs+0x85/0x190 [f2fs] [] f2fs_balance_fs_bg+0x7e/0x1c0 [f2fs] [] f2fs_write_node_pages+0x34/0x320 [f2fs] [] do_writepages+0x21/0x30 [] __writeback_single_inode+0x61/0x760 [] ? _raw_spin_unlock+0x27/0x40 [] writeback_single_inode+0xd5/0x190 [] write_inode_now+0x99/0xc0 [] iput+0x1f6/0x2c0 [] f2fs_fill_super+0xe0e/0x1300 [f2fs] [] ? sget_userns+0x4f4/0x530 [] mount_bdev+0x182/0x1b0 [] ? f2fs_commit_super+0x100/0x100 [f2fs] [] f2fs_mount+0x15/0x20 [f2fs] [] mount_fs+0x38/0x170 [] vfs_kern_mount+0x6b/0x160 [] do_mount+0x1be/0xd60 [] ? copy_mount_options+0xb7/0x220 [] SyS_mount+0x94/0xd0 [] entry_SYSCALL_64_fastpath+0x23/0xc6 Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 6034d51fc5fc..e007c011ec53 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1890,6 +1890,13 @@ free_node_inode: mutex_lock(&sbi->umount_mutex); release_ino_entry(sbi, true); f2fs_leave_shrinker(sbi); + /* + * Some dirty meta pages can be produced by recover_orphan_inodes() + * failed by EIO. Then, iput(node_inode) can trigger balance_fs_bg() + * followed by write_checkpoint() through f2fs_write_node_pages(), which + * falls into an infinite loop in sync_meta_pages(). + */ + truncate_inode_pages_final(META_MAPPING(sbi)); iput(sbi->node_inode); mutex_unlock(&sbi->umount_mutex); free_nm: From 26fcd8659ef3863962235bfa1209b53a86fa9e06 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 28 Oct 2016 17:44:59 +0900 Subject: [PATCH 032/804] f2fs: Add missing break in switch-case Signed-off-by: Damien Le Moal Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index e007c011ec53..4fd34e7bcf60 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -420,6 +420,7 @@ static int parse_options(struct super_block *sb, char *options) break; case Opt_nodiscard: clear_opt(sbi, DISCARD); + break; case Opt_noheap: set_opt(sbi, NOHEAP); break; From 6e89bc832cc20e2edd05167dec11ca2a60b6d5d2 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 28 Oct 2016 17:45:00 +0900 Subject: [PATCH 033/804] f2fs: Use generic zoned block device terminology SMR stands for "Shingled Magnetic Recording" which makes sense only for hard disk drives (spinning rust). The ZBC/ZAC standards enable management of SMR disks, but solid state drives may also support those standards. So rename the HMSMR feature to BLKZONED to avoid a HDD centric terminology. For the same reason, rename f2fs_sb_mounted_hmsmr to f2fs_sb_mounted_blkzoned. Signed-off-by: Damien Le Moal Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 +- fs/f2fs/f2fs.h | 6 +++--- fs/f2fs/super.c | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 192bc039194d..3817cf841dff 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -110,7 +110,7 @@ static inline void __submit_bio(struct f2fs_sb_info *sbi, int rw, { if (!is_read_io(rw)) { atomic_inc(&sbi->nr_wb_bios); - if (f2fs_sb_mounted_hmsmr(sbi->sb) && + if (f2fs_sb_mounted_blkzoned(sbi->sb) && current->plug && (type == DATA || type == NODE)) blk_finish_plug(current->plug); } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 4b13d70d716c..5a563a9f3a52 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -103,7 +103,7 @@ struct f2fs_mount_info { }; #define F2FS_FEATURE_ENCRYPT 0x0001 -#define F2FS_FEATURE_HMSMR 0x0002 +#define F2FS_FEATURE_BLKZONED 0x0002 #define F2FS_HAS_FEATURE(sb, mask) \ ((F2FS_SB(sb)->raw_super->feature & cpu_to_le32(mask)) != 0) @@ -2470,9 +2470,9 @@ static inline int f2fs_sb_has_crypto(struct super_block *sb) return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_ENCRYPT); } -static inline int f2fs_sb_mounted_hmsmr(struct super_block *sb) +static inline int f2fs_sb_mounted_blkzoned(struct super_block *sb) { - return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_HMSMR); + return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_BLKZONED); } static inline void set_opt_mode(struct f2fs_sb_info *sbi, unsigned int mt) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 4fd34e7bcf60..3574d0620dc4 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -974,7 +974,7 @@ static void default_options(struct f2fs_sb_info *sbi) set_opt(sbi, EXTENT_CACHE); sbi->sb->s_flags |= MS_LAZYTIME; set_opt(sbi, FLUSH_MERGE); - if (f2fs_sb_mounted_hmsmr(sbi->sb)) { + if (f2fs_sb_mounted_blkzoned(sbi->sb)) { set_opt_mode(sbi, F2FS_MOUNT_LFS); set_opt(sbi, DISCARD); } else { From 5b0f4f4c6a017a563edb90f393dd01d0f4dc7d4c Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 28 Oct 2016 17:45:01 +0900 Subject: [PATCH 034/804] f2fs: Check zoned block feature for host-managed zoned block devices The F2FS_FEATURE_BLKZONED feature indicates that the drive was formatted with zone alignment optimization. This is optional for host-aware devices, but mandatory for host-managed zoned block devices. So check that the feature is set in this latter case. Signed-off-by: Damien Le Moal Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 3574d0620dc4..4187e3b9a83e 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1639,6 +1639,26 @@ try_onemore: sb->s_fs_info = sbi; sbi->raw_super = raw_super; + /* + * The BLKZONED feature indicates that the drive was formatted with + * zone alignment optimization. This is optional for host-aware + * devices, but mandatory for host-managed zoned block devices. + */ +#ifndef CONFIG_BLK_DEV_ZONED + if (f2fs_sb_mounted_blkzoned(sb)) { + f2fs_msg(sb, KERN_ERR, + "Zoned block device support is not enabled\n"); + goto free_sb_buf; + } +#else + if (bdev_zoned_model(sb->s_bdev) == BLK_ZONED_HM && + !f2fs_sb_mounted_blkzoned(sb)) { + f2fs_msg(sb, KERN_ERR, + "Zoned block device feature not enabled\n"); + goto free_sb_buf; + } +#endif + default_options(sbi); /* parse mount options */ options = kstrdup((const char *)data, GFP_KERNEL); From 40e505d0271bc5af896b903ef4e1d6d0068feb27 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 28 Oct 2016 17:45:02 +0900 Subject: [PATCH 035/804] f2fs: Suppress discard warning message for zoned block devices For zoned block devices, discard is replaced by zone reset. So do not warn if the device does not supports discard. Signed-off-by: Damien Le Moal Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 4187e3b9a83e..3e57ec837de9 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -412,7 +412,7 @@ static int parse_options(struct super_block *sb, char *options) q = bdev_get_queue(sb->s_bdev); if (blk_queue_discard(q)) { set_opt(sbi, DISCARD); - } else { + } else if (!f2fs_sb_mounted_blkzoned(sb)) { f2fs_msg(sb, KERN_WARNING, "mounting with \"discard\" option, but " "the device does not support discard"); From 789098ffddbfc8adbf55470d66ef6eef264485f0 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 28 Oct 2016 17:45:03 +0900 Subject: [PATCH 036/804] f2fs: Always enable discard for zoned blocks devices Zone write pointer reset acts as discard for zoned block devices. So if the zoned block device feature is enabled, always declare that discard is enabled, even if the device does not actually support the command. For the same reason, prevent the use the "nodicard" mount option. Signed-off-by: Damien Le Moal Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 14 +++++++------- fs/f2fs/super.c | 5 +++++ 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 5a563a9f3a52..4fd31208965d 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1159,13 +1159,6 @@ static inline void clear_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f) spin_unlock(&sbi->cp_lock); } -static inline bool f2fs_discard_en(struct f2fs_sb_info *sbi) -{ - struct request_queue *q = bdev_get_queue(sbi->sb->s_bdev); - - return blk_queue_discard(q); -} - static inline void f2fs_lock_op(struct f2fs_sb_info *sbi) { down_read(&sbi->cp_rwsem); @@ -2475,6 +2468,13 @@ static inline int f2fs_sb_mounted_blkzoned(struct super_block *sb) return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_BLKZONED); } +static inline bool f2fs_discard_en(struct f2fs_sb_info *sbi) +{ + struct request_queue *q = bdev_get_queue(sbi->sb->s_bdev); + + return blk_queue_discard(q) || f2fs_sb_mounted_blkzoned(sbi->sb); +} + static inline void set_opt_mode(struct f2fs_sb_info *sbi, unsigned int mt) { clear_opt(sbi, ADAPTIVE); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 3e57ec837de9..33676c1e35d7 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -419,6 +419,11 @@ static int parse_options(struct super_block *sb, char *options) } break; case Opt_nodiscard: + if (f2fs_sb_mounted_blkzoned(sb)) { + f2fs_msg(sb, KERN_WARNING, + "discard is required for zoned block devices"); + return -EINVAL; + } clear_opt(sbi, DISCARD); break; case Opt_noheap: From 02bccb06333fd6abe7b3bf61f092d6cffc9b3722 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 28 Oct 2016 17:45:04 +0900 Subject: [PATCH 037/804] f2fs: Do not allow adaptive mode for host-managed zoned block devices The LFS mode is mandatory for host-managed zoned block devices as update in place optimizations are not possible for segments in sequential zones. Signed-off-by: Damien Le Moal Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 33676c1e35d7..6bc0810969b7 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -518,6 +518,13 @@ static int parse_options(struct super_block *sb, char *options) return -ENOMEM; if (strlen(name) == 8 && !strncmp(name, "adaptive", 8)) { + if (f2fs_sb_mounted_blkzoned(sb)) { + f2fs_msg(sb, KERN_WARNING, + "adaptive mode is not allowed with " + "zoned block device feature"); + kfree(name); + return -EINVAL; + } set_opt_mode(sbi, F2FS_MOUNT_ADAPTIVE); } else if (strlen(name) == 3 && !strncmp(name, "lfs", 3)) { From 060887886644d506772e03d323782a6bffaedcb6 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 28 Oct 2016 17:45:05 +0900 Subject: [PATCH 038/804] f2fs: Cache zoned block devices zone type With the zoned block device feature enabled, section discard need to do a zone reset for sections contained in sequential zones, and a regular discard (if supported) for sections stored in conventional zones. Avoid the need for a costly report zones to obtain a section zone type when discarding it by caching the types of the device zones in the super block information. This cache is initialized at mount time for mounts with the zoned block device feature enabled. Signed-off-by: Damien Le Moal Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 18 +++++++++++++ fs/f2fs/super.c | 72 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 90 insertions(+) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 4fd31208965d..c6dba704b0fe 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -803,6 +803,14 @@ struct f2fs_sb_info { u8 key_prefix[F2FS_KEY_DESC_PREFIX_SIZE]; u8 key_prefix_size; #endif + +#ifdef CONFIG_BLK_DEV_ZONED + unsigned int nr_blkz; /* Total number of zones */ + unsigned int blocks_per_blkz; /* F2FS blocks per zone */ + unsigned int log_blocks_per_blkz; /* log2 F2FS blocks per zone */ + u8 *blkz_type; /* Array of zones type */ +#endif + /* for node-related operations */ struct f2fs_nm_info *nm_info; /* node manager */ struct inode *node_inode; /* cache node blocks */ @@ -2468,6 +2476,16 @@ static inline int f2fs_sb_mounted_blkzoned(struct super_block *sb) return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_BLKZONED); } +#ifdef CONFIG_BLK_DEV_ZONED +static inline int get_blkz_type(struct f2fs_sb_info *sbi, + block_t blkaddr) +{ + unsigned int zno = blkaddr >> sbi->log_blocks_per_blkz; + + return sbi->blkz_type[zno]; +} +#endif + static inline bool f2fs_discard_en(struct f2fs_sb_info *sbi) { struct request_queue *q = bdev_get_queue(sbi->sb->s_bdev); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 6bc0810969b7..d777a18df958 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1512,6 +1512,65 @@ static int init_percpu_info(struct f2fs_sb_info *sbi) GFP_KERNEL); } +#ifdef CONFIG_BLK_DEV_ZONED +static int init_blkz_info(struct f2fs_sb_info *sbi) +{ + struct block_device *bdev = sbi->sb->s_bdev; + sector_t nr_sectors = bdev->bd_part->nr_sects; + sector_t sector = 0; + struct blk_zone *zones; + unsigned int i, nr_zones; + unsigned int n = 0; + int err = -EIO; + + if (!f2fs_sb_mounted_blkzoned(sbi->sb)) + return 0; + + sbi->blocks_per_blkz = SECTOR_TO_BLOCK(bdev_zone_size(bdev)); + sbi->log_blocks_per_blkz = __ilog2_u32(sbi->blocks_per_blkz); + sbi->nr_blkz = SECTOR_TO_BLOCK(nr_sectors) >> + sbi->log_blocks_per_blkz; + if (nr_sectors & (bdev_zone_size(bdev) - 1)) + sbi->nr_blkz++; + + sbi->blkz_type = kmalloc(sbi->nr_blkz, GFP_KERNEL); + if (!sbi->blkz_type) + return -ENOMEM; + +#define F2FS_REPORT_NR_ZONES 4096 + + zones = kcalloc(F2FS_REPORT_NR_ZONES, sizeof(struct blk_zone), + GFP_KERNEL); + if (!zones) + return -ENOMEM; + + /* Get block zones type */ + while (zones && sector < nr_sectors) { + + nr_zones = F2FS_REPORT_NR_ZONES; + err = blkdev_report_zones(bdev, sector, + zones, &nr_zones, + GFP_KERNEL); + if (err) + break; + if (!nr_zones) { + err = -EIO; + break; + } + + for (i = 0; i < nr_zones; i++) { + sbi->blkz_type[n] = zones[i].type; + sector += zones[i].len; + n++; + } + } + + kfree(zones); + + return err; +} +#endif + /* * Read f2fs raw super block. * Because we have two copies of super block, so read both of them @@ -1758,6 +1817,15 @@ try_onemore: init_ino_entry_info(sbi); +#ifdef CONFIG_BLK_DEV_ZONED + err = init_blkz_info(sbi); + if (err) { + f2fs_msg(sb, KERN_ERR, + "Failed to initialize F2FS blkzone information"); + goto free_blkz; + } +#endif + /* setup f2fs internal modules */ err = build_segment_manager(sbi); if (err) { @@ -1936,6 +2004,10 @@ free_nm: destroy_node_manager(sbi); free_sm: destroy_segment_manager(sbi); +#ifdef CONFIG_BLK_DEV_ZONED +free_blkz: + kfree(sbi->blkz_type); +#endif kfree(sbi->ckpt); free_meta_inode: make_bad_inode(sbi->meta_inode); From ac0357e5d5e91b10d66cd6b01fe10424bb5215b7 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 28 Oct 2016 17:45:06 +0900 Subject: [PATCH 039/804] f2fs: Reset sequential zones on zoned block devices When a zoned block device is mounted, discarding sections contained in sequential zones must reset the zone write pointer. For sections contained in conventional zones, the regular discard is used if the drive supports it. Signed-off-by: Damien Le Moal Signed-off-by: Jaegeuk Kim Conflicts: fs/f2fs/segment.c --- fs/f2fs/segment.c | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index ec4d74c26067..8e4863bd36f5 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -16,6 +16,7 @@ #include #include #include +#include #include "f2fs.h" #include "segment.h" @@ -584,6 +585,45 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) mutex_unlock(&dirty_i->seglist_lock); } +#ifdef CONFIG_BLK_DEV_ZONED +static int f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, + block_t blkstart, block_t blklen) +{ + sector_t sector = SECTOR_FROM_BLOCK(blkstart); + sector_t nr_sects = SECTOR_FROM_BLOCK(blklen); + struct block_device *bdev = sbi->sb->s_bdev; + + if (nr_sects != bdev_zone_size(bdev)) { + f2fs_msg(sbi->sb, KERN_INFO, + "Unaligned discard attempted (sector %llu + %llu)", + (unsigned long long)sector, + (unsigned long long)nr_sects); + return -EIO; + } + + /* + * We need to know the type of the zone: for conventional zones, + * use regular discard if the drive supports it. For sequential + * zones, reset the zone write pointer. + */ + switch (get_blkz_type(sbi, blkstart)) { + + case BLK_ZONE_TYPE_CONVENTIONAL: + if (!blk_queue_discard(bdev_get_queue(bdev))) + return 0; + return blkdev_issue_discard(bdev, sector, nr_sects, + GFP_NOFS, 0); + case BLK_ZONE_TYPE_SEQWRITE_REQ: + case BLK_ZONE_TYPE_SEQWRITE_PREF: + return blkdev_reset_zones(bdev, sector, + nr_sects, GFP_NOFS); + default: + /* Unknown zone type: broken device ? */ + return -EIO; + } +} +#endif + static int f2fs_issue_discard(struct f2fs_sb_info *sbi, block_t blkstart, block_t blklen) { @@ -601,6 +641,11 @@ static int f2fs_issue_discard(struct f2fs_sb_info *sbi, sbi->discard_blks--; } trace_f2fs_issue_discard(sbi->sb, blkstart, blklen); + +#ifdef CONFIG_BLK_DEV_ZONED + if (f2fs_sb_mounted_blkzoned(sbi->sb)) + return f2fs_issue_discard_zone(sbi, blkstart, blklen); +#endif return blkdev_issue_discard(sbi->sb->s_bdev, start, len, GFP_NOFS, 0); } From 55fac8071160fb3368531abedb3b72e7a7394004 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 28 Oct 2016 17:45:07 +0900 Subject: [PATCH 040/804] f2fs: Trace reset zone events Similarly to the regular discard, trace zone reset events. Signed-off-by: Damien Le Moal Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 1 + include/trace/events/f2fs.h | 21 +++++++++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 8e4863bd36f5..06b9d16a19f6 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -615,6 +615,7 @@ static int f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, GFP_NOFS, 0); case BLK_ZONE_TYPE_SEQWRITE_REQ: case BLK_ZONE_TYPE_SEQWRITE_PREF: + trace_f2fs_issue_reset_zone(sbi->sb, blkstart); return blkdev_reset_zones(bdev, sector, nr_sects, GFP_NOFS); default: diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 3a09bb4dc3b2..90d6ad49a9c5 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -1110,6 +1110,27 @@ TRACE_EVENT(f2fs_issue_discard, (unsigned long long)__entry->blklen) ); +TRACE_EVENT(f2fs_issue_reset_zone, + + TP_PROTO(struct super_block *sb, block_t blkstart), + + TP_ARGS(sb, blkstart), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(block_t, blkstart) + ), + + TP_fast_assign( + __entry->dev = sb->s_dev; + __entry->blkstart = blkstart; + ), + + TP_printk("dev = (%d,%d), reset zone at block = 0x%llx", + show_dev(__entry), + (unsigned long long)__entry->blkstart) +); + TRACE_EVENT(f2fs_issue_flush, TP_PROTO(struct super_block *sb, unsigned int nobarrier, From d69efabf19970dcc335ea8265affd8eadafe70f3 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 2 Nov 2016 20:43:21 +0800 Subject: [PATCH 041/804] f2fs: record inode updating status correctly We should record updating status of inode only for living inode, for those unlinked inode it needs to clear its ino cache, otherwise after the ino was been reused, it will cause unneeded node page writing during ->fsync. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/inode.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 7b5e402f0a72..af06bda51a54 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -377,6 +377,9 @@ void f2fs_evict_inode(struct inode *inode) goto no_delete; #endif + remove_ino_entry(sbi, inode->i_ino, APPEND_INO); + remove_ino_entry(sbi, inode->i_ino, UPDATE_INO); + sb_start_intwrite(inode->i_sb); set_inode_flag(inode, FI_NO_ALLOC); i_size_write(inode, 0); @@ -409,10 +412,12 @@ no_delete: invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino, inode->i_ino); if (xnid) invalidate_mapping_pages(NODE_MAPPING(sbi), xnid, xnid); - if (is_inode_flag_set(inode, FI_APPEND_WRITE)) - add_ino_entry(sbi, inode->i_ino, APPEND_INO); - if (is_inode_flag_set(inode, FI_UPDATE_WRITE)) - add_ino_entry(sbi, inode->i_ino, UPDATE_INO); + if (inode->i_nlink) { + if (is_inode_flag_set(inode, FI_APPEND_WRITE)) + add_ino_entry(sbi, inode->i_ino, APPEND_INO); + if (is_inode_flag_set(inode, FI_UPDATE_WRITE)) + add_ino_entry(sbi, inode->i_ino, UPDATE_INO); + } if (is_inode_flag_set(inode, FI_FREE_NID)) { alloc_nid_failed(sbi, inode->i_ino); clear_inode_flag(inode, FI_FREE_NID); From 16650422c86074d8f4f02ede10fb66901226da90 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 4 Nov 2016 00:26:55 +0800 Subject: [PATCH 042/804] f2fs: fix wrong i_atime recovery Shouldn't update in-memory i_atime with on-disk i_mtime of inode when recovering inode. Shuoran found this bug which is hidden for a long time, honour is belong to him. Signed-off-by: Shuoran Liu Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/recovery.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 2fc84a991325..d2ba4da08ec3 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -180,10 +180,10 @@ static void recover_inode(struct inode *inode, struct page *page) inode->i_mode = le16_to_cpu(raw->i_mode); f2fs_i_size_write(inode, le64_to_cpu(raw->i_size)); - inode->i_atime.tv_sec = le64_to_cpu(raw->i_mtime); + inode->i_atime.tv_sec = le64_to_cpu(raw->i_atime); inode->i_ctime.tv_sec = le64_to_cpu(raw->i_ctime); inode->i_mtime.tv_sec = le64_to_cpu(raw->i_mtime); - inode->i_atime.tv_nsec = le32_to_cpu(raw->i_mtime_nsec); + inode->i_atime.tv_nsec = le32_to_cpu(raw->i_atime_nsec); inode->i_ctime.tv_nsec = le32_to_cpu(raw->i_ctime_nsec); inode->i_mtime.tv_nsec = le32_to_cpu(raw->i_mtime_nsec); From 4cd4b0465d2227a6ca686d72f81d3fd3b207e94c Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 4 Nov 2016 14:33:57 -0700 Subject: [PATCH 043/804] f2fs: assign segments correctly for direct_io Previously, we assigned CURSEG_WARM_DATA for direct_io, but if we have two or four logs, we do not use that type at all. Let's fix it. Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 06b9d16a19f6..4bdf1191a36f 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1422,8 +1422,12 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, struct curseg_info *curseg; bool direct_io = (type == CURSEG_DIRECT_IO); - type = direct_io ? CURSEG_WARM_DATA : type; - + if (direct_io) { + if (sbi->active_logs <= 4) + type = CURSEG_HOT_DATA; + else + type = CURSEG_WARM_DATA; + } curseg = CURSEG_I(sbi, type); mutex_lock(&curseg->curseg_mutex); From 5f59a8f59bdfe9190a24caaec69fa33975df7ce0 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 4 Nov 2016 14:59:15 -0700 Subject: [PATCH 044/804] f2fs: remove checkpoint in f2fs_freeze The generic freeze_super() calls sync_filesystems() before f2fs_freeze(). So, basically we don't need to do checkpoint in f2fs_freeze(). But, in xfs/068, it triggers circular locking problem below due to gc_mutex for checkpoint. ====================================================== [ INFO: possible circular locking dependency detected ] 4.9.0-rc1+ #132 Tainted: G OE ------------------------------------------------------- 1. wait for __sb_start_write() by [] dump_stack+0x85/0xc2 [] print_circular_bug+0x1cf/0x230 [] __lock_acquire+0x19e0/0x1bc0 [] lock_acquire+0x11b/0x220 [] ? f2fs_drop_inode+0x9b/0x160 [f2fs] [] __sb_start_write+0x130/0x200 [] ? f2fs_drop_inode+0x9b/0x160 [f2fs] [] f2fs_drop_inode+0x9b/0x160 [f2fs] [] iput+0x171/0x2c0 [] f2fs_sync_inode_meta+0x3f/0xf0 [f2fs] [] block_operations+0x84/0x110 [f2fs] [] write_checkpoint+0xe8/0xf20 [f2fs] [] ? trace_hardirqs_on+0xd/0x10 [] ? f2fs_sync_fs+0x79/0x190 [f2fs] [] ? sched_clock+0x9/0x10 [] ? f2fs_sync_fs+0x79/0x190 [f2fs] [] f2fs_sync_fs+0x85/0x190 [f2fs] [] ? do_fsync+0x70/0x70 [] ? do_fsync+0x70/0x70 [] sync_fs_one_sb+0x20/0x30 [] iterate_supers+0xae/0x100 [] sys_sync+0x55/0x90 [] entry_SYSCALL_64_fastpath+0x23/0xc6 2. wait for sbi->gc_mutex by [] lock_acquire+0x11b/0x220 [] mutex_lock_nested+0x76/0x3f0 [] f2fs_sync_fs+0x79/0x190 [f2fs] [] f2fs_freeze+0x1c/0x20 [f2fs] [] freeze_super+0xcf/0x190 [] do_vfs_ioctl+0x53c/0x6a0 [] SyS_ioctl+0x79/0x90 [] entry_SYSCALL_64_fastpath+0x23/0xc6 Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index d777a18df958..7a0634b0bee8 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -800,13 +800,17 @@ int f2fs_sync_fs(struct super_block *sb, int sync) static int f2fs_freeze(struct super_block *sb) { - int err; - if (f2fs_readonly(sb)) return 0; - err = f2fs_sync_fs(sb, 1); - return err; + /* IO error happened before */ + if (unlikely(f2fs_cp_error(F2FS_SB(sb)))) + return -EIO; + + /* must be clean, since sync_filesystem() was already called */ + if (is_sbi_flag_set(F2FS_SB(sb), SBI_IS_DIRTY)) + return -EINVAL; + return 0; } static int f2fs_unfreeze(struct super_block *sb) @@ -2153,3 +2157,4 @@ module_exit(exit_f2fs_fs) MODULE_AUTHOR("Samsung Electronics's Praesto Team"); MODULE_DESCRIPTION("Flash Friendly File System"); MODULE_LICENSE("GPL"); + From 11895b32059553d1ee358980e28587af9cd5eea6 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 5 Nov 2016 11:12:40 +0800 Subject: [PATCH 045/804] Revert "f2fs: do not recover from previous remained wrong dnodes" i_times of inode will be set with current system time which can be configured through 'date', so it's not safe to judge dnode block as garbage data or unchanged inode depend on i_times. Now, we have used enhanced 'cp_ver + cp' crc method to verify valid dnode block, so I expect recoverying invalid dnode is almost not possible. This reverts commit 807b1e1c8e08452948495b1a9985ab46d329e5c2. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/recovery.c | 31 +------------------------------ 1 file changed, 1 insertion(+), 30 deletions(-) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index d2ba4da08ec3..62523b217571 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -196,32 +196,6 @@ static void recover_inode(struct inode *inode, struct page *page) ino_of_node(page), name); } -static bool is_same_inode(struct inode *inode, struct page *ipage) -{ - struct f2fs_inode *ri = F2FS_INODE(ipage); - struct timespec disk; - - if (!IS_INODE(ipage)) - return true; - - disk.tv_sec = le64_to_cpu(ri->i_ctime); - disk.tv_nsec = le32_to_cpu(ri->i_ctime_nsec); - if (timespec_compare(&inode->i_ctime, &disk) > 0) - return false; - - disk.tv_sec = le64_to_cpu(ri->i_atime); - disk.tv_nsec = le32_to_cpu(ri->i_atime_nsec); - if (timespec_compare(&inode->i_atime, &disk) > 0) - return false; - - disk.tv_sec = le64_to_cpu(ri->i_mtime); - disk.tv_nsec = le32_to_cpu(ri->i_mtime_nsec); - if (timespec_compare(&inode->i_mtime, &disk) > 0) - return false; - - return true; -} - static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) { struct curseg_info *curseg; @@ -248,10 +222,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) goto next; entry = get_fsync_inode(head, ino_of_node(page)); - if (entry) { - if (!is_same_inode(entry->inode, page)) - goto next; - } else { + if (!entry) { if (IS_INODE(page) && is_dent_dnode(page)) { err = recover_inode_page(sbi, page); if (err) From 908659afc0564703fae66eae4ffe23b352308ee3 Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Mon, 7 Nov 2016 21:22:31 +0800 Subject: [PATCH 046/804] f2fs: return directly if block has been removed from the victim If one block has been to written to a new place, just return in move data process. This patch check it again with holding page lock. Signed-off-by: Yunlei He Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 72a0ca08f901..744031194934 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -544,7 +544,8 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, return true; } -static void move_encrypted_block(struct inode *inode, block_t bidx) +static void move_encrypted_block(struct inode *inode, block_t bidx, + unsigned int segno, int off) { struct f2fs_io_info fio = { .sbi = F2FS_I_SB(inode), @@ -564,6 +565,9 @@ static void move_encrypted_block(struct inode *inode, block_t bidx) if (!page) return; + if (!check_valid_map(F2FS_I_SB(inode), segno, off)) + goto out; + set_new_dnode(&dn, inode, NULL, NULL, 0); err = get_dnode_of_data(&dn, bidx, LOOKUP_NODE); if (err) @@ -643,7 +647,8 @@ out: f2fs_put_page(page, 1); } -static void move_data_page(struct inode *inode, block_t bidx, int gc_type) +static void move_data_page(struct inode *inode, block_t bidx, int gc_type, + unsigned int segno, int off) { struct page *page; @@ -651,6 +656,9 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type) if (IS_ERR(page)) return; + if (!check_valid_map(F2FS_I_SB(inode), segno, off)) + goto out; + if (gc_type == BG_GC) { if (PageWriteback(page)) goto out; @@ -792,9 +800,9 @@ next_step: start_bidx = start_bidx_of_node(nofs, inode) + ofs_in_node; if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) - move_encrypted_block(inode, start_bidx); + move_encrypted_block(inode, start_bidx, segno, off); else - move_data_page(inode, start_bidx, gc_type); + move_data_page(inode, start_bidx, gc_type, segno, off); if (locked) { up_write(&fi->dio_rwsem[WRITE]); From 17aa419b53395bb52d0afb5863335d9260e1775c Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 11 Nov 2016 12:31:40 -0800 Subject: [PATCH 047/804] f2fs: revert segment allocation for direct IO Now we don't need to be too much careful about storage alignment for dio, since its speed becomes quite fast and we'd better avoid any misalignment first. Revert: 38aa0889b250 (f2fs: align direct_io'ed data to section) Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 6 +----- fs/f2fs/f2fs.h | 1 - fs/f2fs/segment.c | 36 +++++++++--------------------------- 3 files changed, 10 insertions(+), 33 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 3817cf841dff..c37396b3212e 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -582,7 +582,6 @@ static int __allocate_data_block(struct dnode_of_data *dn) struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); struct f2fs_summary sum; struct node_info ni; - int seg = CURSEG_WARM_DATA; pgoff_t fofs; blkcnt_t count = 1; @@ -600,11 +599,8 @@ alloc: get_node_info(sbi, dn->nid, &ni); set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); - if (dn->ofs_in_node == 0 && dn->inode_page == dn->node_page) - seg = CURSEG_DIRECT_IO; - allocate_data_block(sbi, NULL, dn->data_blkaddr, &dn->data_blkaddr, - &sum, seg); + &sum, CURSEG_WARM_DATA); set_data_blkaddr(dn); /* update i_size */ diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index c6dba704b0fe..4d4bfbaeb788 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -639,7 +639,6 @@ enum { CURSEG_WARM_NODE, /* direct node blocks of normal files */ CURSEG_COLD_NODE, /* indirect node blocks */ NO_CHECK_TYPE, - CURSEG_DIRECT_IO, /* to use for the direct IO path */ }; struct flush_cmd { diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 4bdf1191a36f..19ab2e63d8d7 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1267,25 +1267,21 @@ static void allocate_segment_by_default(struct f2fs_sb_info *sbi, stat_inc_seg_type(sbi, curseg); } -static void __allocate_new_segments(struct f2fs_sb_info *sbi, int type) -{ - struct curseg_info *curseg = CURSEG_I(sbi, type); - unsigned int old_segno; - - old_segno = curseg->segno; - SIT_I(sbi)->s_ops->allocate_segment(sbi, type, true); - locate_dirty_segment(sbi, old_segno); -} - void allocate_new_segments(struct f2fs_sb_info *sbi) { + struct curseg_info *curseg; + unsigned int old_segno; int i; if (test_opt(sbi, LFS)) return; - for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) - __allocate_new_segments(sbi, i); + for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { + curseg = CURSEG_I(sbi, i); + old_segno = curseg->segno; + SIT_I(sbi)->s_ops->allocate_segment(sbi, i, true); + locate_dirty_segment(sbi, old_segno); + } } static const struct segment_allocation default_salloc_ops = { @@ -1419,25 +1415,11 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, struct f2fs_summary *sum, int type) { struct sit_info *sit_i = SIT_I(sbi); - struct curseg_info *curseg; - bool direct_io = (type == CURSEG_DIRECT_IO); - - if (direct_io) { - if (sbi->active_logs <= 4) - type = CURSEG_HOT_DATA; - else - type = CURSEG_WARM_DATA; - } - curseg = CURSEG_I(sbi, type); + struct curseg_info *curseg = CURSEG_I(sbi, type); mutex_lock(&curseg->curseg_mutex); mutex_lock(&sit_i->sentry_lock); - /* direct_io'ed data is aligned to the segment for better performance */ - if (direct_io && curseg->next_blkoff && - !has_not_enough_free_secs(sbi, 0, 0)) - __allocate_new_segments(sbi, type); - *new_blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); /* From 09d9b573e5881fb588edb22e1a093ac4e485e1f4 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 11 Nov 2016 12:08:22 -0800 Subject: [PATCH 048/804] f2fs: allow dio read for LFS mode We can allow dio reads for LFS mode, while doing buffered writes for dio writes. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index c37396b3212e..08a1c09adba7 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1723,7 +1723,7 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) return 0; - if (test_opt(F2FS_I_SB(inode), LFS)) + if (rw == WRITE && test_opt(F2FS_I_SB(inode), LFS)) return 0; trace_f2fs_direct_IO_enter(inode, offset, count, rw); From 79d47107adb6f58c8555be43e6b18fccf944ae8d Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 6 Oct 2016 19:02:05 -0700 Subject: [PATCH 049/804] f2fs: support multiple devices This patch implements multiple devices support for f2fs. Given multiple devices by mkfs.f2fs, f2fs shows them entirely as one big volume under one f2fs instance. Internal block management is very simple, but we will modify block allocation and background GC policy to boost IO speed by exploiting them accoording to each device speed. Signed-off-by: Jaegeuk Kim Conflicts: fs/f2fs/data.c fs/f2fs/segment.c --- fs/f2fs/data.c | 55 ++++++++++++++-- fs/f2fs/f2fs.h | 29 +++++++-- fs/f2fs/segment.c | 112 +++++++++++++++++++++++--------- fs/f2fs/super.c | 138 +++++++++++++++++++++++++++++++--------- include/linux/f2fs_fs.h | 10 ++- 5 files changed, 274 insertions(+), 70 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 08a1c09adba7..447dd624f6a4 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -87,6 +87,46 @@ static void f2fs_write_end_io(struct bio *bio) bio_put(bio); } +/* + * Return true, if pre_bio's bdev is same as its target device. + */ +struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi, + block_t blk_addr, struct bio *bio) +{ + struct block_device *bdev = sbi->sb->s_bdev; + int i; + + for (i = 0; i < sbi->s_ndevs; i++) { + if (FDEV(i).start_blk <= blk_addr && + FDEV(i).end_blk >= blk_addr) { + blk_addr -= FDEV(i).start_blk; + bdev = FDEV(i).bdev; + break; + } + } + if (bio) { + bio->bi_bdev = bdev; + bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blk_addr); + } + return bdev; +} + +int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr) +{ + int i; + + for (i = 0; i < sbi->s_ndevs; i++) + if (FDEV(i).start_blk <= blkaddr && FDEV(i).end_blk >= blkaddr) + return i; + return 0; +} + +static bool __same_bdev(struct f2fs_sb_info *sbi, + block_t blk_addr, struct bio *bio) +{ + return f2fs_target_device(sbi, blk_addr, NULL) == bio->bi_bdev; +} + /* * Low-level block read/write IO operations. */ @@ -97,8 +137,7 @@ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr, bio = f2fs_bio_alloc(npages); - bio->bi_bdev = sbi->sb->s_bdev; - bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blk_addr); + f2fs_target_device(sbi, blk_addr, bio); bio->bi_end_io = is_read ? f2fs_read_end_io : f2fs_write_end_io; bio->bi_private = is_read ? NULL : sbi; @@ -268,7 +307,8 @@ void f2fs_submit_page_mbio(struct f2fs_io_info *fio) down_write(&io->io_rwsem); if (io->bio && (io->last_block_in_bio != fio->new_blkaddr - 1 || - io->fio.rw != fio->rw)) + (io->fio.rw != fio->rw) || + !__same_bdev(sbi, fio->new_blkaddr, io->bio))) __submit_merged_bio(io); alloc_new: if (io->bio == NULL) { @@ -955,7 +995,6 @@ static struct bio *f2fs_grab_bio(struct inode *inode, block_t blkaddr, { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct fscrypt_ctx *ctx = NULL; - struct block_device *bdev = sbi->sb->s_bdev; struct bio *bio; if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) { @@ -973,8 +1012,7 @@ static struct bio *f2fs_grab_bio(struct inode *inode, block_t blkaddr, fscrypt_release_ctx(ctx); return ERR_PTR(-ENOMEM); } - bio->bi_bdev = bdev; - bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blkaddr); + f2fs_target_device(sbi, blkaddr, bio); bio->bi_end_io = f2fs_read_end_io; bio->bi_private = ctx; @@ -1068,7 +1106,8 @@ got_it: * This page will go to BIO. Do we need to send this * BIO off first? */ - if (bio && (last_block_in_bio != block_nr - 1)) { + if (bio && (last_block_in_bio != block_nr - 1 || + !__same_bdev(F2FS_I_SB(inode), block_nr, bio))) { submit_and_realloc: __submit_bio(F2FS_I_SB(inode), READ, bio, DATA); bio = NULL; @@ -1725,6 +1764,8 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, return 0; if (rw == WRITE && test_opt(F2FS_I_SB(inode), LFS)) return 0; + if (F2FS_I_SB(inode)->s_ndevs) + return 0; trace_f2fs_direct_IO_enter(inode, offset, count, rw); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 4d4bfbaeb788..04f6dddc6d91 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -756,6 +756,20 @@ struct f2fs_bio_info { struct rw_semaphore io_rwsem; /* blocking op for bio */ }; +#define FDEV(i) (sbi->devs[i]) +#define RDEV(i) (raw_super->devs[i]) +struct f2fs_dev_info { + struct block_device *bdev; + char path[MAX_PATH_LEN]; + unsigned int total_segments; + block_t start_blk; + block_t end_blk; +#ifdef CONFIG_BLK_DEV_ZONED + unsigned int nr_blkz; /* Total number of zones */ + u8 *blkz_type; /* Array of zones type */ +#endif +}; + enum inode_type { DIR_INODE, /* for dirty dir inode */ FILE_INODE, /* for dirty regular/symlink inode */ @@ -804,10 +818,8 @@ struct f2fs_sb_info { #endif #ifdef CONFIG_BLK_DEV_ZONED - unsigned int nr_blkz; /* Total number of zones */ unsigned int blocks_per_blkz; /* F2FS blocks per zone */ unsigned int log_blocks_per_blkz; /* log2 F2FS blocks per zone */ - u8 *blkz_type; /* Array of zones type */ #endif /* for node-related operations */ @@ -924,6 +936,8 @@ struct f2fs_sb_info { /* For shrinker support */ struct list_head s_list; + int s_ndevs; /* number of devices */ + struct f2fs_dev_info *devs; /* for device list */ struct mutex umount_mutex; unsigned int shrinker_run_no; @@ -2190,6 +2204,9 @@ void f2fs_submit_merged_bio_cond(struct f2fs_sb_info *, struct inode *, void f2fs_flush_merged_bios(struct f2fs_sb_info *); int f2fs_submit_page_bio(struct f2fs_io_info *); void f2fs_submit_page_mbio(struct f2fs_io_info *); +struct block_device *f2fs_target_device(struct f2fs_sb_info *, + block_t, struct bio *); +int f2fs_target_device_index(struct f2fs_sb_info *, block_t); void set_data_blkaddr(struct dnode_of_data *); void f2fs_update_data_blkaddr(struct dnode_of_data *, block_t); int reserve_new_blocks(struct dnode_of_data *, blkcnt_t); @@ -2477,11 +2494,15 @@ static inline int f2fs_sb_mounted_blkzoned(struct super_block *sb) #ifdef CONFIG_BLK_DEV_ZONED static inline int get_blkz_type(struct f2fs_sb_info *sbi, - block_t blkaddr) + struct block_device *bdev, block_t blkaddr) { unsigned int zno = blkaddr >> sbi->log_blocks_per_blkz; + int i; - return sbi->blkz_type[zno]; + for (i = 0; i < sbi->s_ndevs; i++) + if (FDEV(i).bdev == bdev) + return FDEV(i).blkz_type[zno]; + return -EINVAL; } #endif diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 19ab2e63d8d7..30d0e9a76c62 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -401,6 +401,32 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) } } +static int __submit_flush_wait(struct block_device *bdev) +{ + struct bio *bio = f2fs_bio_alloc(0); + int ret; + + bio->bi_bdev = bdev; + ret = submit_bio_wait(WRITE_FLUSH, bio); + bio_put(bio); + return ret; +} + +static int submit_flush_wait(struct f2fs_sb_info *sbi) +{ + int ret = __submit_flush_wait(sbi->sb->s_bdev); + int i; + + if (sbi->s_ndevs && !ret) { + for (i = 1; i < sbi->s_ndevs; i++) { + ret = __submit_flush_wait(FDEV(i).bdev); + if (ret) + break; + } + } + return ret; +} + static int issue_flush_thread(void *data) { struct f2fs_sb_info *sbi = data; @@ -411,24 +437,18 @@ repeat: return 0; if (!llist_empty(&fcc->issue_list)) { - struct bio *bio; struct flush_cmd *cmd, *next; int ret; - bio = f2fs_bio_alloc(0); - fcc->dispatch_list = llist_del_all(&fcc->issue_list); fcc->dispatch_list = llist_reverse_order(fcc->dispatch_list); - bio->bi_bdev = sbi->sb->s_bdev; - ret = submit_bio_wait(WRITE_FLUSH, bio); - + ret = submit_flush_wait(sbi); llist_for_each_entry_safe(cmd, next, fcc->dispatch_list, llnode) { cmd->ret = ret; complete(&cmd->wait); } - bio_put(bio); fcc->dispatch_list = NULL; } @@ -449,14 +469,11 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi) return 0; if (!test_opt(sbi, FLUSH_MERGE) || !atomic_read(&fcc->submit_flush)) { - struct bio *bio = f2fs_bio_alloc(0); int ret; atomic_inc(&fcc->submit_flush); - bio->bi_bdev = sbi->sb->s_bdev; - ret = submit_bio_wait(WRITE_FLUSH, bio); + ret = submit_flush_wait(sbi); atomic_dec(&fcc->submit_flush); - bio_put(bio); return ret; } @@ -586,18 +603,24 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) } #ifdef CONFIG_BLK_DEV_ZONED -static int f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, - block_t blkstart, block_t blklen) +static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t blkstart, block_t blklen) { - sector_t sector = SECTOR_FROM_BLOCK(blkstart); sector_t nr_sects = SECTOR_FROM_BLOCK(blklen); - struct block_device *bdev = sbi->sb->s_bdev; + sector_t sector; + int devi = 0; - if (nr_sects != bdev_zone_size(bdev)) { + if (sbi->s_ndevs) { + devi = f2fs_target_device_index(sbi, blkstart); + blkstart -= FDEV(devi).start_blk; + } + sector = SECTOR_FROM_BLOCK(blkstart); + + if (sector % bdev_zone_size(bdev) || nr_sects != bdev_zone_size(bdev)) { f2fs_msg(sbi->sb, KERN_INFO, - "Unaligned discard attempted (sector %llu + %llu)", - (unsigned long long)sector, - (unsigned long long)nr_sects); + "(%d) %s: Unaligned discard attempted (block %x + %x)", + devi, sbi->s_ndevs ? FDEV(devi).path: "", + blkstart, blklen); return -EIO; } @@ -606,7 +629,7 @@ static int f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, * use regular discard if the drive supports it. For sequential * zones, reset the zone write pointer. */ - switch (get_blkz_type(sbi, blkstart)) { + switch (get_blkz_type(sbi, bdev, blkstart)) { case BLK_ZONE_TYPE_CONVENTIONAL: if (!blk_queue_discard(bdev_get_queue(bdev))) @@ -625,29 +648,60 @@ static int f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, } #endif -static int f2fs_issue_discard(struct f2fs_sb_info *sbi, - block_t blkstart, block_t blklen) +static int __issue_discard_async(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t blkstart, block_t blklen) { sector_t start = SECTOR_FROM_BLOCK(blkstart); sector_t len = SECTOR_FROM_BLOCK(blklen); + +#ifdef CONFIG_BLK_DEV_ZONED + if (f2fs_sb_mounted_blkzoned(sbi->sb) && + bdev_zoned_model(bdev) != BLK_ZONED_NONE) + return __f2fs_issue_discard_zone(sbi, bdev, blkstart, blklen); +#endif + return blkdev_issue_discard(bdev, start, len, GFP_NOFS, 0); +} + +static int f2fs_issue_discard(struct f2fs_sb_info *sbi, + block_t blkstart, block_t blklen) +{ + sector_t start = blkstart, len = 0; + struct block_device *bdev; struct seg_entry *se; unsigned int offset; block_t i; + int err = 0; + + bdev = f2fs_target_device(sbi, blkstart, NULL); + + for (i = blkstart; i < blkstart + blklen; i++, len++) { + if (i != start) { + struct block_device *bdev2 = + f2fs_target_device(sbi, i, NULL); + + if (bdev2 != bdev) { + err = __issue_discard_async(sbi, bdev, + start, len); + if (err) + return err; + bdev = bdev2; + start = i; + len = 0; + } + } - for (i = blkstart; i < blkstart + blklen; i++) { se = get_seg_entry(sbi, GET_SEGNO(sbi, i)); offset = GET_BLKOFF_FROM_SEG0(sbi, i); if (!f2fs_test_and_set_bit(offset, se->discard_map)) sbi->discard_blks--; } - trace_f2fs_issue_discard(sbi->sb, blkstart, blklen); -#ifdef CONFIG_BLK_DEV_ZONED - if (f2fs_sb_mounted_blkzoned(sbi->sb)) - return f2fs_issue_discard_zone(sbi, blkstart, blklen); -#endif - return blkdev_issue_discard(sbi->sb->s_bdev, start, len, GFP_NOFS, 0); + if (len) + err = __issue_discard_async(sbi, bdev, start, len); + + trace_f2fs_issue_discard(sbi->sb, blkstart, blklen); + return err; } static void __add_discard_entry(struct f2fs_sb_info *sbi, diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 7a0634b0bee8..2d332a16de71 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -713,6 +713,19 @@ static void destroy_percpu_info(struct f2fs_sb_info *sbi) percpu_counter_destroy(&sbi->total_valid_inode_count); } +static void destroy_device_list(struct f2fs_sb_info *sbi) +{ + int i; + + for (i = 0; i < sbi->s_ndevs; i++) { + blkdev_put(FDEV(i).bdev, FMODE_EXCL); +#ifdef CONFIG_BLK_DEV_ZONED + kfree(FDEV(i).blkz_type); +#endif + } + kfree(sbi->devs); +} + static void f2fs_put_super(struct super_block *sb) { struct f2fs_sb_info *sbi = F2FS_SB(sb); @@ -773,6 +786,8 @@ static void f2fs_put_super(struct super_block *sb) crypto_free_shash(sbi->s_chksum_driver); kfree(sbi->raw_super); + destroy_device_list(sbi); + destroy_percpu_info(sbi); kfree(sbi); } @@ -1517,9 +1532,9 @@ static int init_percpu_info(struct f2fs_sb_info *sbi) } #ifdef CONFIG_BLK_DEV_ZONED -static int init_blkz_info(struct f2fs_sb_info *sbi) +static int init_blkz_info(struct f2fs_sb_info *sbi, int devi) { - struct block_device *bdev = sbi->sb->s_bdev; + struct block_device *bdev = FDEV(devi).bdev; sector_t nr_sectors = bdev->bd_part->nr_sects; sector_t sector = 0; struct blk_zone *zones; @@ -1530,15 +1545,21 @@ static int init_blkz_info(struct f2fs_sb_info *sbi) if (!f2fs_sb_mounted_blkzoned(sbi->sb)) return 0; + if (sbi->blocks_per_blkz && sbi->blocks_per_blkz != + SECTOR_TO_BLOCK(bdev_zone_size(bdev))) + return -EINVAL; sbi->blocks_per_blkz = SECTOR_TO_BLOCK(bdev_zone_size(bdev)); + if (sbi->log_blocks_per_blkz && sbi->log_blocks_per_blkz != + __ilog2_u32(sbi->blocks_per_blkz)) + return -EINVAL; sbi->log_blocks_per_blkz = __ilog2_u32(sbi->blocks_per_blkz); - sbi->nr_blkz = SECTOR_TO_BLOCK(nr_sectors) >> - sbi->log_blocks_per_blkz; + FDEV(devi).nr_blkz = SECTOR_TO_BLOCK(nr_sectors) >> + sbi->log_blocks_per_blkz; if (nr_sectors & (bdev_zone_size(bdev) - 1)) - sbi->nr_blkz++; + FDEV(devi).nr_blkz++; - sbi->blkz_type = kmalloc(sbi->nr_blkz, GFP_KERNEL); - if (!sbi->blkz_type) + FDEV(devi).blkz_type = kmalloc(FDEV(devi).nr_blkz, GFP_KERNEL); + if (!FDEV(devi).blkz_type) return -ENOMEM; #define F2FS_REPORT_NR_ZONES 4096 @@ -1563,7 +1584,7 @@ static int init_blkz_info(struct f2fs_sb_info *sbi) } for (i = 0; i < nr_zones; i++) { - sbi->blkz_type[n] = zones[i].type; + FDEV(devi).blkz_type[n] = zones[i].type; sector += zones[i].len; n++; } @@ -1667,6 +1688,77 @@ int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover) return err; } +static int f2fs_scan_devices(struct f2fs_sb_info *sbi) +{ + struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); + int i; + + for (i = 0; i < MAX_DEVICES; i++) { + if (!RDEV(i).path[0]) + return 0; + + if (i == 0) { + sbi->devs = kzalloc(sizeof(struct f2fs_dev_info) * + MAX_DEVICES, GFP_KERNEL); + if (!sbi->devs) + return -ENOMEM; + } + + memcpy(FDEV(i).path, RDEV(i).path, MAX_PATH_LEN); + FDEV(i).total_segments = le32_to_cpu(RDEV(i).total_segments); + if (i == 0) { + FDEV(i).start_blk = 0; + FDEV(i).end_blk = FDEV(i).start_blk + + (FDEV(i).total_segments << + sbi->log_blocks_per_seg) - 1 + + le32_to_cpu(raw_super->segment0_blkaddr); + } else { + FDEV(i).start_blk = FDEV(i - 1).end_blk + 1; + FDEV(i).end_blk = FDEV(i).start_blk + + (FDEV(i).total_segments << + sbi->log_blocks_per_seg) - 1; + } + + FDEV(i).bdev = blkdev_get_by_path(FDEV(i).path, + sbi->sb->s_mode, sbi->sb->s_type); + if (IS_ERR(FDEV(i).bdev)) + return PTR_ERR(FDEV(i).bdev); + + /* to release errored devices */ + sbi->s_ndevs = i + 1; + +#ifdef CONFIG_BLK_DEV_ZONED + if (bdev_zoned_model(FDEV(i).bdev) == BLK_ZONED_HM && + !f2fs_sb_mounted_blkzoned(sbi->sb)) { + f2fs_msg(sbi->sb, KERN_ERR, + "Zoned block device feature not enabled\n"); + return -EINVAL; + } + if (bdev_zoned_model(FDEV(i).bdev) != BLK_ZONED_NONE) { + if (init_blkz_info(sbi, i)) { + f2fs_msg(sbi->sb, KERN_ERR, + "Failed to initialize F2FS blkzone information"); + return -EINVAL; + } + f2fs_msg(sbi->sb, KERN_INFO, + "Mount Device [%2d]: %20s, %8u, %8x - %8x (zone: %s)", + i, FDEV(i).path, + FDEV(i).total_segments, + FDEV(i).start_blk, FDEV(i).end_blk, + bdev_zoned_model(FDEV(i).bdev) == BLK_ZONED_HA ? + "Host-aware" : "Host-managed"); + continue; + } +#endif + f2fs_msg(sbi->sb, KERN_INFO, + "Mount Device [%2d]: %20s, %8u, %8x - %8x", + i, FDEV(i).path, + FDEV(i).total_segments, + FDEV(i).start_blk, FDEV(i).end_blk); + } + return 0; +} + static int f2fs_fill_super(struct super_block *sb, void *data, int silent) { struct f2fs_sb_info *sbi; @@ -1725,15 +1817,7 @@ try_onemore: "Zoned block device support is not enabled\n"); goto free_sb_buf; } -#else - if (bdev_zoned_model(sb->s_bdev) == BLK_ZONED_HM && - !f2fs_sb_mounted_blkzoned(sb)) { - f2fs_msg(sb, KERN_ERR, - "Zoned block device feature not enabled\n"); - goto free_sb_buf; - } #endif - default_options(sbi); /* parse mount options */ options = kstrdup((const char *)data, GFP_KERNEL); @@ -1803,6 +1887,13 @@ try_onemore: goto free_meta_inode; } + /* Initialize device list */ + err = f2fs_scan_devices(sbi); + if (err) { + f2fs_msg(sb, KERN_ERR, "Failed to find devices"); + goto free_devices; + } + sbi->total_valid_node_count = le32_to_cpu(sbi->ckpt->valid_node_count); percpu_counter_set(&sbi->total_valid_inode_count, @@ -1821,15 +1912,6 @@ try_onemore: init_ino_entry_info(sbi); -#ifdef CONFIG_BLK_DEV_ZONED - err = init_blkz_info(sbi); - if (err) { - f2fs_msg(sb, KERN_ERR, - "Failed to initialize F2FS blkzone information"); - goto free_blkz; - } -#endif - /* setup f2fs internal modules */ err = build_segment_manager(sbi); if (err) { @@ -2008,10 +2090,8 @@ free_nm: destroy_node_manager(sbi); free_sm: destroy_segment_manager(sbi); -#ifdef CONFIG_BLK_DEV_ZONED -free_blkz: - kfree(sbi->blkz_type); -#endif +free_devices: + destroy_device_list(sbi); kfree(sbi->ckpt); free_meta_inode: make_bad_inode(sbi->meta_inode); diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index 422630b8e588..cea41a124a80 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -52,10 +52,17 @@ #define VERSION_LEN 256 #define MAX_VOLUME_NAME 512 +#define MAX_PATH_LEN 64 +#define MAX_DEVICES 8 /* * For superblock */ +struct f2fs_device { + __u8 path[MAX_PATH_LEN]; + __le32 total_segments; +} __packed; + struct f2fs_super_block { __le32 magic; /* Magic Number */ __le16 major_ver; /* Major Version */ @@ -94,7 +101,8 @@ struct f2fs_super_block { __le32 feature; /* defined features */ __u8 encryption_level; /* versioning level for encryption */ __u8 encrypt_pw_salt[16]; /* Salt used for string2key algorithm */ - __u8 reserved[871]; /* valid reserved region */ + struct f2fs_device devs[MAX_DEVICES]; /* device list */ + __u8 reserved[327]; /* valid reserved region */ } __packed; /* From 0bcbcd3714e5765abedf29fc42cd328e16c6b438 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 11 Nov 2016 16:31:56 -0800 Subject: [PATCH 050/804] f2fs: use err for f2fs_preallocate_blocks This patch has no functional change. Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 26 +++++++++++++------------- fs/f2fs/f2fs.h | 2 +- fs/f2fs/file.c | 35 +++++++++++++++++++---------------- 3 files changed, 33 insertions(+), 30 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 447dd624f6a4..13da02435fc5 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -652,11 +652,11 @@ alloc: return 0; } -ssize_t f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) +int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) { struct inode *inode = file_inode(iocb->ki_filp); struct f2fs_map_blocks map; - ssize_t ret = 0; + int err = 0; map.m_lblk = F2FS_BLK_ALIGN(iocb->ki_pos); map.m_len = F2FS_BYTES_TO_BLK(iocb->ki_pos + iov_iter_count(from)); @@ -668,19 +668,19 @@ ssize_t f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) map.m_next_pgofs = NULL; if (iocb->ki_flags & IOCB_DIRECT) { - ret = f2fs_convert_inline_inode(inode); - if (ret) - return ret; + err = f2fs_convert_inline_inode(inode); + if (err) + return err; return f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_DIO); } if (iocb->ki_pos + iov_iter_count(from) > MAX_INLINE_DATA) { - ret = f2fs_convert_inline_inode(inode); - if (ret) - return ret; + err = f2fs_convert_inline_inode(inode); + if (err) + return err; } if (!f2fs_has_inline_data(inode)) return f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO); - return ret; + return err; } /* @@ -857,19 +857,19 @@ static int __get_data_block(struct inode *inode, sector_t iblock, pgoff_t *next_pgofs) { struct f2fs_map_blocks map; - int ret; + int err; map.m_lblk = iblock; map.m_len = bh->b_size >> inode->i_blkbits; map.m_next_pgofs = next_pgofs; - ret = f2fs_map_blocks(inode, &map, create, flag); - if (!ret) { + err = f2fs_map_blocks(inode, &map, create, flag); + if (!err) { map_bh(bh, inode->i_sb, map.m_pblk); bh->b_state = (bh->b_state & ~F2FS_MAP_FLAGS) | map.m_flags; bh->b_size = map.m_len << inode->i_blkbits; } - return ret; + return err; } static int get_data_block(struct inode *inode, sector_t iblock, diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 04f6dddc6d91..8dc378d82f67 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2212,7 +2212,7 @@ void f2fs_update_data_blkaddr(struct dnode_of_data *, block_t); int reserve_new_blocks(struct dnode_of_data *, blkcnt_t); int reserve_new_block(struct dnode_of_data *); int f2fs_get_block(struct dnode_of_data *, pgoff_t); -ssize_t f2fs_preallocate_blocks(struct kiocb *, struct iov_iter *); +int f2fs_preallocate_blocks(struct kiocb *, struct iov_iter *); int f2fs_reserve_block(struct dnode_of_data *, pgoff_t); struct page *get_read_data_page(struct inode *, pgoff_t, int, bool); struct page *find_data_page(struct inode *, pgoff_t); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index ce38a350fb38..fbfcd809baec 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1324,15 +1324,15 @@ static int expand_inode_data(struct inode *inode, loff_t offset, pgoff_t pg_end; loff_t new_size = i_size_read(inode); loff_t off_end; - int ret; + int err; - ret = inode_newsize_ok(inode, (len + offset)); - if (ret) - return ret; + err = inode_newsize_ok(inode, (len + offset)); + if (err) + return err; - ret = f2fs_convert_inline_inode(inode); - if (ret) - return ret; + err = f2fs_convert_inline_inode(inode); + if (err) + return err; f2fs_balance_fs(sbi, true); @@ -1344,12 +1344,12 @@ static int expand_inode_data(struct inode *inode, loff_t offset, if (off_end) map.m_len++; - ret = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO); - if (ret) { + err = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO); + if (err) { pgoff_t last_off; if (!map.m_len) - return ret; + return err; last_off = map.m_lblk + map.m_len - 1; @@ -1363,7 +1363,7 @@ static int expand_inode_data(struct inode *inode, loff_t offset, if (!(mode & FALLOC_FL_KEEP_SIZE) && i_size_read(inode) < new_size) f2fs_i_size_write(inode, new_size); - return ret; + return err; } static long f2fs_fallocate(struct file *file, int mode, @@ -2267,12 +2267,15 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) inode_lock(inode); ret = generic_write_checks(iocb, from); if (ret > 0) { - ret = f2fs_preallocate_blocks(iocb, from); - if (!ret) { - blk_start_plug(&plug); - ret = __generic_file_write_iter(iocb, from); - blk_finish_plug(&plug); + int err = f2fs_preallocate_blocks(iocb, from); + + if (err) { + inode_unlock(inode); + return err; } + blk_start_plug(&plug); + ret = __generic_file_write_iter(iocb, from); + blk_finish_plug(&plug); } inode_unlock(inode); From 5f8b73185bd818f074f2351065e46fe46fe18784 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 11 Nov 2016 16:46:40 -0800 Subject: [PATCH 051/804] f2fs: fix redundant block allocation In direct_IO path of f2fs_file_write_iter(), 1. f2fs_preallocate_blocks(F2FS_GET_BLOCK_PRE_DIO) -> allocate LBA X 2. f2fs_direct_IO() -> return 0; Then, f2fs_write_data_page() will allocate another LBA X+1. This makes EIO triggered by HM-SMR. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 13da02435fc5..dcc5f61ac187 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -652,6 +652,13 @@ alloc: return 0; } +static inline bool __force_buffered_io(struct inode *inode, int rw) +{ + return ((f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) || + (rw == WRITE && test_opt(F2FS_I_SB(inode), LFS)) || + F2FS_I_SB(inode)->s_ndevs); +} + int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) { struct inode *inode = file_inode(iocb->ki_filp); @@ -671,7 +678,10 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) err = f2fs_convert_inline_inode(inode); if (err) return err; - return f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_DIO); + return f2fs_map_blocks(inode, &map, 1, + __force_buffered_io(inode, WRITE) ? + F2FS_GET_BLOCK_PRE_AIO : + F2FS_GET_BLOCK_PRE_DIO); } if (iocb->ki_pos + iov_iter_count(from) > MAX_INLINE_DATA) { err = f2fs_convert_inline_inode(inode); @@ -1760,11 +1770,7 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, if (err) return err; - if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) - return 0; - if (rw == WRITE && test_opt(F2FS_I_SB(inode), LFS)) - return 0; - if (F2FS_I_SB(inode)->s_ndevs) + if (__force_buffered_io(inode, rw)) return 0; trace_f2fs_direct_IO_enter(inode, offset, count, rw); From 3d89bca8b1feefa6b4a574207115de1458711c7f Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 14 Nov 2016 17:38:35 -0800 Subject: [PATCH 052/804] f2fs: avoid BG_GC in f2fs_balance_fs If many threads hit has_not_enough_free_secs() in f2fs_balance_fs() at the same time, all the threads would do FG_GC or BG_GC. In this critical path, we totally don't need to do BG_GC at all. Let's avoid that. Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- fs/f2fs/file.c | 2 +- fs/f2fs/gc.c | 7 +++++-- fs/f2fs/segment.c | 2 +- 4 files changed, 8 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 8dc378d82f67..687ab43a6cd8 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2235,7 +2235,7 @@ int f2fs_migrate_page(struct address_space *, struct page *, struct page *, int start_gc_thread(struct f2fs_sb_info *); void stop_gc_thread(struct f2fs_sb_info *); block_t start_bidx_of_node(unsigned int, struct inode *); -int f2fs_gc(struct f2fs_sb_info *, bool); +int f2fs_gc(struct f2fs_sb_info *, bool, bool); void build_gc_manager(struct f2fs_sb_info *); /* diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index fbfcd809baec..84f4572ae959 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1853,7 +1853,7 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg) mutex_lock(&sbi->gc_mutex); } - ret = f2fs_gc(sbi, sync); + ret = f2fs_gc(sbi, sync, true); out: mnt_drop_write_file(filp); return ret; diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 744031194934..54d06c21af07 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -82,7 +82,7 @@ static int gc_thread_func(void *data) stat_inc_bggc_count(sbi); /* if return value is not zero, no victim was selected */ - if (f2fs_gc(sbi, test_opt(sbi, FORCE_FG_GC))) + if (f2fs_gc(sbi, test_opt(sbi, FORCE_FG_GC), true)) wait_ms = gc_th->no_gc_sleep_time; trace_f2fs_background_gc(sbi->sb, wait_ms, @@ -905,7 +905,7 @@ next: return sec_freed; } -int f2fs_gc(struct f2fs_sb_info *sbi, bool sync) +int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background) { unsigned int segno; int gc_type = sync ? FG_GC : BG_GC; @@ -946,6 +946,9 @@ gc_more: if (ret) goto stop; } + } else if (gc_type == BG_GC && !background) { + /* f2fs_balance_fs doesn't need to do BG_GC in critical path. */ + goto stop; } if (segno == NULL_SEGNO && !__get_victim(sbi, &segno, gc_type)) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 30d0e9a76c62..27e1b7c56e4c 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -364,7 +364,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) */ if (has_not_enough_free_secs(sbi, 0, 0)) { mutex_lock(&sbi->gc_mutex); - f2fs_gc(sbi, false); + f2fs_gc(sbi, false, false); } } From 185a1b0664eef1c95a2ab55ffc274146333f921f Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 14 Nov 2016 18:20:10 -0800 Subject: [PATCH 053/804] f2fs: fix wrong written_valid_blocks counting Previously, written_valid_blocks was got by ckpt->valid_block_count. But if the last checkpoint has some NEW_ADDR due to power-cut, we can get wrong value. Fix it to get the number from actual written block count from sit entries. Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 27e1b7c56e4c..58cae4a541a7 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2176,7 +2176,6 @@ out: static int build_sit_info(struct f2fs_sb_info *sbi) { struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); - struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); struct sit_info *sit_i; unsigned int sit_segs, start; char *src_bitmap, *dst_bitmap; @@ -2243,7 +2242,7 @@ static int build_sit_info(struct f2fs_sb_info *sbi) sit_i->sit_base_addr = le32_to_cpu(raw_super->sit_blkaddr); sit_i->sit_blocks = sit_segs << sbi->log_blocks_per_seg; - sit_i->written_valid_blocks = le64_to_cpu(ckpt->valid_block_count); + sit_i->written_valid_blocks = 0; sit_i->sit_bitmap = dst_bitmap; sit_i->bitmap_size = bitmap_size; sit_i->dirty_sentries = 0; @@ -2397,6 +2396,9 @@ static void init_free_segmap(struct f2fs_sb_info *sbi) struct seg_entry *sentry = get_seg_entry(sbi, start); if (!sentry->valid_blocks) __set_free(sbi, start); + else + SIT_I(sbi)->written_valid_blocks += + sentry->valid_blocks; } /* set use the current segments */ From 2ea2e28982f0264aae0d1c3e413eddfa7c8c149e Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 16 Nov 2016 10:41:20 +0800 Subject: [PATCH 054/804] f2fs: don't wait writeback for datas during checkpoint Normally, while committing checkpoint, we will wait on all pages to be writebacked no matter the page is data or metadata, so in scenario where there are lots of data IO being submitted with metadata, we may suffer long latency for waiting writeback during checkpoint. Indeed, we only care about persistence for pages with metadata, but not pages with data, as file system consistent are only related to metadate, so in order to avoid encountering long latency in above scenario, let's recognize and reference metadata in submitted IOs, wait writeback only for metadatas. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim Conflicts: fs/f2fs/data.c --- fs/f2fs/checkpoint.c | 2 +- fs/f2fs/data.c | 35 +++++++++++++++++++++++++++++------ fs/f2fs/debug.c | 7 ++++--- fs/f2fs/f2fs.h | 9 ++++++--- fs/f2fs/file.c | 2 -- fs/f2fs/gc.c | 2 -- fs/f2fs/segment.c | 1 - 7 files changed, 40 insertions(+), 18 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index ed79757c36e0..889317e07122 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1005,7 +1005,7 @@ static void wait_on_all_pages_writeback(struct f2fs_sb_info *sbi) for (;;) { prepare_to_wait(&sbi->cp_wait, &wait, TASK_UNINTERRUPTIBLE); - if (!atomic_read(&sbi->nr_wb_bios)) + if (!get_pages(sbi, F2FS_WB_CP_DATA)) break; io_schedule_timeout(5*HZ); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index dcc5f61ac187..3994e0a1d9ff 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -29,6 +29,26 @@ #include "trace.h" #include +static bool __is_cp_guaranteed(struct page *page) +{ + struct address_space *mapping = page->mapping; + struct inode *inode; + struct f2fs_sb_info *sbi; + + if (!mapping) + return false; + + inode = mapping->host; + sbi = F2FS_I_SB(inode); + + if (inode->i_ino == F2FS_META_INO(sbi) || + inode->i_ino == F2FS_NODE_INO(sbi) || + S_ISDIR(inode->i_mode) || + is_cold_data(page)) + return true; + return false; +} + static void f2fs_read_end_io(struct bio *bio) { struct bio_vec *bvec; @@ -71,6 +91,7 @@ static void f2fs_write_end_io(struct bio *bio) bio_for_each_segment_all(bvec, bio, i) { struct page *page = bvec->bv_page; + enum count_type type = WB_DATA_TYPE(page); fscrypt_pullback_bio_page(&page, true); @@ -78,9 +99,11 @@ static void f2fs_write_end_io(struct bio *bio) set_bit(AS_EIO, &page->mapping->flags); f2fs_stop_checkpoint(sbi, true); } + dec_page_count(sbi, type); + clear_cold_data(page); end_page_writeback(page); } - if (atomic_dec_and_test(&sbi->nr_wb_bios) && + if (!get_pages(sbi, F2FS_WB_CP_DATA) && wq_has_sleeper(&sbi->cp_wait)) wake_up(&sbi->cp_wait); @@ -148,7 +171,6 @@ static inline void __submit_bio(struct f2fs_sb_info *sbi, int rw, struct bio *bio, enum page_type type) { if (!is_read_io(rw)) { - atomic_inc(&sbi->nr_wb_bios); if (f2fs_sb_mounted_blkzoned(sbi->sb) && current->plug && (type == DATA || type == NODE)) blk_finish_plug(current->plug); @@ -304,6 +326,11 @@ void f2fs_submit_page_mbio(struct f2fs_io_info *fio) verify_block_addr(sbi, fio->old_blkaddr); verify_block_addr(sbi, fio->new_blkaddr); + bio_page = fio->encrypted_page ? fio->encrypted_page : fio->page; + + if (!is_read) + inc_page_count(sbi, WB_DATA_TYPE(bio_page)); + down_write(&io->io_rwsem); if (io->bio && (io->last_block_in_bio != fio->new_blkaddr - 1 || @@ -317,8 +344,6 @@ alloc_new: io->fio = *fio; } - bio_page = fio->encrypted_page ? fio->encrypted_page : fio->page; - if (bio_add_page(io->bio, bio_page, PAGE_SIZE, 0) < PAGE_SIZE) { __submit_merged_bio(io); @@ -1330,7 +1355,6 @@ done: if (err && err != -ENOENT) goto redirty_out; - clear_cold_data(page); out: inode_dec_dirty_pages(inode); if (err) @@ -1733,7 +1757,6 @@ static int f2fs_write_end(struct file *file, goto unlock_out; set_page_dirty(page); - clear_cold_data(page); if (pos + copied > i_size_read(inode)) f2fs_i_size_write(inode, pos + copied); diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 2fdf23398fa1..67a04d8074bb 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -50,7 +50,8 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->ndirty_files = sbi->ndirty_inode[FILE_INODE]; si->ndirty_all = sbi->ndirty_inode[DIRTY_META]; si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES); - si->wb_bios = atomic_read(&sbi->nr_wb_bios); + si->nr_wb_cp_data = get_pages(sbi, F2FS_WB_CP_DATA); + si->nr_wb_data = get_pages(sbi, F2FS_WB_DATA); si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg; si->rsvd_segs = reserved_segments(sbi); si->overp_segs = overprovision_segments(sbi); @@ -313,8 +314,8 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, " - Inner Struct Count: tree: %d(%d), node: %d\n", si->ext_tree, si->zombie_tree, si->ext_node); seq_puts(s, "\nBalancing F2FS Async:\n"); - seq_printf(s, " - inmem: %4d, wb_bios: %4d\n", - si->inmem_pages, si->wb_bios); + seq_printf(s, " - inmem: %4d, wb_cp_data: %4d, wb_data: %4d\n", + si->inmem_pages, si->nr_wb_cp_data, si->nr_wb_data); seq_printf(s, " - nodes: %4d in %4d\n", si->ndirty_node, si->node_pages); seq_printf(s, " - dents: %4d in dirs:%4d (%4d)\n", diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 687ab43a6cd8..d6119ea3b86d 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -702,6 +702,7 @@ struct f2fs_sm_info { * f2fs monitors the number of several block types such as on-writeback, * dirty dentry blocks, dirty node blocks, and dirty meta blocks. */ +#define WB_DATA_TYPE(p) (__is_cp_guaranteed(p) ? F2FS_WB_CP_DATA : F2FS_WB_DATA) enum count_type { F2FS_DIRTY_DENTS, F2FS_DIRTY_DATA, @@ -709,6 +710,8 @@ enum count_type { F2FS_DIRTY_META, F2FS_INMEM_PAGES, F2FS_DIRTY_IMETA, + F2FS_WB_CP_DATA, + F2FS_WB_DATA, NR_COUNT_TYPE, }; @@ -888,7 +891,6 @@ struct f2fs_sb_info { block_t discard_blks; /* discard command candidats */ block_t last_valid_block_count; /* for recovery */ u32 s_next_generation; /* for NFS support */ - atomic_t nr_wb_bios; /* # of writeback bios */ /* # of pages, see count_type */ atomic_t nr_pages[NR_COUNT_TYPE]; @@ -1302,7 +1304,8 @@ static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type) { atomic_inc(&sbi->nr_pages[count_type]); - if (count_type == F2FS_DIRTY_DATA || count_type == F2FS_INMEM_PAGES) + if (count_type == F2FS_DIRTY_DATA || count_type == F2FS_INMEM_PAGES || + count_type == F2FS_WB_CP_DATA || count_type == F2FS_WB_DATA) return; set_sbi_flag(sbi, SBI_IS_DIRTY); @@ -2261,7 +2264,7 @@ struct f2fs_stat_info { unsigned int ndirty_dirs, ndirty_files, ndirty_all; int nats, dirty_nats, sits, dirty_sits, free_nids, alloc_nids; int total_count, utilization; - int bg_gc, wb_bios; + int bg_gc, nr_wb_cp_data, nr_wb_data; int inline_xattr, inline_inode, inline_dir, orphans; unsigned int valid_count, valid_node_count, valid_inode_count, discard_blks; unsigned int bimodal, avg_vblocks; diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 84f4572ae959..bab65f0a5bb5 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -95,8 +95,6 @@ mapped: if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) f2fs_wait_on_encrypted_page_writeback(sbi, dn.data_blkaddr); - /* if gced page is attached, don't write to cold segment */ - clear_cold_data(page); out: sb_end_pagefault(inode->i_sb); f2fs_update_time(sbi, REQ_TIME); diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 54d06c21af07..6390d45c1b68 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -690,8 +690,6 @@ retry: congestion_wait(BLK_RW_ASYNC, HZ/50); goto retry; } - - clear_cold_data(page); } out: f2fs_put_page(page, 1); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 58cae4a541a7..23e8892c4e60 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -288,7 +288,6 @@ static int __commit_inmem_pages(struct inode *inode, /* record old blkaddr for revoking */ cur->old_addr = fio.old_blkaddr; - clear_cold_data(page); submit_bio = true; } unlock_page(page); From daa738ea01c5e1c24fd67dd043723cf7e80ab758 Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Wed, 16 Nov 2016 17:26:24 +0800 Subject: [PATCH 055/804] f2fs: fix an infinite loop when flush nodes in cp Thread A Thread B - write_checkpoint - block_operations -blk_start_plug -sync_node_pages - f2fs_do_sync_file - fsync_node_pages - f2fs_wait_on_page_writeback Thread A wait for global F2FS_DIRTY_NODES decreased to zero, it start a plug list, some requests have been added to this list. Thread B lock one dirty node page, and wait this page write back. But this page has been in plug list of thread A with PG_writeback flag. Thread A keep on running and its plug list has no chance to finish, so it seems a deadlock between cp and fsync path. This patch add a wait on page write back before set node page dirty to avoid this problem. Signed-off-by: Yunlei He Signed-off-by: Pengyang Hou Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 389be7f6e07c..59cc29e6b73c 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1409,6 +1409,7 @@ continue_unlock: "Retry to write fsync mark: ino=%u, idx=%lx", ino, last_page->index); lock_page(last_page); + f2fs_wait_on_page_writeback(last_page, NODE, true); set_page_dirty(last_page); unlock_page(last_page); goto retry; From 8351875692b06008a6a91c4e63110c8d70fd83fe Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 17 Nov 2016 20:53:11 +0800 Subject: [PATCH 056/804] f2fs: fix to account total free nid correctly Thread A Thread B Thread C - f2fs_create - f2fs_new_inode - f2fs_lock_op - alloc_nid alloc last nid - f2fs_unlock_op - f2fs_create - f2fs_new_inode - f2fs_lock_op - alloc_nid as node count still not be increased, we will loop in alloc_nid - f2fs_write_node_pages - f2fs_balance_fs_bg - f2fs_sync_fs - write_checkpoint - block_operations - f2fs_lock_all - f2fs_lock_op While creating new inode, we do not allocate and account nid atomically, so that when there is almost no free nids left, we may encounter deadloop like above stack. In order to avoid that, reuse nm_i::available_nids for accounting free nids and make nid allocation and counting being atomical during node creation. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- fs/f2fs/node.c | 34 +++++++++++++++++++++++++++++----- 2 files changed, 30 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index d6119ea3b86d..973ca74404de 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -560,7 +560,7 @@ enum nid_list { struct f2fs_nm_info { block_t nat_blkaddr; /* base disk address of NAT */ nid_t max_nid; /* maximum possible node ids */ - nid_t available_nids; /* maximum available node ids */ + nid_t available_nids; /* # of available node ids */ nid_t next_scan_nid; /* the next nid to be scanned */ unsigned int ram_thresh; /* control the memory footprint */ unsigned int ra_nid_pages; /* # of nid pages to be readaheaded */ diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 59cc29e6b73c..edacbabb92cf 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1885,11 +1885,13 @@ retry: if (time_to_inject(sbi, FAULT_ALLOC_NID)) return false; #endif - if (unlikely(sbi->total_valid_node_count + 1 > nm_i->available_nids)) - return false; - spin_lock(&nm_i->nid_list_lock); + if (unlikely(nm_i->available_nids == 0)) { + spin_unlock(&nm_i->nid_list_lock); + return false; + } + /* We should not use stale free nids created by build_free_nids */ if (nm_i->nid_cnt[FREE_NID_LIST] && !on_build_free_nids(nm_i)) { f2fs_bug_on(sbi, list_empty(&nm_i->nid_list[FREE_NID_LIST])); @@ -1900,6 +1902,7 @@ retry: __remove_nid_from_list(sbi, i, FREE_NID_LIST, true); i->state = NID_ALLOC; __insert_nid_to_list(sbi, i, ALLOC_NID_LIST, false); + nm_i->available_nids--; spin_unlock(&nm_i->nid_list_lock); return true; } @@ -1951,6 +1954,9 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) i->state = NID_NEW; __insert_nid_to_list(sbi, i, FREE_NID_LIST, false); } + + nm_i->available_nids++; + spin_unlock(&nm_i->nid_list_lock); if (need_free) @@ -2150,6 +2156,19 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi) ne = grab_nat_entry(nm_i, nid); node_info_from_raw_nat(&ne->ni, &raw_ne); } + + /* + * if a free nat in journal has not been used after last + * checkpoint, we should remove it from available nids, + * since later we will add it again. + */ + if (!get_nat_flag(ne, IS_DIRTY) && + le32_to_cpu(raw_ne.block_addr) == NULL_ADDR) { + spin_lock(&nm_i->nid_list_lock); + nm_i->available_nids--; + spin_unlock(&nm_i->nid_list_lock); + } + __set_nat_cache_dirty(nm_i, ne); } update_nats_in_cursum(journal, -i); @@ -2222,8 +2241,12 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, raw_nat_from_node_info(raw_ne, &ne->ni); nat_reset_flag(ne); __clear_nat_cache_dirty(NM_I(sbi), ne); - if (nat_get_blkaddr(ne) == NULL_ADDR) + if (nat_get_blkaddr(ne) == NULL_ADDR) { add_free_nid(sbi, nid, false); + spin_lock(&NM_I(sbi)->nid_list_lock); + NM_I(sbi)->available_nids++; + spin_unlock(&NM_I(sbi)->nid_list_lock); + } } if (to_journal) @@ -2298,7 +2321,8 @@ static int init_node_manager(struct f2fs_sb_info *sbi) nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nat_blocks; /* not used nids: 0, node, meta, (and root counted as valid node) */ - nm_i->available_nids = nm_i->max_nid - F2FS_RESERVED_NODE_NUM; + nm_i->available_nids = nm_i->max_nid - sbi->total_valid_node_count - + F2FS_RESERVED_NODE_NUM; nm_i->nid_cnt[FREE_NID_LIST] = 0; nm_i->nid_cnt[ALLOC_NID_LIST] = 0; nm_i->nat_cnt = 0; From d1e1a3a4c8158a03f4d94933de4d17ebaebe9e15 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 17 Nov 2016 20:53:31 +0800 Subject: [PATCH 057/804] f2fs: fix fdatasync For below two cases, we can't guarantee data consistence: a) 1. xfs_io "pwrite 0 4195328" "fsync" 2. xfs_io "pwrite 4195328 1024" "fdatasync" 3. godown 4. umount & mount --> isize we updated before fdatasync won't be recovered b) 1. xfs_io "pwrite -S 0xcc 0 4202496" "fsync" 2. xfs_io "fpunch 4194304 4096" "fdatasync" 3. godown 4. umount & mount --> dnode we punched before fdatasync won't be recovered The reason is that normally fdatasync won't be aware of modification of metadata in file, e.g. isize changing, dnode updating, so in ->fsync we will skip flushing node pages for above cases, result in making fdatasynced file being lost during recovery. Currently we have introduced DIRTY_META global list in sbi for tracking dirty inode selectively, so in fdatasync we can choose to flush nodes depend on dirty state of current inode in the list. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 11 ++++++++++- fs/f2fs/file.c | 2 +- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 973ca74404de..16bedd87022d 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1763,8 +1763,17 @@ static inline void f2fs_i_size_write(struct inode *inode, loff_t i_size) set_inode_flag(inode, FI_AUTO_RECOVER); } -static inline bool f2fs_skip_inode_update(struct inode *inode) +static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync) { + if (dsync) { + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + bool ret; + + spin_lock(&sbi->inode_lock[DIRTY_META]); + ret = list_empty(&F2FS_I(inode)->gdirty_list); + spin_unlock(&sbi->inode_lock[DIRTY_META]); + return ret; + } if (!is_inode_flag_set(inode, FI_AUTO_RECOVER)) return false; return F2FS_I(inode)->last_disk_size == i_size_read(inode); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index bab65f0a5bb5..7fd8e7cffe9b 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -209,7 +209,7 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end, } /* if the inode is dirty, let's recover all the time */ - if (!datasync && !f2fs_skip_inode_update(inode)) { + if (!f2fs_skip_inode_update(inode, datasync)) { f2fs_write_inode(inode, NULL); goto go_write; } From aca5463a208f230e27b13aa2d8ca6952afb90f4d Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 16 Nov 2016 15:09:48 -0800 Subject: [PATCH 058/804] f2fs: do not recover i_size if it's valid If i_size is already valid during roll_forward recovery, we should not update it according to the block alignment. Signed-off-by: Jaegeuk Kim --- fs/f2fs/recovery.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 62523b217571..687c176f0b56 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -425,7 +425,7 @@ retry_dn: continue; } - if ((start + 1) << PAGE_SHIFT > i_size_read(inode)) + if (i_size_read(inode) <= (start << PAGE_SHIFT)) f2fs_i_size_write(inode, (start + 1) << PAGE_SHIFT); /* From 8df5d34aa0cd8ab9ed6d79f1d2d64c9ccbaf6a58 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 16 Nov 2016 18:53:16 -0800 Subject: [PATCH 059/804] f2fs: fix wrong AUTO_RECOVER condition If i_size is not aligned to the f2fs's block size, we should not skip inode update during fsync. Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 16bedd87022d..fa66e5baa58a 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1774,7 +1774,8 @@ static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync) spin_unlock(&sbi->inode_lock[DIRTY_META]); return ret; } - if (!is_inode_flag_set(inode, FI_AUTO_RECOVER)) + if (!is_inode_flag_set(inode, FI_AUTO_RECOVER) || + i_size_read(inode) & PAGE_MASK) return false; return F2FS_I(inode)->last_disk_size == i_size_read(inode); } From 10a2e5e7a2d4a17f15df8b23002979c00d562d4d Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Fri, 18 Nov 2016 22:21:13 +0800 Subject: [PATCH 060/804] f2fs: drop duplicate header timer.h Drop duplicate header timer.h from segment.c. Signed-off-by: Geliang Tang Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 23e8892c4e60..ba715d60c738 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -16,7 +16,6 @@ #include #include #include -#include #include "f2fs.h" #include "segment.h" From 11fce24cbf647730c551293ca061d86716c017dc Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 18 Nov 2016 22:27:41 +0800 Subject: [PATCH 061/804] f2fs: fix incorrect free inode count in ->statfs While calculating inode count that we can create at most in the left space, we should consider space which data/node blocks occupied, since we create data/node mixly in main area. So fix the wrong calculation in ->statfs. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 2d332a16de71..a288456c17c0 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -852,7 +852,8 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_bavail = user_block_count - valid_user_blocks(sbi); buf->f_files = sbi->total_node_count - F2FS_RESERVED_NODE_NUM; - buf->f_ffree = buf->f_files - valid_inode_count(sbi); + buf->f_ffree = min(buf->f_files - valid_node_count(sbi), + buf->f_bavail); buf->f_namelen = F2FS_NAME_LEN; buf->f_fsid.val[0] = (u32)id; From fd464b55a493c54ba660ac3967a4f50f0979da47 Mon Sep 17 00:00:00 2001 From: Nicolai Stange Date: Sun, 20 Nov 2016 19:57:23 +0100 Subject: [PATCH 062/804] f2fs: set ->owner for debugfs status file's file_operations The struct file_operations instance serving the f2fs/status debugfs file lacks an initialization of its ->owner. This means that although that file might have been opened, the f2fs module can still get removed. Any further operation on that opened file, releasing included, will cause accesses to unmapped memory. Indeed, Mike Marshall reported the following: BUG: unable to handle kernel paging request at ffffffffa0307430 IP: [] full_proxy_release+0x24/0x90 <...> Call Trace: [] __fput+0xdf/0x1d0 [] ____fput+0xe/0x10 [] task_work_run+0x8e/0xc0 [] do_exit+0x2ae/0xae0 [] ? __audit_syscall_entry+0xae/0x100 [] ? syscall_trace_enter+0x1ca/0x310 [] do_group_exit+0x44/0xc0 [] SyS_exit_group+0x14/0x20 [] do_syscall_64+0x61/0x150 [] entry_SYSCALL64_slow_path+0x25/0x25 <...> ---[ end trace f22ae883fa3ea6b8 ]--- Fixing recursive fault but reboot is needed! Fix this by initializing the f2fs/status file_operations' ->owner with THIS_MODULE. This will allow debugfs to grab a reference to the f2fs module upon any open on that file, thus preventing it from getting removed. Fixes: 902829aa0b72 ("f2fs: move proc files to debugfs") Reported-by: Mike Marshall Reported-by: Martin Brandenburg Cc: stable@vger.kernel.org Signed-off-by: Nicolai Stange Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 67a04d8074bb..fbd5184140d0 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -377,6 +377,7 @@ static int stat_open(struct inode *inode, struct file *file) } static const struct file_operations stat_fops = { + .owner = THIS_MODULE, .open = stat_open, .read = seq_read, .llseek = seq_lseek, From b683e01e25446c1fbf67d1a895d5ee25d897c694 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 22 Nov 2016 15:20:16 +0100 Subject: [PATCH 063/804] f2fs: fix 32-bit build The addition of multiple-device support broke CONFIG_BLK_DEV_ZONED on 32-bit machines because of a 64-bit division: fs/f2fs/f2fs.o: In function `__issue_discard_async': extent_cache.c:(.text.__issue_discard_async+0xd4): undefined reference to `__aeabi_uldivmod' Fortunately, bdev_zone_size() is guaranteed to return a power-of-two number, so we can replace the % operator with a cheaper bit mask. Fixes: 792b84b74b54 ("f2fs: support multiple devices") Signed-off-by: Arnd Bergmann Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index ba715d60c738..4f557e5e789d 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -614,7 +614,8 @@ static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, } sector = SECTOR_FROM_BLOCK(blkstart); - if (sector % bdev_zone_size(bdev) || nr_sects != bdev_zone_size(bdev)) { + if (sector & (bdev_zone_size(bdev) - 1) || + nr_sects != bdev_zone_size(bdev)) { f2fs_msg(sbi->sb, KERN_INFO, "(%d) %s: Unaligned discard attempted (block %x + %x)", devi, sbi->s_ndevs ? FDEV(devi).path: "", From 099d3df452efc107e3856a606d9fa29302edc13e Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 28 Nov 2016 15:33:38 -0800 Subject: [PATCH 064/804] f2fs: do not activate auto_recovery for fallocated i_size If a file needs to keep its i_size by fallocate, we need to turn off auto recovery during roll-forward recovery. This will resolve the below scenario. 1. xfs_io -f /mnt/f2fs/file -c "pwrite 0 4096" -c "fsync" 2. xfs_io -f /mnt/f2fs/file -c "falloc -k 4096 4096" -c "fsync" 3. md5sum /mnt/f2fs/file; 4. godown /mnt/f2fs/ 5. umount /mnt/f2fs/ 6. mount -t f2fs /dev/sdx /mnt/f2fs 7. md5sum /mnt/f2fs/file Reported-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 38 +++++++++++++++++++++----------------- fs/f2fs/file.c | 2 ++ fs/f2fs/recovery.c | 11 ++++++++--- 3 files changed, 31 insertions(+), 20 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index fa66e5baa58a..4f1046be0d74 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -449,6 +449,7 @@ struct f2fs_map_blocks { #define FADVISE_LOST_PINO_BIT 0x02 #define FADVISE_ENCRYPT_BIT 0x04 #define FADVISE_ENC_NAME_BIT 0x08 +#define FADVISE_KEEP_SIZE_BIT 0x10 #define file_is_cold(inode) is_file(inode, FADVISE_COLD_BIT) #define file_wrong_pino(inode) is_file(inode, FADVISE_LOST_PINO_BIT) @@ -461,6 +462,8 @@ struct f2fs_map_blocks { #define file_clear_encrypt(inode) clear_file(inode, FADVISE_ENCRYPT_BIT) #define file_enc_name(inode) is_file(inode, FADVISE_ENC_NAME_BIT) #define file_set_enc_name(inode) set_file(inode, FADVISE_ENC_NAME_BIT) +#define file_keep_isize(inode) is_file(inode, FADVISE_KEEP_SIZE_BIT) +#define file_set_keep_isize(inode) set_file(inode, FADVISE_KEEP_SIZE_BIT) #define DEF_DIR_LEVEL 0 @@ -1763,23 +1766,6 @@ static inline void f2fs_i_size_write(struct inode *inode, loff_t i_size) set_inode_flag(inode, FI_AUTO_RECOVER); } -static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync) -{ - if (dsync) { - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - bool ret; - - spin_lock(&sbi->inode_lock[DIRTY_META]); - ret = list_empty(&F2FS_I(inode)->gdirty_list); - spin_unlock(&sbi->inode_lock[DIRTY_META]); - return ret; - } - if (!is_inode_flag_set(inode, FI_AUTO_RECOVER) || - i_size_read(inode) & PAGE_MASK) - return false; - return F2FS_I(inode)->last_disk_size == i_size_read(inode); -} - static inline void f2fs_i_depth_write(struct inode *inode, unsigned int depth) { F2FS_I(inode)->i_current_depth = depth; @@ -1932,6 +1918,24 @@ static inline void clear_file(struct inode *inode, int type) f2fs_mark_inode_dirty_sync(inode, true); } +static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync) +{ + if (dsync) { + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + bool ret; + + spin_lock(&sbi->inode_lock[DIRTY_META]); + ret = list_empty(&F2FS_I(inode)->gdirty_list); + spin_unlock(&sbi->inode_lock[DIRTY_META]); + return ret; + } + if (!is_inode_flag_set(inode, FI_AUTO_RECOVER) || + file_keep_isize(inode) || + i_size_read(inode) & PAGE_MASK) + return false; + return F2FS_I(inode)->last_disk_size == i_size_read(inode); +} + static inline int f2fs_readonly(struct super_block *sb) { return sb->s_flags & MS_RDONLY; diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 7fd8e7cffe9b..57b6dbcbbd88 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1403,6 +1403,8 @@ static long f2fs_fallocate(struct file *file, int mode, if (!ret) { inode->i_mtime = inode->i_ctime = current_time(inode); f2fs_mark_inode_dirty_sync(inode, false); + if (mode & FALLOC_FL_KEEP_SIZE) + file_set_keep_isize(inode); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); } diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 687c176f0b56..981a9584b62f 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -187,6 +187,8 @@ static void recover_inode(struct inode *inode, struct page *page) inode->i_ctime.tv_nsec = le32_to_cpu(raw->i_ctime_nsec); inode->i_mtime.tv_nsec = le32_to_cpu(raw->i_mtime_nsec); + F2FS_I(inode)->i_advise = raw->i_advise; + if (file_enc_name(inode)) name = ""; else @@ -425,7 +427,8 @@ retry_dn: continue; } - if (i_size_read(inode) <= (start << PAGE_SHIFT)) + if (!file_keep_isize(inode) && + (i_size_read(inode) <= (start << PAGE_SHIFT))) f2fs_i_size_write(inode, (start + 1) << PAGE_SHIFT); /* @@ -478,8 +481,10 @@ err: f2fs_put_dnode(&dn); out: f2fs_msg(sbi->sb, KERN_NOTICE, - "recover_data: ino = %lx, recovered = %d blocks, err = %d", - inode->i_ino, recovered, err); + "recover_data: ino = %lx (i_size: %s) recovered = %d, err = %d", + inode->i_ino, + file_keep_isize(inode) ? "keep" : "recover", + recovered, err); return err; } From 0c0f597086be6f2f335648529df3e725840d582a Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 28 Nov 2016 19:13:43 -0800 Subject: [PATCH 065/804] f2fs: return AOP_WRITEPAGE_ACTIVATE for writepage We should use AOP_WRITEPAGE_ACTIVATE when we bypass writing pages. Signed-off-by: Chao Yu Signed-off-by: Miao Xie Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 3994e0a1d9ff..3da99574a59d 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1375,6 +1375,8 @@ out: redirty_out: redirty_page_for_writepage(wbc, page); + if (!err) + return AOP_WRITEPAGE_ACTIVATE; unlock_page(page); return err; } @@ -1470,6 +1472,15 @@ continue_unlock: ret = mapping->a_ops->writepage(page, wbc); if (unlikely(ret)) { + /* + * keep nr_to_write, since vfs uses this to + * get # of written pages. + */ + if (ret == AOP_WRITEPAGE_ACTIVATE) { + unlock_page(page); + ret = 0; + continue; + } done_index = page->index + 1; done = 1; break; From 769b4ad829b5b4a3ee2924441106cdf349d59f02 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 2 Dec 2016 15:11:32 -0800 Subject: [PATCH 066/804] Revert "f2fs: use percpu_counter for # of dirty pages in inode" This reverts commit 1beba1b3a953107c3ff5448ab4e4297db4619c76. The perpcu_counter doesn't provide atomicity in single core and consume more DRAM. That incurs fs_mark test failure due to ENOMEM. Cc: stable@vger.kernel.org # 4.7+ Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 10 +++++----- fs/f2fs/file.c | 2 +- fs/f2fs/super.c | 7 +------ 3 files changed, 7 insertions(+), 12 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 4f1046be0d74..c7eb2ff398ce 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -479,7 +479,7 @@ struct f2fs_inode_info { /* Use below internally in f2fs*/ unsigned long flags; /* use to pass per-file flags */ struct rw_semaphore i_sem; /* protect fi info */ - struct percpu_counter dirty_pages; /* # of dirty pages */ + atomic_t dirty_pages; /* # of dirty pages */ f2fs_hash_t chash; /* hash value of given file name */ unsigned int clevel; /* maximum level of given file name */ nid_t i_xattr_nid; /* node id that contains xattrs */ @@ -1316,7 +1316,7 @@ static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type) static inline void inode_inc_dirty_pages(struct inode *inode) { - percpu_counter_inc(&F2FS_I(inode)->dirty_pages); + atomic_inc(&F2FS_I(inode)->dirty_pages); inc_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ? F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA); } @@ -1332,7 +1332,7 @@ static inline void inode_dec_dirty_pages(struct inode *inode) !S_ISLNK(inode->i_mode)) return; - percpu_counter_dec(&F2FS_I(inode)->dirty_pages); + atomic_dec(&F2FS_I(inode)->dirty_pages); dec_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ? F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA); } @@ -1342,9 +1342,9 @@ static inline s64 get_pages(struct f2fs_sb_info *sbi, int count_type) return atomic_read(&sbi->nr_pages[count_type]); } -static inline s64 get_dirty_pages(struct inode *inode) +static inline int get_dirty_pages(struct inode *inode) { - return percpu_counter_sum_positive(&F2FS_I(inode)->dirty_pages); + return atomic_read(&F2FS_I(inode)->dirty_pages); } static inline int get_blocktype_secs(struct f2fs_sb_info *sbi, int block_type) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 57b6dbcbbd88..5c0500813efe 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1537,7 +1537,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) goto out; f2fs_msg(F2FS_I_SB(inode)->sb, KERN_WARNING, - "Unexpected flush for atomic writes: ino=%lu, npages=%lld", + "Unexpected flush for atomic writes: ino=%lu, npages=%u", inode->i_ino, get_dirty_pages(inode)); ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); if (ret) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index a288456c17c0..ce09191891f8 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -571,13 +571,9 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) init_once((void *) fi); - if (percpu_counter_init(&fi->dirty_pages, 0, GFP_NOFS)) { - kmem_cache_free(f2fs_inode_cachep, fi); - return NULL; - } - /* Initialize f2fs-specific inode info */ fi->vfs_inode.i_version = 1; + atomic_set(&fi->dirty_pages, 0); fi->i_current_depth = 1; fi->i_advise = 0; init_rwsem(&fi->i_sem); @@ -703,7 +699,6 @@ static void f2fs_i_callback(struct rcu_head *head) static void f2fs_destroy_inode(struct inode *inode) { - percpu_counter_destroy(&F2FS_I(inode)->dirty_pages); call_rcu(&inode->i_rcu, f2fs_i_callback); } From a980f29780f397a57a0f53cc2ce7a85078cf7e5d Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 5 Dec 2016 11:37:14 -0800 Subject: [PATCH 067/804] f2fs: call sync_fs when f2fs is idle The sync_fs in f2fs_balance_fs_bg must avoid interrupting current user requests. Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 4f557e5e789d..b95f07559d90 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -381,12 +381,15 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) else build_free_nids(sbi, false); + if (!is_idle(sbi)) + return; + /* checkpoint is the only way to shrink partial cached entries */ if (!available_free_memory(sbi, NAT_ENTRIES) || !available_free_memory(sbi, INO_ENTRIES) || excess_prefree_segs(sbi) || excess_dirty_nats(sbi) || - (is_idle(sbi) && f2fs_time_over(sbi, CP_TIME))) { + f2fs_time_over(sbi, CP_TIME)) { if (test_opt(sbi, DATA_FLUSH)) { struct blk_plug plug; From 96e6c6084b7525043095ed4235b60482d8f36573 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 5 Dec 2016 13:56:04 -0800 Subject: [PATCH 068/804] f2fs: detect wrong layout Previous mkfs.f2fs allows small partition inappropriately, so f2fs should detect that as well. Refer this in f2fs-tools. mkfs.f2fs: detect small partition by overprovision ratio and # of segments Reported-by: Eric Biggers Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.h | 2 ++ fs/f2fs/super.c | 11 +++++++++++ 2 files changed, 13 insertions(+) diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 89ab4301ef02..9d44ce83acb2 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -18,6 +18,8 @@ #define DEF_RECLAIM_PREFREE_SEGMENTS 5 /* 5% over total segments */ #define DEF_MAX_RECLAIM_PREFREE_SEGMENTS 4096 /* 8GB in maximum */ +#define F2FS_MIN_SEGMENTS 9 /* SB + 2 (CP + SIT + NAT) + SSA + MAIN */ + /* L: Logical segment # in volume, R: Relative segment # in main area */ #define GET_L2R_SEGNO(free_i, segno) (segno - free_i->start_segno) #define GET_R2L_SEGNO(free_i, segno) (segno + free_i->start_segno) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index ce09191891f8..07f4ba444733 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1453,6 +1453,7 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi) unsigned int total, fsmeta; struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + unsigned int ovp_segments, reserved_segments; total = le32_to_cpu(raw_super->segment_count); fsmeta = le32_to_cpu(raw_super->segment_count_ckpt); @@ -1464,6 +1465,16 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi) if (unlikely(fsmeta >= total)) return 1; + ovp_segments = le32_to_cpu(ckpt->overprov_segment_count); + reserved_segments = le32_to_cpu(ckpt->rsvd_segment_count); + + if (unlikely(fsmeta < F2FS_MIN_SEGMENTS || + ovp_segments == 0 || reserved_segments == 0)) { + f2fs_msg(sbi->sb, KERN_ERR, + "Wrong layout: check mkfs.f2fs version"); + return 1; + } + if (unlikely(f2fs_cp_error(sbi))) { f2fs_msg(sbi->sb, KERN_ERR, "A bug case: need to run fsck"); return 1; From 640bdae24f2744feb52dfdeadda32215c111709c Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 5 Dec 2016 17:25:32 -0800 Subject: [PATCH 069/804] f2fs: free meta pages if sanity check for ckpt is failed This fixes missing freeing meta pages in the error case. Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 889317e07122..640f28576e88 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -768,7 +768,7 @@ int get_valid_checkpoint(struct f2fs_sb_info *sbi) /* Sanity checking of checkpoint */ if (sanity_check_ckpt(sbi)) - goto fail_no_cp; + goto free_fail_no_cp; if (cur_page == cp1) sbi->cur_cp_pack = 1; @@ -796,6 +796,9 @@ done: f2fs_put_page(cp2, 1); return 0; +free_fail_no_cp: + f2fs_put_page(cp1, 1); + f2fs_put_page(cp2, 1); fail_no_cp: kfree(sbi->ckpt); return -EINVAL; From f96ce4c98613274ebbdc8cf527a8eb43e47ba4ba Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 7 Dec 2016 16:23:32 -0800 Subject: [PATCH 070/804] f2fs: fix to access nullified flush_cmd_control pointer f2fs_sync_file() remount_ro - f2fs_readonly - destroy_flush_cmd_control - f2fs_issue_flush - no fcc pointer! So, this patch doesn't free fcc in this case, but just stop its kernel thread which sends flush commands. Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- fs/f2fs/segment.c | 33 +++++++++++++++++++++++++-------- fs/f2fs/super.c | 5 +++-- 3 files changed, 29 insertions(+), 11 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index c7eb2ff398ce..3ef2d93ab936 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2150,7 +2150,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *, bool); void f2fs_balance_fs_bg(struct f2fs_sb_info *); int f2fs_issue_flush(struct f2fs_sb_info *); int create_flush_cmd_control(struct f2fs_sb_info *); -void destroy_flush_cmd_control(struct f2fs_sb_info *); +void destroy_flush_cmd_control(struct f2fs_sb_info *, bool); void invalidate_blocks(struct f2fs_sb_info *, block_t); bool is_checkpointed_data(struct f2fs_sb_info *, block_t); void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index b95f07559d90..a288de069164 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -486,8 +486,13 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi) if (!fcc->dispatch_list) wake_up(&fcc->flush_wait_queue); - wait_for_completion(&cmd.wait); - atomic_dec(&fcc->submit_flush); + if (fcc->f2fs_issue_flush) { + wait_for_completion(&cmd.wait); + atomic_dec(&fcc->submit_flush); + } else { + llist_del_all(&fcc->issue_list); + atomic_set(&fcc->submit_flush, 0); + } return cmd.ret; } @@ -498,6 +503,11 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi) struct flush_cmd_control *fcc; int err = 0; + if (SM_I(sbi)->cmd_control_info) { + fcc = SM_I(sbi)->cmd_control_info; + goto init_thread; + } + fcc = kzalloc(sizeof(struct flush_cmd_control), GFP_KERNEL); if (!fcc) return -ENOMEM; @@ -505,6 +515,7 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi) init_waitqueue_head(&fcc->flush_wait_queue); init_llist_head(&fcc->issue_list); SM_I(sbi)->cmd_control_info = fcc; +init_thread: fcc->f2fs_issue_flush = kthread_run(issue_flush_thread, sbi, "f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev)); if (IS_ERR(fcc->f2fs_issue_flush)) { @@ -517,14 +528,20 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi) return err; } -void destroy_flush_cmd_control(struct f2fs_sb_info *sbi) +void destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free) { struct flush_cmd_control *fcc = SM_I(sbi)->cmd_control_info; - if (fcc && fcc->f2fs_issue_flush) - kthread_stop(fcc->f2fs_issue_flush); - kfree(fcc); - SM_I(sbi)->cmd_control_info = NULL; + if (fcc && fcc->f2fs_issue_flush) { + struct task_struct *flush_thread = fcc->f2fs_issue_flush; + + fcc->f2fs_issue_flush = NULL; + kthread_stop(flush_thread); + } + if (free) { + kfree(fcc); + SM_I(sbi)->cmd_control_info = NULL; + } } static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, @@ -2658,7 +2675,7 @@ void destroy_segment_manager(struct f2fs_sb_info *sbi) if (!sm_info) return; - destroy_flush_cmd_control(sbi); + destroy_flush_cmd_control(sbi, true); destroy_dirty_segmap(sbi); destroy_curseg(sbi); destroy_free_segmap(sbi); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 07f4ba444733..e6d8d011786c 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1103,8 +1103,9 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) * or if flush_merge is not passed in mount option. */ if ((*flags & MS_RDONLY) || !test_opt(sbi, FLUSH_MERGE)) { - destroy_flush_cmd_control(sbi); - } else if (!SM_I(sbi)->cmd_control_info) { + clear_opt(sbi, FLUSH_MERGE); + destroy_flush_cmd_control(sbi, false); + } else { err = create_flush_cmd_control(sbi); if (err) goto restore_gc; From 2cf125d417ade924903ef4c09b24a864513714c1 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 9 Dec 2016 11:46:10 -0800 Subject: [PATCH 071/804] scripts/tags.sh: catch 4.9-rc6 Signed-off-by: Jaegeuk Kim --- scripts/tags.sh | 222 ++++++++++++++++++++++++------------------------ 1 file changed, 112 insertions(+), 110 deletions(-) diff --git a/scripts/tags.sh b/scripts/tags.sh index 262889046703..a2ff3388e5ea 100755 --- a/scripts/tags.sh +++ b/scripts/tags.sh @@ -1,4 +1,4 @@ -#!/bin/sh +#!/bin/bash # Generate tags or cscope files # Usage tags.sh # @@ -134,11 +134,6 @@ all_kconfigs() find_other_sources 'Kconfig*' } -all_defconfigs() -{ - find_sources $ALLSOURCE_ARCHS "defconfig" -} - docscope() { (echo \-k; echo \-q; all_target_sources) > cscope.files @@ -150,8 +145,111 @@ dogtags() all_target_sources | gtags -i -f - } +# Basic regular expressions with an optional /kind-spec/ for ctags and +# the following limitations: +# - No regex modifiers +# - Use \{0,1\} instead of \?, because etags expects an unescaped ? +# - \s is not working with etags, use a space or [ \t] +# - \w works, but does not match underscores in etags +# - etags regular expressions have to match at the start of a line; +# a ^[^#] is prepended by setup_regex unless an anchor is already present +regex_asm=( + '/^\(ENTRY\|_GLOBAL\)(\([[:alnum:]_\\]*\)).*/\2/' +) +regex_c=( + '/^SYSCALL_DEFINE[0-9](\([[:alnum:]_]*\).*/sys_\1/' + '/^COMPAT_SYSCALL_DEFINE[0-9](\([[:alnum:]_]*\).*/compat_sys_\1/' + '/^TRACE_EVENT(\([[:alnum:]_]*\).*/trace_\1/' + '/^TRACE_EVENT(\([[:alnum:]_]*\).*/trace_\1_rcuidle/' + '/^DEFINE_EVENT([^,)]*, *\([[:alnum:]_]*\).*/trace_\1/' + '/^DEFINE_EVENT([^,)]*, *\([[:alnum:]_]*\).*/trace_\1_rcuidle/' + '/^DEFINE_INSN_CACHE_OPS(\([[:alnum:]_]*\).*/get_\1_slot/' + '/^DEFINE_INSN_CACHE_OPS(\([[:alnum:]_]*\).*/free_\1_slot/' + '/^PAGEFLAG(\([[:alnum:]_]*\).*/Page\1/' + '/^PAGEFLAG(\([[:alnum:]_]*\).*/SetPage\1/' + '/^PAGEFLAG(\([[:alnum:]_]*\).*/ClearPage\1/' + '/^TESTSETFLAG(\([[:alnum:]_]*\).*/TestSetPage\1/' + '/^TESTPAGEFLAG(\([[:alnum:]_]*\).*/Page\1/' + '/^SETPAGEFLAG(\([[:alnum:]_]*\).*/SetPage\1/' + '/\<__SETPAGEFLAG(\([[:alnum:]_]*\).*/__SetPage\1/' + '/\ Date: Mon, 26 Sep 2016 18:07:48 +0200 Subject: [PATCH 072/804] fs/super.c: fix race between freeze_super() and thaw_super() Change thaw_super() to check frozen != SB_FREEZE_COMPLETE rather than frozen == SB_UNFROZEN, otherwise it can race with freeze_super() which drops sb->s_umount after SB_FREEZE_WRITE to preserve the lock ordering. In this case thaw_super() will wrongly call s_op->unfreeze_fs() before it was actually frozen, and call sb_freeze_unlock() which leads to the unbalanced percpu_up_write(). Unfortunately lockdep can't detect this, so this triggers misc BUG_ON()'s in kernel/rcu/sync.c. Reported-and-tested-by: Nikolay Borisov Signed-off-by: Oleg Nesterov Cc: stable@vger.kernel.org Signed-off-by: Al Viro --- fs/super.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/super.c b/fs/super.c index f5f4b328f860..d4d2591b77c8 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1326,8 +1326,8 @@ int freeze_super(struct super_block *sb) } } /* - * This is just for debugging purposes so that fs can warn if it - * sees write activity when frozen is set to SB_FREEZE_COMPLETE. + * For debugging purposes so that fs can warn if it sees write activity + * when frozen is set to SB_FREEZE_COMPLETE, and for thaw_super(). */ sb->s_writers.frozen = SB_FREEZE_COMPLETE; up_write(&sb->s_umount); @@ -1346,7 +1346,7 @@ int thaw_super(struct super_block *sb) int error; down_write(&sb->s_umount); - if (sb->s_writers.frozen == SB_UNFROZEN) { + if (sb->s_writers.frozen != SB_FREEZE_COMPLETE) { up_write(&sb->s_umount); return -EINVAL; } From a1c31d8ded433de32cf7d931ada7ec7cebe8ba85 Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Sun, 11 Dec 2016 15:35:15 +0800 Subject: [PATCH 073/804] f2fs: fix a missing size change in f2fs_setattr This patch fix a missing size change in f2fs_setattr Signed-off-by: Yunlei He Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 5c0500813efe..5808d5c709a7 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -678,6 +678,7 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); int err; + bool size_changed = false; err = inode_change_ok(inode, attr); if (err) @@ -708,6 +709,8 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) } inode->i_mtime = inode->i_ctime = current_time(inode); } + + size_changed = true; } __setattr_copy(inode, attr); @@ -720,8 +723,8 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) } } - /* update attributes only */ - f2fs_mark_inode_dirty_sync(inode, false); + /* file size may changed here */ + f2fs_mark_inode_dirty_sync(inode, size_changed); /* inode change will produce dirty node pages flushed by checkpoint */ f2fs_balance_fs(F2FS_I_SB(inode), true); From d8a1d0c13cde35521bd92c1c4607cbd9d3a7618f Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 11 Jan 2017 09:55:38 -0800 Subject: [PATCH 074/804] f2fs: remove wrong backported codes Kconfig and dentry RCU mode stuffs. Signed-off-by: Jaegeuk Kim --- fs/f2fs/Kconfig | 1 - fs/f2fs/namei.c | 3 --- 2 files changed, 4 deletions(-) diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig index 1852d99df97b..378c221d68a9 100644 --- a/fs/f2fs/Kconfig +++ b/fs/f2fs/Kconfig @@ -2,7 +2,6 @@ config F2FS_FS tristate "F2FS filesystem support" depends on BLOCK select CRYPTO - select KEYS select CRYPTO_CRC32 help F2FS is based on Log-structured File System (LFS), which supports diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 468b2dbe6d34..523bf073642e 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -1014,9 +1014,6 @@ static const char *f2fs_encrypted_follow_link(struct dentry *dentry, void **cook u32 max_size = inode->i_sb->s_blocksize; int res; - if (!dentry) - return ERR_PTR(-ECHILD); - res = fscrypt_get_encryption_info(inode); if (res) return ERR_PTR(res); From 7146292938e42afacadd7b3402f459e638f5b77a Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 11 Jan 2017 18:24:54 -0800 Subject: [PATCH 075/804] f2fs: resolve op and op_flags confilcts Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 11 +++++--- fs/f2fs/data.c | 54 ++++++++++++++++++++----------------- fs/f2fs/f2fs.h | 24 +++++++++++++++-- fs/f2fs/gc.c | 12 ++++++--- fs/f2fs/inline.c | 3 ++- fs/f2fs/node.c | 12 +++++---- fs/f2fs/segment.c | 9 ++++--- fs/f2fs/trace.c | 7 ++--- include/trace/events/f2fs.h | 19 ++++++++----- 9 files changed, 98 insertions(+), 53 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 640f28576e88..2ed785e5ffbb 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -64,14 +64,15 @@ static struct page *__get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index, struct f2fs_io_info fio = { .sbi = sbi, .type = META, - .rw = READ_SYNC | REQ_META | REQ_PRIO, + .op = REQ_OP_READ, + .op_flags = REQ_SYNC | REQ_META | REQ_PRIO, .old_blkaddr = index, .new_blkaddr = index, .encrypted_page = NULL, }; if (unlikely(!is_meta)) - fio.rw &= ~REQ_META; + fio.op_flags &= ~REQ_META; repeat: page = f2fs_grab_cache_page(mapping, index, false); if (!page) { @@ -158,13 +159,15 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, struct f2fs_io_info fio = { .sbi = sbi, .type = META, - .rw = sync ? (READ_SYNC | REQ_META | REQ_PRIO) : READA, + .op = REQ_OP_READ, + .op_flags = sync ? (REQ_SYNC | REQ_META | REQ_PRIO) : + REQ_RAHEAD, .encrypted_page = NULL, }; struct blk_plug plug; if (unlikely(type == META_POR)) - fio.rw &= ~REQ_META; + fio.op_flags &= ~REQ_META; blk_start_plug(&plug); for (; nrpages-- > 0; blkno++) { diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 3da99574a59d..87a85ff3c069 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -167,15 +167,15 @@ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr, return bio; } -static inline void __submit_bio(struct f2fs_sb_info *sbi, int rw, - struct bio *bio, enum page_type type) +static inline void __submit_bio(struct f2fs_sb_info *sbi, + struct bio *bio, enum page_type type) { - if (!is_read_io(rw)) { + if (!is_read_io(bio_op(bio))) { if (f2fs_sb_mounted_blkzoned(sbi->sb) && current->plug && (type == DATA || type == NODE)) blk_finish_plug(current->plug); } - submit_bio(rw, bio); + submit_bio(0, bio); } static void __submit_merged_bio(struct f2fs_bio_info *io) @@ -185,12 +185,14 @@ static void __submit_merged_bio(struct f2fs_bio_info *io) if (!io->bio) return; - if (is_read_io(fio->rw)) + if (is_read_io(fio->op)) trace_f2fs_submit_read_bio(io->sbi->sb, fio, io->bio); else trace_f2fs_submit_write_bio(io->sbi->sb, fio, io->bio); - __submit_bio(io->sbi, fio->rw, io->bio, fio->type); + bio_set_op_attrs(io->bio, fio->op, fio->op_flags); + + __submit_bio(io->sbi, io->bio, fio->type); io->bio = NULL; } @@ -256,10 +258,10 @@ static void __f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, /* change META to META_FLUSH in the checkpoint procedure */ if (type >= META_FLUSH) { io->fio.type = META_FLUSH; - if (test_opt(sbi, NOBARRIER)) - io->fio.rw = WRITE_FLUSH | REQ_META | REQ_PRIO; - else - io->fio.rw = WRITE_FLUSH_FUA | REQ_META | REQ_PRIO; + io->fio.op = REQ_OP_WRITE; + io->fio.op_flags = WRITE_FLUSH | REQ_META | REQ_PRIO; + if (!test_opt(sbi, NOBARRIER)) + io->fio.op_flags |= REQ_FUA; } __submit_merged_bio(io); out: @@ -301,14 +303,15 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) f2fs_trace_ios(fio, 0); /* Allocate a new bio */ - bio = __bio_alloc(fio->sbi, fio->new_blkaddr, 1, is_read_io(fio->rw)); + bio = __bio_alloc(fio->sbi, fio->new_blkaddr, 1, is_read_io(fio->op)); if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { bio_put(bio); return -EFAULT; } + bio_set_op_attrs(bio, fio->op, fio->op_flags); - __submit_bio(fio->sbi, fio->rw, bio, fio->type); + __submit_bio(fio->sbi, bio, fio->type); return 0; } @@ -317,7 +320,7 @@ void f2fs_submit_page_mbio(struct f2fs_io_info *fio) struct f2fs_sb_info *sbi = fio->sbi; enum page_type btype = PAGE_TYPE_OF_BIO(fio->type); struct f2fs_bio_info *io; - bool is_read = is_read_io(fio->rw); + bool is_read = is_read_io(fio->op); struct page *bio_page; io = is_read ? &sbi->read_io : &sbi->write_io[btype]; @@ -334,7 +337,7 @@ void f2fs_submit_page_mbio(struct f2fs_io_info *fio) down_write(&io->io_rwsem); if (io->bio && (io->last_block_in_bio != fio->new_blkaddr - 1 || - (io->fio.rw != fio->rw) || + (io->fio.op != fio->op || io->fio.op_flags != fio->op_flags) || !__same_bdev(sbi, fio->new_blkaddr, io->bio))) __submit_merged_bio(io); alloc_new: @@ -462,7 +465,7 @@ int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index) } struct page *get_read_data_page(struct inode *inode, pgoff_t index, - int rw, bool for_write) + int op_flags, bool for_write) { struct address_space *mapping = inode->i_mapping; struct dnode_of_data dn; @@ -472,7 +475,8 @@ struct page *get_read_data_page(struct inode *inode, pgoff_t index, struct f2fs_io_info fio = { .sbi = F2FS_I_SB(inode), .type = DATA, - .rw = rw, + .op = REQ_OP_READ, + .op_flags = op_flags, .encrypted_page = NULL, }; @@ -540,7 +544,7 @@ struct page *find_data_page(struct inode *inode, pgoff_t index) return page; f2fs_put_page(page, 0); - page = get_read_data_page(inode, index, READ_SYNC, false); + page = get_read_data_page(inode, index, REQ_SYNC, false); if (IS_ERR(page)) return page; @@ -566,7 +570,7 @@ struct page *get_lock_data_page(struct inode *inode, pgoff_t index, struct address_space *mapping = inode->i_mapping; struct page *page; repeat: - page = get_read_data_page(inode, index, READ_SYNC, for_write); + page = get_read_data_page(inode, index, REQ_SYNC, for_write); if (IS_ERR(page)) return page; @@ -1144,7 +1148,7 @@ got_it: if (bio && (last_block_in_bio != block_nr - 1 || !__same_bdev(F2FS_I_SB(inode), block_nr, bio))) { submit_and_realloc: - __submit_bio(F2FS_I_SB(inode), READ, bio, DATA); + __submit_bio(F2FS_I_SB(inode), bio, DATA); bio = NULL; } if (bio == NULL) { @@ -1153,6 +1157,7 @@ submit_and_realloc: bio = NULL; goto set_error_page; } + bio_set_op_attrs(bio, REQ_OP_READ, 0); } if (bio_add_page(bio, page, blocksize, 0) < blocksize) @@ -1167,7 +1172,7 @@ set_error_page: goto next_page; confused: if (bio) { - __submit_bio(F2FS_I_SB(inode), READ, bio, DATA); + __submit_bio(F2FS_I_SB(inode), bio, DATA); bio = NULL; } unlock_page(page); @@ -1177,7 +1182,7 @@ next_page: } BUG_ON(pages && !list_empty(pages)); if (bio) - __submit_bio(F2FS_I_SB(inode), READ, bio, DATA); + __submit_bio(F2FS_I_SB(inode), bio, DATA); return 0; } @@ -1295,7 +1300,8 @@ static int f2fs_write_data_page(struct page *page, struct f2fs_io_info fio = { .sbi = sbi, .type = DATA, - .rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE, + .op = REQ_OP_WRITE, + .op_flags = wbc_to_write_flags(wbc), .page = page, .encrypted_page = NULL, }; @@ -1717,14 +1723,14 @@ repeat: err = PTR_ERR(bio); goto fail; } - + bio->bi_rw = READ_SYNC; if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { bio_put(bio); err = -EFAULT; goto fail; } - __submit_bio(sbi, READ_SYNC, bio, DATA); + __submit_bio(sbi, bio, DATA); lock_page(page); if (unlikely(page->mapping != mapping)) { diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 3ef2d93ab936..d0c7decdd3ac 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -24,6 +24,7 @@ #include #include #include +#include #ifdef CONFIG_F2FS_CHECK_FS #define f2fs_bug_on(sbi, condition) BUG_ON(condition) @@ -112,6 +113,24 @@ struct f2fs_mount_info { #define F2FS_CLEAR_FEATURE(sb, mask) \ F2FS_SB(sb)->raw_super->feature &= ~cpu_to_le32(mask) +/* bio stuffs */ +#define REQ_OP_READ READ +#define REQ_OP_WRITE WRITE +#define bio_op(bio) ((bio)->bi_rw & 1) + +static inline void bio_set_op_attrs(struct bio *bio, unsigned op, + unsigned op_flags) +{ + bio->bi_rw = op | op_flags; +} + +static inline int wbc_to_write_flags(struct writeback_control *wbc) +{ + if (wbc->sync_mode == WB_SYNC_ALL) + return REQ_SYNC; + return 0; +} + /** * wq_has_sleeper - check if there are any waiting processes * @wq: wait queue head @@ -746,14 +765,15 @@ enum page_type { struct f2fs_io_info { struct f2fs_sb_info *sbi; /* f2fs_sb_info pointer */ enum page_type type; /* contains DATA/NODE/META/META_FLUSH */ - int rw; /* contains R/RS/W/WS with REQ_META/REQ_PRIO */ + int op; /* contains REQ_OP_ */ + int op_flags; /* req_flag_bits */ block_t new_blkaddr; /* new block address to be written */ block_t old_blkaddr; /* old block address before Cow */ struct page *page; /* page to be written */ struct page *encrypted_page; /* encrypted page */ }; -#define is_read_io(rw) (((rw) & 1) == READ) +#define is_read_io(rw) (rw == READ) struct f2fs_bio_info { struct f2fs_sb_info *sbi; /* f2fs superblock */ struct bio *bio; /* bios to merge */ diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 6390d45c1b68..d3a36e4b442c 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -550,7 +550,8 @@ static void move_encrypted_block(struct inode *inode, block_t bidx, struct f2fs_io_info fio = { .sbi = F2FS_I_SB(inode), .type = DATA, - .rw = READ_SYNC, + .op = REQ_OP_READ, + .op_flags = REQ_SYNC, .encrypted_page = NULL, }; struct dnode_of_data dn; @@ -627,7 +628,8 @@ static void move_encrypted_block(struct inode *inode, block_t bidx, /* allocate block address */ f2fs_wait_on_page_writeback(dn.node_page, NODE, true); - fio.rw = WRITE_SYNC; + fio.op = REQ_OP_WRITE; + fio.op_flags = REQ_SYNC | REQ_NOIDLE; fio.new_blkaddr = newaddr; f2fs_submit_page_mbio(&fio); @@ -668,7 +670,8 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type, struct f2fs_io_info fio = { .sbi = F2FS_I_SB(inode), .type = DATA, - .rw = WRITE_SYNC, + .op = REQ_OP_WRITE, + .op_flags = REQ_SYNC | REQ_NOIDLE, .page = page, .encrypted_page = NULL, }; @@ -767,7 +770,8 @@ next_step: start_bidx = start_bidx_of_node(nofs, inode); data_page = get_read_data_page(inode, - start_bidx + ofs_in_node, READA, true); + start_bidx + ofs_in_node, REQ_RAHEAD, + true); if (IS_ERR(data_page)) { iput(inode); continue; diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 3f8bfc87c6dc..d82e97b1e6c4 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -110,7 +110,8 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) struct f2fs_io_info fio = { .sbi = F2FS_I_SB(dn->inode), .type = DATA, - .rw = WRITE_SYNC | REQ_PRIO, + .op = REQ_OP_WRITE, + .op_flags = REQ_SYNC | REQ_NOIDLE | REQ_PRIO, .page = page, .encrypted_page = NULL, }; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index edacbabb92cf..26a745c544fc 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1068,14 +1068,15 @@ fail: * 0: f2fs_put_page(page, 0) * LOCKED_PAGE or error: f2fs_put_page(page, 1) */ -static int read_node_page(struct page *page, int rw) +static int read_node_page(struct page *page, int op_flags) { struct f2fs_sb_info *sbi = F2FS_P_SB(page); struct node_info ni; struct f2fs_io_info fio = { .sbi = sbi, .type = NODE, - .rw = rw, + .op = REQ_OP_READ, + .op_flags = op_flags, .page = page, .encrypted_page = NULL, }; @@ -1116,7 +1117,7 @@ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid) if (!apage) return; - err = read_node_page(apage, READA); + err = read_node_page(apage, REQ_RAHEAD); f2fs_put_page(apage, err ? 1 : 0); } @@ -1134,7 +1135,7 @@ repeat: if (!page) return ERR_PTR(-ENOMEM); - err = read_node_page(page, READ_SYNC); + err = read_node_page(page, REQ_SYNC); if (err < 0) { f2fs_put_page(page, 1); return ERR_PTR(err); @@ -1575,7 +1576,8 @@ static int f2fs_write_node_page(struct page *page, struct f2fs_io_info fio = { .sbi = sbi, .type = NODE, - .rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE, + .op = REQ_OP_WRITE, + .op_flags = wbc_to_write_flags(wbc), .page = page, .encrypted_page = NULL, }; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index a288de069164..70aec4a8de13 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -257,7 +257,8 @@ static int __commit_inmem_pages(struct inode *inode, struct f2fs_io_info fio = { .sbi = sbi, .type = DATA, - .rw = WRITE_SYNC | REQ_PRIO, + .op = REQ_OP_WRITE, + .op_flags = REQ_SYNC | REQ_NOIDLE | REQ_PRIO, .encrypted_page = NULL, }; bool submit_bio = false; @@ -407,6 +408,7 @@ static int __submit_flush_wait(struct block_device *bdev) struct bio *bio = f2fs_bio_alloc(0); int ret; + bio->bi_rw = REQ_OP_WRITE; bio->bi_bdev = bdev; ret = submit_bio_wait(WRITE_FLUSH, bio); bio_put(bio); @@ -1544,7 +1546,8 @@ void write_meta_page(struct f2fs_sb_info *sbi, struct page *page) struct f2fs_io_info fio = { .sbi = sbi, .type = META, - .rw = WRITE_SYNC | REQ_META | REQ_PRIO, + .op = REQ_OP_WRITE, + .op_flags = REQ_SYNC | REQ_NOIDLE | REQ_META | REQ_PRIO, .old_blkaddr = page->index, .new_blkaddr = page->index, .page = page, @@ -1552,7 +1555,7 @@ void write_meta_page(struct f2fs_sb_info *sbi, struct page *page) }; if (unlikely(page->index >= MAIN_BLKADDR(sbi))) - fio.rw &= ~REQ_META; + fio.op_flags &= ~REQ_META; set_page_writeback(page); f2fs_submit_page_mbio(&fio); diff --git a/fs/f2fs/trace.c b/fs/f2fs/trace.c index 562ce0821559..73b4e1d1912a 100644 --- a/fs/f2fs/trace.c +++ b/fs/f2fs/trace.c @@ -25,11 +25,11 @@ static inline void __print_last_io(void) if (!last_io.len) return; - trace_printk("%3x:%3x %4x %-16s %2x %5x %12x %4x\n", + trace_printk("%3x:%3x %4x %-16s %2x %5x %5x %12x %4x\n", last_io.major, last_io.minor, last_io.pid, "----------------", last_io.type, - last_io.fio.rw, + last_io.fio.op, last_io.fio.op_flags, last_io.fio.new_blkaddr, last_io.len); memset(&last_io, 0, sizeof(last_io)); @@ -101,7 +101,8 @@ void f2fs_trace_ios(struct f2fs_io_info *fio, int flush) if (last_io.major == major && last_io.minor == minor && last_io.pid == pid && last_io.type == __file_type(inode, pid) && - last_io.fio.rw == fio->rw && + last_io.fio.op == fio->op && + last_io.fio.op_flags == fio->op_flags && last_io.fio.new_blkaddr + last_io.len == fio->new_blkaddr) { last_io.len++; diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 90d6ad49a9c5..7ad46e8a89e6 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -59,7 +59,8 @@ TRACE_DEFINE_ENUM(CP_DISCARD); #define F2FS_BIO_MASK(t) (t & (READA | WRITE_FLUSH_FUA)) #define F2FS_BIO_EXTRA_MASK(t) (t & (REQ_META | REQ_PRIO)) -#define show_bio_type(type) show_bio_base(type), show_bio_extra(type) +#define show_bio_type(op, op_flags) \ + show_bio_base((op|op_flags)), show_bio_extra((op|op_flags)) #define show_bio_base(type) \ __print_symbolic(F2FS_BIO_MASK(type), \ @@ -734,7 +735,8 @@ DECLARE_EVENT_CLASS(f2fs__submit_page_bio, __field(pgoff_t, index) __field(block_t, old_blkaddr) __field(block_t, new_blkaddr) - __field(int, rw) + __field(int, op) + __field(int, op_flags) __field(int, type) ), @@ -744,7 +746,8 @@ DECLARE_EVENT_CLASS(f2fs__submit_page_bio, __entry->index = page->index; __entry->old_blkaddr = fio->old_blkaddr; __entry->new_blkaddr = fio->new_blkaddr; - __entry->rw = fio->rw; + __entry->op = fio->op; + __entry->op_flags = fio->op_flags; __entry->type = fio->type; ), @@ -754,7 +757,7 @@ DECLARE_EVENT_CLASS(f2fs__submit_page_bio, (unsigned long)__entry->index, (unsigned long long)__entry->old_blkaddr, (unsigned long long)__entry->new_blkaddr, - show_bio_type(__entry->rw), + show_bio_type(__entry->op, __entry->op_flags), show_block_type(__entry->type)) ); @@ -785,7 +788,8 @@ DECLARE_EVENT_CLASS(f2fs__submit_bio, TP_STRUCT__entry( __field(dev_t, dev) - __field(int, rw) + __field(int, op) + __field(int, op_flags) __field(int, type) __field(sector_t, sector) __field(unsigned int, size) @@ -793,7 +797,8 @@ DECLARE_EVENT_CLASS(f2fs__submit_bio, TP_fast_assign( __entry->dev = sb->s_dev; - __entry->rw = fio->rw; + __entry->op = fio->op; + __entry->op_flags = fio->op_flags; __entry->type = fio->type; __entry->sector = bio->bi_iter.bi_sector; __entry->size = bio->bi_iter.bi_size; @@ -801,7 +806,7 @@ DECLARE_EVENT_CLASS(f2fs__submit_bio, TP_printk("dev = (%d,%d), %s%s, %s, sector = %lld, size = %u", show_dev(__entry), - show_bio_type(__entry->rw), + show_bio_type(__entry->op, __entry->op_flags), show_block_type(__entry->type), (unsigned long long)__entry->sector, __entry->size) From 373bb0247ae5d5ff0e371d613599fa44392e972e Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 11 Jan 2017 16:41:25 -0800 Subject: [PATCH 076/804] f2fs: support async discard based on v4.9 This patch is based on commit 275b66b09e85 (f2fs: support async discard). Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 7 +- fs/f2fs/f2fs.h | 3 +- fs/f2fs/segment.c | 183 ++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 181 insertions(+), 12 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 2ed785e5ffbb..d485bea3d6bb 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1255,6 +1255,7 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) f2fs_bug_on(sbi, prefree_segments(sbi)); flush_sit_entries(sbi, cpc); clear_prefree_segments(sbi, cpc); + f2fs_wait_all_discard_bio(sbi); unblock_operations(sbi); goto out; } @@ -1273,10 +1274,12 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* unlock all the fs_lock[] in do_checkpoint() */ err = do_checkpoint(sbi, cpc); - if (err) + if (err) { release_discard_addrs(sbi); - else + } else { clear_prefree_segments(sbi, cpc); + f2fs_wait_all_discard_bio(sbi); + } unblock_operations(sbi); stat_inc_cp_count(sbi->stat_info); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index d0c7decdd3ac..883d3ab388c1 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -127,7 +127,7 @@ static inline void bio_set_op_attrs(struct bio *bio, unsigned op, static inline int wbc_to_write_flags(struct writeback_control *wbc) { if (wbc->sync_mode == WB_SYNC_ALL) - return REQ_SYNC; + return REQ_SYNC | REQ_NOIDLE; return 0; } @@ -2174,6 +2174,7 @@ void destroy_flush_cmd_control(struct f2fs_sb_info *, bool); void invalidate_blocks(struct f2fs_sb_info *, block_t); bool is_checkpointed_data(struct f2fs_sb_info *, block_t); void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t); +void f2fs_wait_all_discard_bio(struct f2fs_sb_info *); void clear_prefree_segments(struct f2fs_sb_info *, struct cp_control *); void release_discard_addrs(struct f2fs_sb_info *); int npages_for_summary_flush(struct f2fs_sb_info *, bool); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 70aec4a8de13..13bea6e5120e 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -26,6 +26,7 @@ #define __reverse_ffz(x) __reverse_ffs(~(x)) static struct kmem_cache *discard_entry_slab; +static struct kmem_cache *bio_entry_slab; static struct kmem_cache *sit_entry_set_slab; static struct kmem_cache *inmem_entry_slab; @@ -622,6 +623,162 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) mutex_unlock(&dirty_i->seglist_lock); } +static struct bio_entry *__add_bio_entry(struct f2fs_sb_info *sbi, + struct bio *bio) +{ + struct list_head *wait_list = &(SM_I(sbi)->wait_list); + struct bio_entry *be = f2fs_kmem_cache_alloc(bio_entry_slab, GFP_NOFS); + + INIT_LIST_HEAD(&be->list); + be->bio = bio; + init_completion(&be->event); + list_add_tail(&be->list, wait_list); + + return be; +} + +void f2fs_wait_all_discard_bio(struct f2fs_sb_info *sbi) +{ + struct list_head *wait_list = &(SM_I(sbi)->wait_list); + struct bio_entry *be, *tmp; + + list_for_each_entry_safe(be, tmp, wait_list, list) { + struct bio *bio = be->bio; + int err; + + wait_for_completion_io(&be->event); + err = be->error; + if (err == -EOPNOTSUPP) + err = 0; + + if (err) + f2fs_msg(sbi->sb, KERN_INFO, + "Issue discard failed, ret: %d", err); + + bio_put(bio); + list_del(&be->list); + kmem_cache_free(bio_entry_slab, be); + } +} + +static void f2fs_submit_bio_wait_endio(struct bio *bio) +{ + struct bio_entry *be = (struct bio_entry *)bio->bi_private; + + be->error = bio->bi_error; + complete(&be->event); +} + +/* copied from block/blk-lib.c in 4.10-rc1 */ +static int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, + sector_t nr_sects, gfp_t gfp_mask, int flags, + struct bio **biop) +{ + struct request_queue *q = bdev_get_queue(bdev); + struct bio *bio = *biop; + unsigned int granularity; + int op = REQ_WRITE | REQ_DISCARD; + int alignment; + sector_t bs_mask; + + if (!q) + return -ENXIO; + + if (!blk_queue_discard(q)) + return -EOPNOTSUPP; + + if (flags & BLKDEV_DISCARD_SECURE) { + if (!blk_queue_secdiscard(q)) + return -EOPNOTSUPP; + op |= REQ_SECURE; + } + + bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1; + if ((sector | nr_sects) & bs_mask) + return -EINVAL; + + /* Zero-sector (unknown) and one-sector granularities are the same. */ + granularity = max(q->limits.discard_granularity >> 9, 1U); + alignment = (bdev_discard_alignment(bdev) >> 9) % granularity; + + while (nr_sects) { + unsigned int req_sects; + sector_t end_sect, tmp; + + /* Make sure bi_size doesn't overflow */ + req_sects = min_t(sector_t, nr_sects, UINT_MAX >> 9); + + /** + * If splitting a request, and the next starting sector would be + * misaligned, stop the discard at the previous aligned sector. + */ + end_sect = sector + req_sects; + tmp = end_sect; + if (req_sects < nr_sects && + sector_div(tmp, granularity) != alignment) { + end_sect = end_sect - alignment; + sector_div(end_sect, granularity); + end_sect = end_sect * granularity + alignment; + req_sects = end_sect - sector; + } + + if (bio) { + int ret = submit_bio_wait(0, bio); + bio_put(bio); + if (ret) + return ret; + } + bio = f2fs_bio_alloc(0); + bio->bi_iter.bi_sector = sector; + bio->bi_bdev = bdev; + bio_set_op_attrs(bio, op, 0); + + bio->bi_iter.bi_size = req_sects << 9; + nr_sects -= req_sects; + sector = end_sect; + + /* + * We can loop for a long time in here, if someone does + * full device discards (like mkfs). Be nice and allow + * us to schedule out to avoid softlocking if preempt + * is disabled. + */ + cond_resched(); + } + + *biop = bio; + return 0; +} + +/* this function is copied from blkdev_issue_discard from block/blk-lib.c */ +static int __f2fs_issue_discard_async(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t blkstart, block_t blklen) +{ + struct bio *bio = NULL; + int err; + + trace_f2fs_issue_discard(sbi->sb, blkstart, blklen); + + if (sbi->s_ndevs) { + int devi = f2fs_target_device_index(sbi, blkstart); + + blkstart -= FDEV(devi).start_blk; + } + err = __blkdev_issue_discard(bdev, + SECTOR_FROM_BLOCK(blkstart), + SECTOR_FROM_BLOCK(blklen), + GFP_NOFS, 0, &bio); + if (!err && bio) { + struct bio_entry *be = __add_bio_entry(sbi, bio); + + bio->bi_private = be; + bio->bi_end_io = f2fs_submit_bio_wait_endio; + submit_bio(REQ_SYNC, bio); + } + + return err; +} + #ifdef CONFIG_BLK_DEV_ZONED static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t blkstart, block_t blklen) @@ -655,8 +812,7 @@ static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, case BLK_ZONE_TYPE_CONVENTIONAL: if (!blk_queue_discard(bdev_get_queue(bdev))) return 0; - return blkdev_issue_discard(bdev, sector, nr_sects, - GFP_NOFS, 0); + return __f2fs_issue_discard_async(sbi, bdev, blkstart, blklen); case BLK_ZONE_TYPE_SEQWRITE_REQ: case BLK_ZONE_TYPE_SEQWRITE_PREF: trace_f2fs_issue_reset_zone(sbi->sb, blkstart); @@ -672,15 +828,12 @@ static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, static int __issue_discard_async(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t blkstart, block_t blklen) { - sector_t start = SECTOR_FROM_BLOCK(blkstart); - sector_t len = SECTOR_FROM_BLOCK(blklen); - #ifdef CONFIG_BLK_DEV_ZONED if (f2fs_sb_mounted_blkzoned(sbi->sb) && bdev_zoned_model(bdev) != BLK_ZONED_NONE) return __f2fs_issue_discard_zone(sbi, bdev, blkstart, blklen); #endif - return blkdev_issue_discard(bdev, start, len, GFP_NOFS, 0); + return __f2fs_issue_discard_async(sbi, bdev, blkstart, blklen); } static int f2fs_issue_discard(struct f2fs_sb_info *sbi, @@ -720,8 +873,6 @@ static int f2fs_issue_discard(struct f2fs_sb_info *sbi, if (len) err = __issue_discard_async(sbi, bdev, start, len); - - trace_f2fs_issue_discard(sbi->sb, blkstart, blklen); return err; } @@ -822,11 +973,14 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc) struct list_head *head = &(SM_I(sbi)->discard_list); struct discard_entry *entry, *this; struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + struct blk_plug plug; unsigned long *prefree_map = dirty_i->dirty_segmap[PRE]; unsigned int start = 0, end = -1; unsigned int secno, start_segno; bool force = (cpc->reason == CP_DISCARD); + blk_start_plug(&plug); + mutex_lock(&dirty_i->seglist_lock); while (1) { @@ -875,6 +1029,8 @@ skip: SM_I(sbi)->nr_discards -= entry->len; kmem_cache_free(discard_entry_slab, entry); } + + blk_finish_plug(&plug); } static bool __mark_sit_entry_dirty(struct f2fs_sb_info *sbi, unsigned int segno) @@ -2551,6 +2707,7 @@ int build_segment_manager(struct f2fs_sb_info *sbi) sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS; INIT_LIST_HEAD(&sm_info->discard_list); + INIT_LIST_HEAD(&sm_info->wait_list); sm_info->nr_discards = 0; sm_info->max_discards = 0; @@ -2694,10 +2851,15 @@ int __init create_segment_manager_caches(void) if (!discard_entry_slab) goto fail; + bio_entry_slab = f2fs_kmem_cache_create("bio_entry", + sizeof(struct bio_entry)); + if (!bio_entry_slab) + goto destroy_discard_entry; + sit_entry_set_slab = f2fs_kmem_cache_create("sit_entry_set", sizeof(struct sit_entry_set)); if (!sit_entry_set_slab) - goto destroy_discard_entry; + goto destroy_bio_entry; inmem_entry_slab = f2fs_kmem_cache_create("inmem_page_entry", sizeof(struct inmem_pages)); @@ -2707,6 +2869,8 @@ int __init create_segment_manager_caches(void) destroy_sit_entry_set: kmem_cache_destroy(sit_entry_set_slab); +destroy_bio_entry: + kmem_cache_destroy(bio_entry_slab); destroy_discard_entry: kmem_cache_destroy(discard_entry_slab); fail: @@ -2716,6 +2880,7 @@ fail: void destroy_segment_manager_caches(void) { kmem_cache_destroy(sit_entry_set_slab); + kmem_cache_destroy(bio_entry_slab); kmem_cache_destroy(discard_entry_slab); kmem_cache_destroy(inmem_entry_slab); } From 0e4e431a23c324ed49916871ef1306a8c08f08c1 Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Tue, 13 Dec 2016 17:23:37 +0800 Subject: [PATCH 077/804] f2fs: remove unused values in recover_fsync_data This patch remove unused values in function recover_fsync_data Signed-off-by: Yunlei He Signed-off-by: Jaegeuk Kim --- fs/f2fs/recovery.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 981a9584b62f..4fb4471a3206 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -552,10 +552,8 @@ next: int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) { - struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); struct list_head inode_list; struct list_head dir_list; - block_t blkaddr; int err; int ret = 0; bool need_writecp = false; @@ -571,8 +569,6 @@ int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) /* prevent checkpoint */ mutex_lock(&sbi->cp_mutex); - blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); - /* step #1: find fsynced inode numbers */ err = find_fsync_dnodes(sbi, &inode_list); if (err || list_empty(&inode_list)) From 8799db31b9b1969792f05a48454234febed10008 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 13 Dec 2016 18:54:59 +0800 Subject: [PATCH 078/804] f2fs: don't cache nat entry if out of memory If we run out of memory, in cache_nat_entry, it's better to avoid loop for allocating memory to cache nat entry, so in low memory scenario, for read path of node block, I expect this can avoid unneeded latency. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 26a745c544fc..b01b01cfc39e 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -245,12 +245,24 @@ bool need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino) return need_update; } -static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid) +static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid, + bool no_fail) { struct nat_entry *new; - new = f2fs_kmem_cache_alloc(nat_entry_slab, GFP_NOFS); - f2fs_radix_tree_insert(&nm_i->nat_root, nid, new); + if (no_fail) { + new = f2fs_kmem_cache_alloc(nat_entry_slab, GFP_NOFS); + f2fs_radix_tree_insert(&nm_i->nat_root, nid, new); + } else { + new = kmem_cache_alloc(nat_entry_slab, GFP_NOFS); + if (!new) + return NULL; + if (radix_tree_insert(&nm_i->nat_root, nid, new)) { + kmem_cache_free(nat_entry_slab, new); + return NULL; + } + } + memset(new, 0, sizeof(struct nat_entry)); nat_set_nid(new, nid); nat_reset_flag(new); @@ -267,8 +279,9 @@ static void cache_nat_entry(struct f2fs_sb_info *sbi, nid_t nid, e = __lookup_nat_cache(nm_i, nid); if (!e) { - e = grab_nat_entry(nm_i, nid); - node_info_from_raw_nat(&e->ni, ne); + e = grab_nat_entry(nm_i, nid, false); + if (e) + node_info_from_raw_nat(&e->ni, ne); } else { f2fs_bug_on(sbi, nat_get_ino(e) != le32_to_cpu(ne->ino) || nat_get_blkaddr(e) != @@ -286,7 +299,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, down_write(&nm_i->nat_tree_lock); e = __lookup_nat_cache(nm_i, ni->nid); if (!e) { - e = grab_nat_entry(nm_i, ni->nid); + e = grab_nat_entry(nm_i, ni->nid, true); copy_node_info(&e->ni, ni); f2fs_bug_on(sbi, ni->blk_addr == NEW_ADDR); } else if (new_blkaddr == NEW_ADDR) { @@ -2155,7 +2168,7 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi) ne = __lookup_nat_cache(nm_i, nid); if (!ne) { - ne = grab_nat_entry(nm_i, nid); + ne = grab_nat_entry(nm_i, nid, true); node_info_from_raw_nat(&ne->ni, &raw_ne); } From e891bf97aa8ab43ff254acede061157c1dae0ba8 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 16 Dec 2016 11:18:15 +0300 Subject: [PATCH 079/804] f2fs: remove unneeded condition We checked that "inode" is not an error pointer earlier so there is no need to check again here. Signed-off-by: Dan Carpenter Signed-off-by: Jaegeuk Kim --- fs/f2fs/namei.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 523bf073642e..ca9e2f85eae8 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -321,9 +321,9 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, if (err) goto err_out; } - if (!IS_ERR(inode) && f2fs_encrypted_inode(dir) && - (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) && - !fscrypt_has_permitted_context(dir, inode)) { + if (f2fs_encrypted_inode(dir) && + (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) && + !fscrypt_has_permitted_context(dir, inode)) { bool nokey = f2fs_encrypted_inode(inode) && !fscrypt_has_encryption_key(inode); err = nokey ? -ENOKEY : -EPERM; From e82207d3ee89d81b19882106b69cd5b760f3d4aa Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Mon, 19 Dec 2016 20:10:48 +0800 Subject: [PATCH 080/804] f2fs: fix a problem of using memory after free This patch fix a problem of using memory after free in function __try_merge_extent_node. Fixes: 0f825ee6e873 ("f2fs: add new interfaces for extent tree") Cc: Signed-off-by: Yunlei He Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/extent_cache.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 4db44da7ef69..e02c3d88dc9a 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -352,11 +352,12 @@ static struct extent_node *__try_merge_extent_node(struct inode *inode, } if (next_ex && __is_front_mergeable(ei, &next_ex->ei)) { - if (en) - __release_extent_node(sbi, et, prev_ex); next_ex->ei.fofs = ei->fofs; next_ex->ei.blk = ei->blk; next_ex->ei.len += ei->len; + if (en) + __release_extent_node(sbi, et, prev_ex); + en = next_ex; } From 4212c0f71a584ad1a8f4655c19032816f65b5d3e Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Tue, 20 Dec 2016 11:11:35 +0800 Subject: [PATCH 081/804] f2fs: add a case of no need to read a page in write begin If the range we write cover the whole valid data in the last page, we do not need to read it. Signed-off-by: Yunlei He [Jaegeuk Kim: nullify the remaining area (fix: xfstests/f2fs/001)] Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 87a85ff3c069..b47830db4263 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1712,6 +1712,11 @@ repeat: if (len == PAGE_SIZE || PageUptodate(page)) return 0; + if (!(pos & (PAGE_SIZE - 1)) && (pos + len) >= i_size_read(inode)) { + zero_user_segment(page, len, PAGE_SIZE); + return 0; + } + if (blkaddr == NEW_ADDR) { zero_user_segment(page, 0, PAGE_SIZE); SetPageUptodate(page); @@ -1765,7 +1770,7 @@ static int f2fs_write_end(struct file *file, * let generic_perform_write() try to copy data again through copied=0. */ if (!PageUptodate(page)) { - if (unlikely(copied != PAGE_SIZE)) + if (unlikely(copied != len)) copied = 0; else SetPageUptodate(page); From 670a455ef9e10064eb35579e2b6c9c7492ab693f Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Tue, 20 Dec 2016 21:57:42 +0800 Subject: [PATCH 082/804] f2fs: use rb_entry_safe Use rb_entry_safe() instead of open-coding it. Signed-off-by: Geliang Tang Signed-off-by: Jaegeuk Kim --- fs/f2fs/extent_cache.c | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index e02c3d88dc9a..6ed6424807b6 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -311,28 +311,24 @@ static struct extent_node *__lookup_extent_tree_ret(struct extent_tree *et, tmp_node = parent; if (parent && fofs > en->ei.fofs) tmp_node = rb_next(parent); - *next_ex = tmp_node ? - rb_entry(tmp_node, struct extent_node, rb_node) : NULL; + *next_ex = rb_entry_safe(tmp_node, struct extent_node, rb_node); tmp_node = parent; if (parent && fofs < en->ei.fofs) tmp_node = rb_prev(parent); - *prev_ex = tmp_node ? - rb_entry(tmp_node, struct extent_node, rb_node) : NULL; + *prev_ex = rb_entry_safe(tmp_node, struct extent_node, rb_node); return NULL; lookup_neighbors: if (fofs == en->ei.fofs) { /* lookup prev node for merging backward later */ tmp_node = rb_prev(&en->rb_node); - *prev_ex = tmp_node ? - rb_entry(tmp_node, struct extent_node, rb_node) : NULL; + *prev_ex = rb_entry_safe(tmp_node, struct extent_node, rb_node); } if (fofs == en->ei.fofs + en->ei.len - 1) { /* lookup next node for merging frontward later */ tmp_node = rb_next(&en->rb_node); - *next_ex = tmp_node ? - rb_entry(tmp_node, struct extent_node, rb_node) : NULL; + *next_ex = rb_entry_safe(tmp_node, struct extent_node, rb_node); } return en; } @@ -493,9 +489,8 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode, if (!next_en) { struct rb_node *node = rb_next(&en->rb_node); - next_en = node ? - rb_entry(node, struct extent_node, rb_node) - : NULL; + next_en = rb_entry_safe(node, struct extent_node, + rb_node); } if (parts) From 0c61b0a37be7e2f7f55066dfa77c3697765724ff Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Thu, 22 Dec 2016 11:46:24 +0800 Subject: [PATCH 083/804] f2fs: fix a missing discard prefree segments If userspace issue a fstrim with a range not involve prefree segments, it will reuse these segments without discard. This patch fix it. Signed-off-by: Yunlei He Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 13bea6e5120e..f4e41f997ae3 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -996,9 +996,13 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc) dirty_i->nr_dirty[PRE] -= end - start; - if (force || !test_opt(sbi, DISCARD)) + if (!test_opt(sbi, DISCARD)) continue; + if (force && start >= cpc->trim_start && + (end - 1) <= cpc->trim_end) + continue; + if (!test_opt(sbi, LFS) || sbi->segs_per_sec == 1) { f2fs_issue_discard(sbi, START_BLOCK(sbi, start), (end - start) << sbi->log_blocks_per_seg); @@ -2343,8 +2347,12 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) f2fs_bug_on(sbi, sit_i->dirty_sentries); out: if (cpc->reason == CP_DISCARD) { + __u64 trim_start = cpc->trim_start; + for (; cpc->trim_start <= cpc->trim_end; cpc->trim_start++) add_discard_addrs(sbi, cpc); + + cpc->trim_start = trim_start; } mutex_unlock(&sit_i->sentry_lock); From c1e5d5278024fbe65f123c6f8e772b82f1f72106 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 21 Dec 2016 11:51:32 -0800 Subject: [PATCH 084/804] f2fs: reassign new segment for mode=lfs Otherwise we can remain wrong curseg->next_blkoff, resulting in fsck failure. Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index f4e41f997ae3..e6d3f3d4b028 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1508,9 +1508,6 @@ void allocate_new_segments(struct f2fs_sb_info *sbi) unsigned int old_segno; int i; - if (test_opt(sbi, LFS)) - return; - for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { curseg = CURSEG_I(sbi, i); old_segno = curseg->segno; From 849981c99bd8508cfcce20b693ff3206e0b3d161 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 21 Dec 2016 12:13:03 -0800 Subject: [PATCH 085/804] f2fs: add submit_bio tracepoint This patch adds final submit_bio() tracepoint. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim Conflicts: fs/f2fs/data.c --- fs/f2fs/data.c | 14 +++++++----- include/trace/events/f2fs.h | 45 ++++++++++++++++++++++++------------- 2 files changed, 39 insertions(+), 20 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index b47830db4263..ab82b388c5aa 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -175,6 +175,10 @@ static inline void __submit_bio(struct f2fs_sb_info *sbi, current->plug && (type == DATA || type == NODE)) blk_finish_plug(current->plug); } + if (is_read_io(bio_op(bio))) + trace_f2fs_submit_read_bio(sbi->sb, type, bio); + else + trace_f2fs_submit_write_bio(sbi->sb, type, bio); submit_bio(0, bio); } @@ -185,13 +189,13 @@ static void __submit_merged_bio(struct f2fs_bio_info *io) if (!io->bio) return; - if (is_read_io(fio->op)) - trace_f2fs_submit_read_bio(io->sbi->sb, fio, io->bio); - else - trace_f2fs_submit_write_bio(io->sbi->sb, fio, io->bio); - bio_set_op_attrs(io->bio, fio->op, fio->op_flags); + if (is_read_io(fio->op)) + trace_f2fs_prepare_read_bio(io->sbi->sb, fio->type, io->bio); + else + trace_f2fs_prepare_write_bio(io->sbi->sb, fio->type, io->bio); + __submit_bio(io->sbi, io->bio, fio->type); io->bio = NULL; } diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 7ad46e8a89e6..217691582dd4 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -779,12 +779,11 @@ DEFINE_EVENT_CONDITION(f2fs__submit_page_bio, f2fs_submit_page_mbio, TP_CONDITION(page->mapping) ); -DECLARE_EVENT_CLASS(f2fs__submit_bio, +DECLARE_EVENT_CLASS(f2fs__bio, - TP_PROTO(struct super_block *sb, struct f2fs_io_info *fio, - struct bio *bio), + TP_PROTO(struct super_block *sb, int type, struct bio *bio), - TP_ARGS(sb, fio, bio), + TP_ARGS(sb, type, bio), TP_STRUCT__entry( __field(dev_t, dev) @@ -797,9 +796,9 @@ DECLARE_EVENT_CLASS(f2fs__submit_bio, TP_fast_assign( __entry->dev = sb->s_dev; - __entry->op = fio->op; - __entry->op_flags = fio->op_flags; - __entry->type = fio->type; + __entry->op = bio_op(bio); + __entry->op_flags = bio->bi_rw; + __entry->type = type; __entry->sector = bio->bi_iter.bi_sector; __entry->size = bio->bi_iter.bi_size; ), @@ -812,22 +811,38 @@ DECLARE_EVENT_CLASS(f2fs__submit_bio, __entry->size) ); -DEFINE_EVENT_CONDITION(f2fs__submit_bio, f2fs_submit_write_bio, +DEFINE_EVENT_CONDITION(f2fs__bio, f2fs_prepare_write_bio, - TP_PROTO(struct super_block *sb, struct f2fs_io_info *fio, - struct bio *bio), + TP_PROTO(struct super_block *sb, int type, struct bio *bio), - TP_ARGS(sb, fio, bio), + TP_ARGS(sb, type, bio), TP_CONDITION(bio) ); -DEFINE_EVENT_CONDITION(f2fs__submit_bio, f2fs_submit_read_bio, +DEFINE_EVENT_CONDITION(f2fs__bio, f2fs_prepare_read_bio, - TP_PROTO(struct super_block *sb, struct f2fs_io_info *fio, - struct bio *bio), + TP_PROTO(struct super_block *sb, int type, struct bio *bio), - TP_ARGS(sb, fio, bio), + TP_ARGS(sb, type, bio), + + TP_CONDITION(bio) +); + +DEFINE_EVENT_CONDITION(f2fs__bio, f2fs_submit_read_bio, + + TP_PROTO(struct super_block *sb, int type, struct bio *bio), + + TP_ARGS(sb, type, bio), + + TP_CONDITION(bio) +); + +DEFINE_EVENT_CONDITION(f2fs__bio, f2fs_submit_write_bio, + + TP_PROTO(struct super_block *sb, int type, struct bio *bio), + + TP_ARGS(sb, type, bio), TP_CONDITION(bio) ); From 22f1947949fd050ca103d37f369dbd6d2024ea50 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 14 Dec 2016 10:12:56 -0800 Subject: [PATCH 086/804] f2fs: support IO alignment for DATA and NODE writes This patch implements IO alignment by filling dummy blocks in DATA and NODE write bios. If we can guarantee, for example, 32KB or 64KB for such the IOs, we can eliminate underlying dummy page problem which FTL conducts in order to close MLC or TLC partial written pages. Note that, - it requires "-o mode=lfs". - IO size should be power of 2, not exceed BIO_MAX_PAGES, 256. - read IO is still 4KB. - do checkpoint at fsync, if dummy NODE page was written. Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 55 +++++++++++++++++++++++++++++++++++++++-- fs/f2fs/f2fs.h | 4 ++- fs/f2fs/segment.c | 9 +++++-- fs/f2fs/segment.h | 3 +++ fs/f2fs/super.c | 13 +++++++++- include/linux/f2fs_fs.h | 6 +++++ 6 files changed, 84 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index ab82b388c5aa..1b19b805ef81 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -93,6 +93,17 @@ static void f2fs_write_end_io(struct bio *bio) struct page *page = bvec->bv_page; enum count_type type = WB_DATA_TYPE(page); + if (IS_DUMMY_WRITTEN_PAGE(page)) { + set_page_private(page, (unsigned long)NULL); + ClearPagePrivate(page); + unlock_page(page); + mempool_free(page, sbi->write_io_dummy); + + if (unlikely(bio->bi_error)) + f2fs_stop_checkpoint(sbi, true); + continue; + } + fscrypt_pullback_bio_page(&page, true); if (unlikely(bio->bi_error)) { @@ -171,10 +182,42 @@ static inline void __submit_bio(struct f2fs_sb_info *sbi, struct bio *bio, enum page_type type) { if (!is_read_io(bio_op(bio))) { + unsigned int start; + if (f2fs_sb_mounted_blkzoned(sbi->sb) && current->plug && (type == DATA || type == NODE)) blk_finish_plug(current->plug); + + if (type != DATA && type != NODE) + goto submit_io; + + start = bio->bi_iter.bi_size >> F2FS_BLKSIZE_BITS; + start %= F2FS_IO_SIZE(sbi); + + if (start == 0) + goto submit_io; + + /* fill dummy pages */ + for (; start < F2FS_IO_SIZE(sbi); start++) { + struct page *page = + mempool_alloc(sbi->write_io_dummy, + GFP_NOIO | __GFP_ZERO | __GFP_NOFAIL); + f2fs_bug_on(sbi, !page); + + SetPagePrivate(page); + set_page_private(page, (unsigned long)DUMMY_WRITTEN_PAGE); + lock_page(page); + if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) + f2fs_bug_on(sbi, 1); + } + /* + * In the NODE case, we lose next block address chain. So, we + * need to do checkpoint in f2fs_sync_file. + */ + if (type == NODE) + set_sbi_flag(sbi, SBI_NEED_CP); } +submit_io: if (is_read_io(bio_op(bio))) trace_f2fs_submit_read_bio(sbi->sb, type, bio); else @@ -319,13 +362,14 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) return 0; } -void f2fs_submit_page_mbio(struct f2fs_io_info *fio) +int f2fs_submit_page_mbio(struct f2fs_io_info *fio) { struct f2fs_sb_info *sbi = fio->sbi; enum page_type btype = PAGE_TYPE_OF_BIO(fio->type); struct f2fs_bio_info *io; bool is_read = is_read_io(fio->op); struct page *bio_page; + int err = 0; io = is_read ? &sbi->read_io : &sbi->write_io[btype]; @@ -346,6 +390,12 @@ void f2fs_submit_page_mbio(struct f2fs_io_info *fio) __submit_merged_bio(io); alloc_new: if (io->bio == NULL) { + if ((fio->type == DATA || fio->type == NODE) && + fio->new_blkaddr & F2FS_IO_SIZE_MASK(sbi)) { + err = -EAGAIN; + dec_page_count(sbi, WB_DATA_TYPE(bio_page)); + goto out_fail; + } io->bio = __bio_alloc(sbi, fio->new_blkaddr, BIO_MAX_PAGES, is_read); io->fio = *fio; @@ -359,9 +409,10 @@ alloc_new: io->last_block_in_bio = fio->new_blkaddr; f2fs_trace_ios(fio, 0); - +out_fail: up_write(&io->io_rwsem); trace_f2fs_submit_page_mbio(fio->page, fio); + return err; } static void __set_data_blkaddr(struct dnode_of_data *dn) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 883d3ab388c1..f9a739ffca0f 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -859,6 +859,8 @@ struct f2fs_sb_info { struct f2fs_bio_info read_io; /* for read bios */ struct f2fs_bio_info write_io[NR_PAGE_TYPE]; /* for write bios */ struct mutex wio_mutex[NODE + 1]; /* bio ordering for NODE/DATA */ + int write_io_size_bits; /* Write IO size bits */ + mempool_t *write_io_dummy; /* Dummy pages */ /* for checkpoint */ struct f2fs_checkpoint *ckpt; /* raw checkpoint pointer */ @@ -2241,7 +2243,7 @@ void f2fs_submit_merged_bio_cond(struct f2fs_sb_info *, struct inode *, struct page *, nid_t, enum page_type, int); void f2fs_flush_merged_bios(struct f2fs_sb_info *); int f2fs_submit_page_bio(struct f2fs_io_info *); -void f2fs_submit_page_mbio(struct f2fs_io_info *); +int f2fs_submit_page_mbio(struct f2fs_io_info *); struct block_device *f2fs_target_device(struct f2fs_sb_info *, block_t, struct bio *); int f2fs_target_device_index(struct f2fs_sb_info *, block_t); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index e6d3f3d4b028..a7bb97826445 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1684,15 +1684,20 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) { int type = __get_segment_type(fio->page, fio->type); + int err; if (fio->type == NODE || fio->type == DATA) mutex_lock(&fio->sbi->wio_mutex[fio->type]); - +reallocate: allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr, &fio->new_blkaddr, sum, type); /* writeout dirty page into bdev */ - f2fs_submit_page_mbio(fio); + err = f2fs_submit_page_mbio(fio); + if (err == -EAGAIN) { + fio->old_blkaddr = fio->new_blkaddr; + goto reallocate; + } if (fio->type == NODE || fio->type == DATA) mutex_unlock(&fio->sbi->wio_mutex[fio->type]); diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 9d44ce83acb2..08f1455c812c 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -186,9 +186,12 @@ struct segment_allocation { * the page is atomically written, and it is in inmem_pages list. */ #define ATOMIC_WRITTEN_PAGE ((unsigned long)-1) +#define DUMMY_WRITTEN_PAGE ((unsigned long)-2) #define IS_ATOMIC_WRITTEN_PAGE(page) \ (page_private(page) == (unsigned long)ATOMIC_WRITTEN_PAGE) +#define IS_DUMMY_WRITTEN_PAGE(page) \ + (page_private(page) == (unsigned long)DUMMY_WRITTEN_PAGE) struct inmem_pages { struct list_head list; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index e6d8d011786c..fb9f6c09fa11 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1764,6 +1764,8 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) FDEV(i).total_segments, FDEV(i).start_blk, FDEV(i).end_blk); } + f2fs_msg(sbi->sb, KERN_INFO, + "IO Block Size: %8d KB", F2FS_IO_SIZE_KB(sbi)); return 0; } @@ -1881,12 +1883,19 @@ try_onemore: if (err) goto free_options; + if (F2FS_IO_SIZE(sbi) > 1) { + sbi->write_io_dummy = + mempool_create_page_pool(F2FS_IO_SIZE(sbi) - 1, 0); + if (!sbi->write_io_dummy) + goto free_options; + } + /* get an inode for meta space */ sbi->meta_inode = f2fs_iget(sb, F2FS_META_INO(sbi)); if (IS_ERR(sbi->meta_inode)) { f2fs_msg(sb, KERN_ERR, "Failed to read F2FS meta data inode"); err = PTR_ERR(sbi->meta_inode); - goto free_options; + goto free_io_dummy; } err = get_valid_checkpoint(sbi); @@ -2104,6 +2113,8 @@ free_devices: free_meta_inode: make_bad_inode(sbi->meta_inode); iput(sbi->meta_inode); +free_io_dummy: + mempool_destroy(sbi->write_io_dummy); free_options: destroy_percpu_info(sbi); kfree(options); diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index cea41a124a80..f0748524ca8c 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -36,6 +36,12 @@ #define F2FS_NODE_INO(sbi) (sbi->node_ino_num) #define F2FS_META_INO(sbi) (sbi->meta_ino_num) +#define F2FS_IO_SIZE(sbi) (1 << (sbi)->write_io_size_bits) /* Blocks */ +#define F2FS_IO_SIZE_KB(sbi) (1 << ((sbi)->write_io_size_bits + 2)) /* KB */ +#define F2FS_IO_SIZE_BYTES(sbi) (1 << ((sbi)->write_io_size_bits + 12)) /* B */ +#define F2FS_IO_SIZE_BITS(sbi) ((sbi)->write_io_size_bits) /* power of 2 */ +#define F2FS_IO_SIZE_MASK(sbi) (F2FS_IO_SIZE(sbi) - 1) + /* This flag is used by node and meta inodes, and by recovery */ #define GFP_F2FS_ZERO (GFP_NOFS | __GFP_ZERO) #define GFP_F2FS_HIGH_ZERO (GFP_NOFS | __GFP_ZERO | __GFP_HIGHMEM) From 168fef245e107466a47431627a66414460822faa Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 21 Dec 2016 17:09:19 -0800 Subject: [PATCH 087/804] f2fs: get io size bit from mount option This patch adds to set io_size_bits from mount option. Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.txt | 2 ++ fs/f2fs/super.c | 22 ++++++++++++++++++++++ 2 files changed, 24 insertions(+) diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt index 753dd4f96afe..d99faced79cb 100644 --- a/Documentation/filesystems/f2fs.txt +++ b/Documentation/filesystems/f2fs.txt @@ -157,6 +157,8 @@ data_flush Enable data flushing before checkpoint in order to mode=%s Control block allocation mode which supports "adaptive" and "lfs". In "lfs" mode, there should be no random writes towards main area. +io_bits=%u Set the bit size of write IO requests. It should be set + with "mode=lfs". ================================================================================ DEBUGFS ENTRIES diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index fb9f6c09fa11..3b169927408e 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -101,6 +101,7 @@ enum { Opt_noinline_data, Opt_data_flush, Opt_mode, + Opt_io_size_bits, Opt_fault_injection, Opt_lazytime, Opt_nolazytime, @@ -133,6 +134,7 @@ static match_table_t f2fs_tokens = { {Opt_noinline_data, "noinline_data"}, {Opt_data_flush, "data_flush"}, {Opt_mode, "mode=%s"}, + {Opt_io_size_bits, "io_bits=%u"}, {Opt_fault_injection, "fault_injection=%u"}, {Opt_lazytime, "lazytime"}, {Opt_nolazytime, "nolazytime"}, @@ -535,6 +537,17 @@ static int parse_options(struct super_block *sb, char *options) } kfree(name); break; + case Opt_io_size_bits: + if (args->from && match_int(args, &arg)) + return -EINVAL; + if (arg > __ilog2_u32(BIO_MAX_PAGES)) { + f2fs_msg(sb, KERN_WARNING, + "Not support %d, larger than %d", + 1 << arg, BIO_MAX_PAGES); + return -EINVAL; + } + sbi->write_io_size_bits = arg; + break; case Opt_fault_injection: if (args->from && match_int(args, &arg)) return -EINVAL; @@ -558,6 +571,13 @@ static int parse_options(struct super_block *sb, char *options) return -EINVAL; } } + + if (F2FS_IO_SIZE_BITS(sbi) && !test_opt(sbi, LFS)) { + f2fs_msg(sb, KERN_ERR, + "Should set mode=lfs with %uKB-sized IO", + F2FS_IO_SIZE_KB(sbi)); + return -EINVAL; + } return 0; } @@ -918,6 +938,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) else if (test_opt(sbi, LFS)) seq_puts(seq, "lfs"); seq_printf(seq, ",active_logs=%u", sbi->active_logs); + if (F2FS_IO_SIZE_BITS(sbi)) + seq_printf(seq, ",io_size=%uKB", F2FS_IO_SIZE_KB(sbi)); return 0; } From 75e402e690b9f1713458f3adbef306a73de74f3c Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 28 Dec 2016 13:55:09 -0800 Subject: [PATCH 088/804] f2fs: show the max number of atomic operations This patch adds to show the max number of atomic operations which are conducting concurrently. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 7 +++++++ fs/f2fs/f2fs.h | 17 +++++++++++++++++ fs/f2fs/file.c | 8 ++++++-- fs/f2fs/segment.c | 1 + 4 files changed, 31 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index fbd5184140d0..29cdf0c1da1d 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -50,6 +50,8 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->ndirty_files = sbi->ndirty_inode[FILE_INODE]; si->ndirty_all = sbi->ndirty_inode[DIRTY_META]; si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES); + si->aw_cnt = atomic_read(&sbi->aw_cnt); + si->max_aw_cnt = atomic_read(&sbi->max_aw_cnt); si->nr_wb_cp_data = get_pages(sbi, F2FS_WB_CP_DATA); si->nr_wb_data = get_pages(sbi, F2FS_WB_DATA); si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg; @@ -256,6 +258,8 @@ static int stat_show(struct seq_file *s, void *v) si->inline_dir); seq_printf(s, " - Orphan Inode: %u\n", si->orphans); + seq_printf(s, " - Atomic write count: %4d (Max. %4d)\n", + si->aw_cnt, si->max_aw_cnt); seq_printf(s, "\nMain area: %d segs, %d secs %d zones\n", si->main_area_segs, si->main_area_sections, si->main_area_zones); @@ -414,6 +418,9 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) atomic_set(&sbi->inline_dir, 0); atomic_set(&sbi->inplace_count, 0); + atomic_set(&sbi->aw_cnt, 0); + atomic_set(&sbi->max_aw_cnt, 0); + mutex_lock(&f2fs_stat_mutex); list_add_tail(&si->stat_list, &f2fs_stat_list); mutex_unlock(&f2fs_stat_mutex); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index f9a739ffca0f..19e054b1c4f8 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -951,6 +951,8 @@ struct f2fs_sb_info { atomic_t inline_xattr; /* # of inline_xattr inodes */ atomic_t inline_inode; /* # of inline_data inodes */ atomic_t inline_dir; /* # of inline_dentry inodes */ + atomic_t aw_cnt; /* # of atomic writes */ + atomic_t max_aw_cnt; /* max # of atomic writes */ int bg_gc; /* background gc calls */ unsigned int ndirty_inode[NR_INODE_TYPE]; /* # of dirty inodes */ #endif @@ -2303,6 +2305,7 @@ struct f2fs_stat_info { int total_count, utilization; int bg_gc, nr_wb_cp_data, nr_wb_data; int inline_xattr, inline_inode, inline_dir, orphans; + int aw_cnt, max_aw_cnt; unsigned int valid_count, valid_node_count, valid_inode_count, discard_blks; unsigned int bimodal, avg_vblocks; int util_free, util_valid, util_invalid; @@ -2374,6 +2377,17 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) ((sbi)->block_count[(curseg)->alloc_type]++) #define stat_inc_inplace_blocks(sbi) \ (atomic_inc(&(sbi)->inplace_count)) +#define stat_inc_atomic_write(inode) \ + (atomic_inc(&F2FS_I_SB(inode)->aw_cnt)); +#define stat_dec_atomic_write(inode) \ + (atomic_dec(&F2FS_I_SB(inode)->aw_cnt)); +#define stat_update_max_atomic_write(inode) \ + do { \ + int cur = atomic_read(&F2FS_I_SB(inode)->aw_cnt); \ + int max = atomic_read(&F2FS_I_SB(inode)->max_aw_cnt); \ + if (cur > max) \ + atomic_set(&F2FS_I_SB(inode)->max_aw_cnt, cur); \ + } while (0) #define stat_inc_seg_count(sbi, type, gc_type) \ do { \ struct f2fs_stat_info *si = F2FS_STAT(sbi); \ @@ -2427,6 +2441,9 @@ void f2fs_destroy_root_stats(void); #define stat_dec_inline_inode(inode) #define stat_inc_inline_dir(inode) #define stat_dec_inline_dir(inode) +#define stat_inc_atomic_write(inode) +#define stat_dec_atomic_write(inode) +#define stat_update_max_atomic_write(inode) #define stat_inc_seg_type(sbi, curseg) #define stat_inc_block_count(sbi, curseg) #define stat_inc_inplace_blocks(sbi) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 5808d5c709a7..d7eacef08797 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1546,6 +1546,8 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) if (ret) clear_inode_flag(inode, FI_ATOMIC_FILE); out: + stat_inc_atomic_write(inode); + stat_update_max_atomic_write(inode); inode_unlock(inode); mnt_drop_write_file(filp); return ret; @@ -1575,9 +1577,11 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) set_inode_flag(inode, FI_ATOMIC_FILE); goto err_out; } + ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); + stat_dec_atomic_write(inode); + } else { + ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); } - - ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); err_out: inode_unlock(inode); mnt_drop_write_file(filp); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index a7bb97826445..353ec85b3835 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -243,6 +243,7 @@ void drop_inmem_pages(struct inode *inode) struct f2fs_inode_info *fi = F2FS_I(inode); clear_inode_flag(inode, FI_ATOMIC_FILE); + stat_dec_atomic_write(inode); mutex_lock(&fi->inmem_lock); __revoke_inmem_pages(inode, &fi->inmem_pages, true, false); From 3c299af84525fc49d1ea46bf4f420df132a31d3f Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 28 Dec 2016 17:31:15 -0800 Subject: [PATCH 089/804] f2fs: don't allow encrypted operations without keys This patch fixes the renaming bug on encrypted filenames, which was pointed by (ext4: don't allow encrypted operations without keys) Cc: Theodore Ts'o Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/namei.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index ca9e2f85eae8..db3079cd665d 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -660,6 +660,12 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, bool is_old_inline = f2fs_has_inline_dentry(old_dir); int err = -ENOENT; + if ((f2fs_encrypted_inode(old_dir) && + !fscrypt_has_encryption_key(old_dir)) || + (f2fs_encrypted_inode(new_dir) && + !fscrypt_has_encryption_key(new_dir))) + return -ENOKEY; + if ((old_dir != new_dir) && f2fs_encrypted_inode(new_dir) && !fscrypt_has_permitted_context(new_dir, old_inode)) { err = -EPERM; @@ -840,6 +846,12 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, int old_nlink = 0, new_nlink = 0; int err = -ENOENT; + if ((f2fs_encrypted_inode(old_dir) && + !fscrypt_has_encryption_key(old_dir)) || + (f2fs_encrypted_inode(new_dir) && + !fscrypt_has_encryption_key(new_dir))) + return -ENOKEY; + if ((f2fs_encrypted_inode(old_dir) || f2fs_encrypted_inode(new_dir)) && (old_dir != new_dir) && (!fscrypt_has_permitted_context(new_dir, old_inode) || From 7e43f19b5ecdff3ff4ac97149446ad7c035b2185 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 3 Jan 2017 17:19:30 -0800 Subject: [PATCH 090/804] f2fs: drop exist_data for inline_data when truncated to 0 A test program gets the SEEK_DATA with two values between a new created file and the exist file on f2fs filesystem. F2FS filesystem, (the first "test1" is a new file) SEEK_DATA size != 0 (offset = 8192) SEEK_DATA size != 0 (offset = 4096) PNFS filesystem, (the first "test1" is a new file) SEEK_DATA size != 0 (offset = 4096) SEEK_DATA size != 0 (offset = 4096) int main(int argc, char **argv) { char *filename = argv[1]; int offset = 1, i = 0, fd = -1; if (argc < 2) { printf("Usage: %s f2fsfilename\n", argv[0]); return -1; } /* if (!access(filename, F_OK) || errno != ENOENT) { printf("Needs a new file for test, %m\n"); return -1; }*/ fd = open(filename, O_RDWR | O_CREAT, 0777); if (fd < 0) { printf("Create test file %s failed, %m\n", filename); return -1; } for (i = 0; i < 20; i++) { offset = 1 << i; ftruncate(fd, 0); lseek(fd, offset, SEEK_SET); write(fd, "test", 5); /* Get the alloc size by seek data equal zero*/ if (lseek(fd, 0, SEEK_DATA)) { printf("SEEK_DATA size != 0 (offset = %d)\n", offset); break; } } close(fd); return 0; } Reported-and-Tested-by: Kinglong Mee Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index d7eacef08797..9da13847cda4 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -571,6 +571,8 @@ int truncate_blocks(struct inode *inode, u64 from, bool lock) if (f2fs_has_inline_data(inode)) { if (truncate_inline_inode(ipage, from)) set_page_dirty(ipage); + if (from == 0) + clear_inode_flag(inode, FI_DATA_EXIST); f2fs_put_page(ipage, 1); truncate_page = true; goto out; From 5521ead70476162d3cef2324320784cb4dbd0c10 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 29 Dec 2016 14:07:53 -0800 Subject: [PATCH 091/804] f2fs: relax async discard commands more This patch relaxes async discard commands to avoid waiting its end_io during checkpoint. Instead of waiting them during checkpoint, it will be done when actually reusing them. Test on initial partition of nvme drive. # time fstrim /mnt/test Before : 6.158s After : 4.822s Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 7 ++----- fs/f2fs/f2fs.h | 4 +++- fs/f2fs/segment.c | 24 +++++++++++++++++++----- fs/f2fs/super.c | 3 +++ 4 files changed, 27 insertions(+), 11 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index d485bea3d6bb..2ed785e5ffbb 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1255,7 +1255,6 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) f2fs_bug_on(sbi, prefree_segments(sbi)); flush_sit_entries(sbi, cpc); clear_prefree_segments(sbi, cpc); - f2fs_wait_all_discard_bio(sbi); unblock_operations(sbi); goto out; } @@ -1274,12 +1273,10 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* unlock all the fs_lock[] in do_checkpoint() */ err = do_checkpoint(sbi, cpc); - if (err) { + if (err) release_discard_addrs(sbi); - } else { + else clear_prefree_segments(sbi, cpc); - f2fs_wait_all_discard_bio(sbi); - } unblock_operations(sbi); stat_inc_cp_count(sbi->stat_info); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 19e054b1c4f8..3409392dde9c 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -250,6 +250,8 @@ struct discard_entry { struct bio_entry { struct list_head list; + block_t lstart; + block_t len; struct bio *bio; struct completion event; int error; @@ -2178,7 +2180,7 @@ void destroy_flush_cmd_control(struct f2fs_sb_info *, bool); void invalidate_blocks(struct f2fs_sb_info *, block_t); bool is_checkpointed_data(struct f2fs_sb_info *, block_t); void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t); -void f2fs_wait_all_discard_bio(struct f2fs_sb_info *); +void f2fs_wait_discard_bio(struct f2fs_sb_info *, block_t); void clear_prefree_segments(struct f2fs_sb_info *, struct cp_control *); void release_discard_addrs(struct f2fs_sb_info *); int npages_for_summary_flush(struct f2fs_sb_info *, bool); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 353ec85b3835..fa3d4f8db389 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -625,20 +625,23 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) } static struct bio_entry *__add_bio_entry(struct f2fs_sb_info *sbi, - struct bio *bio) + struct bio *bio, block_t lstart, block_t len) { struct list_head *wait_list = &(SM_I(sbi)->wait_list); struct bio_entry *be = f2fs_kmem_cache_alloc(bio_entry_slab, GFP_NOFS); INIT_LIST_HEAD(&be->list); be->bio = bio; + be->lstart = lstart; + be->len = len; init_completion(&be->event); list_add_tail(&be->list, wait_list); return be; } -void f2fs_wait_all_discard_bio(struct f2fs_sb_info *sbi) +/* This should be covered by global mutex, &sit_i->sentry_lock */ +void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) { struct list_head *wait_list = &(SM_I(sbi)->wait_list); struct bio_entry *be, *tmp; @@ -647,7 +650,15 @@ void f2fs_wait_all_discard_bio(struct f2fs_sb_info *sbi) struct bio *bio = be->bio; int err; - wait_for_completion_io(&be->event); + if (!completion_done(&be->event)) { + if ((be->lstart <= blkaddr && + blkaddr < be->lstart + be->len) || + blkaddr == NULL_ADDR) + wait_for_completion_io(&be->event); + else + continue; + } + err = be->error; if (err == -EOPNOTSUPP) err = 0; @@ -756,6 +767,7 @@ static int __f2fs_issue_discard_async(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t blkstart, block_t blklen) { struct bio *bio = NULL; + block_t lblkstart = blkstart; int err; trace_f2fs_issue_discard(sbi->sb, blkstart, blklen); @@ -770,13 +782,13 @@ static int __f2fs_issue_discard_async(struct f2fs_sb_info *sbi, SECTOR_FROM_BLOCK(blklen), GFP_NOFS, 0, &bio); if (!err && bio) { - struct bio_entry *be = __add_bio_entry(sbi, bio); + struct bio_entry *be = __add_bio_entry(sbi, bio, + lblkstart, blklen); bio->bi_private = be; bio->bi_end_io = f2fs_submit_bio_wait_endio; submit_bio(REQ_SYNC, bio); } - return err; } @@ -1655,6 +1667,8 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, *new_blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); + f2fs_wait_discard_bio(sbi, *new_blkaddr); + /* * __add_sum_entry should be resided under the curseg_mutex * because, this function updates a summary entry in the diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 3b169927408e..84d5686c4aa4 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -770,6 +770,9 @@ static void f2fs_put_super(struct super_block *sb) write_checkpoint(sbi, &cpc); } + /* be sure to wait for any on-going discard commands */ + f2fs_wait_discard_bio(sbi, NULL_ADDR); + /* write_checkpoint can update stat informaion */ f2fs_destroy_stats(sbi); From 711f0385dc67a56d49c57e3857833fdf74fd40f8 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 29 Dec 2016 16:58:54 -0800 Subject: [PATCH 092/804] f2fs: avoid needless checkpoint in f2fs_trim_fs The f2fs_trim_fs() doesn't need to do checkpoint if there are newly allocated data blocks only which didn't change the critical checkpoint data such as nat and sit entries. Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 2ed785e5ffbb..886b96c12c31 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1249,14 +1249,15 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) f2fs_flush_merged_bios(sbi); /* this is the case of multiple fstrims without any changes */ - if (cpc->reason == CP_DISCARD && !is_sbi_flag_set(sbi, SBI_IS_DIRTY)) { - f2fs_bug_on(sbi, NM_I(sbi)->dirty_nat_cnt); - f2fs_bug_on(sbi, SIT_I(sbi)->dirty_sentries); - f2fs_bug_on(sbi, prefree_segments(sbi)); - flush_sit_entries(sbi, cpc); - clear_prefree_segments(sbi, cpc); - unblock_operations(sbi); - goto out; + if (cpc->reason == CP_DISCARD) { + if (NM_I(sbi)->dirty_nat_cnt == 0 && + SIT_I(sbi)->dirty_sentries == 0 && + prefree_segments(sbi) == 0) { + flush_sit_entries(sbi, cpc); + clear_prefree_segments(sbi, cpc); + unblock_operations(sbi); + goto out; + } } /* From 99a5dca4d9c6efd55dd548cf1e30ce86912f47ac Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 29 Dec 2016 22:06:15 -0800 Subject: [PATCH 093/804] f2fs: return fs_trim if there is no candidate If there is no candidate to submit discard command during f2sf_trim_fs, let's return without checkpoint. Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 5 +++++ fs/f2fs/f2fs.h | 1 + fs/f2fs/segment.c | 28 +++++++++++++++++++++++----- 3 files changed, 29 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 886b96c12c31..fbf04d4d7964 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1250,6 +1250,11 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* this is the case of multiple fstrims without any changes */ if (cpc->reason == CP_DISCARD) { + if (!exist_trim_candidates(sbi, cpc)) { + unblock_operations(sbi); + goto out; + } + if (NM_I(sbi)->dirty_nat_cnt == 0 && SIT_I(sbi)->dirty_sentries == 0 && prefree_segments(sbi) == 0) { diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 3409392dde9c..3eb53e3a8eae 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2186,6 +2186,7 @@ void release_discard_addrs(struct f2fs_sb_info *); int npages_for_summary_flush(struct f2fs_sb_info *, bool); void allocate_new_segments(struct f2fs_sb_info *); int f2fs_trim_fs(struct f2fs_sb_info *, struct fstrim_range *); +bool exist_trim_candidates(struct f2fs_sb_info *, struct cp_control *); struct page *get_sum_page(struct f2fs_sb_info *, unsigned int); void update_meta_page(struct f2fs_sb_info *, void *, block_t); void write_meta_page(struct f2fs_sb_info *, struct page *); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index fa3d4f8db389..12f8d5ab7ccf 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -914,7 +914,8 @@ done: SM_I(sbi)->nr_discards += end - start; } -static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc) +static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc, + bool check_only) { int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long); int max_blocks = sbi->blocks_per_seg; @@ -928,12 +929,12 @@ static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc) int i; if (se->valid_blocks == max_blocks || !f2fs_discard_en(sbi)) - return; + return false; if (!force) { if (!test_opt(sbi, DISCARD) || !se->valid_blocks || SM_I(sbi)->nr_discards >= SM_I(sbi)->max_discards) - return; + return false; } /* SIT_VBLOCK_MAP_SIZE should be multiple of sizeof(unsigned long) */ @@ -951,8 +952,12 @@ static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc) && (end - start) < cpc->trim_minlen) continue; + if (check_only) + return true; + __add_discard_entry(sbi, cpc, se, start, end); } + return false; } void release_discard_addrs(struct f2fs_sb_info *sbi) @@ -1533,6 +1538,19 @@ static const struct segment_allocation default_salloc_ops = { .allocate_segment = allocate_segment_by_default, }; +bool exist_trim_candidates(struct f2fs_sb_info *sbi, struct cp_control *cpc) +{ + __u64 trim_start = cpc->trim_start; + + mutex_lock(&SIT_I(sbi)->sentry_lock); + for (; trim_start <= cpc->trim_end; trim_start++) + if (add_discard_addrs(sbi, cpc, true)) + break; + mutex_unlock(&SIT_I(sbi)->sentry_lock); + + return trim_start <= cpc->trim_end; +} + int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) { __u64 start = F2FS_BYTES_TO_BLK(range->start); @@ -2329,7 +2347,7 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* add discard candidates */ if (cpc->reason != CP_DISCARD) { cpc->trim_start = segno; - add_discard_addrs(sbi, cpc); + add_discard_addrs(sbi, cpc, false); } if (to_journal) { @@ -2367,7 +2385,7 @@ out: __u64 trim_start = cpc->trim_start; for (; cpc->trim_start <= cpc->trim_end; cpc->trim_start++) - add_discard_addrs(sbi, cpc); + add_discard_addrs(sbi, cpc, false); cpc->trim_start = trim_start; } From f948bcc51e136aa4f23f6fe05ea51a5dc66bcdd5 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 7 Jan 2017 18:49:42 +0800 Subject: [PATCH 094/804] f2fs: clean up with list_{first, last}_entry Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 4 ++-- fs/f2fs/data.c | 4 ++-- fs/f2fs/node.h | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index fbf04d4d7964..45ef3b6bfb04 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -892,7 +892,7 @@ retry: F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA)); return 0; } - fi = list_entry(head->next, struct f2fs_inode_info, dirty_list); + fi = list_first_entry(head, struct f2fs_inode_info, dirty_list); inode = igrab(&fi->vfs_inode); spin_unlock(&sbi->inode_lock[type]); if (inode) { @@ -925,7 +925,7 @@ int f2fs_sync_inode_meta(struct f2fs_sb_info *sbi) spin_unlock(&sbi->inode_lock[DIRTY_META]); return 0; } - fi = list_entry(head->next, struct f2fs_inode_info, + fi = list_first_entry(head, struct f2fs_inode_info, gdirty_list); inode = igrab(&fi->vfs_inode); spin_unlock(&sbi->inode_lock[DIRTY_META]); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 1b19b805ef81..669c267cd36e 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1143,7 +1143,7 @@ static int f2fs_mpage_readpages(struct address_space *mapping, prefetchw(&page->flags); if (pages) { - page = list_entry(pages->prev, struct page, lru); + page = list_last_entry(pages, struct page, lru); list_del(&page->lru); if (add_to_page_cache_lru(page, mapping, page->index, GFP_KERNEL)) @@ -1261,7 +1261,7 @@ static int f2fs_read_data_pages(struct file *file, struct list_head *pages, unsigned nr_pages) { struct inode *inode = file->f_mapping->host; - struct page *page = list_entry(pages->prev, struct page, lru); + struct page *page = list_last_entry(pages, struct page, lru); trace_f2fs_readpages(inode, page, nr_pages); diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index e7997e240366..9278b21ee073 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -174,7 +174,7 @@ static inline void next_free_nid(struct f2fs_sb_info *sbi, nid_t *nid) spin_unlock(&nm_i->nid_list_lock); return; } - fnid = list_entry(nm_i->nid_list[FREE_NID_LIST].next, + fnid = list_first_entry(&nm_i->nid_list[FREE_NID_LIST], struct free_nid, list); *nid = fnid->nid; spin_unlock(&nm_i->nid_list_lock); From 72d48dabe998550f45038d44c98ec286e5161ce6 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 7 Jan 2017 18:50:26 +0800 Subject: [PATCH 095/804] f2fs: introduce FI_ATOMIC_COMMIT This patch introduces a new flag to indicate inode status of doing atomic write committing, so that, we can keep atomic write status for inode during atomic committing, then we can skip GCing pages of atomic write inode, that avoids random GCed datas being mixed with current transaction, so isolation of transaction can be kept. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 +- fs/f2fs/f2fs.h | 6 ++++++ fs/f2fs/file.c | 11 ++++++----- fs/f2fs/gc.c | 6 ++++++ fs/f2fs/segment.c | 10 +++++++--- 5 files changed, 26 insertions(+), 9 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 669c267cd36e..7efc2bf88641 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1976,7 +1976,7 @@ static int f2fs_set_data_page_dirty(struct page *page) if (!PageUptodate(page)) SetPageUptodate(page); - if (f2fs_is_atomic_file(inode)) { + if (f2fs_is_atomic_file(inode) && !f2fs_is_commit_atomic_write(inode)) { if (!IS_ATOMIC_WRITTEN_PAGE(page)) { register_inmem_page(inode, page); return 1; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 3eb53e3a8eae..807855d37c63 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1706,6 +1706,7 @@ enum { FI_UPDATE_WRITE, /* inode has in-place-update data */ FI_NEED_IPU, /* used for ipu per file */ FI_ATOMIC_FILE, /* indicate atomic file */ + FI_ATOMIC_COMMIT, /* indicate the state of atomical committing */ FI_VOLATILE_FILE, /* indicate volatile file */ FI_FIRST_BLOCK_WRITTEN, /* indicate #0 data block was written */ FI_DROP_CACHE, /* drop dirty page cache */ @@ -1895,6 +1896,11 @@ static inline bool f2fs_is_atomic_file(struct inode *inode) return is_inode_flag_set(inode, FI_ATOMIC_FILE); } +static inline bool f2fs_is_commit_atomic_write(struct inode *inode) +{ + return is_inode_flag_set(inode, FI_ATOMIC_COMMIT); +} + static inline bool f2fs_is_volatile_file(struct inode *inode) { return is_inode_flag_set(inode, FI_VOLATILE_FILE); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 9da13847cda4..e4e5d76d80b0 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1573,14 +1573,15 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) goto err_out; if (f2fs_is_atomic_file(inode)) { - clear_inode_flag(inode, FI_ATOMIC_FILE); ret = commit_inmem_pages(inode); - if (ret) { - set_inode_flag(inode, FI_ATOMIC_FILE); + if (ret) goto err_out; - } + ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); - stat_dec_atomic_write(inode); + if (!ret) { + clear_inode_flag(inode, FI_ATOMIC_FILE); + stat_dec_atomic_write(inode); + } } else { ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); } diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index d3a36e4b442c..7f0c3e02408c 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -569,6 +569,9 @@ static void move_encrypted_block(struct inode *inode, block_t bidx, if (!check_valid_map(F2FS_I_SB(inode), segno, off)) goto out; + if (f2fs_is_atomic_file(inode)) + goto out; + set_new_dnode(&dn, inode, NULL, NULL, 0); err = get_dnode_of_data(&dn, bidx, LOOKUP_NODE); if (err) @@ -661,6 +664,9 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type, if (!check_valid_map(F2FS_I_SB(inode), segno, off)) goto out; + if (f2fs_is_atomic_file(inode)) + goto out; + if (gc_type == BG_GC) { if (PageWriteback(page)) goto out; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 12f8d5ab7ccf..6a870677d58a 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -242,12 +242,12 @@ void drop_inmem_pages(struct inode *inode) { struct f2fs_inode_info *fi = F2FS_I(inode); - clear_inode_flag(inode, FI_ATOMIC_FILE); - stat_dec_atomic_write(inode); - mutex_lock(&fi->inmem_lock); __revoke_inmem_pages(inode, &fi->inmem_pages, true, false); mutex_unlock(&fi->inmem_lock); + + clear_inode_flag(inode, FI_ATOMIC_FILE); + stat_dec_atomic_write(inode); } static int __commit_inmem_pages(struct inode *inode, @@ -316,6 +316,8 @@ int commit_inmem_pages(struct inode *inode) f2fs_balance_fs(sbi, true); f2fs_lock_op(sbi); + set_inode_flag(inode, FI_ATOMIC_COMMIT); + mutex_lock(&fi->inmem_lock); err = __commit_inmem_pages(inode, &revoke_list); if (err) { @@ -337,6 +339,8 @@ int commit_inmem_pages(struct inode *inode) } mutex_unlock(&fi->inmem_lock); + clear_inode_flag(inode, FI_ATOMIC_COMMIT); + f2fs_unlock_op(sbi); return err; } From c50d5c09193e413467fca1a3fdfa5a69e59a6930 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 7 Jan 2017 18:51:01 +0800 Subject: [PATCH 096/804] f2fs: check in-memory block bitmap This patch adds a mirror for valid block bitmap, and use it to detect in-memory bitmap corruption which may be caused by bit-transition of cache or memory overflow. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 32 ++++++++++++++++++++++++++++++-- fs/f2fs/segment.h | 6 ++++++ 2 files changed, 36 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 6a870677d58a..aae1c2ea7a1d 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1101,14 +1101,32 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) /* Update valid block bitmap */ if (del > 0) { - if (f2fs_test_and_set_bit(offset, se->cur_valid_map)) + if (f2fs_test_and_set_bit(offset, se->cur_valid_map)) { +#ifdef CONFIG_F2FS_CHECK_FS + if (f2fs_test_and_set_bit(offset, + se->cur_valid_map_mir)) + f2fs_bug_on(sbi, 1); + else + WARN_ON(1); +#else f2fs_bug_on(sbi, 1); +#endif + } if (f2fs_discard_en(sbi) && !f2fs_test_and_set_bit(offset, se->discard_map)) sbi->discard_blks--; } else { - if (!f2fs_test_and_clear_bit(offset, se->cur_valid_map)) + if (!f2fs_test_and_clear_bit(offset, se->cur_valid_map)) { +#ifdef CONFIG_F2FS_CHECK_FS + if (!f2fs_test_and_clear_bit(offset, + se->cur_valid_map_mir)) + f2fs_bug_on(sbi, 1); + else + WARN_ON(1); +#else f2fs_bug_on(sbi, 1); +#endif + } if (f2fs_discard_en(sbi) && f2fs_test_and_clear_bit(offset, se->discard_map)) sbi->discard_blks++; @@ -2432,6 +2450,13 @@ static int build_sit_info(struct f2fs_sb_info *sbi) !sit_i->sentries[start].ckpt_valid_map) return -ENOMEM; +#ifdef CONFIG_F2FS_CHECK_FS + sit_i->sentries[start].cur_valid_map_mir + = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); + if (!sit_i->sentries[start].cur_valid_map_mir) + return -ENOMEM; +#endif + if (f2fs_discard_en(sbi)) { sit_i->sentries[start].discard_map = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); @@ -2861,6 +2886,9 @@ static void destroy_sit_info(struct f2fs_sb_info *sbi) if (sit_i->sentries) { for (start = 0; start < MAIN_SEGS(sbi); start++) { kfree(sit_i->sentries[start].cur_valid_map); +#ifdef CONFIG_F2FS_CHECK_FS + kfree(sit_i->sentries[start].cur_valid_map_mir); +#endif kfree(sit_i->sentries[start].ckpt_valid_map); kfree(sit_i->sentries[start].discard_map); } diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 08f1455c812c..9af95194db06 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -164,6 +164,9 @@ struct seg_entry { unsigned int ckpt_valid_blocks:10; /* # of valid blocks last cp */ unsigned int padding:6; /* padding */ unsigned char *cur_valid_map; /* validity bitmap of blocks */ +#ifdef CONFIG_F2FS_CHECK_FS + unsigned char *cur_valid_map_mir; /* mirror of current valid bitmap */ +#endif /* * # of valid blocks and the validity bitmap stored in the the last * checkpoint pack. This information is used by the SSR mode. @@ -320,6 +323,9 @@ static inline void seg_info_from_raw_sit(struct seg_entry *se, se->ckpt_valid_blocks = GET_SIT_VBLOCKS(rs); memcpy(se->cur_valid_map, rs->valid_map, SIT_VBLOCK_MAP_SIZE); memcpy(se->ckpt_valid_map, rs->valid_map, SIT_VBLOCK_MAP_SIZE); +#ifdef CONFIG_F2FS_CHECK_FS + memcpy(se->cur_valid_map_mir, rs->valid_map, SIT_VBLOCK_MAP_SIZE); +#endif se->type = GET_SIT_TYPE(rs); se->mtime = le64_to_cpu(rs->mtime); } From 8a576d4d407b72324e476544b3ab9ae2b0998788 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 7 Jan 2017 18:52:01 +0800 Subject: [PATCH 097/804] f2fs: check in-memory nat version bitmap This patch adds a mirror for nat version bitmap, and use it to detect in-memory bitmap corruption which may be caused by bit-transition of cache or memory overflow. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 3 +++ fs/f2fs/node.c | 11 +++++++++++ fs/f2fs/node.h | 15 +++++++++++++++ 3 files changed, 29 insertions(+) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 807855d37c63..d4783d9cf4e0 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -607,6 +607,9 @@ struct f2fs_nm_info { /* for checkpoint */ char *nat_bitmap; /* NAT bitmap pointer */ +#ifdef CONFIG_F2FS_CHECK_FS + char *nat_bitmap_mir; /* NAT bitmap mirror */ +#endif int bitmap_size; /* bitmap size */ }; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index b01b01cfc39e..bc67dc323f7e 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2366,6 +2366,14 @@ static int init_node_manager(struct f2fs_sb_info *sbi) GFP_KERNEL); if (!nm_i->nat_bitmap) return -ENOMEM; + +#ifdef CONFIG_F2FS_CHECK_FS + nm_i->nat_bitmap_mir = kmemdup(version_bitmap, nm_i->bitmap_size, + GFP_KERNEL); + if (!nm_i->nat_bitmap_mir) + return -ENOMEM; +#endif + return 0; } @@ -2440,6 +2448,9 @@ void destroy_node_manager(struct f2fs_sb_info *sbi) up_write(&nm_i->nat_tree_lock); kfree(nm_i->nat_bitmap); +#ifdef CONFIG_F2FS_CHECK_FS + kfree(nm_i->nat_bitmap_mir); +#endif sbi->nm_info = NULL; kfree(nm_i); } diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 9278b21ee073..29ff783eb9c3 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -186,6 +186,12 @@ static inline void next_free_nid(struct f2fs_sb_info *sbi, nid_t *nid) static inline void get_nat_bitmap(struct f2fs_sb_info *sbi, void *addr) { struct f2fs_nm_info *nm_i = NM_I(sbi); + +#ifdef CONFIG_F2FS_CHECK_FS + if (memcmp(nm_i->nat_bitmap, nm_i->nat_bitmap_mir, + nm_i->bitmap_size)) + f2fs_bug_on(sbi, 1); +#endif memcpy(addr, nm_i->nat_bitmap, nm_i->bitmap_size); } @@ -203,6 +209,12 @@ static inline pgoff_t current_nat_addr(struct f2fs_sb_info *sbi, nid_t start) (seg_off << sbi->log_blocks_per_seg << 1) + (block_off & (sbi->blocks_per_seg - 1))); +#ifdef CONFIG_F2FS_CHECK_FS + if (f2fs_test_bit(block_off, nm_i->nat_bitmap) != + f2fs_test_bit(block_off, nm_i->nat_bitmap_mir)) + f2fs_bug_on(sbi, 1); +#endif + if (f2fs_test_bit(block_off, nm_i->nat_bitmap)) block_addr += sbi->blocks_per_seg; @@ -228,6 +240,9 @@ static inline void set_to_next_nat(struct f2fs_nm_info *nm_i, nid_t start_nid) unsigned int block_off = NAT_BLOCK_OFFSET(start_nid); f2fs_change_bit(block_off, nm_i->nat_bitmap); +#ifdef CONFIG_F2FS_CHECK_FS + f2fs_change_bit(block_off, nm_i->nat_bitmap_mir); +#endif } static inline nid_t ino_of_node(struct page *node_page) From e3d4c4b5f18cd817225906e341aadf8fd8a01345 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 7 Jan 2017 18:52:34 +0800 Subject: [PATCH 098/804] f2fs: check in-memory sit version bitmap This patch adds a mirror for sit version bitmap, and use it to detect in-memory bitmap corruption which may be caused by bit-transition of cache or memory overflow. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 16 ++++++++++++---- fs/f2fs/segment.h | 18 ++++++++++++++++++ 2 files changed, 30 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index aae1c2ea7a1d..c39bbffb0cac 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2421,7 +2421,7 @@ static int build_sit_info(struct f2fs_sb_info *sbi) struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); struct sit_info *sit_i; unsigned int sit_segs, start; - char *src_bitmap, *dst_bitmap; + char *src_bitmap; unsigned int bitmap_size; /* allocate memory for SIT information */ @@ -2483,17 +2483,22 @@ static int build_sit_info(struct f2fs_sb_info *sbi) bitmap_size = __bitmap_size(sbi, SIT_BITMAP); src_bitmap = __bitmap_ptr(sbi, SIT_BITMAP); - dst_bitmap = kmemdup(src_bitmap, bitmap_size, GFP_KERNEL); - if (!dst_bitmap) + sit_i->sit_bitmap = kmemdup(src_bitmap, bitmap_size, GFP_KERNEL); + if (!sit_i->sit_bitmap) return -ENOMEM; +#ifdef CONFIG_F2FS_CHECK_FS + sit_i->sit_bitmap_mir = kmemdup(src_bitmap, bitmap_size, GFP_KERNEL); + if (!sit_i->sit_bitmap_mir) + return -ENOMEM; +#endif + /* init SIT information */ sit_i->s_ops = &default_salloc_ops; sit_i->sit_base_addr = le32_to_cpu(raw_super->sit_blkaddr); sit_i->sit_blocks = sit_segs << sbi->log_blocks_per_seg; sit_i->written_valid_blocks = 0; - sit_i->sit_bitmap = dst_bitmap; sit_i->bitmap_size = bitmap_size; sit_i->dirty_sentries = 0; sit_i->sents_per_block = SIT_ENTRY_PER_BLOCK; @@ -2901,6 +2906,9 @@ static void destroy_sit_info(struct f2fs_sb_info *sbi) SM_I(sbi)->sit_info = NULL; kfree(sit_i->sit_bitmap); +#ifdef CONFIG_F2FS_CHECK_FS + kfree(sit_i->sit_bitmap_mir); +#endif kfree(sit_i); } diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 9af95194db06..5cb5755c75d9 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -209,6 +209,9 @@ struct sit_info { block_t sit_blocks; /* # of blocks used by SIT area */ block_t written_valid_blocks; /* # of valid blocks in main area */ char *sit_bitmap; /* SIT bitmap pointer */ +#ifdef CONFIG_F2FS_CHECK_FS + char *sit_bitmap_mir; /* SIT bitmap mirror */ +#endif unsigned int bitmap_size; /* SIT bitmap size */ unsigned long *tmp_map; /* bitmap for temporal use */ @@ -423,6 +426,12 @@ static inline void get_sit_bitmap(struct f2fs_sb_info *sbi, void *dst_addr) { struct sit_info *sit_i = SIT_I(sbi); + +#ifdef CONFIG_F2FS_CHECK_FS + if (memcmp(sit_i->sit_bitmap, sit_i->sit_bitmap_mir, + sit_i->bitmap_size)) + f2fs_bug_on(sbi, 1); +#endif memcpy(dst_addr, sit_i->sit_bitmap, sit_i->bitmap_size); } @@ -643,6 +652,12 @@ static inline pgoff_t current_sit_addr(struct f2fs_sb_info *sbi, check_seg_range(sbi, start); +#ifdef CONFIG_F2FS_CHECK_FS + if (f2fs_test_bit(offset, sit_i->sit_bitmap) != + f2fs_test_bit(offset, sit_i->sit_bitmap_mir)) + f2fs_bug_on(sbi, 1); +#endif + /* calculate sit block address */ if (f2fs_test_bit(offset, sit_i->sit_bitmap)) blk_addr += sit_i->sit_blocks; @@ -668,6 +683,9 @@ static inline void set_to_next_sit(struct sit_info *sit_i, unsigned int start) unsigned int block_off = SIT_BLOCK_OFFSET(start); f2fs_change_bit(block_off, sit_i->sit_bitmap); +#ifdef CONFIG_F2FS_CHECK_FS + f2fs_change_bit(block_off, sit_i->sit_bitmap_mir); +#endif } static inline unsigned long long get_mtime(struct f2fs_sb_info *sbi) From 0d7a55b0135b91c5eb8ecaf1b6bfe0c0f5eca3fd Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 9 Jan 2017 14:13:03 -0800 Subject: [PATCH 099/804] f2fs: clean up flush/discard command namings This patch simply cleans up the names for flush/discard commands. Signed-off-by: Jaegeuk Kim Conflicts: fs/f2fs/segment.c --- fs/f2fs/debug.c | 2 +- fs/f2fs/f2fs.h | 20 +++++----- fs/f2fs/segment.c | 98 +++++++++++++++++++++++------------------------ 3 files changed, 59 insertions(+), 61 deletions(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 29cdf0c1da1d..883f1ea9e0b6 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -194,7 +194,7 @@ get_cache: si->cache_mem += sizeof(struct f2fs_gc_kthread); /* build merge flush thread */ - if (SM_I(sbi)->cmd_control_info) + if (SM_I(sbi)->fcc_info) si->cache_mem += sizeof(struct flush_cmd_control); /* free nids */ diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index d4783d9cf4e0..167c5f841b5f 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -248,13 +248,12 @@ struct discard_entry { int len; /* # of consecutive blocks of the discard */ }; -struct bio_entry { - struct list_head list; - block_t lstart; - block_t len; - struct bio *bio; - struct completion event; - int error; +struct discard_cmd { + struct list_head list; /* command list */ + struct completion wait; /* compleation */ + block_t lstart; /* logical start address */ + block_t len; /* length */ + struct bio *bio; /* bio */ }; /* for the list of fsync inodes, used only during recovery */ @@ -701,8 +700,8 @@ struct f2fs_sm_info { unsigned int rec_prefree_segments; /* for small discard management */ - struct list_head discard_list; /* 4KB discard list */ - struct list_head wait_list; /* linked with issued discard bio */ + struct list_head discard_entry_list; /* 4KB discard entry list */ + struct list_head discard_cmd_list; /* discard cmd list */ int nr_discards; /* # of discards in the list */ int max_discards; /* max. discards to be issued */ @@ -716,8 +715,7 @@ struct f2fs_sm_info { unsigned int min_fsync_blocks; /* threshold for fsync */ /* for flush command control */ - struct flush_cmd_control *cmd_control_info; - + struct flush_cmd_control *fcc_info; }; /* diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index c39bbffb0cac..289b3facd2d8 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -26,7 +26,7 @@ #define __reverse_ffz(x) __reverse_ffs(~(x)) static struct kmem_cache *discard_entry_slab; -static struct kmem_cache *bio_entry_slab; +static struct kmem_cache *discard_cmd_slab; static struct kmem_cache *sit_entry_set_slab; static struct kmem_cache *inmem_entry_slab; @@ -439,7 +439,7 @@ static int submit_flush_wait(struct f2fs_sb_info *sbi) static int issue_flush_thread(void *data) { struct f2fs_sb_info *sbi = data; - struct flush_cmd_control *fcc = SM_I(sbi)->cmd_control_info; + struct flush_cmd_control *fcc = SM_I(sbi)->fcc_info; wait_queue_head_t *q = &fcc->flush_wait_queue; repeat: if (kthread_should_stop()) @@ -468,7 +468,7 @@ repeat: int f2fs_issue_flush(struct f2fs_sb_info *sbi) { - struct flush_cmd_control *fcc = SM_I(sbi)->cmd_control_info; + struct flush_cmd_control *fcc = SM_I(sbi)->fcc_info; struct flush_cmd cmd; trace_f2fs_issue_flush(sbi->sb, test_opt(sbi, NOBARRIER), @@ -511,8 +511,8 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi) struct flush_cmd_control *fcc; int err = 0; - if (SM_I(sbi)->cmd_control_info) { - fcc = SM_I(sbi)->cmd_control_info; + if (SM_I(sbi)->fcc_info) { + fcc = SM_I(sbi)->fcc_info; goto init_thread; } @@ -522,14 +522,14 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi) atomic_set(&fcc->submit_flush, 0); init_waitqueue_head(&fcc->flush_wait_queue); init_llist_head(&fcc->issue_list); - SM_I(sbi)->cmd_control_info = fcc; + SM_I(sbi)->fcc_info = fcc; init_thread: fcc->f2fs_issue_flush = kthread_run(issue_flush_thread, sbi, "f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev)); if (IS_ERR(fcc->f2fs_issue_flush)) { err = PTR_ERR(fcc->f2fs_issue_flush); kfree(fcc); - SM_I(sbi)->cmd_control_info = NULL; + SM_I(sbi)->fcc_info = NULL; return err; } @@ -538,7 +538,7 @@ init_thread: void destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free) { - struct flush_cmd_control *fcc = SM_I(sbi)->cmd_control_info; + struct flush_cmd_control *fcc = SM_I(sbi)->fcc_info; if (fcc && fcc->f2fs_issue_flush) { struct task_struct *flush_thread = fcc->f2fs_issue_flush; @@ -548,7 +548,7 @@ void destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free) } if (free) { kfree(fcc); - SM_I(sbi)->cmd_control_info = NULL; + SM_I(sbi)->fcc_info = NULL; } } @@ -628,42 +628,43 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) mutex_unlock(&dirty_i->seglist_lock); } -static struct bio_entry *__add_bio_entry(struct f2fs_sb_info *sbi, +static struct discard_cmd *__add_discard_cmd(struct f2fs_sb_info *sbi, struct bio *bio, block_t lstart, block_t len) { - struct list_head *wait_list = &(SM_I(sbi)->wait_list); - struct bio_entry *be = f2fs_kmem_cache_alloc(bio_entry_slab, GFP_NOFS); + struct list_head *wait_list = &(SM_I(sbi)->discard_cmd_list); + struct discard_cmd *dc; - INIT_LIST_HEAD(&be->list); - be->bio = bio; - be->lstart = lstart; - be->len = len; - init_completion(&be->event); - list_add_tail(&be->list, wait_list); + dc = f2fs_kmem_cache_alloc(discard_cmd_slab, GFP_NOFS); + INIT_LIST_HEAD(&dc->list); + dc->bio = bio; + dc->lstart = lstart; + dc->len = len; + init_completion(&dc->wait); + list_add_tail(&dc->list, wait_list); - return be; + return dc; } /* This should be covered by global mutex, &sit_i->sentry_lock */ void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) { - struct list_head *wait_list = &(SM_I(sbi)->wait_list); - struct bio_entry *be, *tmp; + struct list_head *wait_list = &(SM_I(sbi)->discard_cmd_list); + struct discard_cmd *dc, *tmp; - list_for_each_entry_safe(be, tmp, wait_list, list) { - struct bio *bio = be->bio; + list_for_each_entry_safe(dc, tmp, wait_list, list) { + struct bio *bio = dc->bio; int err; - if (!completion_done(&be->event)) { - if ((be->lstart <= blkaddr && - blkaddr < be->lstart + be->len) || + if (!completion_done(&dc->wait)) { + if ((dc->lstart <= blkaddr && + blkaddr < dc->lstart + dc->len) || blkaddr == NULL_ADDR) - wait_for_completion_io(&be->event); + wait_for_completion_io(&dc->wait); else continue; } - err = be->error; + err = bio->bi_error; if (err == -EOPNOTSUPP) err = 0; @@ -672,17 +673,16 @@ void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) "Issue discard failed, ret: %d", err); bio_put(bio); - list_del(&be->list); - kmem_cache_free(bio_entry_slab, be); + list_del(&dc->list); + kmem_cache_free(discard_cmd_slab, dc); } } -static void f2fs_submit_bio_wait_endio(struct bio *bio) +static void f2fs_submit_discard_endio(struct bio *bio) { - struct bio_entry *be = (struct bio_entry *)bio->bi_private; + struct discard_cmd *dc = (struct discard_cmd *)bio->bi_private; - be->error = bio->bi_error; - complete(&be->event); + complete(&dc->wait); } /* copied from block/blk-lib.c in 4.10-rc1 */ @@ -786,11 +786,11 @@ static int __f2fs_issue_discard_async(struct f2fs_sb_info *sbi, SECTOR_FROM_BLOCK(blklen), GFP_NOFS, 0, &bio); if (!err && bio) { - struct bio_entry *be = __add_bio_entry(sbi, bio, + struct discard_cmd *dc = __add_discard_cmd(sbi, bio, lblkstart, blklen); - bio->bi_private = be; - bio->bi_end_io = f2fs_submit_bio_wait_endio; + bio->bi_private = dc; + bio->bi_end_io = f2fs_submit_discard_endio; submit_bio(REQ_SYNC, bio); } return err; @@ -897,7 +897,7 @@ static void __add_discard_entry(struct f2fs_sb_info *sbi, struct cp_control *cpc, struct seg_entry *se, unsigned int start, unsigned int end) { - struct list_head *head = &SM_I(sbi)->discard_list; + struct list_head *head = &SM_I(sbi)->discard_entry_list; struct discard_entry *new, *last; if (!list_empty(head)) { @@ -966,7 +966,7 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc, void release_discard_addrs(struct f2fs_sb_info *sbi) { - struct list_head *head = &(SM_I(sbi)->discard_list); + struct list_head *head = &(SM_I(sbi)->discard_entry_list); struct discard_entry *entry, *this; /* drop caches */ @@ -992,7 +992,7 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi) void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc) { - struct list_head *head = &(SM_I(sbi)->discard_list); + struct list_head *head = &(SM_I(sbi)->discard_entry_list); struct discard_entry *entry, *this; struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); struct blk_plug plug; @@ -2783,8 +2783,8 @@ int build_segment_manager(struct f2fs_sb_info *sbi) sm_info->min_ipu_util = DEF_MIN_IPU_UTIL; sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS; - INIT_LIST_HEAD(&sm_info->discard_list); - INIT_LIST_HEAD(&sm_info->wait_list); + INIT_LIST_HEAD(&sm_info->discard_entry_list); + INIT_LIST_HEAD(&sm_info->discard_cmd_list); sm_info->nr_discards = 0; sm_info->max_discards = 0; @@ -2934,15 +2934,15 @@ int __init create_segment_manager_caches(void) if (!discard_entry_slab) goto fail; - bio_entry_slab = f2fs_kmem_cache_create("bio_entry", - sizeof(struct bio_entry)); - if (!bio_entry_slab) + discard_cmd_slab = f2fs_kmem_cache_create("discard_cmd", + sizeof(struct discard_cmd)); + if (!discard_cmd_slab) goto destroy_discard_entry; sit_entry_set_slab = f2fs_kmem_cache_create("sit_entry_set", sizeof(struct sit_entry_set)); if (!sit_entry_set_slab) - goto destroy_bio_entry; + goto destroy_discard_cmd; inmem_entry_slab = f2fs_kmem_cache_create("inmem_page_entry", sizeof(struct inmem_pages)); @@ -2952,8 +2952,8 @@ int __init create_segment_manager_caches(void) destroy_sit_entry_set: kmem_cache_destroy(sit_entry_set_slab); -destroy_bio_entry: - kmem_cache_destroy(bio_entry_slab); +destroy_discard_cmd: + kmem_cache_destroy(discard_cmd_slab); destroy_discard_entry: kmem_cache_destroy(discard_entry_slab); fail: @@ -2963,7 +2963,7 @@ fail: void destroy_segment_manager_caches(void) { kmem_cache_destroy(sit_entry_set_slab); - kmem_cache_destroy(bio_entry_slab); + kmem_cache_destroy(discard_cmd_slab); kmem_cache_destroy(discard_entry_slab); kmem_cache_destroy(inmem_entry_slab); } From 4844bb76e75265a7c742cfcaf0e8e54f02994933 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 11 Jan 2017 10:21:15 -0800 Subject: [PATCH 100/804] f2fs: reorganize stat information This patch modifies stat information more clearly. Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 883f1ea9e0b6..cd338ca24941 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -258,8 +258,6 @@ static int stat_show(struct seq_file *s, void *v) si->inline_dir); seq_printf(s, " - Orphan Inode: %u\n", si->orphans); - seq_printf(s, " - Atomic write count: %4d (Max. %4d)\n", - si->aw_cnt, si->max_aw_cnt); seq_printf(s, "\nMain area: %d segs, %d secs %d zones\n", si->main_area_segs, si->main_area_sections, si->main_area_zones); @@ -318,8 +316,10 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, " - Inner Struct Count: tree: %d(%d), node: %d\n", si->ext_tree, si->zombie_tree, si->ext_node); seq_puts(s, "\nBalancing F2FS Async:\n"); - seq_printf(s, " - inmem: %4d, wb_cp_data: %4d, wb_data: %4d\n", - si->inmem_pages, si->nr_wb_cp_data, si->nr_wb_data); + seq_printf(s, " - IO (CP: %4d, Data: %4d)\n", + si->nr_wb_cp_data, si->nr_wb_data); + seq_printf(s, " - inmem: %4d, atomic IO: %4d (Max. %4d)\n", + si->inmem_pages, si->aw_cnt, si->max_aw_cnt); seq_printf(s, " - nodes: %4d in %4d\n", si->ndirty_node, si->node_pages); seq_printf(s, " - dents: %4d in dirs:%4d (%4d)\n", From c4cc29d19eaf010c1133823438f5a3adba155f05 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 9 Jan 2017 18:16:29 -0800 Subject: [PATCH 101/804] f2fs: remove batched discard in f2fs_trim_fs We don't need to do multiple checkpoints, since we don't actually wait for completion of discard commands during checkpoint. Instead, we still need to avoid very big discard commands, since that large discard can interfere block allocation. Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 6 ---- fs/f2fs/f2fs.h | 9 +----- fs/f2fs/segment.c | 38 +++++++++---------------- fs/f2fs/super.c | 2 -- 4 files changed, 14 insertions(+), 41 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index 0345f2d1c727..bc8fbfa1c800 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -75,12 +75,6 @@ Contact: "Jaegeuk Kim" Description: Controls the memory footprint used by f2fs. -What: /sys/fs/f2fs//trim_sections -Date: February 2015 -Contact: "Jaegeuk Kim" -Description: - Controls the trimming rate in batch mode. - What: /sys/fs/f2fs//cp_interval Date: October 2015 Contact: "Jaegeuk Kim" diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 167c5f841b5f..dc436f780295 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -195,11 +195,7 @@ enum { CP_DISCARD, }; -#define DEF_BATCHED_TRIM_SECTIONS 2 -#define BATCHED_TRIM_SEGMENTS(sbi) \ - (SM_I(sbi)->trim_sections * (sbi)->segs_per_sec) -#define BATCHED_TRIM_BLOCKS(sbi) \ - (BATCHED_TRIM_SEGMENTS(sbi) << (sbi)->log_blocks_per_seg) +#define MAX_DISCARD_BLOCKS(sbi) (1 << (sbi)->log_blocks_per_seg) #define DEF_CP_INTERVAL 60 /* 60 secs */ #define DEF_IDLE_INTERVAL 5 /* 5 secs */ @@ -705,9 +701,6 @@ struct f2fs_sm_info { int nr_discards; /* # of discards in the list */ int max_discards; /* max. discards to be issued */ - /* for batched trimming */ - unsigned int trim_sections; /* # of sections to trim */ - struct list_head sit_entry_set; /* sit entry set list */ unsigned int ipu_policy; /* in-place-update policy */ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 289b3facd2d8..245ba28529b1 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -903,7 +903,8 @@ static void __add_discard_entry(struct f2fs_sb_info *sbi, if (!list_empty(head)) { last = list_last_entry(head, struct discard_entry, list); if (START_BLOCK(sbi, cpc->trim_start) + start == - last->blkaddr + last->len) { + last->blkaddr + last->len && + last->len <= MAX_DISCARD_BLOCKS(sbi)) { last->len += end - start; goto done; } @@ -1593,36 +1594,25 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) "Found FS corruption, run fsck to fix."); goto out; } + if (sbi->discard_blks == 0) + goto out; /* start/end segment number in main_area */ start_segno = (start <= MAIN_BLKADDR(sbi)) ? 0 : GET_SEGNO(sbi, start); end_segno = (end >= MAX_BLKADDR(sbi)) ? MAIN_SEGS(sbi) - 1 : GET_SEGNO(sbi, end); + /* + * do checkpoint to issue discard commands safely since we now can + * use async discard. + */ cpc.reason = CP_DISCARD; cpc.trim_minlen = max_t(__u64, 1, F2FS_BYTES_TO_BLK(range->minlen)); + cpc.trim_start = start_segno; + cpc.trim_end = end_segno; - /* do checkpoint to issue discard commands safely */ - for (; start_segno <= end_segno; start_segno = cpc.trim_end + 1) { - cpc.trim_start = start_segno; - - if (sbi->discard_blks == 0) - break; - else if (sbi->discard_blks < BATCHED_TRIM_BLOCKS(sbi)) - cpc.trim_end = end_segno; - else - cpc.trim_end = min_t(unsigned int, - rounddown(start_segno + - BATCHED_TRIM_SEGMENTS(sbi), - sbi->segs_per_sec) - 1, end_segno); - - mutex_lock(&sbi->gc_mutex); - err = write_checkpoint(sbi, &cpc); - mutex_unlock(&sbi->gc_mutex); - if (err) - break; - - schedule(); - } + mutex_lock(&sbi->gc_mutex); + err = write_checkpoint(sbi, &cpc); + mutex_unlock(&sbi->gc_mutex); out: range->len = F2FS_BLK_TO_BYTES(cpc.trimmed); return err; @@ -2788,8 +2778,6 @@ int build_segment_manager(struct f2fs_sb_info *sbi) sm_info->nr_discards = 0; sm_info->max_discards = 0; - sm_info->trim_sections = DEF_BATCHED_TRIM_SECTIONS; - INIT_LIST_HEAD(&sm_info->sit_entry_set); if (test_opt(sbi, FLUSH_MERGE) && !f2fs_readonly(sbi->sb)) { diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 84d5686c4aa4..38d40670aed0 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -284,7 +284,6 @@ F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_no_gc_sleep_time, no_gc_sleep_time); F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_idle, gc_idle); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, max_small_discards, max_discards); -F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, batched_trim_sections, trim_sections); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks); @@ -309,7 +308,6 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(gc_idle), ATTR_LIST(reclaim_segments), ATTR_LIST(max_small_discards), - ATTR_LIST(batched_trim_sections), ATTR_LIST(ipu_policy), ATTR_LIST(min_ipu_util), ATTR_LIST(min_fsync_blocks), From 565f0225f95f1518132952e8fe6854c92a60fd46 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 11 Jan 2017 14:40:24 -0800 Subject: [PATCH 102/804] f2fs: factor out discard command info into discard_cmd_control This patch adds discard_cmd_control with the existing discarding controls. Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 2 ++ fs/f2fs/f2fs.h | 16 ++++++----- fs/f2fs/segment.c | 68 ++++++++++++++++++++++++++++++++++++----------- fs/f2fs/super.c | 5 +++- 4 files changed, 69 insertions(+), 22 deletions(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index cd338ca24941..f9f6b0aeba02 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -196,6 +196,8 @@ get_cache: /* build merge flush thread */ if (SM_I(sbi)->fcc_info) si->cache_mem += sizeof(struct flush_cmd_control); + if (SM_I(sbi)->dcc_info) + si->cache_mem += sizeof(struct discard_cmd_control); /* free nids */ si->cache_mem += (NM_I(sbi)->nid_cnt[FREE_NID_LIST] + diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index dc436f780295..1bec4707e830 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -252,6 +252,13 @@ struct discard_cmd { struct bio *bio; /* bio */ }; +struct discard_cmd_control { + struct list_head discard_entry_list; /* 4KB discard entry list */ + int nr_discards; /* # of discards in the list */ + struct list_head discard_cmd_list; /* discard cmd list */ + int max_discards; /* max. discards to be issued */ +}; + /* for the list of fsync inodes, used only during recovery */ struct fsync_inode_entry { struct list_head list; /* list head */ @@ -695,12 +702,6 @@ struct f2fs_sm_info { /* a threshold to reclaim prefree segments */ unsigned int rec_prefree_segments; - /* for small discard management */ - struct list_head discard_entry_list; /* 4KB discard entry list */ - struct list_head discard_cmd_list; /* discard cmd list */ - int nr_discards; /* # of discards in the list */ - int max_discards; /* max. discards to be issued */ - struct list_head sit_entry_set; /* sit entry set list */ unsigned int ipu_policy; /* in-place-update policy */ @@ -709,6 +710,9 @@ struct f2fs_sm_info { /* for flush command control */ struct flush_cmd_control *fcc_info; + + /* for discard command control */ + struct discard_cmd_control *dcc_info; }; /* diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 245ba28529b1..dbe4b3e3198f 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -631,7 +631,8 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) static struct discard_cmd *__add_discard_cmd(struct f2fs_sb_info *sbi, struct bio *bio, block_t lstart, block_t len) { - struct list_head *wait_list = &(SM_I(sbi)->discard_cmd_list); + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct list_head *cmd_list = &(dcc->discard_cmd_list); struct discard_cmd *dc; dc = f2fs_kmem_cache_alloc(discard_cmd_slab, GFP_NOFS); @@ -640,7 +641,7 @@ static struct discard_cmd *__add_discard_cmd(struct f2fs_sb_info *sbi, dc->lstart = lstart; dc->len = len; init_completion(&dc->wait); - list_add_tail(&dc->list, wait_list); + list_add_tail(&dc->list, cmd_list); return dc; } @@ -648,7 +649,8 @@ static struct discard_cmd *__add_discard_cmd(struct f2fs_sb_info *sbi, /* This should be covered by global mutex, &sit_i->sentry_lock */ void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) { - struct list_head *wait_list = &(SM_I(sbi)->discard_cmd_list); + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct list_head *wait_list = &(dcc->discard_cmd_list); struct discard_cmd *dc, *tmp; list_for_each_entry_safe(dc, tmp, wait_list, list) { @@ -897,7 +899,7 @@ static void __add_discard_entry(struct f2fs_sb_info *sbi, struct cp_control *cpc, struct seg_entry *se, unsigned int start, unsigned int end) { - struct list_head *head = &SM_I(sbi)->discard_entry_list; + struct list_head *head = &SM_I(sbi)->dcc_info->discard_entry_list; struct discard_entry *new, *last; if (!list_empty(head)) { @@ -916,7 +918,7 @@ static void __add_discard_entry(struct f2fs_sb_info *sbi, new->len = end - start; list_add_tail(&new->list, head); done: - SM_I(sbi)->nr_discards += end - start; + SM_I(sbi)->dcc_info->nr_discards += end - start; } static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc, @@ -938,7 +940,8 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc, if (!force) { if (!test_opt(sbi, DISCARD) || !se->valid_blocks || - SM_I(sbi)->nr_discards >= SM_I(sbi)->max_discards) + SM_I(sbi)->dcc_info->nr_discards >= + SM_I(sbi)->dcc_info->max_discards) return false; } @@ -947,7 +950,8 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc, dmap[i] = force ? ~ckpt_map[i] & ~discard_map[i] : (cur_map[i] ^ ckpt_map[i]) & ckpt_map[i]; - while (force || SM_I(sbi)->nr_discards <= SM_I(sbi)->max_discards) { + while (force || SM_I(sbi)->dcc_info->nr_discards <= + SM_I(sbi)->dcc_info->max_discards) { start = __find_rev_next_bit(dmap, max_blocks, end + 1); if (start >= max_blocks) break; @@ -967,7 +971,7 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc, void release_discard_addrs(struct f2fs_sb_info *sbi) { - struct list_head *head = &(SM_I(sbi)->discard_entry_list); + struct list_head *head = &(SM_I(sbi)->dcc_info->discard_entry_list); struct discard_entry *entry, *this; /* drop caches */ @@ -993,7 +997,7 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi) void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc) { - struct list_head *head = &(SM_I(sbi)->discard_entry_list); + struct list_head *head = &(SM_I(sbi)->dcc_info->discard_entry_list); struct discard_entry *entry, *this; struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); struct blk_plug plug; @@ -1053,13 +1057,47 @@ next: cpc->trimmed += entry->len; skip: list_del(&entry->list); - SM_I(sbi)->nr_discards -= entry->len; + SM_I(sbi)->dcc_info->nr_discards -= entry->len; kmem_cache_free(discard_entry_slab, entry); } blk_finish_plug(&plug); } +int create_discard_cmd_control(struct f2fs_sb_info *sbi) +{ + struct discard_cmd_control *dcc; + int err = 0; + + if (SM_I(sbi)->dcc_info) { + dcc = SM_I(sbi)->dcc_info; + goto init_thread; + } + + dcc = kzalloc(sizeof(struct discard_cmd_control), GFP_KERNEL); + if (!dcc) + return -ENOMEM; + + INIT_LIST_HEAD(&dcc->discard_entry_list); + INIT_LIST_HEAD(&dcc->discard_cmd_list); + dcc->nr_discards = 0; + dcc->max_discards = 0; + + SM_I(sbi)->dcc_info = dcc; +init_thread: + return err; +} + +void destroy_discard_cmd_control(struct f2fs_sb_info *sbi, bool free) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + + if (free) { + kfree(dcc); + SM_I(sbi)->dcc_info = NULL; + } +} + static bool __mark_sit_entry_dirty(struct f2fs_sb_info *sbi, unsigned int segno) { struct sit_info *sit_i = SIT_I(sbi); @@ -2773,11 +2811,6 @@ int build_segment_manager(struct f2fs_sb_info *sbi) sm_info->min_ipu_util = DEF_MIN_IPU_UTIL; sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS; - INIT_LIST_HEAD(&sm_info->discard_entry_list); - INIT_LIST_HEAD(&sm_info->discard_cmd_list); - sm_info->nr_discards = 0; - sm_info->max_discards = 0; - INIT_LIST_HEAD(&sm_info->sit_entry_set); if (test_opt(sbi, FLUSH_MERGE) && !f2fs_readonly(sbi->sb)) { @@ -2786,6 +2819,10 @@ int build_segment_manager(struct f2fs_sb_info *sbi) return err; } + err = create_discard_cmd_control(sbi); + if (err) + return err; + err = build_sit_info(sbi); if (err) return err; @@ -2907,6 +2944,7 @@ void destroy_segment_manager(struct f2fs_sb_info *sbi) if (!sm_info) return; destroy_flush_cmd_control(sbi, true); + destroy_discard_cmd_control(sbi, true); destroy_dirty_segmap(sbi); destroy_curseg(sbi); destroy_free_segmap(sbi); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 38d40670aed0..1f152734b2ec 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -145,6 +145,7 @@ static match_table_t f2fs_tokens = { enum { GC_THREAD, /* struct f2fs_gc_thread */ SM_INFO, /* struct f2fs_sm_info */ + DCC_INFO, /* struct discard_cmd_control */ NM_INFO, /* struct f2fs_nm_info */ F2FS_SBI, /* struct f2fs_sb_info */ #ifdef CONFIG_F2FS_FAULT_INJECTION @@ -168,6 +169,8 @@ static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type) return (unsigned char *)sbi->gc_thread; else if (struct_type == SM_INFO) return (unsigned char *)SM_I(sbi); + else if (struct_type == DCC_INFO) + return (unsigned char *)SM_I(sbi)->dcc_info; else if (struct_type == NM_INFO) return (unsigned char *)NM_I(sbi); else if (struct_type == F2FS_SBI) @@ -283,7 +286,7 @@ F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_max_sleep_time, max_sleep_time); F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_no_gc_sleep_time, no_gc_sleep_time); F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_idle, gc_idle); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments); -F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, max_small_discards, max_discards); +F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_small_discards, max_discards); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks); From 587ad91ac9a8fe33865b05e086fea6384ecfbe48 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 9 Jan 2017 20:32:07 -0800 Subject: [PATCH 103/804] f2fs: add a kernel thread to issue discard commands asynchronously This patch adds a kernel thread to issue discard commands. It proposes three states, D_PREP, D_SUBMIT, and D_DONE to identify current bio status. Signed-off-by: Jaegeuk Kim Conflicts: fs/f2fs/segment.c --- fs/f2fs/f2fs.h | 11 ++++ fs/f2fs/segment.c | 131 ++++++++++++++++++++++++++++++++++------------ 2 files changed, 109 insertions(+), 33 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 1bec4707e830..29aa96496c67 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -196,6 +196,7 @@ enum { }; #define MAX_DISCARD_BLOCKS(sbi) (1 << (sbi)->log_blocks_per_seg) +#define DISCARD_ISSUE_RATE 8 #define DEF_CP_INTERVAL 60 /* 60 secs */ #define DEF_IDLE_INTERVAL 5 /* 5 secs */ @@ -244,18 +245,28 @@ struct discard_entry { int len; /* # of consecutive blocks of the discard */ }; +enum { + D_PREP, + D_SUBMIT, + D_DONE, +}; + struct discard_cmd { struct list_head list; /* command list */ struct completion wait; /* compleation */ block_t lstart; /* logical start address */ block_t len; /* length */ struct bio *bio; /* bio */ + int state; /* state */ }; struct discard_cmd_control { + struct task_struct *f2fs_issue_discard; /* discard thread */ struct list_head discard_entry_list; /* 4KB discard entry list */ int nr_discards; /* # of discards in the list */ struct list_head discard_cmd_list; /* discard cmd list */ + wait_queue_head_t discard_wait_queue; /* waiting queue for wake-up */ + struct mutex cmd_lock; int max_discards; /* max. discards to be issued */ }; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index dbe4b3e3198f..bae15887ac98 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -628,7 +628,7 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) mutex_unlock(&dirty_i->seglist_lock); } -static struct discard_cmd *__add_discard_cmd(struct f2fs_sb_info *sbi, +static void __add_discard_cmd(struct f2fs_sb_info *sbi, struct bio *bio, block_t lstart, block_t len) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; @@ -638,12 +638,30 @@ static struct discard_cmd *__add_discard_cmd(struct f2fs_sb_info *sbi, dc = f2fs_kmem_cache_alloc(discard_cmd_slab, GFP_NOFS); INIT_LIST_HEAD(&dc->list); dc->bio = bio; + bio->bi_private = dc; dc->lstart = lstart; dc->len = len; + dc->state = D_PREP; init_completion(&dc->wait); - list_add_tail(&dc->list, cmd_list); - return dc; + mutex_lock(&dcc->cmd_lock); + list_add_tail(&dc->list, cmd_list); + mutex_unlock(&dcc->cmd_lock); +} + +static void __remove_discard_cmd(struct f2fs_sb_info *sbi, struct discard_cmd *dc) +{ + int err = dc->bio->bi_error; + + if (err == -EOPNOTSUPP) + err = 0; + + if (err) + f2fs_msg(sbi->sb, KERN_INFO, + "Issue discard failed, ret: %d", err); + bio_put(dc->bio); + list_del(&dc->list); + kmem_cache_free(discard_cmd_slab, dc); } /* This should be covered by global mutex, &sit_i->sentry_lock */ @@ -653,31 +671,28 @@ void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) struct list_head *wait_list = &(dcc->discard_cmd_list); struct discard_cmd *dc, *tmp; + mutex_lock(&dcc->cmd_lock); list_for_each_entry_safe(dc, tmp, wait_list, list) { - struct bio *bio = dc->bio; - int err; - if (!completion_done(&dc->wait)) { - if ((dc->lstart <= blkaddr && - blkaddr < dc->lstart + dc->len) || - blkaddr == NULL_ADDR) - wait_for_completion_io(&dc->wait); - else - continue; + if (blkaddr == NULL_ADDR) { + if (dc->state == D_PREP) { + dc->state = D_SUBMIT; + submit_bio(REQ_SYNC, dc->bio); + } + wait_for_completion_io(&dc->wait); + + __remove_discard_cmd(sbi, dc); + continue; } - err = bio->bi_error; - if (err == -EOPNOTSUPP) - err = 0; - - if (err) - f2fs_msg(sbi->sb, KERN_INFO, - "Issue discard failed, ret: %d", err); - - bio_put(bio); - list_del(&dc->list); - kmem_cache_free(discard_cmd_slab, dc); + if (dc->lstart <= blkaddr && blkaddr < dc->lstart + dc->len) { + if (dc->state == D_SUBMIT) + wait_for_completion_io(&dc->wait); + else + __remove_discard_cmd(sbi, dc); + } } + mutex_unlock(&dcc->cmd_lock); } static void f2fs_submit_discard_endio(struct bio *bio) @@ -685,6 +700,7 @@ static void f2fs_submit_discard_endio(struct bio *bio) struct discard_cmd *dc = (struct discard_cmd *)bio->bi_private; complete(&dc->wait); + dc->state = D_DONE; } /* copied from block/blk-lib.c in 4.10-rc1 */ @@ -768,6 +784,45 @@ static int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, return 0; } +static int issue_discard_thread(void *data) +{ + struct f2fs_sb_info *sbi = data; + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + wait_queue_head_t *q = &dcc->discard_wait_queue; + struct list_head *cmd_list = &dcc->discard_cmd_list; + struct discard_cmd *dc, *tmp; + struct blk_plug plug; + int iter = 0; +repeat: + if (kthread_should_stop()) + return 0; + + blk_start_plug(&plug); + + mutex_lock(&dcc->cmd_lock); + list_for_each_entry_safe(dc, tmp, cmd_list, list) { + if (dc->state == D_PREP) { + dc->state = D_SUBMIT; + submit_bio(REQ_SYNC, dc->bio); + if (iter++ > DISCARD_ISSUE_RATE) + break; + } else if (dc->state == D_DONE) { + __remove_discard_cmd(sbi, dc); + } + } + mutex_unlock(&dcc->cmd_lock); + + blk_finish_plug(&plug); + + iter = 0; + congestion_wait(BLK_RW_SYNC, HZ/50); + + wait_event_interruptible(*q, + kthread_should_stop() || !list_empty(&dcc->discard_cmd_list)); + goto repeat; +} + + /* this function is copied from blkdev_issue_discard from block/blk-lib.c */ static int __f2fs_issue_discard_async(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t blkstart, block_t blklen) @@ -788,12 +843,9 @@ static int __f2fs_issue_discard_async(struct f2fs_sb_info *sbi, SECTOR_FROM_BLOCK(blklen), GFP_NOFS, 0, &bio); if (!err && bio) { - struct discard_cmd *dc = __add_discard_cmd(sbi, bio, - lblkstart, blklen); - - bio->bi_private = dc; bio->bi_end_io = f2fs_submit_discard_endio; - submit_bio(REQ_SYNC, bio); + __add_discard_cmd(sbi, bio, lblkstart, blklen); + wake_up(&SM_I(sbi)->dcc_info->discard_wait_queue); } return err; } @@ -1000,14 +1052,11 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc) struct list_head *head = &(SM_I(sbi)->dcc_info->discard_entry_list); struct discard_entry *entry, *this; struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); - struct blk_plug plug; unsigned long *prefree_map = dirty_i->dirty_segmap[PRE]; unsigned int start = 0, end = -1; unsigned int secno, start_segno; bool force = (cpc->reason == CP_DISCARD); - blk_start_plug(&plug); - mutex_lock(&dirty_i->seglist_lock); while (1) { @@ -1060,12 +1109,11 @@ skip: SM_I(sbi)->dcc_info->nr_discards -= entry->len; kmem_cache_free(discard_entry_slab, entry); } - - blk_finish_plug(&plug); } int create_discard_cmd_control(struct f2fs_sb_info *sbi) { + dev_t dev = sbi->sb->s_bdev->bd_dev; struct discard_cmd_control *dcc; int err = 0; @@ -1080,11 +1128,22 @@ int create_discard_cmd_control(struct f2fs_sb_info *sbi) INIT_LIST_HEAD(&dcc->discard_entry_list); INIT_LIST_HEAD(&dcc->discard_cmd_list); + mutex_init(&dcc->cmd_lock); dcc->nr_discards = 0; dcc->max_discards = 0; + init_waitqueue_head(&dcc->discard_wait_queue); SM_I(sbi)->dcc_info = dcc; init_thread: + dcc->f2fs_issue_discard = kthread_run(issue_discard_thread, sbi, + "f2fs_discard-%u:%u", MAJOR(dev), MINOR(dev)); + if (IS_ERR(dcc->f2fs_issue_discard)) { + err = PTR_ERR(dcc->f2fs_issue_discard); + kfree(dcc); + SM_I(sbi)->dcc_info = NULL; + return err; + } + return err; } @@ -1092,6 +1151,12 @@ void destroy_discard_cmd_control(struct f2fs_sb_info *sbi, bool free) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + if (dcc && dcc->f2fs_issue_discard) { + struct task_struct *discard_thread = dcc->f2fs_issue_discard; + + dcc->f2fs_issue_discard = NULL; + kthread_stop(discard_thread); + } if (free) { kfree(dcc); SM_I(sbi)->dcc_info = NULL; From 334173cc4ca16534d011446c301e85a7cba5c035 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 11 Jan 2017 10:20:04 -0800 Subject: [PATCH 104/804] f2fs: show # of on-going flush and discard bios This patch adds stat information for flush and discard commands. Signed-off-by: Jaegeuk Kim Conflicts: fs/f2fs/segment.c --- fs/f2fs/debug.c | 11 +++++++++-- fs/f2fs/f2fs.h | 3 ++- fs/f2fs/segment.c | 6 ++++++ 3 files changed, 17 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index f9f6b0aeba02..0ca977a94c13 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -54,6 +54,12 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->max_aw_cnt = atomic_read(&sbi->max_aw_cnt); si->nr_wb_cp_data = get_pages(sbi, F2FS_WB_CP_DATA); si->nr_wb_data = get_pages(sbi, F2FS_WB_DATA); + if (SM_I(sbi) && SM_I(sbi)->fcc_info) + si->nr_flush = + atomic_read(&SM_I(sbi)->fcc_info->submit_flush); + if (SM_I(sbi) && SM_I(sbi)->dcc_info) + si->nr_discard = + atomic_read(&SM_I(sbi)->dcc_info->submit_discard); si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg; si->rsvd_segs = reserved_segments(sbi); si->overp_segs = overprovision_segments(sbi); @@ -318,8 +324,9 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, " - Inner Struct Count: tree: %d(%d), node: %d\n", si->ext_tree, si->zombie_tree, si->ext_node); seq_puts(s, "\nBalancing F2FS Async:\n"); - seq_printf(s, " - IO (CP: %4d, Data: %4d)\n", - si->nr_wb_cp_data, si->nr_wb_data); + seq_printf(s, " - IO (CP: %4d, Data: %4d, Flush: %4d, Discard: %4d)\n", + si->nr_wb_cp_data, si->nr_wb_data, + si->nr_flush, si->nr_discard); seq_printf(s, " - inmem: %4d, atomic IO: %4d (Max. %4d)\n", si->inmem_pages, si->aw_cnt, si->max_aw_cnt); seq_printf(s, " - nodes: %4d in %4d\n", diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 29aa96496c67..9a4e2012ba36 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -268,6 +268,7 @@ struct discard_cmd_control { wait_queue_head_t discard_wait_queue; /* waiting queue for wake-up */ struct mutex cmd_lock; int max_discards; /* max. discards to be issued */ + atomic_t submit_discard; /* # of issued discard */ }; /* for the list of fsync inodes, used only during recovery */ @@ -2321,7 +2322,7 @@ struct f2fs_stat_info { unsigned int ndirty_dirs, ndirty_files, ndirty_all; int nats, dirty_nats, sits, dirty_sits, free_nids, alloc_nids; int total_count, utilization; - int bg_gc, nr_wb_cp_data, nr_wb_data; + int bg_gc, nr_wb_cp_data, nr_wb_data, nr_flush, nr_discard; int inline_xattr, inline_inode, inline_dir, orphans; int aw_cnt, max_aw_cnt; unsigned int valid_count, valid_node_count, valid_inode_count, discard_blks; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index bae15887ac98..5efc36f88b4a 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -653,6 +653,9 @@ static void __remove_discard_cmd(struct f2fs_sb_info *sbi, struct discard_cmd *d { int err = dc->bio->bi_error; + if (dc->state == D_DONE) + atomic_dec(&(SM_I(sbi)->dcc_info->submit_discard)); + if (err == -EOPNOTSUPP) err = 0; @@ -678,6 +681,7 @@ void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) if (dc->state == D_PREP) { dc->state = D_SUBMIT; submit_bio(REQ_SYNC, dc->bio); + atomic_inc(&dcc->submit_discard); } wait_for_completion_io(&dc->wait); @@ -804,6 +808,7 @@ repeat: if (dc->state == D_PREP) { dc->state = D_SUBMIT; submit_bio(REQ_SYNC, dc->bio); + atomic_inc(&dcc->submit_discard); if (iter++ > DISCARD_ISSUE_RATE) break; } else if (dc->state == D_DONE) { @@ -1129,6 +1134,7 @@ int create_discard_cmd_control(struct f2fs_sb_info *sbi) INIT_LIST_HEAD(&dcc->discard_entry_list); INIT_LIST_HEAD(&dcc->discard_cmd_list); mutex_init(&dcc->cmd_lock); + atomic_set(&dcc->submit_discard, 0); dcc->nr_discards = 0; dcc->max_discards = 0; From 87d83ae92ee06c4478d17db9934c0ea3b52be164 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 13 Jan 2017 13:12:29 -0800 Subject: [PATCH 105/804] f2fs: do not preallocate blocks which has wrong buffer Sheng Yong reports needless preallocation if write(small_buffer, large_size) is called. In that case, f2fs preallocates large_size, but vfs returns early due to small_buffer size. Let's detect it before preallocation phase in f2fs. Reported-by: Sheng Yong Signed-off-by: Jaegeuk Kim Conflicts: fs/f2fs/file.c --- fs/f2fs/data.c | 6 +++++- fs/f2fs/f2fs.h | 1 + fs/f2fs/file.c | 8 +++++++- 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 7efc2bf88641..ead210ae9468 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -749,6 +749,9 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) struct f2fs_map_blocks map; int err = 0; + if (is_inode_flag_set(inode, FI_NO_PREALLOC)) + return 0; + map.m_lblk = F2FS_BLK_ALIGN(iocb->ki_pos); map.m_len = F2FS_BYTES_TO_BLK(iocb->ki_pos + iov_iter_count(from)); if (map.m_len > map.m_lblk) @@ -1650,7 +1653,8 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi, * we already allocated all the blocks, so we don't need to get * the block addresses when there is no need to fill the page. */ - if (!f2fs_has_inline_data(inode) && len == PAGE_SIZE) + if (!f2fs_has_inline_data(inode) && len == PAGE_SIZE && + !is_inode_flag_set(inode, FI_NO_PREALLOC)) return 0; if (f2fs_has_inline_data(inode) || diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 9a4e2012ba36..7fc161474647 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1724,6 +1724,7 @@ enum { FI_INLINE_DOTS, /* indicate inline dot dentries */ FI_DO_DEFRAG, /* indicate defragment is running */ FI_DIRTY_FILE, /* indicate regular/symlink has dirty pages */ + FI_NO_PREALLOC, /* indicate skipped preallocated blocks */ }; static inline void __mark_inode_dirty_flag(struct inode *inode, diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index e4e5d76d80b0..27ef66ff7aab 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -2277,8 +2278,12 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) inode_lock(inode); ret = generic_write_checks(iocb, from); if (ret > 0) { - int err = f2fs_preallocate_blocks(iocb, from); + int err; + if (iov_iter_fault_in_readable(from, iov_iter_count(from))) + set_inode_flag(inode, FI_NO_PREALLOC); + + err = f2fs_preallocate_blocks(iocb, from); if (err) { inode_unlock(inode); return err; @@ -2286,6 +2291,7 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) blk_start_plug(&plug); ret = __generic_file_write_iter(iocb, from); blk_finish_plug(&plug); + clear_inode_flag(inode, FI_NO_PREALLOC); } inode_unlock(inode); From b89d1d4dfd7d78e91b425706c23130bbdcebb813 Mon Sep 17 00:00:00 2001 From: Wei Fang Date: Sun, 22 Jan 2017 12:21:02 +0800 Subject: [PATCH 106/804] f2fs: fix a dead loop in f2fs_fiemap() A dead loop can be triggered in f2fs_fiemap() using the test case as below: ... fd = open(); fallocate(fd, 0, 0, 4294967296); ioctl(fd, FS_IOC_FIEMAP, fiemap_buf); ... It's caused by an overflow in __get_data_block(): ... bh->b_size = map.m_len << inode->i_blkbits; ... map.m_len is an unsigned int, and bh->b_size is a size_t which is 64 bits on 64 bits archtecture, type conversion from an unsigned int to a size_t will result in an overflow. In the above-mentioned case, bh->b_size will be zero, and f2fs_fiemap() will call get_data_block() at block 0 again an again. Fix this by adding a force conversion before left shift. Signed-off-by: Wei Fang Acked-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index ead210ae9468..dbd2822b5ab8 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -964,7 +964,7 @@ static int __get_data_block(struct inode *inode, sector_t iblock, if (!err) { map_bh(bh, inode->i_sb, map.m_pblk); bh->b_state = (bh->b_state & ~F2FS_MAP_FLAGS) | map.m_flags; - bh->b_size = map.m_len << inode->i_blkbits; + bh->b_size = (u64)map.m_len << inode->i_blkbits; } return err; } From 21980a25e7276552f4923089dc28bb9af9024e9d Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 24 Jan 2017 20:39:51 +0800 Subject: [PATCH 107/804] f2fs: enhance lookup xattr Previously, in getxattr we will load all entries both in inline xattr and xattr node block, and then do the lookup in all entries, but our lookup flow shows low efficiency, since if we can lookup and hit in inline xattr of inode page cache first, we don't need to load and lookup xattr node block, which can obviously save cpu time and IO latency. Signed-off-by: Chao Yu [Jaegeuk Kim: initialize NULL to avoid warning] Signed-off-by: Jaegeuk Kim --- fs/f2fs/xattr.c | 132 ++++++++++++++++++++++++++++++++++++++++++------ fs/f2fs/xattr.h | 7 +-- 2 files changed, 121 insertions(+), 18 deletions(-) diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 1c4d5e39586c..ba67ca0c7014 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -264,6 +264,112 @@ static struct f2fs_xattr_entry *__find_xattr(void *base_addr, int index, return entry; } +static struct f2fs_xattr_entry *__find_inline_xattr(void *base_addr, + void **last_addr, int index, + size_t len, const char *name) +{ + struct f2fs_xattr_entry *entry; + unsigned int inline_size = F2FS_INLINE_XATTR_ADDRS << 2; + + list_for_each_xattr(entry, base_addr) { + if ((void *)entry + sizeof(__u32) > base_addr + inline_size || + (void *)XATTR_NEXT_ENTRY(entry) + sizeof(__u32) > + base_addr + inline_size) { + *last_addr = entry; + return NULL; + } + if (entry->e_name_index != index) + continue; + if (entry->e_name_len != len) + continue; + if (!memcmp(entry->e_name, name, len)) + break; + } + return entry; +} + +static int lookup_all_xattrs(struct inode *inode, struct page *ipage, + unsigned int index, unsigned int len, + const char *name, struct f2fs_xattr_entry **xe, + void **base_addr) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + void *cur_addr, *txattr_addr, *last_addr = NULL; + nid_t xnid = F2FS_I(inode)->i_xattr_nid; + unsigned int size = xnid ? VALID_XATTR_BLOCK_SIZE : 0; + unsigned int inline_size = 0; + int err = 0; + + inline_size = inline_xattr_size(inode); + + if (!size && !inline_size) + return -ENODATA; + + txattr_addr = kzalloc(inline_size + size + sizeof(__u32), + GFP_F2FS_ZERO); + if (!txattr_addr) + return -ENOMEM; + + /* read from inline xattr */ + if (inline_size) { + struct page *page = NULL; + void *inline_addr; + + if (ipage) { + inline_addr = inline_xattr_addr(ipage); + } else { + page = get_node_page(sbi, inode->i_ino); + if (IS_ERR(page)) { + err = PTR_ERR(page); + goto out; + } + inline_addr = inline_xattr_addr(page); + } + memcpy(txattr_addr, inline_addr, inline_size); + f2fs_put_page(page, 1); + + *xe = __find_inline_xattr(txattr_addr, &last_addr, + index, len, name); + if (*xe) + goto check; + } + + /* read from xattr node block */ + if (xnid) { + struct page *xpage; + void *xattr_addr; + + /* The inode already has an extended attribute block. */ + xpage = get_node_page(sbi, xnid); + if (IS_ERR(xpage)) { + err = PTR_ERR(xpage); + goto out; + } + + xattr_addr = page_address(xpage); + memcpy(txattr_addr + inline_size, xattr_addr, size); + f2fs_put_page(xpage, 1); + } + + if (last_addr) + cur_addr = XATTR_HDR(last_addr) - 1; + else + cur_addr = txattr_addr; + + *xe = __find_xattr(cur_addr, index, len, name); +check: + if (IS_XATTR_LAST_ENTRY(*xe)) { + err = -ENODATA; + goto out; + } + + *base_addr = txattr_addr; + return 0; +out: + kzfree(txattr_addr); + return err; +} + static int read_all_xattrs(struct inode *inode, struct page *ipage, void **base_addr) { @@ -395,8 +501,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, } xattr_addr = page_address(xpage); - memcpy(xattr_addr, txattr_addr + inline_size, PAGE_SIZE - - sizeof(struct node_footer)); + memcpy(xattr_addr, txattr_addr + inline_size, MAX_XATTR_BLOCK_SIZE); set_page_dirty(xpage); f2fs_put_page(xpage, 1); @@ -408,10 +513,11 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, int f2fs_getxattr(struct inode *inode, int index, const char *name, void *buffer, size_t buffer_size, struct page *ipage) { - struct f2fs_xattr_entry *entry; - void *base_addr; + struct f2fs_xattr_entry *entry = NULL; int error = 0; - size_t size, len; + unsigned int size, len; + char *pval; + void *base_addr = NULL; if (name == NULL) return -EINVAL; @@ -420,30 +526,26 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name, if (len > F2FS_NAME_LEN) return -ERANGE; - error = read_all_xattrs(inode, ipage, &base_addr); + error = lookup_all_xattrs(inode, ipage, index, len, name, + &entry, &base_addr); if (error) return error; - entry = __find_xattr(base_addr, index, len, name); - if (IS_XATTR_LAST_ENTRY(entry)) { - error = -ENODATA; - goto cleanup; - } - size = le16_to_cpu(entry->e_value_size); if (buffer && size > buffer_size) { error = -ERANGE; - goto cleanup; + goto out; } + pval = entry->e_name + entry->e_name_len; + if (buffer) { char *pval = entry->e_name + entry->e_name_len; memcpy(buffer, pval, size); } error = size; - -cleanup: +out: kzfree(base_addr); return error; } diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h index d2fd0387a3c7..ba64f43d163d 100644 --- a/fs/f2fs/xattr.h +++ b/fs/f2fs/xattr.h @@ -72,9 +72,10 @@ struct f2fs_xattr_entry { for (entry = XATTR_FIRST_ENTRY(addr);\ !IS_XATTR_LAST_ENTRY(entry);\ entry = XATTR_NEXT_ENTRY(entry)) - -#define MIN_OFFSET(i) XATTR_ALIGN(inline_xattr_size(i) + PAGE_SIZE - \ - sizeof(struct node_footer) - sizeof(__u32)) +#define MAX_XATTR_BLOCK_SIZE (PAGE_SIZE - sizeof(struct node_footer)) +#define VALID_XATTR_BLOCK_SIZE (MAX_XATTR_BLOCK_SIZE - sizeof(__u32)) +#define MIN_OFFSET(i) XATTR_ALIGN(inline_xattr_size(i) + \ + VALID_XATTR_BLOCK_SIZE) #define MAX_VALUE_LEN(i) (MIN_OFFSET(i) - \ sizeof(struct f2fs_xattr_header) - \ From d27bebf86574c0edbbf7d17dfd6568ba72c752c7 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 25 Jan 2017 10:52:39 +0800 Subject: [PATCH 108/804] f2fs: fix to avoid overflow when left shifting page offset We use following method to calculate size with current page index: size = index << PAGE_SHIFT If type of index has only 32-bits size, left shifting will incur overflow, which makes result incorrect. So let's cast index with 64-bits type to avoid such issue. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/recovery.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 4fb4471a3206..e93316ea8d1b 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -428,8 +428,9 @@ retry_dn: } if (!file_keep_isize(inode) && - (i_size_read(inode) <= (start << PAGE_SHIFT))) - f2fs_i_size_write(inode, (start + 1) << PAGE_SHIFT); + (i_size_read(inode) <= ((loff_t)start << PAGE_SHIFT))) + f2fs_i_size_write(inode, + (loff_t)(start + 1) << PAGE_SHIFT); /* * dest is reserved block, invalidate src block From fb40e1231cbc48f58432e095a22aadcff0c0c557 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 25 Jan 2017 10:52:40 +0800 Subject: [PATCH 109/804] f2fs: fix null pointer dereference when issuing flush in ->fsync We only allocate flush merge control structure sbi::sm_info::fcc_info when flush_merge option is on, but in f2fs_issue_flush we still try to access member of the control structure without that option, it incurs panic as show below, fix it. Call Trace: __remove_ino_entry+0xa9/0xc0 [f2fs] f2fs_do_sync_file.isra.27+0x214/0x6d0 [f2fs] f2fs_sync_file+0x18/0x20 [f2fs] vfs_fsync_range+0x3d/0xb0 __do_page_fault+0x261/0x4d0 do_fsync+0x3d/0x70 SyS_fsync+0x10/0x20 do_syscall_64+0x6e/0x180 entry_SYSCALL64_slow_path+0x25/0x25 RIP: 0033:0x7f18ce260de0 RSP: 002b:00007ffdd4589258 EFLAGS: 00000246 ORIG_RAX: 000000000000004a RAX: ffffffffffffffda RBX: 0000000000000001 RCX: 00007f18ce260de0 RDX: 0000000000000006 RSI: 00000000016c0360 RDI: 0000000000000003 RBP: 00000000016c0360 R08: 000000000000ffff R09: 000000000000001f R10: 00007ffdd4589020 R11: 0000000000000246 R12: 00000000016c0100 R13: 0000000000000000 R14: 00000000016c1f00 R15: 00000000016c0100 Code: fb 81 e3 00 08 00 00 48 89 45 a0 0f 1f 44 00 00 31 c0 85 db 75 27 41 81 e7 00 04 00 00 74 0c 41 8b 45 20 85 c0 0f 85 81 00 00 00 41 ff 45 20 4c 89 e7 e8 f8 e9 ff ff f0 41 ff 4d 20 48 83 c4 RIP: f2fs_issue_flush+0x5b/0x170 [f2fs] RSP: ffffc90003b5fd78 CR2: 0000000000000020 ---[ end trace a09314c24f037648 ]--- Reported-by: Shuoran Liu Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 5efc36f88b4a..11f2eccd873d 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -477,7 +477,10 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi) if (test_opt(sbi, NOBARRIER)) return 0; - if (!test_opt(sbi, FLUSH_MERGE) || !atomic_read(&fcc->submit_flush)) { + if (!test_opt(sbi, FLUSH_MERGE)) + return submit_flush_wait(sbi); + + if (!atomic_read(&fcc->submit_flush)) { int ret; atomic_inc(&fcc->submit_flush); From f3ca0da5c793d6a74ad6355c70b40f0fb314a779 Mon Sep 17 00:00:00 2001 From: Kaixu Xia Date: Fri, 27 Jan 2017 09:35:37 +0800 Subject: [PATCH 110/804] f2fs: show the fault injection mount option This patch shows the fault injection mount option in f2fs_show_options(). Signed-off-by: Kaixu Xia Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 1f152734b2ec..08b6ba9b3f14 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -554,6 +554,7 @@ static int parse_options(struct super_block *sb, char *options) return -EINVAL; #ifdef CONFIG_F2FS_FAULT_INJECTION f2fs_build_fault_attr(sbi, arg); + set_opt(sbi, FAULT_INJECTION); #else f2fs_msg(sb, KERN_INFO, "FAULT_INJECTION was not selected"); @@ -944,6 +945,10 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_printf(seq, ",active_logs=%u", sbi->active_logs); if (F2FS_IO_SIZE_BITS(sbi)) seq_printf(seq, ",io_size=%uKB", F2FS_IO_SIZE_KB(sbi)); +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (test_opt(sbi, FAULT_INJECTION)) + seq_puts(seq, ",fault_injection"); +#endif return 0; } From 01940a21a97e8f03b8f8fe618728ff099f0e9481 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Sun, 29 Jan 2017 14:27:02 +0900 Subject: [PATCH 111/804] f2fs: declare missing static function We missed two functions declared as static functions. Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 11f2eccd873d..9f0d77b4eefd 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1119,7 +1119,7 @@ skip: } } -int create_discard_cmd_control(struct f2fs_sb_info *sbi) +static int create_discard_cmd_control(struct f2fs_sb_info *sbi) { dev_t dev = sbi->sb->s_bdev->bd_dev; struct discard_cmd_control *dcc; @@ -1156,7 +1156,7 @@ init_thread: return err; } -void destroy_discard_cmd_control(struct f2fs_sb_info *sbi, bool free) +static void destroy_discard_cmd_control(struct f2fs_sb_info *sbi, bool free) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; From c99f2de7780a483dd50456282442d1021ad9d246 Mon Sep 17 00:00:00 2001 From: DongOh Shin Date: Mon, 30 Jan 2017 10:55:17 -0800 Subject: [PATCH 112/804] f2fs: fix 3 coding style errors in f2fs.h Two coding style errors below have been resolved: "Macros with complex values should be enclosed in parentheses" And a coding style error below has been resolved: "space prohibited before that ',' (ctx:WxW)" Signed-off-by: DongOh Shin Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 7fc161474647..95a559a359ca 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -109,9 +109,9 @@ struct f2fs_mount_info { #define F2FS_HAS_FEATURE(sb, mask) \ ((F2FS_SB(sb)->raw_super->feature & cpu_to_le32(mask)) != 0) #define F2FS_SET_FEATURE(sb, mask) \ - F2FS_SB(sb)->raw_super->feature |= cpu_to_le32(mask) + (F2FS_SB(sb)->raw_super->feature |= cpu_to_le32(mask)) #define F2FS_CLEAR_FEATURE(sb, mask) \ - F2FS_SB(sb)->raw_super->feature &= ~cpu_to_le32(mask) + (F2FS_SB(sb)->raw_super->feature &= ~cpu_to_le32(mask)) /* bio stuffs */ #define REQ_OP_READ READ @@ -2107,7 +2107,7 @@ void f2fs_set_link(struct inode *, struct f2fs_dir_entry *, struct page *, struct inode *); int update_dent_inode(struct inode *, struct inode *, const struct qstr *); void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *, - const struct qstr *, f2fs_hash_t , unsigned int); + const struct qstr *, f2fs_hash_t, unsigned int); int f2fs_add_regular_entry(struct inode *, const struct qstr *, const struct qstr *, struct inode *, nid_t, umode_t); int __f2fs_do_add_link(struct inode *, struct fscrypt_name*, struct inode *, From a94d94904cd3ecce3dc44f582c5213747467161b Mon Sep 17 00:00:00 2001 From: DongOh Shin Date: Mon, 30 Jan 2017 10:55:18 -0800 Subject: [PATCH 113/804] f2fs: fix 446 coding style warnings in f2fs.h 1) Nine coding style warnings below have been resolved: "Missing a blank line after declarations" 2) 435 coding style warnings below have been resolved: "function definition argument 'x' should also have an identifier name" 3) Two coding style warnings below have been resolved: "macros should not use a trailing semicolon" Signed-off-by: DongOh Shin Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 498 +++++++++++++++++++++++++++---------------------- 1 file changed, 270 insertions(+), 228 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 95a559a359ca..7f97aee70b12 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -293,6 +293,7 @@ struct fsync_inode_entry { static inline int update_nats_in_cursum(struct f2fs_journal *journal, int i) { int before = nats_in_cursum(journal); + journal->n_nats = cpu_to_le16(before + i); return before; } @@ -300,6 +301,7 @@ static inline int update_nats_in_cursum(struct f2fs_journal *journal, int i) static inline int update_sits_in_cursum(struct f2fs_journal *journal, int i) { int before = sits_in_cursum(journal); + journal->n_sits = cpu_to_le16(before + i); return before; } @@ -385,12 +387,14 @@ static inline void make_dentry_ptr(struct inode *inode, if (type == 1) { struct f2fs_dentry_block *t = (struct f2fs_dentry_block *)src; + d->max = NR_DENTRY_IN_BLOCK; d->bitmap = &t->dentry_bitmap; d->dentry = t->dentry; d->filename = t->filename; } else { struct f2fs_inline_dentry *t = (struct f2fs_inline_dentry *)src; + d->max = NR_INLINE_DENTRY; d->bitmap = &t->dentry_bitmap; d->dentry = t->dentry; @@ -579,7 +583,7 @@ static inline bool __is_front_mergeable(struct extent_info *cur, return __is_extent_mergeable(cur, front); } -extern void f2fs_mark_inode_dirty_sync(struct inode *, bool); +extern void f2fs_mark_inode_dirty_sync(struct inode *inode, bool sync); static inline void __try_update_largest_extent(struct inode *inode, struct extent_tree *et, struct extent_node *en) { @@ -1537,6 +1541,7 @@ static inline struct page *f2fs_grab_cache_page(struct address_space *mapping, { #ifdef CONFIG_F2FS_FAULT_INJECTION struct page *page = find_lock_page(mapping, index); + if (page) return page; @@ -1620,6 +1625,7 @@ static inline void f2fs_radix_tree_insert(struct radix_tree_root *root, static inline bool IS_INODE(struct page *page) { struct f2fs_node *p = F2FS_NODE(page); + return RAW_IS_INODE(p); } @@ -1633,6 +1639,7 @@ static inline block_t datablock_addr(struct page *node_page, { struct f2fs_node *raw_node; __le32 *addr_array; + raw_node = F2FS_NODE(node_page); addr_array = blkaddr_in_node(raw_node); return le32_to_cpu(addr_array[offset]); @@ -1869,6 +1876,7 @@ static inline unsigned int addrs_per_inode(struct inode *inode) static inline void *inline_xattr_addr(struct page *page) { struct f2fs_inode *ri = F2FS_INODE(page); + return (void *)&(ri->i_addr[DEF_ADDRS_PER_INODE - F2FS_INLINE_XATTR_ADDRS]); } @@ -1930,6 +1938,7 @@ static inline bool f2fs_is_drop_cache(struct inode *inode) static inline void *inline_data_addr(struct page *page) { struct f2fs_inode *ri = F2FS_INODE(page); + return (void *)&(ri->i_addr[1]); } @@ -2052,29 +2061,30 @@ static inline void *f2fs_kvzalloc(size_t size, gfp_t flags) /* * file.c */ -int f2fs_sync_file(struct file *, loff_t, loff_t, int); -void truncate_data_blocks(struct dnode_of_data *); -int truncate_blocks(struct inode *, u64, bool); -int f2fs_truncate(struct inode *); -int f2fs_getattr(struct vfsmount *, struct dentry *, struct kstat *); -int f2fs_setattr(struct dentry *, struct iattr *); -int truncate_hole(struct inode *, pgoff_t, pgoff_t); -int truncate_data_blocks_range(struct dnode_of_data *, int); -long f2fs_ioctl(struct file *, unsigned int, unsigned long); -long f2fs_compat_ioctl(struct file *, unsigned int, unsigned long); +int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync); +void truncate_data_blocks(struct dnode_of_data *dn); +int truncate_blocks(struct inode *inode, u64 from, bool lock); +int f2fs_truncate(struct inode *inode); +int f2fs_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat); +int f2fs_setattr(struct dentry *dentry, struct iattr *attr); +int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end); +int truncate_data_blocks_range(struct dnode_of_data *dn, int count); +long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); +long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); /* * inode.c */ -void f2fs_set_inode_flags(struct inode *); -struct inode *f2fs_iget(struct super_block *, unsigned long); -struct inode *f2fs_iget_retry(struct super_block *, unsigned long); -int try_to_free_nats(struct f2fs_sb_info *, int); -int update_inode(struct inode *, struct page *); -int update_inode_page(struct inode *); -int f2fs_write_inode(struct inode *, struct writeback_control *); -void f2fs_evict_inode(struct inode *); -void handle_failed_inode(struct inode *); +void f2fs_set_inode_flags(struct inode *inode); +struct inode *f2fs_iget(struct super_block *sb, unsigned long ino); +struct inode *f2fs_iget_retry(struct super_block *sb, unsigned long ino); +int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink); +int update_inode(struct inode *inode, struct page *node_page); +int update_inode_page(struct inode *inode); +int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc); +void f2fs_evict_inode(struct inode *inode); +void handle_failed_inode(struct inode *inode); /* * namei.c @@ -2084,40 +2094,47 @@ struct dentry *f2fs_get_parent(struct dentry *child); /* * dir.c */ -void set_de_type(struct f2fs_dir_entry *, umode_t); -unsigned char get_de_type(struct f2fs_dir_entry *); -struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *, - f2fs_hash_t, int *, struct f2fs_dentry_ptr *); -int f2fs_fill_dentries(struct dir_context *, struct f2fs_dentry_ptr *, - unsigned int, struct fscrypt_str *); -void do_make_empty_dir(struct inode *, struct inode *, - struct f2fs_dentry_ptr *); -struct page *init_inode_metadata(struct inode *, struct inode *, - const struct qstr *, const struct qstr *, struct page *); -void update_parent_metadata(struct inode *, struct inode *, unsigned int); -int room_for_filename(const void *, int, int); -void f2fs_drop_nlink(struct inode *, struct inode *); -struct f2fs_dir_entry *__f2fs_find_entry(struct inode *, struct fscrypt_name *, - struct page **); -struct f2fs_dir_entry *f2fs_find_entry(struct inode *, const struct qstr *, - struct page **); -struct f2fs_dir_entry *f2fs_parent_dir(struct inode *, struct page **); -ino_t f2fs_inode_by_name(struct inode *, const struct qstr *, struct page **); -void f2fs_set_link(struct inode *, struct f2fs_dir_entry *, - struct page *, struct inode *); -int update_dent_inode(struct inode *, struct inode *, const struct qstr *); -void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *, - const struct qstr *, f2fs_hash_t, unsigned int); -int f2fs_add_regular_entry(struct inode *, const struct qstr *, - const struct qstr *, struct inode *, nid_t, umode_t); -int __f2fs_do_add_link(struct inode *, struct fscrypt_name*, struct inode *, - nid_t, umode_t); -int __f2fs_add_link(struct inode *, const struct qstr *, struct inode *, nid_t, - umode_t); -void f2fs_delete_entry(struct f2fs_dir_entry *, struct page *, struct inode *, - struct inode *); -int f2fs_do_tmpfile(struct inode *, struct inode *); -bool f2fs_empty_dir(struct inode *); +void set_de_type(struct f2fs_dir_entry *de, umode_t mode); +unsigned char get_de_type(struct f2fs_dir_entry *de); +struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *fname, + f2fs_hash_t namehash, int *max_slots, + struct f2fs_dentry_ptr *d); +int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, + unsigned int start_pos, struct fscrypt_str *fstr); +void do_make_empty_dir(struct inode *inode, struct inode *parent, + struct f2fs_dentry_ptr *d); +struct page *init_inode_metadata(struct inode *inode, struct inode *dir, + const struct qstr *new_name, + const struct qstr *orig_name, struct page *dpage); +void update_parent_metadata(struct inode *dir, struct inode *inode, + unsigned int current_depth); +int room_for_filename(const void *bitmap, int slots, int max_slots); +void f2fs_drop_nlink(struct inode *dir, struct inode *inode); +struct f2fs_dir_entry *__f2fs_find_entry(struct inode *dir, + struct fscrypt_name *fname, struct page **res_page); +struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir, + const struct qstr *child, struct page **res_page); +struct f2fs_dir_entry *f2fs_parent_dir(struct inode *dir, struct page **p); +ino_t f2fs_inode_by_name(struct inode *dir, const struct qstr *qstr, + struct page **page); +void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, + struct page *page, struct inode *inode); +int update_dent_inode(struct inode *inode, struct inode *to, + const struct qstr *name); +void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *d, + const struct qstr *name, f2fs_hash_t name_hash, + unsigned int bit_pos); +int f2fs_add_regular_entry(struct inode *dir, const struct qstr *new_name, + const struct qstr *orig_name, + struct inode *inode, nid_t ino, umode_t mode); +int __f2fs_do_add_link(struct inode *dir, struct fscrypt_name *fname, + struct inode *inode, nid_t ino, umode_t mode); +int __f2fs_add_link(struct inode *dir, const struct qstr *name, + struct inode *inode, nid_t ino, umode_t mode); +void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, + struct inode *dir, struct inode *inode); +int f2fs_do_tmpfile(struct inode *inode, struct inode *dir); +bool f2fs_empty_dir(struct inode *dir); static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode) { @@ -2128,18 +2145,18 @@ static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode) /* * super.c */ -int f2fs_inode_dirtied(struct inode *, bool); -void f2fs_inode_synced(struct inode *); -int f2fs_commit_super(struct f2fs_sb_info *, bool); -int f2fs_sync_fs(struct super_block *, int); +int f2fs_inode_dirtied(struct inode *inode, bool sync); +void f2fs_inode_synced(struct inode *inode); +int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover); +int f2fs_sync_fs(struct super_block *sb, int sync); extern __printf(3, 4) -void f2fs_msg(struct super_block *, const char *, const char *, ...); +void f2fs_msg(struct super_block *sb, const char *level, const char *fmt, ...); int sanity_check_ckpt(struct f2fs_sb_info *sbi); /* * hash.c */ -f2fs_hash_t f2fs_dentry_hash(const struct qstr *); +f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info); /* * node.c @@ -2147,164 +2164,183 @@ f2fs_hash_t f2fs_dentry_hash(const struct qstr *); struct dnode_of_data; struct node_info; -bool available_free_memory(struct f2fs_sb_info *, int); -int need_dentry_mark(struct f2fs_sb_info *, nid_t); -bool is_checkpointed_node(struct f2fs_sb_info *, nid_t); -bool need_inode_block_update(struct f2fs_sb_info *, nid_t); -void get_node_info(struct f2fs_sb_info *, nid_t, struct node_info *); -pgoff_t get_next_page_offset(struct dnode_of_data *, pgoff_t); -int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int); -int truncate_inode_blocks(struct inode *, pgoff_t); -int truncate_xattr_node(struct inode *, struct page *); -int wait_on_node_pages_writeback(struct f2fs_sb_info *, nid_t); -int remove_inode_page(struct inode *); -struct page *new_inode_page(struct inode *); -struct page *new_node_page(struct dnode_of_data *, unsigned int, struct page *); -void ra_node_page(struct f2fs_sb_info *, nid_t); -struct page *get_node_page(struct f2fs_sb_info *, pgoff_t); -struct page *get_node_page_ra(struct page *, int); -void move_node_page(struct page *, int); -int fsync_node_pages(struct f2fs_sb_info *, struct inode *, - struct writeback_control *, bool); -int sync_node_pages(struct f2fs_sb_info *, struct writeback_control *); -void build_free_nids(struct f2fs_sb_info *, bool); -bool alloc_nid(struct f2fs_sb_info *, nid_t *); -void alloc_nid_done(struct f2fs_sb_info *, nid_t); -void alloc_nid_failed(struct f2fs_sb_info *, nid_t); -int try_to_free_nids(struct f2fs_sb_info *, int); -void recover_inline_xattr(struct inode *, struct page *); -void recover_xattr_data(struct inode *, struct page *, block_t); -int recover_inode_page(struct f2fs_sb_info *, struct page *); -int restore_node_summary(struct f2fs_sb_info *, unsigned int, - struct f2fs_summary_block *); -void flush_nat_entries(struct f2fs_sb_info *); -int build_node_manager(struct f2fs_sb_info *); -void destroy_node_manager(struct f2fs_sb_info *); +bool available_free_memory(struct f2fs_sb_info *sbi, int type); +int need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid); +bool is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid); +bool need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino); +void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni); +pgoff_t get_next_page_offset(struct dnode_of_data *dn, pgoff_t pgofs); +int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode); +int truncate_inode_blocks(struct inode *inode, pgoff_t from); +int truncate_xattr_node(struct inode *inode, struct page *page); +int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino); +int remove_inode_page(struct inode *inode); +struct page *new_inode_page(struct inode *inode); +struct page *new_node_page(struct dnode_of_data *dn, + unsigned int ofs, struct page *ipage); +void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid); +struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid); +struct page *get_node_page_ra(struct page *parent, int start); +void move_node_page(struct page *node_page, int gc_type); +int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, + struct writeback_control *wbc, bool atomic); +int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc); +void build_free_nids(struct f2fs_sb_info *sbi, bool sync); +bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid); +void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid); +void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid); +int try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink); +void recover_inline_xattr(struct inode *inode, struct page *page); +void recover_xattr_data(struct inode *inode, struct page *page, + block_t blkaddr); +int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page); +int restore_node_summary(struct f2fs_sb_info *sbi, + unsigned int segno, struct f2fs_summary_block *sum); +void flush_nat_entries(struct f2fs_sb_info *sbi); +int build_node_manager(struct f2fs_sb_info *sbi); +void destroy_node_manager(struct f2fs_sb_info *sbi); int __init create_node_manager_caches(void); void destroy_node_manager_caches(void); /* * segment.c */ -void register_inmem_page(struct inode *, struct page *); -void drop_inmem_pages(struct inode *); -int commit_inmem_pages(struct inode *); -void f2fs_balance_fs(struct f2fs_sb_info *, bool); -void f2fs_balance_fs_bg(struct f2fs_sb_info *); -int f2fs_issue_flush(struct f2fs_sb_info *); -int create_flush_cmd_control(struct f2fs_sb_info *); -void destroy_flush_cmd_control(struct f2fs_sb_info *, bool); -void invalidate_blocks(struct f2fs_sb_info *, block_t); -bool is_checkpointed_data(struct f2fs_sb_info *, block_t); -void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t); -void f2fs_wait_discard_bio(struct f2fs_sb_info *, block_t); -void clear_prefree_segments(struct f2fs_sb_info *, struct cp_control *); -void release_discard_addrs(struct f2fs_sb_info *); -int npages_for_summary_flush(struct f2fs_sb_info *, bool); -void allocate_new_segments(struct f2fs_sb_info *); -int f2fs_trim_fs(struct f2fs_sb_info *, struct fstrim_range *); -bool exist_trim_candidates(struct f2fs_sb_info *, struct cp_control *); -struct page *get_sum_page(struct f2fs_sb_info *, unsigned int); -void update_meta_page(struct f2fs_sb_info *, void *, block_t); -void write_meta_page(struct f2fs_sb_info *, struct page *); -void write_node_page(unsigned int, struct f2fs_io_info *); -void write_data_page(struct dnode_of_data *, struct f2fs_io_info *); -void rewrite_data_page(struct f2fs_io_info *); -void __f2fs_replace_block(struct f2fs_sb_info *, struct f2fs_summary *, - block_t, block_t, bool, bool); -void f2fs_replace_block(struct f2fs_sb_info *, struct dnode_of_data *, - block_t, block_t, unsigned char, bool, bool); -void allocate_data_block(struct f2fs_sb_info *, struct page *, - block_t, block_t *, struct f2fs_summary *, int); -void f2fs_wait_on_page_writeback(struct page *, enum page_type, bool); -void f2fs_wait_on_encrypted_page_writeback(struct f2fs_sb_info *, block_t); -void write_data_summaries(struct f2fs_sb_info *, block_t); -void write_node_summaries(struct f2fs_sb_info *, block_t); -int lookup_journal_in_cursum(struct f2fs_journal *, int, unsigned int, int); -void flush_sit_entries(struct f2fs_sb_info *, struct cp_control *); -int build_segment_manager(struct f2fs_sb_info *); -void destroy_segment_manager(struct f2fs_sb_info *); +void register_inmem_page(struct inode *inode, struct page *page); +void drop_inmem_pages(struct inode *inode); +int commit_inmem_pages(struct inode *inode); +void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need); +void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi); +int f2fs_issue_flush(struct f2fs_sb_info *sbi); +int create_flush_cmd_control(struct f2fs_sb_info *sbi); +void destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free); +void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr); +bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr); +void refresh_sit_entry(struct f2fs_sb_info *sbi, block_t old, block_t new); +void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr); +void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc); +void release_discard_addrs(struct f2fs_sb_info *sbi); +int npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra); +void allocate_new_segments(struct f2fs_sb_info *sbi); +int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range); +bool exist_trim_candidates(struct f2fs_sb_info *sbi, struct cp_control *cpc); +struct page *get_sum_page(struct f2fs_sb_info *sbi, unsigned int segno); +void update_meta_page(struct f2fs_sb_info *sbi, void *src, block_t blk_addr); +void write_meta_page(struct f2fs_sb_info *sbi, struct page *page); +void write_node_page(unsigned int nid, struct f2fs_io_info *fio); +void write_data_page(struct dnode_of_data *dn, struct f2fs_io_info *fio); +void rewrite_data_page(struct f2fs_io_info *fio); +void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, + block_t old_blkaddr, block_t new_blkaddr, + bool recover_curseg, bool recover_newaddr); +void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn, + block_t old_addr, block_t new_addr, + unsigned char version, bool recover_curseg, + bool recover_newaddr); +void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, + block_t old_blkaddr, block_t *new_blkaddr, + struct f2fs_summary *sum, int type); +void f2fs_wait_on_page_writeback(struct page *page, + enum page_type type, bool ordered); +void f2fs_wait_on_encrypted_page_writeback(struct f2fs_sb_info *sbi, + block_t blkaddr); +void write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk); +void write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk); +int lookup_journal_in_cursum(struct f2fs_journal *journal, int type, + unsigned int val, int alloc); +void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc); +int build_segment_manager(struct f2fs_sb_info *sbi); +void destroy_segment_manager(struct f2fs_sb_info *sbi); int __init create_segment_manager_caches(void); void destroy_segment_manager_caches(void); /* * checkpoint.c */ -void f2fs_stop_checkpoint(struct f2fs_sb_info *, bool); -struct page *grab_meta_page(struct f2fs_sb_info *, pgoff_t); -struct page *get_meta_page(struct f2fs_sb_info *, pgoff_t); -struct page *get_tmp_page(struct f2fs_sb_info *, pgoff_t); -bool is_valid_blkaddr(struct f2fs_sb_info *, block_t, int); -int ra_meta_pages(struct f2fs_sb_info *, block_t, int, int, bool); -void ra_meta_pages_cond(struct f2fs_sb_info *, pgoff_t); -long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long); -void add_ino_entry(struct f2fs_sb_info *, nid_t, int type); -void remove_ino_entry(struct f2fs_sb_info *, nid_t, int type); -void release_ino_entry(struct f2fs_sb_info *, bool); -bool exist_written_data(struct f2fs_sb_info *, nid_t, int); -int f2fs_sync_inode_meta(struct f2fs_sb_info *); -int acquire_orphan_inode(struct f2fs_sb_info *); -void release_orphan_inode(struct f2fs_sb_info *); -void add_orphan_inode(struct inode *); -void remove_orphan_inode(struct f2fs_sb_info *, nid_t); -int recover_orphan_inodes(struct f2fs_sb_info *); -int get_valid_checkpoint(struct f2fs_sb_info *); -void update_dirty_page(struct inode *, struct page *); -void remove_dirty_inode(struct inode *); -int sync_dirty_inodes(struct f2fs_sb_info *, enum inode_type); -int write_checkpoint(struct f2fs_sb_info *, struct cp_control *); -void init_ino_entry_info(struct f2fs_sb_info *); +void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io); +struct page *grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index); +struct page *get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index); +struct page *get_tmp_page(struct f2fs_sb_info *sbi, pgoff_t index); +bool is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type); +int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, + int type, bool sync); +void ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index); +long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, + long nr_to_write); +void add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type); +void remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type); +void release_ino_entry(struct f2fs_sb_info *sbi, bool all); +bool exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode); +int f2fs_sync_inode_meta(struct f2fs_sb_info *sbi); +int acquire_orphan_inode(struct f2fs_sb_info *sbi); +void release_orphan_inode(struct f2fs_sb_info *sbi); +void add_orphan_inode(struct inode *inode); +void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino); +int recover_orphan_inodes(struct f2fs_sb_info *sbi); +int get_valid_checkpoint(struct f2fs_sb_info *sbi); +void update_dirty_page(struct inode *inode, struct page *page); +void remove_dirty_inode(struct inode *inode); +int sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type); +int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc); +void init_ino_entry_info(struct f2fs_sb_info *sbi); int __init create_checkpoint_caches(void); void destroy_checkpoint_caches(void); /* * data.c */ -void f2fs_submit_merged_bio(struct f2fs_sb_info *, enum page_type, int); -void f2fs_submit_merged_bio_cond(struct f2fs_sb_info *, struct inode *, - struct page *, nid_t, enum page_type, int); -void f2fs_flush_merged_bios(struct f2fs_sb_info *); -int f2fs_submit_page_bio(struct f2fs_io_info *); -int f2fs_submit_page_mbio(struct f2fs_io_info *); -struct block_device *f2fs_target_device(struct f2fs_sb_info *, - block_t, struct bio *); -int f2fs_target_device_index(struct f2fs_sb_info *, block_t); -void set_data_blkaddr(struct dnode_of_data *); -void f2fs_update_data_blkaddr(struct dnode_of_data *, block_t); -int reserve_new_blocks(struct dnode_of_data *, blkcnt_t); -int reserve_new_block(struct dnode_of_data *); -int f2fs_get_block(struct dnode_of_data *, pgoff_t); -int f2fs_preallocate_blocks(struct kiocb *, struct iov_iter *); -int f2fs_reserve_block(struct dnode_of_data *, pgoff_t); -struct page *get_read_data_page(struct inode *, pgoff_t, int, bool); -struct page *find_data_page(struct inode *, pgoff_t); -struct page *get_lock_data_page(struct inode *, pgoff_t, bool); -struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool); -int do_write_data_page(struct f2fs_io_info *); -int f2fs_map_blocks(struct inode *, struct f2fs_map_blocks *, int, int); -int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *, u64, u64); -void f2fs_set_page_dirty_nobuffers(struct page *); -void f2fs_invalidate_page(struct page *, unsigned int, unsigned int); -int f2fs_release_page(struct page *, gfp_t); +void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, enum page_type type, + int rw); +void f2fs_submit_merged_bio_cond(struct f2fs_sb_info *sbi, + struct inode *inode, struct page *page, + nid_t ino, enum page_type type, int rw); +void f2fs_flush_merged_bios(struct f2fs_sb_info *sbi); +int f2fs_submit_page_bio(struct f2fs_io_info *fio); +int f2fs_submit_page_mbio(struct f2fs_io_info *fio); +struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi, + block_t blk_addr, struct bio *bio); +int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr); +void set_data_blkaddr(struct dnode_of_data *dn); +void f2fs_update_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr); +int reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count); +int reserve_new_block(struct dnode_of_data *dn); +int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index); +int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from); +int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index); +struct page *get_read_data_page(struct inode *inode, pgoff_t index, + int op_flags, bool for_write); +struct page *find_data_page(struct inode *inode, pgoff_t index); +struct page *get_lock_data_page(struct inode *inode, pgoff_t index, + bool for_write); +struct page *get_new_data_page(struct inode *inode, + struct page *ipage, pgoff_t index, bool new_i_size); +int do_write_data_page(struct f2fs_io_info *fio); +int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, + int create, int flag); +int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 start, u64 len); +void f2fs_set_page_dirty_nobuffers(struct page *page); +void f2fs_invalidate_page(struct page *page, unsigned int offset, + unsigned int length); +int f2fs_release_page(struct page *page, gfp_t wait); #ifdef CONFIG_MIGRATION -int f2fs_migrate_page(struct address_space *, struct page *, struct page *, - enum migrate_mode); +int f2fs_migrate_page(struct address_space *mapping, struct page *newpage, + struct page *page, enum migrate_mode mode); #endif /* * gc.c */ -int start_gc_thread(struct f2fs_sb_info *); -void stop_gc_thread(struct f2fs_sb_info *); -block_t start_bidx_of_node(unsigned int, struct inode *); -int f2fs_gc(struct f2fs_sb_info *, bool, bool); -void build_gc_manager(struct f2fs_sb_info *); +int start_gc_thread(struct f2fs_sb_info *sbi); +void stop_gc_thread(struct f2fs_sb_info *sbi); +block_t start_bidx_of_node(unsigned int node_ofs, struct inode *inode); +int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background); +void build_gc_manager(struct f2fs_sb_info *sbi); /* * recovery.c */ -int recover_fsync_data(struct f2fs_sb_info *, bool); -bool space_for_roll_forward(struct f2fs_sb_info *); +int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only); +bool space_for_roll_forward(struct f2fs_sb_info *sbi); /* * debug.c @@ -2398,9 +2434,9 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) #define stat_inc_inplace_blocks(sbi) \ (atomic_inc(&(sbi)->inplace_count)) #define stat_inc_atomic_write(inode) \ - (atomic_inc(&F2FS_I_SB(inode)->aw_cnt)); + (atomic_inc(&F2FS_I_SB(inode)->aw_cnt)) #define stat_dec_atomic_write(inode) \ - (atomic_dec(&F2FS_I_SB(inode)->aw_cnt)); + (atomic_dec(&F2FS_I_SB(inode)->aw_cnt)) #define stat_update_max_atomic_write(inode) \ do { \ int cur = atomic_read(&F2FS_I_SB(inode)->aw_cnt); \ @@ -2440,8 +2476,8 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) si->bg_node_blks += (gc_type == BG_GC) ? (blks) : 0; \ } while (0) -int f2fs_build_stats(struct f2fs_sb_info *); -void f2fs_destroy_stats(struct f2fs_sb_info *); +int f2fs_build_stats(struct f2fs_sb_info *sbi); +void f2fs_destroy_stats(struct f2fs_sb_info *sbi); int __init f2fs_create_root_stats(void); void f2fs_destroy_root_stats(void); #else @@ -2493,49 +2529,55 @@ extern struct kmem_cache *inode_entry_slab; /* * inline.c */ -bool f2fs_may_inline_data(struct inode *); -bool f2fs_may_inline_dentry(struct inode *); -void read_inline_data(struct page *, struct page *); -bool truncate_inline_inode(struct page *, u64); -int f2fs_read_inline_data(struct inode *, struct page *); -int f2fs_convert_inline_page(struct dnode_of_data *, struct page *); -int f2fs_convert_inline_inode(struct inode *); -int f2fs_write_inline_data(struct inode *, struct page *); -bool recover_inline_data(struct inode *, struct page *); -struct f2fs_dir_entry *find_in_inline_dir(struct inode *, - struct fscrypt_name *, struct page **); -int make_empty_inline_dir(struct inode *inode, struct inode *, struct page *); -int f2fs_add_inline_entry(struct inode *, const struct qstr *, - const struct qstr *, struct inode *, nid_t, umode_t); -void f2fs_delete_inline_entry(struct f2fs_dir_entry *, struct page *, - struct inode *, struct inode *); -bool f2fs_empty_inline_dir(struct inode *); -int f2fs_read_inline_dir(struct file *, struct dir_context *, - struct fscrypt_str *); -int f2fs_inline_data_fiemap(struct inode *, - struct fiemap_extent_info *, __u64, __u64); +bool f2fs_may_inline_data(struct inode *inode); +bool f2fs_may_inline_dentry(struct inode *inode); +void read_inline_data(struct page *page, struct page *ipage); +bool truncate_inline_inode(struct page *ipage, u64 from); +int f2fs_read_inline_data(struct inode *inode, struct page *page); +int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page); +int f2fs_convert_inline_inode(struct inode *inode); +int f2fs_write_inline_data(struct inode *inode, struct page *page); +bool recover_inline_data(struct inode *inode, struct page *npage); +struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir, + struct fscrypt_name *fname, struct page **res_page); +int make_empty_inline_dir(struct inode *inode, struct inode *parent, + struct page *ipage); +int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, + const struct qstr *orig_name, + struct inode *inode, nid_t ino, umode_t mode); +void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page, + struct inode *dir, struct inode *inode); +bool f2fs_empty_inline_dir(struct inode *dir); +int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx, + struct fscrypt_str *fstr); +int f2fs_inline_data_fiemap(struct inode *inode, + struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len); /* * shrinker.c */ -unsigned long f2fs_shrink_count(struct shrinker *, struct shrink_control *); -unsigned long f2fs_shrink_scan(struct shrinker *, struct shrink_control *); -void f2fs_join_shrinker(struct f2fs_sb_info *); -void f2fs_leave_shrinker(struct f2fs_sb_info *); +unsigned long f2fs_shrink_count(struct shrinker *shrink, + struct shrink_control *sc); +unsigned long f2fs_shrink_scan(struct shrinker *shrink, + struct shrink_control *sc); +void f2fs_join_shrinker(struct f2fs_sb_info *sbi); +void f2fs_leave_shrinker(struct f2fs_sb_info *sbi); /* * extent_cache.c */ -unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *, int); -bool f2fs_init_extent_tree(struct inode *, struct f2fs_extent *); -void f2fs_drop_extent_tree(struct inode *); -unsigned int f2fs_destroy_extent_node(struct inode *); -void f2fs_destroy_extent_tree(struct inode *); -bool f2fs_lookup_extent_cache(struct inode *, pgoff_t, struct extent_info *); -void f2fs_update_extent_cache(struct dnode_of_data *); +unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink); +bool f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext); +void f2fs_drop_extent_tree(struct inode *inode); +unsigned int f2fs_destroy_extent_node(struct inode *inode); +void f2fs_destroy_extent_tree(struct inode *inode); +bool f2fs_lookup_extent_cache(struct inode *inode, pgoff_t pgofs, + struct extent_info *ei); +void f2fs_update_extent_cache(struct dnode_of_data *dn); void f2fs_update_extent_cache_range(struct dnode_of_data *dn, - pgoff_t, block_t, unsigned int); -void init_extent_cache_info(struct f2fs_sb_info *); + pgoff_t fofs, block_t blkaddr, unsigned int len); +void init_extent_cache_info(struct f2fs_sb_info *sbi); int __init create_extent_cache(void); void destroy_extent_cache(void); From de0e3bc1a54a884d5c7392a0441b272c0e60ed5b Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 1 Feb 2017 15:40:11 -0800 Subject: [PATCH 114/804] f2fs: show # of APPEND and UPDATE inodes This patch shows cached # of APPEND and UPDATE inode entries. Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 6 ++++-- fs/f2fs/f2fs.h | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 0ca977a94c13..de8da9fc5c99 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -70,6 +70,8 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->inline_xattr = atomic_read(&sbi->inline_xattr); si->inline_inode = atomic_read(&sbi->inline_inode); si->inline_dir = atomic_read(&sbi->inline_dir); + si->append = sbi->im[APPEND_INO].ino_num; + si->update = sbi->im[UPDATE_INO].ino_num; si->orphans = sbi->im[ORPHAN_INO].ino_num; si->utilization = utilization(sbi); @@ -264,8 +266,8 @@ static int stat_show(struct seq_file *s, void *v) si->inline_inode); seq_printf(s, " - Inline_dentry Inode: %u\n", si->inline_dir); - seq_printf(s, " - Orphan Inode: %u\n", - si->orphans); + seq_printf(s, " - Orphan/Append/Update Inode: %u, %u, %u\n", + si->orphans, si->append, si->update); seq_printf(s, "\nMain area: %d segs, %d secs %d zones\n", si->main_area_segs, si->main_area_sections, si->main_area_zones); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 7f97aee70b12..a32d1a2523a5 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2360,7 +2360,7 @@ struct f2fs_stat_info { int nats, dirty_nats, sits, dirty_sits, free_nids, alloc_nids; int total_count, utilization; int bg_gc, nr_wb_cp_data, nr_wb_data, nr_flush, nr_discard; - int inline_xattr, inline_inode, inline_dir, orphans; + int inline_xattr, inline_inode, inline_dir, append, update, orphans; int aw_cnt, max_aw_cnt; unsigned int valid_count, valid_node_count, valid_inode_count, discard_blks; unsigned int bimodal, avg_vblocks; From 040eb7fd6c297f6f2b7e899930892e87d6a52796 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 2 Feb 2017 16:40:55 -0800 Subject: [PATCH 115/804] f2fs: move flush tracepoint This patch moves the tracepoint location for flush command. Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 9f0d77b4eefd..ab62f0be2b15 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -426,6 +426,9 @@ static int submit_flush_wait(struct f2fs_sb_info *sbi) int ret = __submit_flush_wait(sbi->sb->s_bdev); int i; + trace_f2fs_issue_flush(sbi->sb, test_opt(sbi, NOBARRIER), + test_opt(sbi, FLUSH_MERGE)); + if (sbi->s_ndevs && !ret) { for (i = 1; i < sbi->s_ndevs; i++) { ret = __submit_flush_wait(FDEV(i).bdev); @@ -471,9 +474,6 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi) struct flush_cmd_control *fcc = SM_I(sbi)->fcc_info; struct flush_cmd cmd; - trace_f2fs_issue_flush(sbi->sb, test_opt(sbi, NOBARRIER), - test_opt(sbi, FLUSH_MERGE)); - if (test_opt(sbi, NOBARRIER)) return 0; From 8f70c40113f4c69e1bb9e1906e8d3b7a4c2b16d4 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 2 Feb 2017 18:27:17 -0800 Subject: [PATCH 116/804] f2fs: move write_node_page above fsync_node_pages This patch just moves write_node_page and introduces an inner function. Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 140 ++++++++++++++++++++++++++----------------------- 1 file changed, 73 insertions(+), 67 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index bc67dc323f7e..5cdcf5ba43a0 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1318,6 +1318,78 @@ continue_unlock: return last_page; } +static int __write_node_page(struct page *page, + struct writeback_control *wbc) +{ + struct f2fs_sb_info *sbi = F2FS_P_SB(page); + nid_t nid; + struct node_info ni; + struct f2fs_io_info fio = { + .sbi = sbi, + .type = NODE, + .op = REQ_OP_WRITE, + .op_flags = wbc_to_write_flags(wbc), + .page = page, + .encrypted_page = NULL, + }; + + trace_f2fs_writepage(page, NODE); + + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) + goto redirty_out; + if (unlikely(f2fs_cp_error(sbi))) + goto redirty_out; + + /* get old block addr of this node page */ + nid = nid_of_node(page); + f2fs_bug_on(sbi, page->index != nid); + + if (wbc->for_reclaim) { + if (!down_read_trylock(&sbi->node_write)) + goto redirty_out; + } else { + down_read(&sbi->node_write); + } + + get_node_info(sbi, nid, &ni); + + /* This page is already truncated */ + if (unlikely(ni.blk_addr == NULL_ADDR)) { + ClearPageUptodate(page); + dec_page_count(sbi, F2FS_DIRTY_NODES); + up_read(&sbi->node_write); + unlock_page(page); + return 0; + } + + set_page_writeback(page); + fio.old_blkaddr = ni.blk_addr; + write_node_page(nid, &fio); + set_node_addr(sbi, &ni, fio.new_blkaddr, is_fsync_dnode(page)); + dec_page_count(sbi, F2FS_DIRTY_NODES); + up_read(&sbi->node_write); + + if (wbc->for_reclaim) + f2fs_submit_merged_bio_cond(sbi, NULL, page, 0, NODE, WRITE); + + unlock_page(page); + + if (unlikely(f2fs_cp_error(sbi))) + f2fs_submit_merged_bio(sbi, NODE, WRITE); + + return 0; + +redirty_out: + redirty_page_for_writepage(wbc, page); + return AOP_WRITEPAGE_ACTIVATE; +} + +static int f2fs_write_node_page(struct page *page, + struct writeback_control *wbc) +{ + return __write_node_page(page, wbc); +} + int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, struct writeback_control *wbc, bool atomic) { @@ -1397,7 +1469,7 @@ continue_unlock: if (!clear_page_dirty_for_io(page)) goto continue_unlock; - ret = NODE_MAPPING(sbi)->a_ops->writepage(page, wbc); + ret = __write_node_page(page, wbc); if (ret) { unlock_page(page); f2fs_put_page(last_page, 0); @@ -1580,72 +1652,6 @@ int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino) return ret; } -static int f2fs_write_node_page(struct page *page, - struct writeback_control *wbc) -{ - struct f2fs_sb_info *sbi = F2FS_P_SB(page); - nid_t nid; - struct node_info ni; - struct f2fs_io_info fio = { - .sbi = sbi, - .type = NODE, - .op = REQ_OP_WRITE, - .op_flags = wbc_to_write_flags(wbc), - .page = page, - .encrypted_page = NULL, - }; - - trace_f2fs_writepage(page, NODE); - - if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) - goto redirty_out; - if (unlikely(f2fs_cp_error(sbi))) - goto redirty_out; - - /* get old block addr of this node page */ - nid = nid_of_node(page); - f2fs_bug_on(sbi, page->index != nid); - - if (wbc->for_reclaim) { - if (!down_read_trylock(&sbi->node_write)) - goto redirty_out; - } else { - down_read(&sbi->node_write); - } - - get_node_info(sbi, nid, &ni); - - /* This page is already truncated */ - if (unlikely(ni.blk_addr == NULL_ADDR)) { - ClearPageUptodate(page); - dec_page_count(sbi, F2FS_DIRTY_NODES); - up_read(&sbi->node_write); - unlock_page(page); - return 0; - } - - set_page_writeback(page); - fio.old_blkaddr = ni.blk_addr; - write_node_page(nid, &fio); - set_node_addr(sbi, &ni, fio.new_blkaddr, is_fsync_dnode(page)); - dec_page_count(sbi, F2FS_DIRTY_NODES); - up_read(&sbi->node_write); - - if (wbc->for_reclaim) - f2fs_submit_merged_bio_cond(sbi, NULL, page, 0, NODE, WRITE); - - unlock_page(page); - - if (unlikely(f2fs_cp_error(sbi))) - f2fs_submit_merged_bio(sbi, NODE, WRITE); - - return 0; - -redirty_out: - redirty_page_for_writepage(wbc, page); - return AOP_WRITEPAGE_ACTIVATE; -} - static int f2fs_write_node_pages(struct address_space *mapping, struct writeback_control *wbc) { From ed0eee678877e7e9276262616d447813edbb33ab Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 2 Feb 2017 18:18:06 -0800 Subject: [PATCH 117/804] f2fs: avoid out-of-order execution of atomic writes We need to flush data writes before flushing last node block writes by using FUA with PREFLUSH. We don't need to guarantee precedent node writes since if those are not written, we can't reach to the last node block when scanning node block chain during roll-forward recovery. Afterwards f2fs_wait_on_page_writeback guarantees all the IO submission to disk, which builds a valid node block chain. Signed-off-by: Jaegeuk Kim Conflicts: include/trace/events/f2fs.h --- fs/f2fs/file.c | 3 ++- fs/f2fs/node.c | 10 +++++++--- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 27ef66ff7aab..12c12cb4a06f 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -279,7 +279,8 @@ sync_nodes: flush_out: remove_ino_entry(sbi, ino, UPDATE_INO); clear_inode_flag(inode, FI_UPDATE_WRITE); - ret = f2fs_issue_flush(sbi); + if (!atomic) + ret = f2fs_issue_flush(sbi); f2fs_update_time(sbi, REQ_TIME); out: trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 5cdcf5ba43a0..d24bdb970a24 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1318,7 +1318,7 @@ continue_unlock: return last_page; } -static int __write_node_page(struct page *page, +static int __write_node_page(struct page *page, bool atomic, struct writeback_control *wbc) { struct f2fs_sb_info *sbi = F2FS_P_SB(page); @@ -1362,6 +1362,9 @@ static int __write_node_page(struct page *page, return 0; } + if (atomic && !test_opt(sbi, NOBARRIER)) + fio.op_flags |= WRITE_FLUSH_FUA; + set_page_writeback(page); fio.old_blkaddr = ni.blk_addr; write_node_page(nid, &fio); @@ -1387,7 +1390,7 @@ redirty_out: static int f2fs_write_node_page(struct page *page, struct writeback_control *wbc) { - return __write_node_page(page, wbc); + return __write_node_page(page, false, wbc); } int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, @@ -1469,7 +1472,8 @@ continue_unlock: if (!clear_page_dirty_for_io(page)) goto continue_unlock; - ret = __write_node_page(page, wbc); + ret = __write_node_page(page, atomic && + page == last_page, wbc); if (ret) { unlock_page(page); f2fs_put_page(last_page, 0); From 33f44e9f9c086f3e4767e7ebb17256e3b6da79db Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 3 Feb 2017 17:18:00 -0800 Subject: [PATCH 118/804] f2fs: call internal __write_data_page directly This patch introduces __write_data_page to call it by f2fs_write_cache_pages directly.. Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index dbd2822b5ab8..f8a0bab49f0d 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1343,7 +1343,7 @@ out_writepage: return err; } -static int f2fs_write_data_page(struct page *page, +static int __write_data_page(struct page *page, struct writeback_control *wbc) { struct inode *inode = page->mapping->host; @@ -1445,6 +1445,12 @@ redirty_out: return err; } +static int f2fs_write_data_page(struct page *page, + struct writeback_control *wbc) +{ + return __write_data_page(page, wbc); +} + /* * This function was copied from write_cche_pages from mm/page-writeback.c. * The major change is making write step of cold data page separately from @@ -1534,7 +1540,7 @@ continue_unlock: if (!clear_page_dirty_for_io(page)) goto continue_unlock; - ret = mapping->a_ops->writepage(page, wbc); + ret = __write_data_page(page, wbc); if (unlikely(ret)) { /* * keep nr_to_write, since vfs uses this to From 6cd8a154387ddc8025c93327e86cfbe690dd7035 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 23 Feb 2017 09:52:35 -0800 Subject: [PATCH 119/804] f2fs: fix missing bio_alloc(1) For discard commands, we should use bio_alloc(1) in old versions. Fixes: 373bb0247a ("f2fs: support async discard based on v4.9") Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index ab62f0be2b15..83d34c80ec37 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -769,7 +769,7 @@ static int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, if (ret) return ret; } - bio = f2fs_bio_alloc(0); + bio = f2fs_bio_alloc(1); bio->bi_iter.bi_sector = sector; bio->bi_bdev = bdev; bio_set_op_attrs(bio, op, 0); From 5e95180bf64c01a1ed4238ff92b1524ebfb8e34f Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 23 Feb 2017 16:53:14 -0800 Subject: [PATCH 120/804] Revert "f2fs: remove batched discard in f2fs_trim_fs" This reverts commit c4cc29d19eaf010c1133823438f5a3adba155f05. Conflicts: fs/f2fs/f2fs.h fs/f2fs/segment.c fs/f2fs/super.c --- Documentation/ABI/testing/sysfs-fs-f2fs | 6 ++++ fs/f2fs/f2fs.h | 10 ++++++- fs/f2fs/segment.c | 38 ++++++++++++++++--------- fs/f2fs/super.c | 2 ++ 4 files changed, 42 insertions(+), 14 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index bc8fbfa1c800..0345f2d1c727 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -75,6 +75,12 @@ Contact: "Jaegeuk Kim" Description: Controls the memory footprint used by f2fs. +What: /sys/fs/f2fs//trim_sections +Date: February 2015 +Contact: "Jaegeuk Kim" +Description: + Controls the trimming rate in batch mode. + What: /sys/fs/f2fs//cp_interval Date: October 2015 Contact: "Jaegeuk Kim" diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index a32d1a2523a5..0c22dfd69d6e 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -195,7 +195,12 @@ enum { CP_DISCARD, }; -#define MAX_DISCARD_BLOCKS(sbi) (1 << (sbi)->log_blocks_per_seg) +#define DEF_BATCHED_TRIM_SECTIONS 2 +#define BATCHED_TRIM_SEGMENTS(sbi) \ + (SM_I(sbi)->trim_sections * (sbi)->segs_per_sec) +#define BATCHED_TRIM_BLOCKS(sbi) \ + (BATCHED_TRIM_SEGMENTS(sbi) << (sbi)->log_blocks_per_seg) + #define DISCARD_ISSUE_RATE 8 #define DEF_CP_INTERVAL 60 /* 60 secs */ #define DEF_IDLE_INTERVAL 5 /* 5 secs */ @@ -718,6 +723,9 @@ struct f2fs_sm_info { /* a threshold to reclaim prefree segments */ unsigned int rec_prefree_segments; + /* for batched trimming */ + unsigned int trim_sections; /* # of sections to trim */ + struct list_head sit_entry_set; /* sit entry set list */ unsigned int ipu_policy; /* in-place-update policy */ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 83d34c80ec37..ab0fc88bfe17 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -965,8 +965,7 @@ static void __add_discard_entry(struct f2fs_sb_info *sbi, if (!list_empty(head)) { last = list_last_entry(head, struct discard_entry, list); if (START_BLOCK(sbi, cpc->trim_start) + start == - last->blkaddr + last->len && - last->len <= MAX_DISCARD_BLOCKS(sbi)) { + last->blkaddr + last->len) { last->len += end - start; goto done; } @@ -1706,25 +1705,36 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) "Found FS corruption, run fsck to fix."); goto out; } - if (sbi->discard_blks == 0) - goto out; /* start/end segment number in main_area */ start_segno = (start <= MAIN_BLKADDR(sbi)) ? 0 : GET_SEGNO(sbi, start); end_segno = (end >= MAX_BLKADDR(sbi)) ? MAIN_SEGS(sbi) - 1 : GET_SEGNO(sbi, end); - /* - * do checkpoint to issue discard commands safely since we now can - * use async discard. - */ cpc.reason = CP_DISCARD; cpc.trim_minlen = max_t(__u64, 1, F2FS_BYTES_TO_BLK(range->minlen)); - cpc.trim_start = start_segno; - cpc.trim_end = end_segno; - mutex_lock(&sbi->gc_mutex); - err = write_checkpoint(sbi, &cpc); - mutex_unlock(&sbi->gc_mutex); + /* do checkpoint to issue discard commands safely */ + for (; start_segno <= end_segno; start_segno = cpc.trim_end + 1) { + cpc.trim_start = start_segno; + + if (sbi->discard_blks == 0) + break; + else if (sbi->discard_blks < BATCHED_TRIM_BLOCKS(sbi)) + cpc.trim_end = end_segno; + else + cpc.trim_end = min_t(unsigned int, + rounddown(start_segno + + BATCHED_TRIM_SEGMENTS(sbi), + sbi->segs_per_sec) - 1, end_segno); + + mutex_lock(&sbi->gc_mutex); + err = write_checkpoint(sbi, &cpc); + mutex_unlock(&sbi->gc_mutex); + if (err) + break; + + schedule(); + } out: range->len = F2FS_BLK_TO_BYTES(cpc.trimmed); return err; @@ -2885,6 +2895,8 @@ int build_segment_manager(struct f2fs_sb_info *sbi) sm_info->min_ipu_util = DEF_MIN_IPU_UTIL; sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS; + sm_info->trim_sections = DEF_BATCHED_TRIM_SECTIONS; + INIT_LIST_HEAD(&sm_info->sit_entry_set); if (test_opt(sbi, FLUSH_MERGE) && !f2fs_readonly(sbi->sb)) { diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 08b6ba9b3f14..171ca84c7769 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -287,6 +287,7 @@ F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_no_gc_sleep_time, no_gc_sleep_time); F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_idle, gc_idle); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments); F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_small_discards, max_discards); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, batched_trim_sections, trim_sections); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks); @@ -311,6 +312,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(gc_idle), ATTR_LIST(reclaim_segments), ATTR_LIST(max_small_discards), + ATTR_LIST(batched_trim_sections), ATTR_LIST(ipu_policy), ATTR_LIST(min_ipu_util), ATTR_LIST(min_fsync_blocks), From 0166e6469f9dbbd0c98c71a4803f818f98749929 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 23 Feb 2017 16:58:09 -0800 Subject: [PATCH 121/804] f2fs: fix trim_fs assignment This is missing fix from upstream. Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index ab0fc88bfe17..f69ddd77558f 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1675,14 +1675,19 @@ static const struct segment_allocation default_salloc_ops = { bool exist_trim_candidates(struct f2fs_sb_info *sbi, struct cp_control *cpc) { __u64 trim_start = cpc->trim_start; + bool has_candidate = false; mutex_lock(&SIT_I(sbi)->sentry_lock); - for (; trim_start <= cpc->trim_end; trim_start++) - if (add_discard_addrs(sbi, cpc, true)) + for (; cpc->trim_start <= cpc->trim_end; cpc->trim_start++) { + if (add_discard_addrs(sbi, cpc, true)) { + has_candidate = true; break; + } + } mutex_unlock(&SIT_I(sbi)->sentry_lock); - return trim_start <= cpc->trim_end; + cpc->trim_start = trim_start; + return has_candidate; } int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) From 4e50b7053c191969b664d447068113990a2e7ca9 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 3 Feb 2017 17:44:04 -0800 Subject: [PATCH 122/804] f2fs: check io submission more precisely This patch check IO submission more precisely than previous rough check. Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 23 +++++++++++++++++------ fs/f2fs/f2fs.h | 1 + fs/f2fs/node.c | 27 +++++++++++++++++++-------- 3 files changed, 37 insertions(+), 14 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index f8a0bab49f0d..7e1b93dff2bb 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -379,6 +379,9 @@ int f2fs_submit_page_mbio(struct f2fs_io_info *fio) bio_page = fio->encrypted_page ? fio->encrypted_page : fio->page; + /* set submitted = 1 as a return value */ + fio->submitted = 1; + if (!is_read) inc_page_count(sbi, WB_DATA_TYPE(bio_page)); @@ -1343,8 +1346,8 @@ out_writepage: return err; } -static int __write_data_page(struct page *page, - struct writeback_control *wbc) +static int __write_data_page(struct page *page, bool *submitted, + struct writeback_control *wbc) { struct inode *inode = page->mapping->host; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -1362,6 +1365,7 @@ static int __write_data_page(struct page *page, .op_flags = wbc_to_write_flags(wbc), .page = page, .encrypted_page = NULL, + .submitted = false, }; trace_f2fs_writepage(page, DATA); @@ -1427,13 +1431,19 @@ out: if (wbc->for_reclaim) { f2fs_submit_merged_bio_cond(sbi, NULL, page, 0, DATA, WRITE); remove_dirty_inode(inode); + submitted = NULL; } unlock_page(page); f2fs_balance_fs(sbi, need_balance_fs); - if (unlikely(f2fs_cp_error(sbi))) + if (unlikely(f2fs_cp_error(sbi))) { f2fs_submit_merged_bio(sbi, DATA, WRITE); + submitted = NULL; + } + + if (submitted) + *submitted = fio.submitted; return 0; @@ -1448,7 +1458,7 @@ redirty_out: static int f2fs_write_data_page(struct page *page, struct writeback_control *wbc) { - return __write_data_page(page, wbc); + return __write_data_page(page, NULL, wbc); } /* @@ -1507,6 +1517,7 @@ retry: for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; + bool submitted = false; if (page->index > end) { done = 1; @@ -1540,7 +1551,7 @@ continue_unlock: if (!clear_page_dirty_for_io(page)) goto continue_unlock; - ret = __write_data_page(page, wbc); + ret = __write_data_page(page, &submitted, wbc); if (unlikely(ret)) { /* * keep nr_to_write, since vfs uses this to @@ -1554,7 +1565,7 @@ continue_unlock: done_index = page->index + 1; done = 1; break; - } else { + } else if (submitted) { nwritten++; } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 0c22dfd69d6e..b51c4a36bf50 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -795,6 +795,7 @@ struct f2fs_io_info { block_t old_blkaddr; /* old block address before Cow */ struct page *page; /* page to be written */ struct page *encrypted_page; /* encrypted page */ + bool submitted; /* indicate IO submission */ }; #define is_read_io(rw) (rw == READ) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index d24bdb970a24..4b9e116b781e 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1318,7 +1318,7 @@ continue_unlock: return last_page; } -static int __write_node_page(struct page *page, bool atomic, +static int __write_node_page(struct page *page, bool atomic, bool *submitted, struct writeback_control *wbc) { struct f2fs_sb_info *sbi = F2FS_P_SB(page); @@ -1331,6 +1331,7 @@ static int __write_node_page(struct page *page, bool atomic, .op_flags = wbc_to_write_flags(wbc), .page = page, .encrypted_page = NULL, + .submitted = false, }; trace_f2fs_writepage(page, NODE); @@ -1372,13 +1373,19 @@ static int __write_node_page(struct page *page, bool atomic, dec_page_count(sbi, F2FS_DIRTY_NODES); up_read(&sbi->node_write); - if (wbc->for_reclaim) + if (wbc->for_reclaim) { f2fs_submit_merged_bio_cond(sbi, NULL, page, 0, NODE, WRITE); + submitted = NULL; + } unlock_page(page); - if (unlikely(f2fs_cp_error(sbi))) + if (unlikely(f2fs_cp_error(sbi))) { f2fs_submit_merged_bio(sbi, NODE, WRITE); + submitted = NULL; + } + if (submitted) + *submitted = fio.submitted; return 0; @@ -1390,7 +1397,7 @@ redirty_out: static int f2fs_write_node_page(struct page *page, struct writeback_control *wbc) { - return __write_node_page(page, false, wbc); + return __write_node_page(page, false, NULL, wbc); } int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, @@ -1424,6 +1431,7 @@ retry: for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; + bool submitted = false; if (unlikely(f2fs_cp_error(sbi))) { f2fs_put_page(last_page, 0); @@ -1473,12 +1481,13 @@ continue_unlock: goto continue_unlock; ret = __write_node_page(page, atomic && - page == last_page, wbc); + page == last_page, + &submitted, wbc); if (ret) { unlock_page(page); f2fs_put_page(last_page, 0); break; - } else { + } else if (submitted) { nwritten++; } @@ -1534,6 +1543,7 @@ next_step: for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; + bool submitted = false; if (unlikely(f2fs_cp_error(sbi))) { pagevec_release(&pvec); @@ -1587,9 +1597,10 @@ continue_unlock: set_fsync_mark(page, 0); set_dentry_mark(page, 0); - if (NODE_MAPPING(sbi)->a_ops->writepage(page, wbc)) + ret = __write_node_page(page, false, &submitted, wbc); + if (ret) unlock_page(page); - else + else if (submitted) nwritten++; if (--wbc->nr_to_write == 0) From 5b7c84083345af3007d71394037cb2cec31e26f7 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 1 Feb 2017 16:51:22 -0800 Subject: [PATCH 123/804] f2fs: check last page index in cached bio to decide submission If the cached bio has the last page's index, then we need to submit it. Otherwise, we don't need to submit it and can wait for further IO merges. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 3 ++- fs/f2fs/data.c | 43 ++++++++++++++++++++++--------------------- fs/f2fs/f2fs.h | 4 ++-- fs/f2fs/node.c | 12 +++++++----- fs/f2fs/segment.c | 13 +++++++------ 5 files changed, 40 insertions(+), 35 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 45ef3b6bfb04..c943452098a3 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -250,7 +250,8 @@ static int f2fs_write_meta_page(struct page *page, dec_page_count(sbi, F2FS_DIRTY_META); if (wbc->for_reclaim) - f2fs_submit_merged_bio_cond(sbi, NULL, page, 0, META, WRITE); + f2fs_submit_merged_bio_cond(sbi, page->mapping->host, + 0, page->index, META, WRITE); unlock_page(page); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 7e1b93dff2bb..b362b2ce3b3a 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -243,8 +243,8 @@ static void __submit_merged_bio(struct f2fs_bio_info *io) io->bio = NULL; } -static bool __has_merged_page(struct f2fs_bio_info *io, struct inode *inode, - struct page *page, nid_t ino) +static bool __has_merged_page(struct f2fs_bio_info *io, + struct inode *inode, nid_t ino, pgoff_t idx) { struct bio_vec *bvec; struct page *target; @@ -253,7 +253,7 @@ static bool __has_merged_page(struct f2fs_bio_info *io, struct inode *inode, if (!io->bio) return false; - if (!inode && !page && !ino) + if (!inode && !ino) return true; bio_for_each_segment_all(bvec, io->bio, i) { @@ -263,10 +263,11 @@ static bool __has_merged_page(struct f2fs_bio_info *io, struct inode *inode, else target = fscrypt_control_page(bvec->bv_page); + if (idx != target->index) + continue; + if (inode && inode == target->mapping->host) return true; - if (page && page == target) - return true; if (ino && ino == ino_of_node(target)) return true; } @@ -275,22 +276,21 @@ static bool __has_merged_page(struct f2fs_bio_info *io, struct inode *inode, } static bool has_merged_page(struct f2fs_sb_info *sbi, struct inode *inode, - struct page *page, nid_t ino, - enum page_type type) + nid_t ino, pgoff_t idx, enum page_type type) { enum page_type btype = PAGE_TYPE_OF_BIO(type); struct f2fs_bio_info *io = &sbi->write_io[btype]; bool ret; down_read(&io->io_rwsem); - ret = __has_merged_page(io, inode, page, ino); + ret = __has_merged_page(io, inode, ino, idx); up_read(&io->io_rwsem); return ret; } static void __f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, - struct inode *inode, struct page *page, - nid_t ino, enum page_type type, int rw) + struct inode *inode, nid_t ino, pgoff_t idx, + enum page_type type, int rw) { enum page_type btype = PAGE_TYPE_OF_BIO(type); struct f2fs_bio_info *io; @@ -299,7 +299,7 @@ static void __f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, down_write(&io->io_rwsem); - if (!__has_merged_page(io, inode, page, ino)) + if (!__has_merged_page(io, inode, ino, idx)) goto out; /* change META to META_FLUSH in the checkpoint procedure */ @@ -318,15 +318,15 @@ out: void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, enum page_type type, int rw) { - __f2fs_submit_merged_bio(sbi, NULL, NULL, 0, type, rw); + __f2fs_submit_merged_bio(sbi, NULL, 0, 0, type, rw); } void f2fs_submit_merged_bio_cond(struct f2fs_sb_info *sbi, - struct inode *inode, struct page *page, - nid_t ino, enum page_type type, int rw) + struct inode *inode, nid_t ino, pgoff_t idx, + enum page_type type, int rw) { - if (has_merged_page(sbi, inode, page, ino, type)) - __f2fs_submit_merged_bio(sbi, inode, page, ino, type, rw); + if (has_merged_page(sbi, inode, ino, idx, type)) + __f2fs_submit_merged_bio(sbi, inode, ino, idx, type, rw); } void f2fs_flush_merged_bios(struct f2fs_sb_info *sbi) @@ -1429,7 +1429,8 @@ out: ClearPageUptodate(page); if (wbc->for_reclaim) { - f2fs_submit_merged_bio_cond(sbi, NULL, page, 0, DATA, WRITE); + f2fs_submit_merged_bio_cond(sbi, inode, 0, page->index, + DATA, WRITE); remove_dirty_inode(inode); submitted = NULL; } @@ -1477,10 +1478,10 @@ static int f2fs_write_cache_pages(struct address_space *mapping, pgoff_t index; pgoff_t end; /* Inclusive */ pgoff_t done_index; + pgoff_t last_idx = ULONG_MAX; int cycled; int range_whole = 0; int tag; - int nwritten = 0; pagevec_init(&pvec, 0); @@ -1566,7 +1567,7 @@ continue_unlock: done = 1; break; } else if (submitted) { - nwritten++; + last_idx = page->index; } if (--wbc->nr_to_write <= 0 && @@ -1588,9 +1589,9 @@ continue_unlock: if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) mapping->writeback_index = done_index; - if (nwritten) + if (last_idx != ULONG_MAX) f2fs_submit_merged_bio_cond(F2FS_M_SB(mapping), mapping->host, - NULL, 0, DATA, WRITE); + 0, last_idx, DATA, WRITE); return ret; } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index b51c4a36bf50..840a37c20566 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2300,8 +2300,8 @@ void destroy_checkpoint_caches(void); void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, enum page_type type, int rw); void f2fs_submit_merged_bio_cond(struct f2fs_sb_info *sbi, - struct inode *inode, struct page *page, - nid_t ino, enum page_type type, int rw); + struct inode *inode, nid_t ino, pgoff_t idx, + enum page_type type, int rw); void f2fs_flush_merged_bios(struct f2fs_sb_info *sbi); int f2fs_submit_page_bio(struct f2fs_io_info *fio); int f2fs_submit_page_mbio(struct f2fs_io_info *fio); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 4b9e116b781e..86ff0da6d6aa 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1374,7 +1374,8 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, up_read(&sbi->node_write); if (wbc->for_reclaim) { - f2fs_submit_merged_bio_cond(sbi, NULL, page, 0, NODE, WRITE); + f2fs_submit_merged_bio_cond(sbi, page->mapping->host, 0, + page->index, NODE, WRITE); submitted = NULL; } @@ -1404,12 +1405,12 @@ int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, struct writeback_control *wbc, bool atomic) { pgoff_t index, end; + pgoff_t last_idx = ULONG_MAX; struct pagevec pvec; int ret = 0; struct page *last_page = NULL; bool marked = false; nid_t ino = inode->i_ino; - int nwritten = 0; if (atomic) { last_page = last_fsync_dnode(sbi, ino); @@ -1488,7 +1489,7 @@ continue_unlock: f2fs_put_page(last_page, 0); break; } else if (submitted) { - nwritten++; + last_idx = page->index; } if (page == last_page) { @@ -1514,8 +1515,9 @@ continue_unlock: goto retry; } out: - if (nwritten) - f2fs_submit_merged_bio_cond(sbi, NULL, NULL, ino, NODE, WRITE); + if (last_idx != ULONG_MAX) + f2fs_submit_merged_bio_cond(sbi, NULL, ino, last_idx, + NODE, WRITE); return ret ? -EIO: 0; } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index f69ddd77558f..ddada895787f 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -263,7 +263,7 @@ static int __commit_inmem_pages(struct inode *inode, .op_flags = REQ_SYNC | REQ_NOIDLE | REQ_PRIO, .encrypted_page = NULL, }; - bool submit_bio = false; + pgoff_t last_idx = ULONG_MAX; int err = 0; list_for_each_entry_safe(cur, tmp, &fi->inmem_pages, list) { @@ -289,15 +289,15 @@ static int __commit_inmem_pages(struct inode *inode, /* record old blkaddr for revoking */ cur->old_addr = fio.old_blkaddr; - - submit_bio = true; + last_idx = page->index; } unlock_page(page); list_move_tail(&cur->list, revoke_list); } - if (submit_bio) - f2fs_submit_merged_bio_cond(sbi, inode, NULL, 0, DATA, WRITE); + if (last_idx != ULONG_MAX) + f2fs_submit_merged_bio_cond(sbi, inode, 0, last_idx, + DATA, WRITE); if (!err) __revoke_inmem_pages(inode, revoke_list, false, false); @@ -2011,7 +2011,8 @@ void f2fs_wait_on_page_writeback(struct page *page, if (PageWriteback(page)) { struct f2fs_sb_info *sbi = F2FS_P_SB(page); - f2fs_submit_merged_bio_cond(sbi, NULL, page, 0, type, WRITE); + f2fs_submit_merged_bio_cond(sbi, page->mapping->host, + 0, page->index, type, WRITE); if (ordered) wait_on_page_writeback(page); else From 1b1f1ea0e7f750d746f75b110978f3ebab22b0c4 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 6 Feb 2017 13:57:58 -0800 Subject: [PATCH 124/804] f2fs: remove preflush for nobarrier case This patch removes REQ_PREFLUSH in the nobarrier case. Signed-off-by: Jaegeuk Kim Conflicts: fs/f2fs/data.c --- fs/f2fs/data.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index b362b2ce3b3a..e8f605bafc4e 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -306,9 +306,9 @@ static void __f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, if (type >= META_FLUSH) { io->fio.type = META_FLUSH; io->fio.op = REQ_OP_WRITE; - io->fio.op_flags = WRITE_FLUSH | REQ_META | REQ_PRIO; + io->fio.op_flags = REQ_META | REQ_PRIO; if (!test_opt(sbi, NOBARRIER)) - io->fio.op_flags |= REQ_FUA; + io->fio.op_flags |= WRITE_FLUSH | REQ_FUA; } __submit_merged_bio(io); out: From 194fbd2a5710c4a9446f52b6a4e144b0925cb8ea Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 9 Feb 2017 13:25:35 -0800 Subject: [PATCH 125/804] f2fs: show checkpoint version at mount time If we mounted f2fs successfully, let's show current checkpoint version. Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 171ca84c7769..6ab7f6aa337c 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2103,6 +2103,8 @@ skip_recovery: sbi->valid_super_block ? 1 : 2, err); } + f2fs_msg(sbi->sb, KERN_NOTICE, "Mounted with checkpoint version = %llx", + cur_cp_version(F2FS_CKPT(sbi))); f2fs_update_time(sbi, CP_TIME); f2fs_update_time(sbi, REQ_TIME); return 0; From 56cba038b6226df91b11a557fdf2c3492b92c054 Mon Sep 17 00:00:00 2001 From: Bhumika Goyal Date: Sat, 11 Feb 2017 15:50:46 +0530 Subject: [PATCH 126/804] f2fs: super: constify fscrypt_operations structure Declare fscrypt_operations structure as const as it is only stored in the s_cop field of a super_block structure. This field is of type const, so fscrypt_operations structure having this property can be made const too. File size before: fs/f2fs/super.o text data bss dec hex filename 54131 31355 184 85670 14ea6 fs/f2fs/super.o File size after: fs/f2fs/super.o text data bss dec hex filename 54227 31259 184 85670 14ea6 fs/f2fs/super.o Signed-off-by: Bhumika Goyal Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 6ab7f6aa337c..ebad846ba1f1 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1210,7 +1210,7 @@ static unsigned f2fs_max_namelen(struct inode *inode) inode->i_sb->s_blocksize : F2FS_NAME_LEN; } -static struct fscrypt_operations f2fs_cryptops = { +static const struct fscrypt_operations f2fs_cryptops = { .get_context = f2fs_get_context, .key_prefix = f2fs_key_prefix, .set_context = f2fs_set_context, @@ -1219,7 +1219,7 @@ static struct fscrypt_operations f2fs_cryptops = { .max_namelen = f2fs_max_namelen, }; #else -static struct fscrypt_operations f2fs_cryptops = { +static const struct fscrypt_operations f2fs_cryptops = { .is_encrypted = f2fs_encrypted_inode, }; #endif From a71c22fcd5c85453e5d362d2cddcd0fb45230630 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 8 Feb 2017 17:39:45 +0800 Subject: [PATCH 127/804] f2fs: change recovery policy of xattr node block Currently, if we call fsync after updating the xattr date belongs to the file, f2fs needs to trigger checkpoint to keep xattr data consistent. But, this policy cause low performance as checkpoint will block most foreground operations and cause unneeded and unrelated IOs around checkpoint. This patch will reuse regular file recovery policy for xattr node block, so, we change to write xattr node block tagged with fsync flag to warm area instead of cold area, and during recovery, we search warm node chain for fsynced xattr block, and do the recovery. So, for below application IO pattern, performance can be improved obviously: - touch file - create/update/delete xattr entry in file - fsync file Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 3 +-- fs/f2fs/file.c | 3 --- fs/f2fs/node.c | 29 +++++++++++++++++------------ fs/f2fs/node.h | 2 +- fs/f2fs/recovery.c | 8 +++----- fs/f2fs/xattr.c | 2 -- 6 files changed, 22 insertions(+), 25 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 840a37c20566..848f24e40cdb 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -527,7 +527,6 @@ struct f2fs_inode_info { f2fs_hash_t chash; /* hash value of given file name */ unsigned int clevel; /* maximum level of given file name */ nid_t i_xattr_nid; /* node id that contains xattrs */ - unsigned long long xattr_ver; /* cp version of xattr modification */ loff_t last_disk_size; /* lastly written file size */ struct list_head dirty_list; /* dirty list for dirs and files */ @@ -2200,7 +2199,7 @@ void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid); void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid); int try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink); void recover_inline_xattr(struct inode *inode, struct page *page); -void recover_xattr_data(struct inode *inode, struct page *page, +int recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr); int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page); int restore_node_summary(struct f2fs_sb_info *sbi, diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 12c12cb4a06f..948d440dd3ce 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -143,8 +143,6 @@ static inline bool need_do_checkpoint(struct inode *inode) need_cp = true; else if (!is_checkpointed_node(sbi, F2FS_I(inode)->i_pino)) need_cp = true; - else if (F2FS_I(inode)->xattr_ver == cur_cp_version(F2FS_CKPT(sbi))) - need_cp = true; else if (test_opt(sbi, FASTBOOT)) need_cp = true; else if (sbi->active_logs == 2) @@ -170,7 +168,6 @@ static void try_to_fix_pino(struct inode *inode) nid_t pino; down_write(&fi->i_sem); - fi->xattr_ver = 0; if (file_wrong_pino(inode) && inode->i_nlink == 1 && get_parent_ino(inode, &pino)) { f2fs_i_pino_write(inode, pino); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 86ff0da6d6aa..f8abf61be75b 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -971,9 +971,6 @@ int truncate_xattr_node(struct inode *inode, struct page *page) f2fs_i_xnid_write(inode, 0); - /* need to do checkpoint during fsync */ - F2FS_I(inode)->xattr_ver = cur_cp_version(F2FS_CKPT(sbi)); - set_new_dnode(&dn, inode, page, npage, nid); if (page) @@ -2057,18 +2054,18 @@ update_inode: f2fs_put_page(ipage, 1); } -void recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr) +int recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); nid_t prev_xnid = F2FS_I(inode)->i_xattr_nid; nid_t new_xnid = nid_of_node(page); struct node_info ni; + struct page *xpage; - /* 1: invalidate the previous xattr nid */ if (!prev_xnid) goto recover_xnid; - /* Deallocate node address */ + /* 1: invalidate the previous xattr nid */ get_node_info(sbi, prev_xnid, &ni); f2fs_bug_on(sbi, ni.blk_addr == NULL_ADDR); invalidate_blocks(sbi, ni.blk_addr); @@ -2076,19 +2073,27 @@ void recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr) set_node_addr(sbi, &ni, NULL_ADDR, false); recover_xnid: - /* 2: allocate new xattr nid */ + /* 2: update xattr nid in inode */ + remove_free_nid(sbi, new_xnid); + f2fs_i_xnid_write(inode, new_xnid); if (unlikely(!inc_valid_node_count(sbi, inode))) f2fs_bug_on(sbi, 1); + update_inode_page(inode); + + /* 3: update and set xattr node page dirty */ + xpage = grab_cache_page(NODE_MAPPING(sbi), new_xnid); + if (!xpage) + return -ENOMEM; + + memcpy(F2FS_NODE(xpage), F2FS_NODE(page), PAGE_SIZE); - remove_free_nid(sbi, new_xnid); get_node_info(sbi, new_xnid, &ni); ni.ino = inode->i_ino; set_node_addr(sbi, &ni, NEW_ADDR, false); - f2fs_i_xnid_write(inode, new_xnid); + set_page_dirty(xpage); + f2fs_put_page(xpage, 1); - /* 3: update xattr blkaddr */ - refresh_sit_entry(sbi, NEW_ADDR, blkaddr); - set_node_addr(sbi, &ni, blkaddr, false); + return 0; } int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 29ff783eb9c3..d3d289306469 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -358,7 +358,7 @@ static inline bool IS_DNODE(struct page *node_page) unsigned int ofs = ofs_of_node(node_page); if (f2fs_has_xattr_block(ofs)) - return false; + return true; if (ofs == 3 || ofs == 4 + NIDS_PER_BLOCK || ofs == 5 + 2 * NIDS_PER_BLOCK) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index e93316ea8d1b..d025aa83fb5b 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -378,11 +378,9 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, if (IS_INODE(page)) { recover_inline_xattr(inode, page); } else if (f2fs_has_xattr_block(ofs_of_node(page))) { - /* - * Deprecated; xattr blocks should be found from cold log. - * But, we should remain this for backward compatibility. - */ - recover_xattr_data(inode, page, blkaddr); + err = recover_xattr_data(inode, page, blkaddr); + if (!err) + recovered++; goto out; } diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index ba67ca0c7014..8eca9022bf16 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -505,8 +505,6 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, set_page_dirty(xpage); f2fs_put_page(xpage, 1); - /* need to checkpoint during fsync */ - F2FS_I(inode)->xattr_ver = cur_cp_version(F2FS_CKPT(sbi)); return 0; } From 69a0a6912f7882f3ddec5b1f5559871fddd1a05c Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Sat, 11 Feb 2017 10:46:44 -0800 Subject: [PATCH 128/804] f2fs: remove build_free_nids() during checkpoint Let's avoid build_free_nids() in checkpoint path. Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index c943452098a3..fd8db9d1ceea 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1000,8 +1000,6 @@ out: static void unblock_operations(struct f2fs_sb_info *sbi) { up_write(&sbi->node_write); - - build_free_nids(sbi, false); f2fs_unlock_all(sbi); } From 34a65412384c9e4f0126b95e7e4c1ce818623703 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 13 Feb 2017 17:02:44 -0800 Subject: [PATCH 129/804] f2fs: avoid reading NAT page by get_node_info We've not seen this buggy case for a long time, so it's time to avoid this unnecessary get_node_info() call which reading NAT page to cache nat entry. Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index f8abf61be75b..8137903c9012 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1028,7 +1028,7 @@ struct page *new_node_page(struct dnode_of_data *dn, unsigned int ofs, struct page *ipage) { struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); - struct node_info old_ni, new_ni; + struct node_info new_ni; struct page *page; int err; @@ -1043,13 +1043,15 @@ struct page *new_node_page(struct dnode_of_data *dn, err = -ENOSPC; goto fail; } - - get_node_info(sbi, dn->nid, &old_ni); - - /* Reinitialize old_ni with new node page */ - f2fs_bug_on(sbi, old_ni.blk_addr != NULL_ADDR); - new_ni = old_ni; +#ifdef CONFIG_F2FS_CHECK_FS + get_node_info(sbi, dn->nid, &new_ni); + f2fs_bug_on(sbi, new_ni.blk_addr != NULL_ADDR); +#endif + new_ni.nid = dn->nid; new_ni.ino = dn->inode->i_ino; + new_ni.blk_addr = NULL_ADDR; + new_ni.flag = 0; + new_ni.version = 0; set_node_addr(sbi, &new_ni, NEW_ADDR, false); f2fs_wait_on_page_writeback(page, NODE, true); From 79887aed377059190c10d76a55674e6d189614f9 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 15 Feb 2017 10:34:45 +0800 Subject: [PATCH 130/804] f2fs: introduce noinline_xattr mount option This patch introduces new mount option 'noinline_xattr', so we can disable inline xattr functionality which is already set as a default mount option. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.txt | 1 + fs/f2fs/super.c | 11 +++++++++++ 2 files changed, 12 insertions(+) diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt index d99faced79cb..8e454b0559f1 100644 --- a/Documentation/filesystems/f2fs.txt +++ b/Documentation/filesystems/f2fs.txt @@ -125,6 +125,7 @@ active_logs=%u Support configuring the number of active logs. In the disable_ext_identify Disable the extension list configured by mkfs, so f2fs does not aware of cold files such as media files. inline_xattr Enable the inline xattrs feature. +noinline_xattr Disable the inline xattrs feature. inline_data Enable the inline data feature: New created small(<~3.4k) files can be written into inode block. inline_dentry Enable the inline dir feature: data in new created diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index ebad846ba1f1..7b3fe81db741 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -89,6 +89,7 @@ enum { Opt_active_logs, Opt_disable_ext_identify, Opt_inline_xattr, + Opt_noinline_xattr, Opt_inline_data, Opt_inline_dentry, Opt_noinline_dentry, @@ -122,6 +123,7 @@ static match_table_t f2fs_tokens = { {Opt_active_logs, "active_logs=%u"}, {Opt_disable_ext_identify, "disable_ext_identify"}, {Opt_inline_xattr, "inline_xattr"}, + {Opt_noinline_xattr, "noinline_xattr"}, {Opt_inline_data, "inline_data"}, {Opt_inline_dentry, "inline_dentry"}, {Opt_noinline_dentry, "noinline_dentry"}, @@ -444,6 +446,9 @@ static int parse_options(struct super_block *sb, char *options) case Opt_inline_xattr: set_opt(sbi, INLINE_XATTR); break; + case Opt_noinline_xattr: + clear_opt(sbi, INLINE_XATTR); + break; #else case Opt_user_xattr: f2fs_msg(sb, KERN_INFO, @@ -457,6 +462,10 @@ static int parse_options(struct super_block *sb, char *options) f2fs_msg(sb, KERN_INFO, "inline_xattr options not supported"); break; + case Opt_noinline_xattr: + f2fs_msg(sb, KERN_INFO, + "noinline_xattr options not supported"); + break; #endif #ifdef CONFIG_F2FS_FS_POSIX_ACL case Opt_acl: @@ -909,6 +918,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",nouser_xattr"); if (test_opt(sbi, INLINE_XATTR)) seq_puts(seq, ",inline_xattr"); + else + seq_puts(seq, ",noinline_xattr"); #endif #ifdef CONFIG_F2FS_FS_POSIX_ACL if (test_opt(sbi, POSIX_ACL)) From 675dd8213b7074093662cd83519ce4ed5d78bca4 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 8 Feb 2017 17:39:44 +0800 Subject: [PATCH 131/804] f2fs: enable inline_xattr by default In android, since SElinux is enable, security policy will be appliedd for each file, it stores in inode as an xattr entry, so it will take one 4k size node block additionally for each file. Let's enable inline_xattr by default in order to save storage space. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 7b3fe81db741..35e712bbccf1 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1040,6 +1040,7 @@ static void default_options(struct f2fs_sb_info *sbi) sbi->active_logs = NR_CURSEG_TYPE; set_opt(sbi, BG_GC); + set_opt(sbi, INLINE_XATTR); set_opt(sbi, INLINE_DATA); set_opt(sbi, INLINE_DENTRY); set_opt(sbi, EXTENT_CACHE); From 5331a1d87fc592190b4eb7cbe6031ea1c6e6a70c Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 14 Feb 2017 19:32:51 -0800 Subject: [PATCH 132/804] f2fs: use SSR for warm node as well We have had node chains, but haven't used it so far due to stale node blocks. Now, we have crc|cp_ver in node footer and give random cp_ver at format time, we can start to use it again. Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index ddada895787f..3c39982bd4b8 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1642,7 +1642,8 @@ static void allocate_segment_by_default(struct f2fs_sb_info *sbi, if (force) new_curseg(sbi, type, true); - else if (type == CURSEG_WARM_NODE) + else if (!is_set_ckpt_flags(sbi, CP_CRC_RECOVERY_FLAG) && + type == CURSEG_WARM_NODE) new_curseg(sbi, type, false); else if (curseg->alloc_type == LFS && is_next_segment_free(sbi, type)) new_curseg(sbi, type, false); From ff3bf2f2079260ccd00a2847f244dd0acb6c67b1 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 15 Feb 2017 11:14:06 -0800 Subject: [PATCH 133/804] f2fs: show actual device info in tracepoints This patch shows actual device information in the tracepoints. Signed-off-by: Jaegeuk Kim Conflicts: include/trace/events/f2fs.h --- fs/f2fs/segment.c | 10 ++++---- include/trace/events/f2fs.h | 49 ++++++++++++++++++++----------------- 2 files changed, 31 insertions(+), 28 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 3c39982bd4b8..4fc23afc03e2 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -426,11 +426,11 @@ static int submit_flush_wait(struct f2fs_sb_info *sbi) int ret = __submit_flush_wait(sbi->sb->s_bdev); int i; - trace_f2fs_issue_flush(sbi->sb, test_opt(sbi, NOBARRIER), - test_opt(sbi, FLUSH_MERGE)); - if (sbi->s_ndevs && !ret) { for (i = 1; i < sbi->s_ndevs; i++) { + trace_f2fs_issue_flush(FDEV(i).bdev, + test_opt(sbi, NOBARRIER), + test_opt(sbi, FLUSH_MERGE)); ret = __submit_flush_wait(FDEV(i).bdev); if (ret) break; @@ -839,7 +839,7 @@ static int __f2fs_issue_discard_async(struct f2fs_sb_info *sbi, block_t lblkstart = blkstart; int err; - trace_f2fs_issue_discard(sbi->sb, blkstart, blklen); + trace_f2fs_issue_discard(bdev, blkstart, blklen); if (sbi->s_ndevs) { int devi = f2fs_target_device_index(sbi, blkstart); @@ -894,7 +894,7 @@ static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, return __f2fs_issue_discard_async(sbi, bdev, blkstart, blklen); case BLK_ZONE_TYPE_SEQWRITE_REQ: case BLK_ZONE_TYPE_SEQWRITE_PREF: - trace_f2fs_issue_reset_zone(sbi->sb, blkstart); + trace_f2fs_issue_reset_zone(bdev, blkstart); return blkdev_reset_zones(bdev, sector, nr_sects, GFP_NOFS); default: diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 217691582dd4..bd1772879c8c 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -6,8 +6,8 @@ #include -#define show_dev(entry) MAJOR(entry->dev), MINOR(entry->dev) -#define show_dev_ino(entry) show_dev(entry), (unsigned long)entry->ino +#define show_dev(dev) MAJOR(dev), MINOR(dev) +#define show_dev_ino(entry) show_dev(entry->dev), (unsigned long)entry->ino TRACE_DEFINE_ENUM(NODE); TRACE_DEFINE_ENUM(DATA); @@ -239,7 +239,7 @@ TRACE_EVENT(f2fs_sync_fs, ), TP_printk("dev = (%d,%d), superblock is %s, wait = %d", - show_dev(__entry), + show_dev(__entry->dev), __entry->dirty ? "dirty" : "not dirty", __entry->wait) ); @@ -538,7 +538,7 @@ TRACE_EVENT(f2fs_background_gc, ), TP_printk("dev = (%d,%d), wait_ms = %ld, prefree = %u, free = %u", - show_dev(__entry), + show_dev(__entry->dev), __entry->wait_ms, __entry->prefree, __entry->free) @@ -580,7 +580,7 @@ TRACE_EVENT(f2fs_get_victim, TP_printk("dev = (%d,%d), type = %s, policy = (%s, %s, %s), victim = %u " "ofs_unit = %u, pre_victim_secno = %d, prefree = %u, free = %u", - show_dev(__entry), + show_dev(__entry->dev), show_data_type(__entry->type), show_gc_type(__entry->gc_type), show_alloc_mode(__entry->alloc_mode), @@ -717,7 +717,7 @@ TRACE_EVENT(f2fs_reserve_new_blocks, ), TP_printk("dev = (%d,%d), nid = %u, ofs_in_node = %u, count = %llu", - show_dev(__entry), + show_dev(__entry->dev), (unsigned int)__entry->nid, __entry->ofs_in_node, (unsigned long long)__entry->count) @@ -787,6 +787,7 @@ DECLARE_EVENT_CLASS(f2fs__bio, TP_STRUCT__entry( __field(dev_t, dev) + __field(dev_t, target) __field(int, op) __field(int, op_flags) __field(int, type) @@ -796,6 +797,7 @@ DECLARE_EVENT_CLASS(f2fs__bio, TP_fast_assign( __entry->dev = sb->s_dev; + __entry->target = bio->bi_bdev->bd_dev; __entry->op = bio_op(bio); __entry->op_flags = bio->bi_rw; __entry->type = type; @@ -803,8 +805,9 @@ DECLARE_EVENT_CLASS(f2fs__bio, __entry->size = bio->bi_iter.bi_size; ), - TP_printk("dev = (%d,%d), %s%s, %s, sector = %lld, size = %u", - show_dev(__entry), + TP_printk("dev = (%d,%d)/(%d,%d), rw = %s%s, %s, sector = %lld, size = %u", + show_dev(__entry->target), + show_dev(__entry->dev), show_bio_type(__entry->op, __entry->op_flags), show_block_type(__entry->type), (unsigned long long)__entry->sector, @@ -1101,16 +1104,16 @@ TRACE_EVENT(f2fs_write_checkpoint, ), TP_printk("dev = (%d,%d), checkpoint for %s, state = %s", - show_dev(__entry), + show_dev(__entry->dev), show_cpreason(__entry->reason), __entry->msg) ); TRACE_EVENT(f2fs_issue_discard, - TP_PROTO(struct super_block *sb, block_t blkstart, block_t blklen), + TP_PROTO(struct block_device *dev, block_t blkstart, block_t blklen), - TP_ARGS(sb, blkstart, blklen), + TP_ARGS(dev, blkstart, blklen), TP_STRUCT__entry( __field(dev_t, dev) @@ -1119,22 +1122,22 @@ TRACE_EVENT(f2fs_issue_discard, ), TP_fast_assign( - __entry->dev = sb->s_dev; + __entry->dev = dev->bd_dev; __entry->blkstart = blkstart; __entry->blklen = blklen; ), TP_printk("dev = (%d,%d), blkstart = 0x%llx, blklen = 0x%llx", - show_dev(__entry), + show_dev(__entry->dev), (unsigned long long)__entry->blkstart, (unsigned long long)__entry->blklen) ); TRACE_EVENT(f2fs_issue_reset_zone, - TP_PROTO(struct super_block *sb, block_t blkstart), + TP_PROTO(struct block_device *dev, block_t blkstart), - TP_ARGS(sb, blkstart), + TP_ARGS(dev, blkstart), TP_STRUCT__entry( __field(dev_t, dev) @@ -1142,21 +1145,21 @@ TRACE_EVENT(f2fs_issue_reset_zone, ), TP_fast_assign( - __entry->dev = sb->s_dev; + __entry->dev = dev->bd_dev; __entry->blkstart = blkstart; ), TP_printk("dev = (%d,%d), reset zone at block = 0x%llx", - show_dev(__entry), + show_dev(__entry->dev), (unsigned long long)__entry->blkstart) ); TRACE_EVENT(f2fs_issue_flush, - TP_PROTO(struct super_block *sb, unsigned int nobarrier, + TP_PROTO(struct block_device *dev, unsigned int nobarrier, unsigned int flush_merge), - TP_ARGS(sb, nobarrier, flush_merge), + TP_ARGS(dev, nobarrier, flush_merge), TP_STRUCT__entry( __field(dev_t, dev) @@ -1165,13 +1168,13 @@ TRACE_EVENT(f2fs_issue_flush, ), TP_fast_assign( - __entry->dev = sb->s_dev; + __entry->dev = dev->bd_dev; __entry->nobarrier = nobarrier; __entry->flush_merge = flush_merge; ), TP_printk("dev = (%d,%d), %s %s", - show_dev(__entry), + show_dev(__entry->dev), __entry->nobarrier ? "skip (nobarrier)" : "issue", __entry->flush_merge ? " with flush_merge" : "") ); @@ -1286,7 +1289,7 @@ TRACE_EVENT(f2fs_shrink_extent_tree, ), TP_printk("dev = (%d,%d), shrunk: node_cnt = %u, tree_cnt = %u", - show_dev(__entry), + show_dev(__entry->dev), __entry->node_cnt, __entry->tree_cnt) ); @@ -1333,7 +1336,7 @@ DECLARE_EVENT_CLASS(f2fs_sync_dirty_inodes, ), TP_printk("dev = (%d,%d), %s, dirty count = %lld", - show_dev(__entry), + show_dev(__entry->dev), show_file_type(__entry->type), __entry->count) ); From e5e16d8af5cb4936cb2438e4961769878583c76c Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 14 Feb 2017 09:54:37 -0800 Subject: [PATCH 134/804] f2fs: fix multiple f2fs_add_link() calls having same name It turns out a stakable filesystem like sdcardfs in AOSP can trigger multiple vfs_create() to lower filesystem. In that case, f2fs will add multiple dentries having same name which breaks filesystem consistency. Until upper layer fixes, let's work around by f2fs, which shows actually not much performance regression. Cc: Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 34 +++++++++++++++++++++++++++++----- fs/f2fs/f2fs.h | 1 + 2 files changed, 30 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 4436079dbf0c..ab5343f79f9b 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -207,9 +207,13 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, f2fs_put_page(dentry_page, 0); } - if (!de && room && F2FS_I(dir)->chash != namehash) { - F2FS_I(dir)->chash = namehash; - F2FS_I(dir)->clevel = level; + /* This is to increase the speed of f2fs_create */ + if (!de && room) { + F2FS_I(dir)->task = current; + if (F2FS_I(dir)->chash != namehash) { + F2FS_I(dir)->chash = namehash; + F2FS_I(dir)->clevel = level; + } } return de; @@ -643,14 +647,34 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name, struct inode *inode, nid_t ino, umode_t mode) { struct fscrypt_name fname; + struct page *page = NULL; + struct f2fs_dir_entry *de = NULL; int err; err = fscrypt_setup_filename(dir, name, 0, &fname); if (err) return err; - err = __f2fs_do_add_link(dir, &fname, inode, ino, mode); - + /* + * An immature stakable filesystem shows a race condition between lookup + * and create. If we have same task when doing lookup and create, it's + * definitely fine as expected by VFS normally. Otherwise, let's just + * verify on-disk dentry one more time, which guarantees filesystem + * consistency more. + */ + if (current != F2FS_I(dir)->task) { + de = __f2fs_find_entry(dir, &fname, &page); + F2FS_I(dir)->task = NULL; + } + if (de) { + f2fs_dentry_kunmap(dir, page); + f2fs_put_page(page, 0); + err = -EEXIST; + } else if (IS_ERR(page)) { + err = PTR_ERR(page); + } else { + err = __f2fs_do_add_link(dir, &fname, inode, ino, mode); + } fscrypt_free_filename(&fname); return err; } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 848f24e40cdb..24b5fd2d1f03 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -526,6 +526,7 @@ struct f2fs_inode_info { atomic_t dirty_pages; /* # of dirty pages */ f2fs_hash_t chash; /* hash value of given file name */ unsigned int clevel; /* maximum level of given file name */ + struct task_struct *task; /* lookup and create consistency */ nid_t i_xattr_nid; /* node id that contains xattrs */ loff_t last_disk_size; /* lastly written file size */ From 4c05cfb0081f3fca7de0a7d9d458305e655dd82b Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Fri, 17 Feb 2017 17:16:38 +0800 Subject: [PATCH 135/804] f2fs: replace __get_victim by dirty_segments in FG_GC In FG_GC process, it will search victim section twice. This will cause some dirty section with less valid blocks skip garbage collection. section # 26425 : valid blocks # 3 142.037567: get_victim_by_default: victim 26425 : valid blocks # 3 142.037585: f2fs_get_victim: dev = (259,30), type = No TYPE, policy = (Foreground GC, LFS-mode, Greedy), victim = 26425 ofs_unit = 1, pre_victim_secno = 26425, prefree = 0, free = 244 142.039494: f2fs_get_victim: dev = (259,30), type = Hot DATA, policy = (Background GC, SSR-mode, Greedy), victim = 19022 ofs_unit = 1, pre_victim_secno = 26425, prefree = 0, free = 24 142.070247: new_curseg: Debug: alloc new segment 26746 142.244341: f2fs_get_victim: dev = (259,30), type = No TYPE, policy = (Foreground GC, LFS-mode, Greedy), victim = 26054 ofs_unit = 1, pre_victim_secno = 26054, prefree = 0, free = 243 142.254475: do_garbage_collect: Debug: FG_GC, seg_freed = 1 142.293131: f2fs_get_victim: dev = (259,30), type = Warm DATA, policy = (Background GC, SSR-mode, Greedy), victim = 23466 ofs_unit = 1, pre_victim_secno = -1, prefree = 0, free = 244 142.319001: f2fs_get_victim: dev = (259,30), type = Warm DATA, policy = (Background GC, SSR-mode, Greedy), victim = 23467 ofs_unit = 1, pre_victim_secno = -1, prefree = 0, free = 244 142.368879: get_victim_by_default: victim 26425 : valid blocks # 3 142.368894: f2fs_get_victim: dev = (259,30), type = No TYPE, policy = (Foreground GC, LFS-mode, Greedy), victim = 26425 ofs_unit = 1, pre_victim_secno = 26425, prefree = 0, free = 244 142.378127: f2fs_get_victim: dev = (259,30), type = Hot DATA, policy = (Background GC, SSR-mode, Greedy), victim = 19612 ofs_unit = 1, pre_victim_secno = 26425, prefree = 0, free = 24 142.416917: new_curseg: Debug: alloc new segment 26054 142.656794: f2fs_get_victim: dev = (259,30), type = No TYPE, policy = (Foreground GC, LFS-mode, Greedy), victim = 25404 ofs_unit = 1, pre_victim_secno = 25404, prefree = 0, free = 243 142.662139: do_garbage_collect: Debug: FG_GC, seg_freed = 1 142.684159: new_curseg: Debug: alloc new segment 25197 142.685059: get_victim_by_default: victim 26425 : valid blocks # 3 142.685079: f2fs_get_victim: dev = (259,30), type = No TYPE, policy = (Foreground GC, LFS-mode, Greedy), victim = 26425 ofs_unit = 1, pre_victim_secno = 26425, prefree = 0, free = 243 142.701427: f2fs_get_victim: dev = (259,30), type = No TYPE, policy = (Foreground GC, LFS-mode, Greedy), victim = 26238 ofs_unit = 1, pre_victim_secno = 26238, prefree = 0, free = 243 142.707105: do_garbage_collect: Debug: FG_GC, seg_freed = 1 142.802444: f2fs_get_victim: dev = (259,30), type = Warm DATA, policy = (Background GC, SSR-mode, Greedy), victim = 23473 ofs_unit = 1, pre_victim_secno = -1, prefree = 0, free = 244 142.804422: get_victim_by_default: victim 26425 : valid blocks # 3 142.804443: f2fs_get_victim: dev = (259,30), type = No TYPE, policy = (Foreground GC, LFS-mode, Greedy), victim = 26425 ofs_unit = 1, pre_victim_secno = 26425, prefree = 0, free = 244 142.851567: f2fs_get_victim: dev = (259,30), type = Hot DATA, policy = (Background GC, SSR-mode, Greedy), victim = 19092 ofs_unit = 1, pre_victim_secno = 26425, prefree = 0, free = 24 142.865014: new_curseg: Debug: alloc new segment 26238 143.082245: f2fs_get_victim: dev = (259,30), type = No TYPE, policy = (Foreground GC, LFS-mode, Greedy), victim = 26307 ofs_unit = 1, pre_victim_secno = 26307, prefree = 0, free = 244 143.088252: do_garbage_collect: Debug: FG_GC, seg_freed = 1 143.128307: new_curseg: Debug: alloc new segment 25404 143.181846: get_victim_by_default: victim 26425 : valid blocks # 3 143.181872: f2fs_get_victim: dev = (259,30), type = No TYPE, policy = (Foreground GC, LFS-mode, Greedy), victim = 26425 ofs_unit = 1, pre_victim_secno = 26425, prefree = 0, free = 244 Signed-off-by: Yunlei He Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 7f0c3e02408c..518557bfad42 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -927,8 +927,6 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background) cpc.reason = __get_cp_reason(sbi); gc_more: - segno = NULL_SEGNO; - if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE))) goto stop; if (unlikely(f2fs_cp_error(sbi))) { @@ -943,12 +941,10 @@ gc_more: * enough free sections, we should flush dent/node blocks and do * garbage collections. */ - if (__get_victim(sbi, &segno, gc_type) || - prefree_segments(sbi)) { + if (dirty_segments(sbi) || prefree_segments(sbi)) { ret = write_checkpoint(sbi, &cpc); if (ret) goto stop; - segno = NULL_SEGNO; } else if (has_not_enough_free_secs(sbi, 0, 0)) { ret = write_checkpoint(sbi, &cpc); if (ret) @@ -959,7 +955,7 @@ gc_more: goto stop; } - if (segno == NULL_SEGNO && !__get_victim(sbi, &segno, gc_type)) + if (!__get_victim(sbi, &segno, gc_type)) goto stop; ret = 0; From e24eb1fceda8df42f824bd51bfc28e8019848c5b Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 17 Feb 2017 09:55:55 -0800 Subject: [PATCH 136/804] f2fs: do not wait for writeback in write_begin Otherwise we can get livelock like below. [79880.428136] dbench D 0 18405 18404 0x00000000 [79880.428139] Call Trace: [79880.428142] __schedule+0x219/0x6b0 [79880.428144] schedule+0x36/0x80 [79880.428147] schedule_timeout+0x243/0x2e0 [79880.428152] ? update_sd_lb_stats+0x16b/0x5f0 [79880.428155] ? ktime_get+0x3c/0xb0 [79880.428157] io_schedule_timeout+0xa6/0x110 [79880.428161] __lock_page+0xf7/0x130 [79880.428164] ? unlock_page+0x30/0x30 [79880.428167] pagecache_get_page+0x16b/0x250 [79880.428171] grab_cache_page_write_begin+0x20/0x40 [79880.428182] f2fs_write_begin+0xa2/0xdb0 [f2fs] [79880.428192] ? f2fs_mark_inode_dirty_sync+0x16/0x30 [f2fs] [79880.428197] ? kmem_cache_free+0x79/0x200 [79880.428203] ? __mark_inode_dirty+0x17f/0x360 [79880.428206] generic_perform_write+0xbb/0x190 [79880.428213] ? file_update_time+0xa4/0xf0 [79880.428217] __generic_file_write_iter+0x19b/0x1e0 [79880.428226] f2fs_file_write_iter+0x9c/0x180 [f2fs] [79880.428231] __vfs_write+0xc5/0x140 [79880.428235] vfs_write+0xb2/0x1b0 [79880.428238] SyS_write+0x46/0xa0 [79880.428242] entry_SYSCALL_64_fastpath+0x1e/0xad Fixes: cae96a5c8ab6 ("f2fs: check io submission more precisely") Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index e8f605bafc4e..2efca57fdb26 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1756,7 +1756,11 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, goto fail; } repeat: - page = grab_cache_page_write_begin(mapping, index, flags); + /* + * Do not use grab_cache_page_write_begin() to avoid deadlock due to + * wait_for_stable_page. Will wait that below with our IO control. + */ + page = grab_cache_page(mapping, index); if (!page) { err = -ENOMEM; goto fail; From c1288c8f35c7dde04b857b17bc7437907f19cfac Mon Sep 17 00:00:00 2001 From: Hou Pengyang Date: Thu, 16 Feb 2017 12:34:31 +0000 Subject: [PATCH 137/804] f2fs: add ovp valid_blocks check for bg gc victim to fg_gc For foreground gc, greedy algorithm should be adapted, which makes this formula work well: (2 * (100 / config.overprovision + 1) + 6) But currently, we fg_gc have a prior to select bg_gc victim segments to gc first, these victims are selected by cost-benefit algorithm, we can't guarantee such segments have the small valid blocks, which may destroy the f2fs rule, on the worstest case, would consume all the free segments. This patch fix this by add a filter in check_bg_victims, if segment's has # of valid blocks over overprovision ratio, skip such segments. Cc: Signed-off-by: Hou Pengyang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 3 +++ fs/f2fs/gc.c | 22 ++++++++++++++++++++-- fs/f2fs/segment.h | 9 +++++++++ 3 files changed, 32 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 24b5fd2d1f03..053f5b30eb4b 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -957,6 +957,9 @@ struct f2fs_sb_info { struct f2fs_gc_kthread *gc_thread; /* GC thread */ unsigned int cur_victim_sec; /* current victim section num */ + /* threshold for converting bg victims for fg */ + u64 fggc_threshold; + /* maximum # of trials to find a victim segment for SSR and GC */ unsigned int max_victim_search; diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 518557bfad42..6cb0c81f56a5 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -166,7 +166,8 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type, p->ofs_unit = sbi->segs_per_sec; } - if (p->max_search > sbi->max_victim_search) + /* we need to check every dirty segments in the FG_GC case */ + if (gc_type != FG_GC && p->max_search > sbi->max_victim_search) p->max_search = sbi->max_victim_search; p->offset = sbi->last_victim[p->gc_mode]; @@ -199,6 +200,10 @@ static unsigned int check_bg_victims(struct f2fs_sb_info *sbi) for_each_set_bit(secno, dirty_i->victim_secmap, MAIN_SECS(sbi)) { if (sec_usage_check(sbi, secno)) continue; + + if (no_fggc_candidate(sbi, secno)) + continue; + clear_bit(secno, dirty_i->victim_secmap); return secno * sbi->segs_per_sec; } @@ -322,13 +327,15 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, nsearched++; } - secno = GET_SECNO(sbi, segno); if (sec_usage_check(sbi, secno)) goto next; if (gc_type == BG_GC && test_bit(secno, dirty_i->victim_secmap)) goto next; + if (gc_type == FG_GC && p.alloc_mode == LFS && + no_fggc_candidate(sbi, secno)) + goto next; cost = get_gc_cost(sbi, segno, &p); @@ -985,5 +992,16 @@ stop: void build_gc_manager(struct f2fs_sb_info *sbi) { + u64 main_count, resv_count, ovp_count, blocks_per_sec; + DIRTY_I(sbi)->v_ops = &default_v_ops; + + /* threshold of # of valid blocks in a section for victims of FG_GC */ + main_count = SM_I(sbi)->main_segments << sbi->log_blocks_per_seg; + resv_count = SM_I(sbi)->reserved_segments << sbi->log_blocks_per_seg; + ovp_count = SM_I(sbi)->ovp_segments << sbi->log_blocks_per_seg; + blocks_per_sec = sbi->blocks_per_seg * sbi->segs_per_sec; + + sbi->fggc_threshold = div_u64((main_count - ovp_count) * blocks_per_sec, + (main_count - resv_count)); } diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 5cb5755c75d9..f4020f141d83 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -716,6 +716,15 @@ static inline block_t sum_blk_addr(struct f2fs_sb_info *sbi, int base, int type) - (base + 1) + type; } +static inline bool no_fggc_candidate(struct f2fs_sb_info *sbi, + unsigned int secno) +{ + if (get_valid_blocks(sbi, secno, sbi->segs_per_sec) >= + sbi->fggc_threshold) + return true; + return false; +} + static inline bool sec_usage_check(struct f2fs_sb_info *sbi, unsigned int secno) { if (IS_CURSEC(sbi, secno) || (sbi->cur_victim_sec == secno)) From 1466b660bebe2334aad1f806c793be57f028f17d Mon Sep 17 00:00:00 2001 From: Yunlong Song Date: Tue, 21 Feb 2017 16:59:26 +0800 Subject: [PATCH 138/804] f2fs: put allocate_segment after refresh_sit_entry SIT information should be updated before segment allocation, since SSR needs latest valid block information. Current code does not update the old_blkaddr info in sit_entry, so adjust the allocate_segment to its proper location. Commit 5e443818fa0b2a2845561ee25bec181424fb2889 ("f2fs: handle dirty segments inside refresh_sit_entry") puts it into wrong location. Signed-off-by: Yunlong Song Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 4fc23afc03e2..22e9c31e189f 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1838,14 +1838,15 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, stat_inc_block_count(sbi, curseg); - if (!__has_curseg_space(sbi, type)) - sit_i->s_ops->allocate_segment(sbi, type, false); /* * SIT information should be updated before segment allocation, * since SSR needs latest valid block information. */ refresh_sit_entry(sbi, old_blkaddr, *new_blkaddr); + if (!__has_curseg_space(sbi, type)) + sit_i->s_ops->allocate_segment(sbi, type, false); + mutex_unlock(&sit_i->sentry_lock); if (page && IS_NODESEG(type)) From 316bed49a61855f454fee133d244bd51812055d1 Mon Sep 17 00:00:00 2001 From: Hou Pengyang Date: Wed, 22 Feb 2017 10:28:59 +0000 Subject: [PATCH 139/804] f2fs: node segment is prior to data segment selected victim As data segment gc may lead dnode dirty, so the greedy cost for data segment should be valid blocks * 2, that is data segment is prior to node segment. Signed-off-by: Hou Pengyang Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 6cb0c81f56a5..07e61b6139cc 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -242,6 +242,16 @@ static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno) return UINT_MAX - ((100 * (100 - u) * age) / (100 + u)); } +static unsigned int get_greedy_cost(struct f2fs_sb_info *sbi, + unsigned int segno) +{ + unsigned int valid_blocks = + get_valid_blocks(sbi, segno, sbi->segs_per_sec); + + return IS_DATASEG(get_seg_entry(sbi, segno)->type) ? + valid_blocks * 2 : valid_blocks; +} + static inline unsigned int get_gc_cost(struct f2fs_sb_info *sbi, unsigned int segno, struct victim_sel_policy *p) { @@ -250,7 +260,7 @@ static inline unsigned int get_gc_cost(struct f2fs_sb_info *sbi, /* alloc_mode == LFS */ if (p->gc_mode == GC_GREEDY) - return get_valid_blocks(sbi, segno, sbi->segs_per_sec); + return get_greedy_cost(sbi, segno); else return get_cb_cost(sbi, segno); } From e3e27c59487bf43b9b5a8bfc1d825e5720443332 Mon Sep 17 00:00:00 2001 From: Yunlong Song Date: Wed, 22 Feb 2017 20:50:49 +0800 Subject: [PATCH 140/804] f2fs: do SSR for data when there is enough free space In allocate_segment_by_default(), need_SSR() already detected it's time to do SSR. So, let's try to find victims for data segments more aggressively in time. Signed-off-by: Yunlong Song Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 22e9c31e189f..98eef04bffa3 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1619,7 +1619,7 @@ static int get_ssr_segment(struct f2fs_sb_info *sbi, int type) struct curseg_info *curseg = CURSEG_I(sbi, type); const struct victim_selection *v_ops = DIRTY_I(sbi)->v_ops; - if (IS_NODESEG(type) || !has_not_enough_free_secs(sbi, 0, 0)) + if (IS_NODESEG(type)) return v_ops->get_victim(sbi, &(curseg)->next_segno, BG_GC, type, SSR); From 62d8564f71ec326e7cb5ba2ab582942858ccccec Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 22 Feb 2017 16:39:11 -0800 Subject: [PATCH 141/804] f2fs: do SSR in higher priority Let's check SSR in prior to LFS allocation. Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 98eef04bffa3..209fe59e45f4 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1403,17 +1403,6 @@ static void write_current_sum_page(struct f2fs_sb_info *sbi, f2fs_put_page(page, 1); } -static int is_next_segment_free(struct f2fs_sb_info *sbi, int type) -{ - struct curseg_info *curseg = CURSEG_I(sbi, type); - unsigned int segno = curseg->segno + 1; - struct free_segmap_info *free_i = FREE_I(sbi); - - if (segno < MAIN_SEGS(sbi) && segno % sbi->segs_per_sec) - return !test_bit(segno, free_i->free_segmap); - return 0; -} - /* * Find a new segment from the free segments bitmap to right order * This function should be returned with success, otherwise BUG @@ -1638,21 +1627,17 @@ static int get_ssr_segment(struct f2fs_sb_info *sbi, int type) static void allocate_segment_by_default(struct f2fs_sb_info *sbi, int type, bool force) { - struct curseg_info *curseg = CURSEG_I(sbi, type); - if (force) new_curseg(sbi, type, true); else if (!is_set_ckpt_flags(sbi, CP_CRC_RECOVERY_FLAG) && type == CURSEG_WARM_NODE) new_curseg(sbi, type, false); - else if (curseg->alloc_type == LFS && is_next_segment_free(sbi, type)) - new_curseg(sbi, type, false); else if (need_SSR(sbi) && get_ssr_segment(sbi, type)) change_curseg(sbi, type, true); else new_curseg(sbi, type, false); - stat_inc_seg_type(sbi, curseg); + stat_inc_seg_type(sbi, CURSEG_I(sbi, type)); } void allocate_new_segments(struct f2fs_sb_info *sbi) From 3f2523b222146a4af5b0c0c65e82e641475a3c66 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 22 Feb 2017 17:10:18 -0800 Subject: [PATCH 142/804] f2fs: find data segments across all the types Previously, if type is CURSEG_HOT_DATA, we only check CURSEG_HOT_DATA only. This patch fixes to search all the different types for SSR. Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 209fe59e45f4..aebbc3dbc2de 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1607,16 +1607,23 @@ static int get_ssr_segment(struct f2fs_sb_info *sbi, int type) { struct curseg_info *curseg = CURSEG_I(sbi, type); const struct victim_selection *v_ops = DIRTY_I(sbi)->v_ops; + int i; + + /* need_SSR() already forces to do this */ + if (v_ops->get_victim(sbi, &(curseg)->next_segno, BG_GC, type, SSR)) + return 1; if (IS_NODESEG(type)) - return v_ops->get_victim(sbi, - &(curseg)->next_segno, BG_GC, type, SSR); + return 0; /* For data segments, let's do SSR more intensively */ - for (; type >= CURSEG_HOT_DATA; type--) + for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { + if (i == type) + continue; if (v_ops->get_victim(sbi, &(curseg)->next_segno, - BG_GC, type, SSR)) + BG_GC, i, SSR)) return 1; + } return 0; } From 91ef1346c8a399e90c9e3611914dd21f20172029 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 22 Feb 2017 19:10:35 -0800 Subject: [PATCH 143/804] f2fs: avoid very large discard command This patch adds MAX_DISCARD_BLOCKS() to avoid issuing too much large single discard command. Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 3 ++- fs/f2fs/segment.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 053f5b30eb4b..121ee0765fa4 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -200,7 +200,8 @@ enum { (SM_I(sbi)->trim_sections * (sbi)->segs_per_sec) #define BATCHED_TRIM_BLOCKS(sbi) \ (BATCHED_TRIM_SEGMENTS(sbi) << (sbi)->log_blocks_per_seg) - +#define MAX_DISCARD_BLOCKS(sbi) \ + ((1 << (sbi)->log_blocks_per_seg) * (sbi)->segs_per_sec) #define DISCARD_ISSUE_RATE 8 #define DEF_CP_INTERVAL 60 /* 60 secs */ #define DEF_IDLE_INTERVAL 5 /* 5 secs */ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index aebbc3dbc2de..63846d45b4ad 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -965,7 +965,8 @@ static void __add_discard_entry(struct f2fs_sb_info *sbi, if (!list_empty(head)) { last = list_last_entry(head, struct discard_entry, list); if (START_BLOCK(sbi, cpc->trim_start) + start == - last->blkaddr + last->len) { + last->blkaddr + last->len && + last->len < MAX_DISCARD_BLOCKS(sbi)) { last->len += end - start; goto done; } From 94d3f18b4f784e2ee114db3ad773527df992ac27 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 22 Feb 2017 19:53:07 -0800 Subject: [PATCH 144/804] f2fs: much larger batched trim_fs job We have a kernel thread to issue discard commands, so we can increase the number of batched discard sections. By default, now it becomes 4GB range. Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 121ee0765fa4..4440f52a83a3 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -195,7 +195,7 @@ enum { CP_DISCARD, }; -#define DEF_BATCHED_TRIM_SECTIONS 2 +#define DEF_BATCHED_TRIM_SECTIONS 2048 #define BATCHED_TRIM_SEGMENTS(sbi) \ (SM_I(sbi)->trim_sections * (sbi)->segs_per_sec) #define BATCHED_TRIM_BLOCKS(sbi) \ From 97a61adb90fb6d15e46b693a7a60b5bde6b1c8d2 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 22 Feb 2017 19:58:23 -0800 Subject: [PATCH 145/804] f2fs: wait for discard completion after submission We don't need to wait for each discard commands when unmounting the image. Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 63846d45b4ad..609d09ab6012 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -676,8 +676,12 @@ void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct list_head *wait_list = &(dcc->discard_cmd_list); struct discard_cmd *dc, *tmp; + struct blk_plug plug; mutex_lock(&dcc->cmd_lock); + + blk_start_plug(&plug); + list_for_each_entry_safe(dc, tmp, wait_list, list) { if (blkaddr == NULL_ADDR) { @@ -686,9 +690,6 @@ void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) submit_bio(REQ_SYNC, dc->bio); atomic_inc(&dcc->submit_discard); } - wait_for_completion_io(&dc->wait); - - __remove_discard_cmd(sbi, dc); continue; } @@ -699,6 +700,15 @@ void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) __remove_discard_cmd(sbi, dc); } } + blk_finish_plug(&plug); + + /* this comes from f2fs_put_super */ + if (blkaddr == NULL_ADDR) { + list_for_each_entry_safe(dc, tmp, wait_list, list) { + wait_for_completion_io(&dc->wait); + __remove_discard_cmd(sbi, dc); + } + } mutex_unlock(&dcc->cmd_lock); } From 70dd5a4c5aa2440965085a36a17496be6ba9b760 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 22 Feb 2017 20:18:35 -0800 Subject: [PATCH 146/804] f2fs: check discard alignment only for SEQWRITE zones For converntional zones, we don't need to align discard commands to exact zone size. Signed-off-by: Jaegeuk Kim Conflicts: fs/f2fs/segment.c --- fs/f2fs/segment.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 609d09ab6012..d77b2cddf9df 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -872,24 +872,13 @@ static int __f2fs_issue_discard_async(struct f2fs_sb_info *sbi, static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t blkstart, block_t blklen) { - sector_t nr_sects = SECTOR_FROM_BLOCK(blklen); - sector_t sector; + sector_t sector, nr_sects; int devi = 0; if (sbi->s_ndevs) { devi = f2fs_target_device_index(sbi, blkstart); blkstart -= FDEV(devi).start_blk; } - sector = SECTOR_FROM_BLOCK(blkstart); - - if (sector & (bdev_zone_size(bdev) - 1) || - nr_sects != bdev_zone_size(bdev)) { - f2fs_msg(sbi->sb, KERN_INFO, - "(%d) %s: Unaligned discard attempted (block %x + %x)", - devi, sbi->s_ndevs ? FDEV(devi).path: "", - blkstart, blklen); - return -EIO; - } /* * We need to know the type of the zone: for conventional zones, @@ -904,6 +893,17 @@ static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, return __f2fs_issue_discard_async(sbi, bdev, blkstart, blklen); case BLK_ZONE_TYPE_SEQWRITE_REQ: case BLK_ZONE_TYPE_SEQWRITE_PREF: + sector = SECTOR_FROM_BLOCK(blkstart); + nr_sects = SECTOR_FROM_BLOCK(blklen); + + if (sector & (bdev_zone_size(bdev) - 1) || + nr_sects != bdev_zone_size(bdev)) { + f2fs_msg(sbi->sb, KERN_INFO, + "(%d) %s: Unaligned discard attempted (block %x + %x)", + devi, sbi->s_ndevs ? FDEV(devi).path: "", + blkstart, blklen); + return -EIO; + } trace_f2fs_issue_reset_zone(bdev, blkstart); return blkdev_reset_zones(bdev, sector, nr_sects, GFP_NOFS); From cb9ca08d121ab076d027d3f37806fd315d9771fa Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 22 Feb 2017 17:02:32 -0800 Subject: [PATCH 147/804] f2fs: do SSR for node segments more aggresively This patch gives more SSR chances for node blocks. Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index d77b2cddf9df..934749663b61 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1618,17 +1618,22 @@ static int get_ssr_segment(struct f2fs_sb_info *sbi, int type) { struct curseg_info *curseg = CURSEG_I(sbi, type); const struct victim_selection *v_ops = DIRTY_I(sbi)->v_ops; - int i; + int i, n; /* need_SSR() already forces to do this */ if (v_ops->get_victim(sbi, &(curseg)->next_segno, BG_GC, type, SSR)) return 1; - if (IS_NODESEG(type)) - return 0; + /* For node segments, let's do SSR more intensively */ + if (IS_NODESEG(type)) { + i = CURSEG_HOT_NODE; + n = CURSEG_COLD_NODE; + } else { + i = CURSEG_HOT_DATA; + n = CURSEG_COLD_DATA; + } - /* For data segments, let's do SSR more intensively */ - for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { + for (; i <= n; i++) { if (i == type) continue; if (v_ops->get_victim(sbi, &(curseg)->next_segno, From c074e1b7c11cb877b8484f8f47ff618ff83b5169 Mon Sep 17 00:00:00 2001 From: Yunlong Song Date: Tue, 21 Feb 2017 20:43:48 +0800 Subject: [PATCH 148/804] f2fs: remove unnecessary condition check for write_checkpoint in f2fs_gc Since has_not_enough_free_secs(sbi, 0, 0) must be true if has_not_enough_ free_secs(sbi, sec_freed, 0) is true, write_checkpoint is sure to execute in both conditions. Signed-off-by: Yunlong Song Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 07e61b6139cc..2727d352817e 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -958,15 +958,9 @@ gc_more: * enough free sections, we should flush dent/node blocks and do * garbage collections. */ - if (dirty_segments(sbi) || prefree_segments(sbi)) { - ret = write_checkpoint(sbi, &cpc); - if (ret) - goto stop; - } else if (has_not_enough_free_secs(sbi, 0, 0)) { - ret = write_checkpoint(sbi, &cpc); - if (ret) - goto stop; - } + ret = write_checkpoint(sbi, &cpc); + if (ret) + goto stop; } else if (gc_type == BG_GC && !background) { /* f2fs_balance_fs doesn't need to do BG_GC in critical path. */ goto stop; From c12a69d920f191a77d8f5134592af800b965bd21 Mon Sep 17 00:00:00 2001 From: Hou Pengyang Date: Thu, 23 Feb 2017 09:18:05 +0000 Subject: [PATCH 149/804] f2fs: init local extent_info to avoid stale stack info in tp To avoid such stale(fops, blk, len) info in f2fs_lookup_extent_tree_end tp dio-23095 [005] ...1 17878.856859: f2fs_lookup_extent_tree_end: dev = (259,30), ino = 856, pgofs = 0, ext_info(fofs: 3441207040, blk: 4294967232, len: 3481143808) Signed-off-by: Hou Pengyang Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 8 ++++---- fs/f2fs/file.c | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 2efca57fdb26..8cbfd1caf4be 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -511,7 +511,7 @@ int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index) int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index) { - struct extent_info ei; + struct extent_info ei = {0,0,0}; struct inode *inode = dn->inode; if (f2fs_lookup_extent_cache(inode, index, &ei)) { @@ -528,7 +528,7 @@ struct page *get_read_data_page(struct inode *inode, pgoff_t index, struct address_space *mapping = inode->i_mapping; struct dnode_of_data dn; struct page *page; - struct extent_info ei; + struct extent_info ei = {0,0,0}; int err; struct f2fs_io_info fio = { .sbi = F2FS_I_SB(inode), @@ -803,7 +803,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, int err = 0, ofs = 1; unsigned int ofs_in_node, last_ofs_in_node; blkcnt_t prealloc; - struct extent_info ei; + struct extent_info ei = {0,0,0}; block_t blkaddr; if (!maxblocks) @@ -1664,7 +1664,7 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi, struct dnode_of_data dn; struct page *ipage; bool locked = false; - struct extent_info ei; + struct extent_info ei = {0,0,0}; int err = 0; /* diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 948d440dd3ce..36082c11adb7 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1896,7 +1896,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, { struct inode *inode = file_inode(filp); struct f2fs_map_blocks map = { .m_next_pgofs = NULL }; - struct extent_info ei; + struct extent_info ei = {0,0,0}; pgoff_t pg_start, pg_end; unsigned int blk_per_seg = sbi->blocks_per_seg; unsigned int total = 0, sec_num; From 7234370dc65c163b804acf700d9694ce7d78abdf Mon Sep 17 00:00:00 2001 From: Hou Pengyang Date: Thu, 23 Feb 2017 09:18:06 +0000 Subject: [PATCH 150/804] f2fs: remove unsafe bitmap checking proc A: proc B: - writeback_sb_inodes - __writeback_single_inode - do_writepages - f2fs_write_node_pages - f2fs_balance_fs_bg - write_checkpoint - build_free_nids - flush_nat_entries - __build_free_nids - __flush_nat_entry_set - ra_meta_pages - get_next_nat_page - current_nat_addr - set_to_next_nat [do nat_bitmap checking] - f2fs_change_bit For proc A, nat_bitmap and nat_bitmap_mir would be compared without lock_op and nm_i->nat_tree_lock, while proc B is changing nat_bitmap/nat_bitmap_ver in cp. So it is normal for nat_bitmap/nat_bitmap diffrence under such scenario. This patch fix this by removing the monitoring point. [Fix: 599a09b f2fs: check in-memory nat version bitmap] Signed-off-by: Hou Pengyang Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.h | 6 ------ 1 file changed, 6 deletions(-) diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index d3d289306469..3fc9c4b1dce9 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -209,12 +209,6 @@ static inline pgoff_t current_nat_addr(struct f2fs_sb_info *sbi, nid_t start) (seg_off << sbi->log_blocks_per_seg << 1) + (block_off & (sbi->blocks_per_seg - 1))); -#ifdef CONFIG_F2FS_CHECK_FS - if (f2fs_test_bit(block_off, nm_i->nat_bitmap) != - f2fs_test_bit(block_off, nm_i->nat_bitmap_mir)) - f2fs_bug_on(sbi, 1); -#endif - if (f2fs_test_bit(block_off, nm_i->nat_bitmap)) block_addr += sbi->blocks_per_seg; From 8cfbfea08204d8eb71ff5306c859379456e75bd3 Mon Sep 17 00:00:00 2001 From: Kinglong Mee Date: Thu, 23 Feb 2017 19:55:05 +0800 Subject: [PATCH 151/804] f2fs: avoid m_flags overlay when allocating more data blocks When more than one data blocks are allocated, the F2FS_MAP_UNWRITTEN/MAPPED flags will be overlapped by F2FS_MAP_NEW at the later times. Signed-off-by: Kinglong Mee Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 8cbfd1caf4be..5356cc3cf1a9 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -867,7 +867,7 @@ next_block: } if (err) goto sync_out; - map->m_flags = F2FS_MAP_NEW; + map->m_flags |= F2FS_MAP_NEW; blkaddr = dn.data_blkaddr; } else { if (flag == F2FS_GET_BLOCK_BMAP) { From 2ef086ad2991b52a5bf8a24408505bc53d6e43f8 Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Thu, 23 Feb 2017 19:39:59 +0800 Subject: [PATCH 152/804] f2fs: replace rw semaphore extent_tree_lock with mutex lock This patch replace rw semaphore extent_tree_lock with mutex lock for no read cases with this lock. Signed-off-by: Yunlei He Signed-off-by: Jaegeuk Kim --- fs/f2fs/extent_cache.c | 22 +++++++++++----------- fs/f2fs/f2fs.h | 2 +- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 6ed6424807b6..0ab5518e45c2 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -77,7 +77,7 @@ static struct extent_tree *__grab_extent_tree(struct inode *inode) struct extent_tree *et; nid_t ino = inode->i_ino; - down_write(&sbi->extent_tree_lock); + mutex_lock(&sbi->extent_tree_lock); et = radix_tree_lookup(&sbi->extent_tree_root, ino); if (!et) { et = f2fs_kmem_cache_alloc(extent_tree_slab, GFP_NOFS); @@ -94,7 +94,7 @@ static struct extent_tree *__grab_extent_tree(struct inode *inode) atomic_dec(&sbi->total_zombie_tree); list_del_init(&et->list); } - up_write(&sbi->extent_tree_lock); + mutex_unlock(&sbi->extent_tree_lock); /* never died until evict_inode */ F2FS_I(inode)->extent_tree = et; @@ -548,7 +548,7 @@ unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) if (!atomic_read(&sbi->total_zombie_tree)) goto free_node; - if (!down_write_trylock(&sbi->extent_tree_lock)) + if (!mutex_trylock(&sbi->extent_tree_lock)) goto out; /* 1. remove unreferenced extent tree */ @@ -570,11 +570,11 @@ unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) goto unlock_out; cond_resched(); } - up_write(&sbi->extent_tree_lock); + mutex_unlock(&sbi->extent_tree_lock); free_node: /* 2. remove LRU extent entries */ - if (!down_write_trylock(&sbi->extent_tree_lock)) + if (!mutex_trylock(&sbi->extent_tree_lock)) goto out; remained = nr_shrink - (node_cnt + tree_cnt); @@ -604,7 +604,7 @@ free_node: spin_unlock(&sbi->extent_lock); unlock_out: - up_write(&sbi->extent_tree_lock); + mutex_unlock(&sbi->extent_tree_lock); out: trace_f2fs_shrink_extent_tree(sbi, node_cnt, tree_cnt); @@ -651,10 +651,10 @@ void f2fs_destroy_extent_tree(struct inode *inode) if (inode->i_nlink && !is_bad_inode(inode) && atomic_read(&et->node_cnt)) { - down_write(&sbi->extent_tree_lock); + mutex_lock(&sbi->extent_tree_lock); list_add_tail(&et->list, &sbi->zombie_list); atomic_inc(&sbi->total_zombie_tree); - up_write(&sbi->extent_tree_lock); + mutex_unlock(&sbi->extent_tree_lock); return; } @@ -662,12 +662,12 @@ void f2fs_destroy_extent_tree(struct inode *inode) node_cnt = f2fs_destroy_extent_node(inode); /* delete extent tree entry in radix tree */ - down_write(&sbi->extent_tree_lock); + mutex_lock(&sbi->extent_tree_lock); f2fs_bug_on(sbi, atomic_read(&et->node_cnt)); radix_tree_delete(&sbi->extent_tree_root, inode->i_ino); kmem_cache_free(extent_tree_slab, et); atomic_dec(&sbi->total_ext_tree); - up_write(&sbi->extent_tree_lock); + mutex_unlock(&sbi->extent_tree_lock); F2FS_I(inode)->extent_tree = NULL; @@ -714,7 +714,7 @@ void f2fs_update_extent_cache_range(struct dnode_of_data *dn, void init_extent_cache_info(struct f2fs_sb_info *sbi) { INIT_RADIX_TREE(&sbi->extent_tree_root, GFP_NOIO); - init_rwsem(&sbi->extent_tree_lock); + mutex_init(&sbi->extent_tree_lock); INIT_LIST_HEAD(&sbi->extent_list); spin_lock_init(&sbi->extent_lock); atomic_set(&sbi->total_ext_tree, 0); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 4440f52a83a3..94f0dcf48763 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -911,7 +911,7 @@ struct f2fs_sb_info { /* for extent tree cache */ struct radix_tree_root extent_tree_root;/* cache extent cache entries */ - struct rw_semaphore extent_tree_lock; /* locking extent radix tree */ + struct mutex extent_tree_lock; /* locking extent radix tree */ struct list_head extent_list; /* lru list for shrinker */ spinlock_t extent_lock; /* locking extent lru list */ atomic_t total_ext_tree; /* extent tree count */ From 273924c37731e8a4e578f31727167338a237d1dd Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 9 Feb 2017 10:38:09 -0800 Subject: [PATCH 153/804] f2fs: add bitmaps for empty or full NAT blocks This patches adds bitmaps to represent empty or full NAT blocks containing free nid entries. If we can find valid crc|cp_ver in the last block of checkpoint pack, we'll use these bitmaps when building free nids. In order to avoid checkpointing burden, up-to-date bitmaps will be flushed only during umount time. So, normally we can get this gain, but when power-cut happens, we rely on fsck.f2fs which recovers this bitmap again. After this patch, we build free nids from nid #0 at mount time to make more full NAT blocks, but in runtime, we check empty NAT blocks to load free nids without loading any NAT pages from disk. Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 28 +++++- fs/f2fs/debug.c | 1 + fs/f2fs/f2fs.h | 31 ++++++- fs/f2fs/node.c | 188 ++++++++++++++++++++++++++++++++++++---- fs/f2fs/segment.c | 2 +- include/linux/f2fs_fs.h | 1 + 6 files changed, 231 insertions(+), 20 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index fd8db9d1ceea..2a7824341a75 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1025,6 +1025,10 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc) spin_lock(&sbi->cp_lock); + if (cpc->reason == CP_UMOUNT && ckpt->cp_pack_total_block_count > + sbi->blocks_per_seg - NM_I(sbi)->nat_bits_blocks) + disable_nat_bits(sbi, false); + if (cpc->reason == CP_UMOUNT) __set_ckpt_flags(ckpt, CP_UMOUNT_FLAG); else @@ -1137,6 +1141,28 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) start_blk = __start_cp_next_addr(sbi); + /* write nat bits */ + if (enabled_nat_bits(sbi, cpc)) { + __u64 cp_ver = cur_cp_version(ckpt); + unsigned int i; + block_t blk; + + cp_ver |= ((__u64)crc32 << 32); + *(__le64 *)nm_i->nat_bits = cpu_to_le64(cp_ver); + + blk = start_blk + sbi->blocks_per_seg - nm_i->nat_bits_blocks; + for (i = 0; i < nm_i->nat_bits_blocks; i++) + update_meta_page(sbi, nm_i->nat_bits + + (i << F2FS_BLKSIZE_BITS), blk + i); + + /* Flush all the NAT BITS pages */ + while (get_pages(sbi, F2FS_DIRTY_META)) { + sync_meta_pages(sbi, META, LONG_MAX); + if (unlikely(f2fs_cp_error(sbi))) + return -EIO; + } + } + /* need to wait for end_io results */ wait_on_all_pages_writeback(sbi); if (unlikely(f2fs_cp_error(sbi))) @@ -1273,7 +1299,7 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) ckpt->checkpoint_ver = cpu_to_le64(++ckpt_ver); /* write cached NAT/SIT entries to NAT/SIT area */ - flush_nat_entries(sbi); + flush_nat_entries(sbi, cpc); flush_sit_entries(sbi, cpc); /* unlock all the fs_lock[] in do_checkpoint() */ diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index de8da9fc5c99..015ad2b73a92 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -193,6 +193,7 @@ static void update_mem_info(struct f2fs_sb_info *sbi) /* build nm */ si->base_mem += sizeof(struct f2fs_nm_info); si->base_mem += __bitmap_size(sbi, NAT_BITMAP); + si->base_mem += (NM_I(sbi)->nat_bits_blocks << F2FS_BLKSIZE_BITS); get_cache: si->cache_mem = 0; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 94f0dcf48763..3e726878ccdc 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -621,6 +621,7 @@ struct f2fs_nm_info { struct list_head nat_entries; /* cached nat entry list (clean) */ unsigned int nat_cnt; /* the # of cached nat entries */ unsigned int dirty_nat_cnt; /* total num of nat entries in set */ + unsigned int nat_blocks; /* # of nat blocks */ /* free node ids management */ struct radix_tree_root free_nid_root;/* root of the free_nid cache */ @@ -631,6 +632,11 @@ struct f2fs_nm_info { /* for checkpoint */ char *nat_bitmap; /* NAT bitmap pointer */ + + unsigned int nat_bits_blocks; /* # of nat bits blocks */ + unsigned char *nat_bits; /* NAT bits blocks */ + unsigned char *full_nat_bits; /* full NAT pages */ + unsigned char *empty_nat_bits; /* empty NAT pages */ #ifdef CONFIG_F2FS_CHECK_FS char *nat_bitmap_mir; /* NAT bitmap mirror */ #endif @@ -1238,6 +1244,27 @@ static inline void clear_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f) spin_unlock(&sbi->cp_lock); } +static inline void disable_nat_bits(struct f2fs_sb_info *sbi, bool lock) +{ + set_sbi_flag(sbi, SBI_NEED_FSCK); + + if (lock) + spin_lock(&sbi->cp_lock); + __clear_ckpt_flags(F2FS_CKPT(sbi), CP_NAT_BITS_FLAG); + kfree(NM_I(sbi)->nat_bits); + NM_I(sbi)->nat_bits = NULL; + if (lock) + spin_unlock(&sbi->cp_lock); +} + +static inline bool enabled_nat_bits(struct f2fs_sb_info *sbi, + struct cp_control *cpc) +{ + bool set = is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG); + + return (cpc) ? (cpc->reason == CP_UMOUNT) && set : set; +} + static inline void f2fs_lock_op(struct f2fs_sb_info *sbi) { down_read(&sbi->cp_rwsem); @@ -2198,7 +2225,7 @@ void move_node_page(struct page *node_page, int gc_type); int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, struct writeback_control *wbc, bool atomic); int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc); -void build_free_nids(struct f2fs_sb_info *sbi, bool sync); +void build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount); bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid); void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid); void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid); @@ -2209,7 +2236,7 @@ int recover_xattr_data(struct inode *inode, struct page *page, int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page); int restore_node_summary(struct f2fs_sb_info *sbi, unsigned int segno, struct f2fs_summary_block *sum); -void flush_nat_entries(struct f2fs_sb_info *sbi); +void flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc); int build_node_manager(struct f2fs_sb_info *sbi); void destroy_node_manager(struct f2fs_sb_info *sbi); int __init create_node_manager_caches(void); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 8137903c9012..7facc1711baf 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -338,6 +338,9 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, set_nat_flag(e, IS_CHECKPOINTED, false); __set_nat_cache_dirty(nm_i, e); + if (enabled_nat_bits(sbi, NULL) && new_blkaddr == NEW_ADDR) + __clear_bit_le(NAT_BLOCK_OFFSET(ni->nid), nm_i->empty_nat_bits); + /* update fsync_mark if its inode nat entry is still alive */ if (ni->nid != ni->ino) e = __lookup_nat_cache(nm_i, ni->ino); @@ -1844,7 +1847,60 @@ static void scan_nat_page(struct f2fs_sb_info *sbi, } } -static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync) +static int scan_nat_bits(struct f2fs_sb_info *sbi) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct page *page; + unsigned int i = 0; + nid_t target = FREE_NID_PAGES * NAT_ENTRY_PER_BLOCK; + nid_t nid; + + if (!enabled_nat_bits(sbi, NULL)) + return -EAGAIN; + + down_read(&nm_i->nat_tree_lock); +check_empty: + i = find_next_bit_le(nm_i->empty_nat_bits, nm_i->nat_blocks, i); + if (i >= nm_i->nat_blocks) { + i = 0; + goto check_partial; + } + + for (nid = i * NAT_ENTRY_PER_BLOCK; nid < (i + 1) * NAT_ENTRY_PER_BLOCK; + nid++) { + if (unlikely(nid >= nm_i->max_nid)) + break; + add_free_nid(sbi, nid, true); + } + + if (nm_i->nid_cnt[FREE_NID_LIST] >= target) + goto out; + i++; + goto check_empty; + +check_partial: + i = find_next_zero_bit_le(nm_i->full_nat_bits, nm_i->nat_blocks, i); + if (i >= nm_i->nat_blocks) { + disable_nat_bits(sbi, true); + up_read(&nm_i->nat_tree_lock); + return -EINVAL; + } + + nid = i * NAT_ENTRY_PER_BLOCK; + page = get_current_nat_page(sbi, nid); + scan_nat_page(sbi, page, nid); + f2fs_put_page(page, 1); + + if (nm_i->nid_cnt[FREE_NID_LIST] < target) { + i++; + goto check_partial; + } +out: + up_read(&nm_i->nat_tree_lock); + return 0; +} + +static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); @@ -1859,6 +1915,21 @@ static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync) if (!sync && !available_free_memory(sbi, FREE_NIDS)) return; + /* try to find free nids with nat_bits */ + if (!mount && !scan_nat_bits(sbi) && nm_i->nid_cnt[FREE_NID_LIST]) + return; + + /* find next valid candidate */ + if (enabled_nat_bits(sbi, NULL)) { + int idx = find_next_zero_bit_le(nm_i->full_nat_bits, + nm_i->nat_blocks, 0); + + if (idx >= nm_i->nat_blocks) + set_sbi_flag(sbi, SBI_NEED_FSCK); + else + nid = idx * NAT_ENTRY_PER_BLOCK; + } + /* readahead nat pages to be scanned */ ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES, META_NAT, true); @@ -1901,10 +1972,10 @@ static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync) nm_i->ra_nid_pages, META_NAT, false); } -void build_free_nids(struct f2fs_sb_info *sbi, bool sync) +void build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount) { mutex_lock(&NM_I(sbi)->build_lock); - __build_free_nids(sbi, sync); + __build_free_nids(sbi, sync, mount); mutex_unlock(&NM_I(sbi)->build_lock); } @@ -1946,7 +2017,7 @@ retry: spin_unlock(&nm_i->nid_list_lock); /* Let's scan nat pages and its caches to get free nids */ - build_free_nids(sbi, true); + build_free_nids(sbi, true, false); goto retry; } @@ -2238,8 +2309,39 @@ add_out: list_add_tail(&nes->set_list, head); } +void __update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid, + struct page *page) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + unsigned int nat_index = start_nid / NAT_ENTRY_PER_BLOCK; + struct f2fs_nat_block *nat_blk = page_address(page); + int valid = 0; + int i; + + if (!enabled_nat_bits(sbi, NULL)) + return; + + for (i = 0; i < NAT_ENTRY_PER_BLOCK; i++) { + if (start_nid == 0 && i == 0) + valid++; + if (nat_blk->entries[i].block_addr) + valid++; + } + if (valid == 0) { + __set_bit_le(nat_index, nm_i->empty_nat_bits); + __clear_bit_le(nat_index, nm_i->full_nat_bits); + return; + } + + __clear_bit_le(nat_index, nm_i->empty_nat_bits); + if (valid == NAT_ENTRY_PER_BLOCK) + __set_bit_le(nat_index, nm_i->full_nat_bits); + else + __clear_bit_le(nat_index, nm_i->full_nat_bits); +} + static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, - struct nat_entry_set *set) + struct nat_entry_set *set, struct cp_control *cpc) { struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); struct f2fs_journal *journal = curseg->journal; @@ -2254,7 +2356,8 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, * #1, flush nat entries to journal in current hot data summary block. * #2, flush nat entries to nat page. */ - if (!__has_cursum_space(journal, set->entry_cnt, NAT_JOURNAL)) + if (enabled_nat_bits(sbi, cpc) || + !__has_cursum_space(journal, set->entry_cnt, NAT_JOURNAL)) to_journal = false; if (to_journal) { @@ -2294,10 +2397,12 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, } } - if (to_journal) + if (to_journal) { up_write(&curseg->journal_rwsem); - else + } else { + __update_nat_bits(sbi, start_nid, page); f2fs_put_page(page, 1); + } f2fs_bug_on(sbi, set->entry_cnt); @@ -2308,7 +2413,7 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, /* * This function is called during the checkpointing process. */ -void flush_nat_entries(struct f2fs_sb_info *sbi) +void flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); @@ -2329,7 +2434,8 @@ void flush_nat_entries(struct f2fs_sb_info *sbi) * entries, remove all entries from journal and merge them * into nat entry set. */ - if (!__has_cursum_space(journal, nm_i->dirty_nat_cnt, NAT_JOURNAL)) + if (cpc->reason == CP_UMOUNT || + !__has_cursum_space(journal, nm_i->dirty_nat_cnt, NAT_JOURNAL)) remove_nats_in_journal(sbi); while ((found = __gang_lookup_nat_set(nm_i, @@ -2343,27 +2449,72 @@ void flush_nat_entries(struct f2fs_sb_info *sbi) /* flush dirty nats in nat entry set */ list_for_each_entry_safe(set, tmp, &sets, set_list) - __flush_nat_entry_set(sbi, set); + __flush_nat_entry_set(sbi, set, cpc); up_write(&nm_i->nat_tree_lock); f2fs_bug_on(sbi, nm_i->dirty_nat_cnt); } +static int __get_nat_bitmaps(struct f2fs_sb_info *sbi) +{ + struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + struct f2fs_nm_info *nm_i = NM_I(sbi); + unsigned int nat_bits_bytes = nm_i->nat_blocks / BITS_PER_BYTE; + unsigned int i; + __u64 cp_ver = cur_cp_version(ckpt); + size_t crc_offset = le32_to_cpu(ckpt->checksum_offset); + __u64 crc = le32_to_cpu(*((__le32 *) + ((unsigned char *)ckpt + crc_offset))); + block_t nat_bits_addr; + + if (!enabled_nat_bits(sbi, NULL)) + return 0; + + nm_i->nat_bits_blocks = F2FS_BYTES_TO_BLK((nat_bits_bytes << 1) + 8 + + F2FS_BLKSIZE - 1); + nm_i->nat_bits = kzalloc(nm_i->nat_bits_blocks << F2FS_BLKSIZE_BITS, + GFP_KERNEL); + if (!nm_i->nat_bits) + return -ENOMEM; + + nat_bits_addr = __start_cp_addr(sbi) + sbi->blocks_per_seg - + nm_i->nat_bits_blocks; + for (i = 0; i < nm_i->nat_bits_blocks; i++) { + struct page *page = get_meta_page(sbi, nat_bits_addr++); + + memcpy(nm_i->nat_bits + (i << F2FS_BLKSIZE_BITS), + page_address(page), F2FS_BLKSIZE); + f2fs_put_page(page, 1); + } + + cp_ver |= (crc << 32); + if (cpu_to_le64(cp_ver) != *(__le64 *)nm_i->nat_bits) { + disable_nat_bits(sbi, true); + return 0; + } + + nm_i->full_nat_bits = nm_i->nat_bits + 8; + nm_i->empty_nat_bits = nm_i->full_nat_bits + nat_bits_bytes; + + f2fs_msg(sbi->sb, KERN_NOTICE, "Found nat_bits in checkpoint"); + return 0; +} + static int init_node_manager(struct f2fs_sb_info *sbi) { struct f2fs_super_block *sb_raw = F2FS_RAW_SUPER(sbi); struct f2fs_nm_info *nm_i = NM_I(sbi); unsigned char *version_bitmap; - unsigned int nat_segs, nat_blocks; + unsigned int nat_segs; + int err; nm_i->nat_blkaddr = le32_to_cpu(sb_raw->nat_blkaddr); /* segment_count_nat includes pair segment so divide to 2. */ nat_segs = le32_to_cpu(sb_raw->segment_count_nat) >> 1; - nat_blocks = nat_segs << le32_to_cpu(sb_raw->log_blocks_per_seg); - - nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nat_blocks; + nm_i->nat_blocks = nat_segs << le32_to_cpu(sb_raw->log_blocks_per_seg); + nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nm_i->nat_blocks; /* not used nids: 0, node, meta, (and root counted as valid node) */ nm_i->available_nids = nm_i->max_nid - sbi->total_valid_node_count - @@ -2397,6 +2548,10 @@ static int init_node_manager(struct f2fs_sb_info *sbi) if (!nm_i->nat_bitmap) return -ENOMEM; + err = __get_nat_bitmaps(sbi); + if (err) + return err; + #ifdef CONFIG_F2FS_CHECK_FS nm_i->nat_bitmap_mir = kmemdup(version_bitmap, nm_i->bitmap_size, GFP_KERNEL); @@ -2419,7 +2574,7 @@ int build_node_manager(struct f2fs_sb_info *sbi) if (err) return err; - build_free_nids(sbi, true); + build_free_nids(sbi, true, true); return 0; } @@ -2478,6 +2633,7 @@ void destroy_node_manager(struct f2fs_sb_info *sbi) up_write(&nm_i->nat_tree_lock); kfree(nm_i->nat_bitmap); + kfree(nm_i->nat_bits); #ifdef CONFIG_F2FS_CHECK_FS kfree(nm_i->nat_bitmap_mir); #endif diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 934749663b61..953599361fb0 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -386,7 +386,7 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) if (!available_free_memory(sbi, FREE_NIDS)) try_to_free_nids(sbi, MAX_FREE_NIDS); else - build_free_nids(sbi, false); + build_free_nids(sbi, false, false); if (!is_idle(sbi)) return; diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index f0748524ca8c..1c92ace2e8f8 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -114,6 +114,7 @@ struct f2fs_super_block { /* * For checkpoint */ +#define CP_NAT_BITS_FLAG 0x00000080 #define CP_CRC_RECOVERY_FLAG 0x00000040 #define CP_FASTBOOT_FLAG 0x00000020 #define CP_FSCK_FLAG 0x00000010 From 1ad1cd4f71491ca811a0511bfdc8287a686d1244 Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Thu, 23 Feb 2017 20:31:20 +0800 Subject: [PATCH 154/804] f2fs: no need lock_op in f2fs_write_inline_data Similar as f2fs_write_inode, f2fs_write_inline_data just mark inode page dirty, so it's no need to write inline data under read lock of cp_rwsem. Signed-off-by: Yunlei He Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 5356cc3cf1a9..86774b13ba42 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1411,9 +1411,12 @@ write: goto redirty_out; err = -EAGAIN; - f2fs_lock_op(sbi); - if (f2fs_has_inline_data(inode)) + if (f2fs_has_inline_data(inode)) { err = f2fs_write_inline_data(inode, page); + if (!err) + goto out; + } + f2fs_lock_op(sbi); if (err == -EAGAIN) err = do_write_data_page(&fio); if (F2FS_I(inode)->last_disk_size < psize) From 7d77c7a3525b5be2b5daaaf23ed71c7632f9b04a Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 23 Feb 2017 17:43:49 -0800 Subject: [PATCH 155/804] f2fs: use __clear_bit_le Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index ab5343f79f9b..0c7bd9a133a9 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -745,7 +745,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, dentry_blk = page_address(page); bit_pos = dentry - dentry_blk->dentry; for (i = 0; i < slots; i++) - clear_bit_le(bit_pos + i, &dentry_blk->dentry_bitmap); + __clear_bit_le(bit_pos + i, &dentry_blk->dentry_bitmap); /* Let's check and deallocate this dentry page */ bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap, From 2ea010a9c82622e475a00e20ee53b095edc53c3e Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 24 Feb 2017 15:09:16 -0800 Subject: [PATCH 156/804] fscrypt: catch fscrypto_get_policy in v4.10-rc6 Signed-off-by: Jaegeuk Kim --- fs/crypto/policy.c | 39 ++++++++++++++++++++++++++------------- fs/f2fs/f2fs.h | 4 ++-- fs/f2fs/file.c | 19 ++----------------- include/linux/fscrypto.h | 12 ++++++------ 4 files changed, 36 insertions(+), 38 deletions(-) diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index 6865663aac69..69ec4da11a7b 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -93,16 +93,19 @@ static int create_encryption_context_from_policy(struct inode *inode, return inode->i_sb->s_cop->set_context(inode, &ctx, sizeof(ctx), NULL); } -int fscrypt_process_policy(struct file *filp, - const struct fscrypt_policy *policy) +int fscrypt_ioctl_set_policy(struct file *filp, const void __user *arg) { + struct fscrypt_policy policy; struct inode *inode = file_inode(filp); int ret; + if (copy_from_user(&policy, arg, sizeof(policy))) + return -EFAULT; + if (!inode_owner_or_capable(inode)) return -EACCES; - if (policy->version != 0) + if (policy.version != 0) return -EINVAL; ret = mnt_want_write_file(filp); @@ -120,9 +123,9 @@ int fscrypt_process_policy(struct file *filp, ret = -ENOTEMPTY; else ret = create_encryption_context_from_policy(inode, - policy); + &policy); } else if (!is_encryption_context_consistent_with_policy(inode, - policy)) { + &policy)) { printk(KERN_WARNING "%s: Policy inconsistent with encryption context\n", __func__); @@ -134,11 +137,13 @@ int fscrypt_process_policy(struct file *filp, mnt_drop_write_file(filp); return ret; } -EXPORT_SYMBOL(fscrypt_process_policy); +EXPORT_SYMBOL(fscrypt_ioctl_set_policy); -int fscrypt_get_policy(struct inode *inode, struct fscrypt_policy *policy) +int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg) { + struct inode *inode = file_inode(filp); struct fscrypt_context ctx; + struct fscrypt_policy policy; int res; if (!inode->i_sb->s_cop->get_context || @@ -151,15 +156,18 @@ int fscrypt_get_policy(struct inode *inode, struct fscrypt_policy *policy) if (ctx.format != FS_ENCRYPTION_CONTEXT_FORMAT_V1) return -EINVAL; - policy->version = 0; - policy->contents_encryption_mode = ctx.contents_encryption_mode; - policy->filenames_encryption_mode = ctx.filenames_encryption_mode; - policy->flags = ctx.flags; - memcpy(&policy->master_key_descriptor, ctx.master_key_descriptor, + policy.version = 0; + policy.contents_encryption_mode = ctx.contents_encryption_mode; + policy.filenames_encryption_mode = ctx.filenames_encryption_mode; + policy.flags = ctx.flags; + memcpy(policy.master_key_descriptor, ctx.master_key_descriptor, FS_KEY_DESCRIPTOR_SIZE); + + if (copy_to_user(arg, &policy, sizeof(policy))) + return -EFAULT; return 0; } -EXPORT_SYMBOL(fscrypt_get_policy); +EXPORT_SYMBOL(fscrypt_ioctl_get_policy); int fscrypt_has_permitted_context(struct inode *parent, struct inode *child) { @@ -171,6 +179,11 @@ int fscrypt_has_permitted_context(struct inode *parent, struct inode *child) BUG_ON(1); } + /* No restrictions on file types which are never encrypted */ + if (!S_ISREG(child->i_mode) && !S_ISDIR(child->i_mode) && + !S_ISLNK(child->i_mode)) + return 1; + /* no restrictions if the parent directory is not encrypted */ if (!parent->i_sb->s_cop->is_encrypted(parent)) return 1; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 3e726878ccdc..1e41664982a0 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2708,8 +2708,8 @@ static inline bool f2fs_may_encrypt(struct inode *inode) #define fscrypt_pullback_bio_page fscrypt_notsupp_pullback_bio_page #define fscrypt_restore_control_page fscrypt_notsupp_restore_control_page #define fscrypt_zeroout_range fscrypt_notsupp_zeroout_range -#define fscrypt_process_policy fscrypt_notsupp_process_policy -#define fscrypt_get_policy fscrypt_notsupp_get_policy +#define fscrypt_ioctl_set_policy fscrypt_notsupp_ioctl_set_policy +#define fscrypt_ioctl_get_policy fscrypt_notsupp_ioctl_get_policy #define fscrypt_has_permitted_context fscrypt_notsupp_has_permitted_context #define fscrypt_inherit_context fscrypt_notsupp_inherit_context #define fscrypt_get_encryption_info fscrypt_notsupp_get_encryption_info diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 36082c11adb7..892caab7f74b 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1772,31 +1772,16 @@ static bool uuid_is_nonzero(__u8 u[16]) static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg) { - struct fscrypt_policy policy; struct inode *inode = file_inode(filp); - if (copy_from_user(&policy, (struct fscrypt_policy __user *)arg, - sizeof(policy))) - return -EFAULT; - f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); - return fscrypt_process_policy(filp, &policy); + return fscrypt_ioctl_set_policy(filp, (const void __user *)arg); } static int f2fs_ioc_get_encryption_policy(struct file *filp, unsigned long arg) { - struct fscrypt_policy policy; - struct inode *inode = file_inode(filp); - int err; - - err = fscrypt_get_policy(inode, &policy); - if (err) - return err; - - if (copy_to_user((struct fscrypt_policy __user *)arg, &policy, sizeof(policy))) - return -EFAULT; - return 0; + return fscrypt_ioctl_get_policy(filp, (void __user *)arg); } static int f2fs_ioc_get_encryption_pwsalt(struct file *filp, unsigned long arg) diff --git a/include/linux/fscrypto.h b/include/linux/fscrypto.h index ff8b11b26f31..e6e53a36104b 100644 --- a/include/linux/fscrypto.h +++ b/include/linux/fscrypto.h @@ -250,8 +250,8 @@ extern void fscrypt_restore_control_page(struct page *); extern int fscrypt_zeroout_range(struct inode *, pgoff_t, sector_t, unsigned int); /* policy.c */ -extern int fscrypt_process_policy(struct file *, const struct fscrypt_policy *); -extern int fscrypt_get_policy(struct inode *, struct fscrypt_policy *); +extern int fscrypt_ioctl_set_policy(struct file *, const void __user *); +extern int fscrypt_ioctl_get_policy(struct file *, void __user *); extern int fscrypt_has_permitted_context(struct inode *, struct inode *); extern int fscrypt_inherit_context(struct inode *, struct inode *, void *, bool); @@ -320,14 +320,14 @@ static inline int fscrypt_notsupp_zeroout_range(struct inode *i, pgoff_t p, } /* policy.c */ -static inline int fscrypt_notsupp_process_policy(struct file *f, - const struct fscrypt_policy *p) +static inline int fscrypt_notsupp_ioctl_set_policy(struct file *f, + const void __user *arg) { return -EOPNOTSUPP; } -static inline int fscrypt_notsupp_get_policy(struct inode *i, - struct fscrypt_policy *p) +static inline int fscrypt_notsupp_ioctl_get_policy(struct file *f, + void __user *arg) { return -EOPNOTSUPP; } From 3a40c74cce8bf6a05114d70317fe4c2c8b6ca50d Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 25 Feb 2017 11:08:28 +0800 Subject: [PATCH 157/804] f2fs: show simple call stack in fault injection message Previously kernel message can show that in which function we do the injection, but unfortunately, most of the caller are the same, for tracking more information of injection path, it needs to show upper caller's name. This patch supports that ability. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim Conflicts: fs/f2fs/data.c --- fs/f2fs/checkpoint.c | 1 + fs/f2fs/data.c | 4 +++- fs/f2fs/dir.c | 4 +++- fs/f2fs/f2fs.h | 20 +++++++++++++------- fs/f2fs/gc.c | 4 +++- fs/f2fs/inode.c | 4 +++- fs/f2fs/node.c | 4 +++- fs/f2fs/segment.c | 4 +++- 8 files changed, 32 insertions(+), 13 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 2a7824341a75..d30973a4e4d9 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -495,6 +495,7 @@ int acquire_orphan_inode(struct f2fs_sb_info *sbi) #ifdef CONFIG_F2FS_FAULT_INJECTION if (time_to_inject(sbi, FAULT_ORPHAN)) { spin_unlock(&im->ino_lock); + f2fs_show_injection_info(FAULT_ORPHAN); return -ENOSPC; } #endif diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 86774b13ba42..dacc9b2896d7 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -55,8 +55,10 @@ static void f2fs_read_end_io(struct bio *bio) int i; #ifdef CONFIG_F2FS_FAULT_INJECTION - if (time_to_inject(F2FS_P_SB(bio->bi_io_vec->bv_page), FAULT_IO)) + if (time_to_inject(F2FS_P_SB(bio->bi_io_vec->bv_page), FAULT_IO)) { + f2fs_show_injection_info(FAULT_IO); bio->bi_error = -EIO; + } #endif if (f2fs_bio_encrypted(bio)) { diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 0c7bd9a133a9..35cbe7185594 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -549,8 +549,10 @@ int f2fs_add_regular_entry(struct inode *dir, const struct qstr *new_name, start: #ifdef CONFIG_F2FS_FAULT_INJECTION - if (time_to_inject(F2FS_I_SB(dir), FAULT_DIR_DEPTH)) + if (time_to_inject(F2FS_I_SB(dir), FAULT_DIR_DEPTH)) { + f2fs_show_injection_info(FAULT_DIR_DEPTH); return -ENOSPC; + } #endif if (unlikely(current_depth == MAX_DIR_HASH_DEPTH)) return -ENOSPC; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 1e41664982a0..c8ddca99acaa 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1019,6 +1019,10 @@ struct f2fs_sb_info { }; #ifdef CONFIG_F2FS_FAULT_INJECTION +#define f2fs_show_injection_info(type) \ + printk("%sF2FS-fs : inject %s in %s of %pF\n", \ + KERN_INFO, fault_name[type], \ + __func__, __builtin_return_address(0)) static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type) { struct f2fs_fault_info *ffi = &sbi->fault_info; @@ -1032,10 +1036,6 @@ static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type) atomic_inc(&ffi->inject_ops); if (atomic_read(&ffi->inject_ops) >= ffi->inject_rate) { atomic_set(&ffi->inject_ops, 0); - printk("%sF2FS-fs : inject %s in %pF\n", - KERN_INFO, - fault_name[type], - __builtin_return_address(0)); return true; } return false; @@ -1344,8 +1344,10 @@ static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi, blkcnt_t diff; #ifdef CONFIG_F2FS_FAULT_INJECTION - if (time_to_inject(sbi, FAULT_BLOCK)) + if (time_to_inject(sbi, FAULT_BLOCK)) { + f2fs_show_injection_info(FAULT_BLOCK); return false; + } #endif /* * let's increase this in prior to actual block count change in order @@ -1585,8 +1587,10 @@ static inline struct page *f2fs_grab_cache_page(struct address_space *mapping, if (page) return page; - if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_ALLOC)) + if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_ALLOC)) { + f2fs_show_injection_info(FAULT_PAGE_ALLOC); return NULL; + } #endif if (!for_write) return grab_cache_page(mapping, index); @@ -2062,8 +2066,10 @@ static inline void *f2fs_kmalloc(struct f2fs_sb_info *sbi, size_t size, gfp_t flags) { #ifdef CONFIG_F2FS_FAULT_INJECTION - if (time_to_inject(sbi, FAULT_KMALLOC)) + if (time_to_inject(sbi, FAULT_KMALLOC)) { + f2fs_show_injection_info(FAULT_KMALLOC); return NULL; + } #endif return kmalloc(size, flags); } diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 2727d352817e..b77d1c806aba 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -48,8 +48,10 @@ static int gc_thread_func(void *data) } #ifdef CONFIG_F2FS_FAULT_INJECTION - if (time_to_inject(sbi, FAULT_CHECKPOINT)) + if (time_to_inject(sbi, FAULT_CHECKPOINT)) { + f2fs_show_injection_info(FAULT_CHECKPOINT); f2fs_stop_checkpoint(sbi, false); + } #endif /* diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index af06bda51a54..24bb8213d974 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -373,8 +373,10 @@ void f2fs_evict_inode(struct inode *inode) goto no_delete; #ifdef CONFIG_F2FS_FAULT_INJECTION - if (time_to_inject(sbi, FAULT_EVICT_INODE)) + if (time_to_inject(sbi, FAULT_EVICT_INODE)) { + f2fs_show_injection_info(FAULT_EVICT_INODE); goto no_delete; + } #endif remove_ino_entry(sbi, inode->i_ino, APPEND_INO); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 7facc1711baf..dca0b1a2c395 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1990,8 +1990,10 @@ bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid) struct free_nid *i = NULL; retry: #ifdef CONFIG_F2FS_FAULT_INJECTION - if (time_to_inject(sbi, FAULT_ALLOC_NID)) + if (time_to_inject(sbi, FAULT_ALLOC_NID)) { + f2fs_show_injection_info(FAULT_ALLOC_NID); return false; + } #endif spin_lock(&nm_i->nid_list_lock); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 953599361fb0..684a5165dd04 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -352,8 +352,10 @@ int commit_inmem_pages(struct inode *inode) void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) { #ifdef CONFIG_F2FS_FAULT_INJECTION - if (time_to_inject(sbi, FAULT_CHECKPOINT)) + if (time_to_inject(sbi, FAULT_CHECKPOINT)) { + f2fs_show_injection_info(FAULT_CHECKPOINT); f2fs_stop_checkpoint(sbi, false); + } #endif if (!need) From 377816fec3f305ab5f4f72356363bfa98b992db5 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 24 Feb 2017 18:46:00 +0800 Subject: [PATCH 158/804] f2fs: select target segment with closer temperature in SSR mode In SSR mode, we can allocate target segment which has different temperature type from the type of current block, in order to avoid mixing coldest and hottest data/node as much as possible, change SSR allocation policy to select closer temperature for current block prior. Signed-off-by: Yunlong Song Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 684a5165dd04..e4ef306ba234 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1620,7 +1620,8 @@ static int get_ssr_segment(struct f2fs_sb_info *sbi, int type) { struct curseg_info *curseg = CURSEG_I(sbi, type); const struct victim_selection *v_ops = DIRTY_I(sbi)->v_ops; - int i, n; + int i, cnt; + bool reversed = false; /* need_SSR() already forces to do this */ if (v_ops->get_victim(sbi, &(curseg)->next_segno, BG_GC, type, SSR)) @@ -1628,14 +1629,24 @@ static int get_ssr_segment(struct f2fs_sb_info *sbi, int type) /* For node segments, let's do SSR more intensively */ if (IS_NODESEG(type)) { - i = CURSEG_HOT_NODE; - n = CURSEG_COLD_NODE; + if (type >= CURSEG_WARM_NODE) { + reversed = true; + i = CURSEG_COLD_NODE; + } else { + i = CURSEG_HOT_NODE; + } + cnt = NR_CURSEG_NODE_TYPE; } else { - i = CURSEG_HOT_DATA; - n = CURSEG_COLD_DATA; + if (type >= CURSEG_WARM_DATA) { + reversed = true; + i = CURSEG_COLD_DATA; + } else { + i = CURSEG_HOT_DATA; + } + cnt = NR_CURSEG_DATA_TYPE; } - for (; i <= n; i++) { + for (; cnt-- > 0; reversed ? i-- : i++) { if (i == type) continue; if (v_ops->get_victim(sbi, &(curseg)->next_segno, From 1b30dde97f843a00bd6c19c3bd6fe501a4487b8f Mon Sep 17 00:00:00 2001 From: Hou Pengyang Date: Sat, 25 Feb 2017 03:57:38 +0000 Subject: [PATCH 159/804] f2fs: avoid bggc->fggc when enough free segments are avaliable after cp We use has_not_enough_free_secs to check if there are enough free segments, (free_sections(sbi) + freed) <= (node_secs + 2 * dent_secs + imeta_secs + reserved_sections(sbi) + needed); Under scenario with large number of dirty nodes, these nodes would be flushed during cp, as a result, right side of the inequality would be decreased, while left side stays unchanged if these nodes are flushed in SSR way, which means there are enough free segments after this cp. For this case, we just do a bggc instead of fggc. Signed-off-by: Hou Pengyang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index b77d1c806aba..8c8e7135ef58 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -953,21 +953,22 @@ gc_more: goto stop; } - if (gc_type == BG_GC && has_not_enough_free_secs(sbi, sec_freed, 0)) { - gc_type = FG_GC; + if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0)) { /* - * If there is no victim and no prefree segment but still not - * enough free sections, we should flush dent/node blocks and do - * garbage collections. + * For example, if there are many prefree_segments below given + * threshold, we can make them free by checkpoint. Then, we + * secure free segments which doesn't need fggc any more. */ ret = write_checkpoint(sbi, &cpc); if (ret) goto stop; - } else if (gc_type == BG_GC && !background) { - /* f2fs_balance_fs doesn't need to do BG_GC in critical path. */ - goto stop; + if (has_not_enough_free_secs(sbi, 0, 0)) + gc_type = FG_GC; } + /* f2fs_balance_fs doesn't need to do BG_GC in critical path. */ + if (gc_type == BG_GC && !background) + goto stop; if (!__get_victim(sbi, &segno, gc_type)) goto stop; ret = 0; From e042b87adaa7be0bdda3998f07cc3777fc845a71 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 25 Feb 2017 17:29:54 +0800 Subject: [PATCH 160/804] f2fs: kill __is_extent_same Since commit ee6d182f2a19 ("f2fs: remove syncing inode page in all the cases") delayed inode element updating from inode cache to node page cache, so once largest cached extent is updated, we can make inode dirty immediately instead of checking and updating it in the end of extent cache update. The above commit didn't clean up unneeded codes in extent_cache.c, let's finish the job in this patch. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/extent_cache.c | 8 +++----- fs/f2fs/f2fs.h | 7 ------- 2 files changed, 3 insertions(+), 12 deletions(-) diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 0ab5518e45c2..c6934f014e0f 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -413,7 +413,7 @@ do_insert: return en; } -static unsigned int f2fs_update_extent_tree_range(struct inode *inode, +static void f2fs_update_extent_tree_range(struct inode *inode, pgoff_t fofs, block_t blkaddr, unsigned int len) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -426,7 +426,7 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode, unsigned int pos = (unsigned int)fofs; if (!et) - return false; + return; trace_f2fs_update_extent_tree_range(inode, fofs, blkaddr, len); @@ -434,7 +434,7 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode, if (is_inode_flag_set(inode, FI_NO_EXTENT)) { write_unlock(&et->lock); - return false; + return; } prev = et->largest; @@ -531,8 +531,6 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode, __free_extent_tree(sbi, et); write_unlock(&et->lock); - - return !__is_extent_same(&prev, &et->largest); } unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index c8ddca99acaa..b4bde555aaf4 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -563,13 +563,6 @@ static inline void set_extent_info(struct extent_info *ei, unsigned int fofs, ei->len = len; } -static inline bool __is_extent_same(struct extent_info *ei1, - struct extent_info *ei2) -{ - return (ei1->fofs == ei2->fofs && ei1->blk == ei2->blk && - ei1->len == ei2->len); -} - static inline bool __is_extent_mergeable(struct extent_info *back, struct extent_info *front) { From 2ed4b498d823d54e6c1d8e22390624f814b4b753 Mon Sep 17 00:00:00 2001 From: Kinglong Mee Date: Sat, 25 Feb 2017 19:23:27 +0800 Subject: [PATCH 161/804] f2fs: Don't update the xattr data that same as the exist f2fs removes the old xattr data and appends the new data although the new data is same as the exist. Signed-off-by: Kinglong Mee Signed-off-by: Jaegeuk Kim --- fs/f2fs/xattr.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 8eca9022bf16..4f8ab3c0edb1 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -585,6 +585,13 @@ cleanup: return error; } +static bool f2fs_xattr_value_same(struct f2fs_xattr_entry *entry, + const void *value, size_t size) +{ + void *pval = entry->e_name + entry->e_name_len; + return (entry->e_value_size == size) && !memcmp(pval, value, size); +} + static int __f2fs_setxattr(struct inode *inode, int index, const char *name, const void *value, size_t size, struct page *ipage, int flags) @@ -619,12 +626,17 @@ static int __f2fs_setxattr(struct inode *inode, int index, found = IS_XATTR_LAST_ENTRY(here) ? 0 : 1; - if ((flags & XATTR_REPLACE) && !found) { + if (found) { + if ((flags & XATTR_CREATE)) { + error = -EEXIST; + goto exit; + } + + if (f2fs_xattr_value_same(here, value, size)) + goto exit; + } else if ((flags & XATTR_REPLACE)) { error = -ENODATA; goto exit; - } else if ((flags & XATTR_CREATE) && found) { - error = -EEXIST; - goto exit; } last = here; From aa2a9a1c479bd0225c4f7bbd78957f7f9a5f133c Mon Sep 17 00:00:00 2001 From: Kinglong Mee Date: Sat, 25 Feb 2017 19:23:40 +0800 Subject: [PATCH 162/804] f2fs: drop the duplicate pval in f2fs_getxattr Fixes: ba38c27eb9 ("f2fs: enhance lookup xattr") Signed-off-by: Kinglong Mee Signed-off-by: Jaegeuk Kim --- fs/f2fs/xattr.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 4f8ab3c0edb1..d0d15920e3ff 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -514,7 +514,6 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name, struct f2fs_xattr_entry *entry = NULL; int error = 0; unsigned int size, len; - char *pval; void *base_addr = NULL; if (name == NULL) @@ -536,8 +535,6 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name, goto out; } - pval = entry->e_name + entry->e_name_len; - if (buffer) { char *pval = entry->e_name + entry->e_name_len; memcpy(buffer, pval, size); From 20adb5b3fe0bfde2d36413de4bed7963a3158184 Mon Sep 17 00:00:00 2001 From: Kinglong Mee Date: Sat, 25 Feb 2017 19:32:21 +0800 Subject: [PATCH 163/804] f2fs: update the comment of default nr_pages to skipping Fixes: 2c237ebaa4 ("f2fs: avoid writing node/metapages during writes") Signed-off-by: Kinglong Mee Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index f4020f141d83..5e8ad4280a50 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -736,8 +736,8 @@ static inline bool sec_usage_check(struct f2fs_sb_info *sbi, unsigned int secno) * It is very important to gather dirty pages and write at once, so that we can * submit a big bio without interfering other data writes. * By default, 512 pages for directory data, - * 512 pages (2MB) * 3 for three types of nodes, and - * max_bio_blocks for meta are set. + * 512 pages (2MB) * 8 for nodes, and + * 256 pages * 8 for meta are set. */ static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type) { From 4db9ebac249f0882d4ace06b77b2e9dc8aa440df Mon Sep 17 00:00:00 2001 From: Kinglong Mee Date: Sat, 25 Feb 2017 19:53:39 +0800 Subject: [PATCH 164/804] f2fs: new helper cur_cp_crc() getting crc in f2fs_checkpoint There are four places that getting the crc value in f2fs_checkpoint, just add a new helper cur_cp_crc for them. Signed-off-by: Kinglong Mee Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 3 +-- fs/f2fs/f2fs.h | 6 ++++++ fs/f2fs/node.c | 5 +---- fs/f2fs/node.h | 20 +++++++------------- 4 files changed, 15 insertions(+), 19 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index d30973a4e4d9..645c3f7f21ce 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -684,8 +684,7 @@ static int get_checkpoint_version(struct f2fs_sb_info *sbi, block_t cp_addr, return -EINVAL; } - crc = le32_to_cpu(*((__le32 *)((unsigned char *)*cp_block - + crc_offset))); + crc = cur_cp_crc(*cp_block); if (!f2fs_crc_valid(sbi, crc, *cp_block, crc_offset)) { f2fs_msg(sbi->sb, KERN_WARNING, "invalid crc value"); return -EINVAL; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index b4bde555aaf4..d0f8a6153068 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1193,6 +1193,12 @@ static inline unsigned long long cur_cp_version(struct f2fs_checkpoint *cp) return le64_to_cpu(cp->checkpoint_ver); } +static inline __u64 cur_cp_crc(struct f2fs_checkpoint *cp) +{ + size_t crc_offset = le32_to_cpu(cp->checksum_offset); + return le32_to_cpu(*((__le32 *)((unsigned char *)cp + crc_offset))); +} + static inline bool __is_set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) { unsigned int ckpt_flags = le32_to_cpu(cp->ckpt_flags); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index dca0b1a2c395..3463a3e54750 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2465,9 +2465,6 @@ static int __get_nat_bitmaps(struct f2fs_sb_info *sbi) unsigned int nat_bits_bytes = nm_i->nat_blocks / BITS_PER_BYTE; unsigned int i; __u64 cp_ver = cur_cp_version(ckpt); - size_t crc_offset = le32_to_cpu(ckpt->checksum_offset); - __u64 crc = le32_to_cpu(*((__le32 *) - ((unsigned char *)ckpt + crc_offset))); block_t nat_bits_addr; if (!enabled_nat_bits(sbi, NULL)) @@ -2490,7 +2487,7 @@ static int __get_nat_bitmaps(struct f2fs_sb_info *sbi) f2fs_put_page(page, 1); } - cp_ver |= (crc << 32); + cp_ver |= (cur_cp_crc(ckpt) << 32); if (cpu_to_le64(cp_ver) != *(__le64 *)nm_i->nat_bits) { disable_nat_bits(sbi, true); return 0; diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 3fc9c4b1dce9..2f9603fa85a5 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -300,14 +300,11 @@ static inline void fill_node_footer_blkaddr(struct page *page, block_t blkaddr) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(F2FS_P_SB(page)); struct f2fs_node *rn = F2FS_NODE(page); - size_t crc_offset = le32_to_cpu(ckpt->checksum_offset); - __u64 cp_ver = le64_to_cpu(ckpt->checkpoint_ver); + __u64 cp_ver = cur_cp_version(ckpt); + + if (__is_set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG)) + cp_ver |= (cur_cp_crc(ckpt) << 32); - if (__is_set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG)) { - __u64 crc = le32_to_cpu(*((__le32 *) - ((unsigned char *)ckpt + crc_offset))); - cp_ver |= (crc << 32); - } rn->footer.cp_ver = cpu_to_le64(cp_ver); rn->footer.next_blkaddr = cpu_to_le32(blkaddr); } @@ -315,14 +312,11 @@ static inline void fill_node_footer_blkaddr(struct page *page, block_t blkaddr) static inline bool is_recoverable_dnode(struct page *page) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(F2FS_P_SB(page)); - size_t crc_offset = le32_to_cpu(ckpt->checksum_offset); __u64 cp_ver = cur_cp_version(ckpt); - if (__is_set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG)) { - __u64 crc = le32_to_cpu(*((__le32 *) - ((unsigned char *)ckpt + crc_offset))); - cp_ver |= (crc << 32); - } + if (__is_set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG)) + cp_ver |= (cur_cp_crc(ckpt) << 32); + return cp_ver == cpver_of_node(page); } From 179e2535c7b7231285a9db9a14b663191acce80e Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 23 Feb 2017 10:53:49 +0800 Subject: [PATCH 165/804] f2fs: introduce free nid bitmap In scenario of intensively node allocation, free nids will be ran out soon, then it needs to stop to load free nids by traversing NAT blocks, in worse case, if NAT blocks does not be cached in memory, it generates IOs which slows down our foreground operations. In order to speed up node allocation, in this patch we introduce a new free_nid_bitmap array, so there is an bitmap table for each NAT block, Once the NAT block is loaded, related bitmap cache will be switched on, and bitmap will be set during traversing nat entries in NAT block, later we can query and update nid usage status in memory completely. With such implementation, I expect performance of node allocation can be improved in the long-term after filesystem image is mounted. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim Conflicts: include/linux/f2fs_fs.h --- fs/f2fs/debug.c | 2 + fs/f2fs/f2fs.h | 2 + fs/f2fs/node.c | 125 ++++++++++++++++++++++++++++++++++++---- include/linux/f2fs_fs.h | 1 + 4 files changed, 120 insertions(+), 10 deletions(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 015ad2b73a92..a77df377e2e8 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -194,6 +194,8 @@ static void update_mem_info(struct f2fs_sb_info *sbi) si->base_mem += sizeof(struct f2fs_nm_info); si->base_mem += __bitmap_size(sbi, NAT_BITMAP); si->base_mem += (NM_I(sbi)->nat_bits_blocks << F2FS_BLKSIZE_BITS); + si->base_mem += NM_I(sbi)->nat_blocks * NAT_ENTRY_BITMAP_SIZE; + si->base_mem += NM_I(sbi)->nat_blocks / 8; get_cache: si->cache_mem = 0; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index d0f8a6153068..f26b9b451e13 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -622,6 +622,8 @@ struct f2fs_nm_info { unsigned int nid_cnt[MAX_NID_LIST]; /* the number of free node id */ spinlock_t nid_list_lock; /* protect nid lists ops */ struct mutex build_lock; /* lock for build free nids */ + unsigned char (*free_nid_bitmap)[NAT_ENTRY_BITMAP_SIZE]; + unsigned char *nat_block_bitmap; /* for checkpoint */ char *nat_bitmap; /* NAT bitmap pointer */ diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 3463a3e54750..2909c935039a 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1768,7 +1768,8 @@ static void __remove_nid_from_list(struct f2fs_sb_info *sbi, radix_tree_delete(&nm_i->free_nid_root, i->nid); } -static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) +/* return if the nid is recognized as free */ +static bool add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *i; @@ -1777,14 +1778,14 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) /* 0 nid should not be used */ if (unlikely(nid == 0)) - return 0; + return false; if (build) { /* do not add allocated nids */ ne = __lookup_nat_cache(nm_i, nid); if (ne && (!get_nat_flag(ne, IS_CHECKPOINTED) || nat_get_blkaddr(ne) != NULL_ADDR)) - return 0; + return false; } i = f2fs_kmem_cache_alloc(free_nid_slab, GFP_NOFS); @@ -1793,7 +1794,7 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) if (radix_tree_preload(GFP_NOFS)) { kmem_cache_free(free_nid_slab, i); - return 0; + return true; } spin_lock(&nm_i->nid_list_lock); @@ -1802,9 +1803,9 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) radix_tree_preload_end(); if (err) { kmem_cache_free(free_nid_slab, i); - return 0; + return true; } - return 1; + return true; } static void remove_free_nid(struct f2fs_sb_info *sbi, nid_t nid) @@ -1825,17 +1826,36 @@ static void remove_free_nid(struct f2fs_sb_info *sbi, nid_t nid) kmem_cache_free(free_nid_slab, i); } +void update_free_nid_bitmap(struct f2fs_sb_info *sbi, nid_t nid, bool set) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + unsigned int nat_ofs = NAT_BLOCK_OFFSET(nid); + unsigned int nid_ofs = nid - START_NID(nid); + + if (!test_bit_le(nat_ofs, nm_i->nat_block_bitmap)) + return; + + if (set) + set_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]); + else + clear_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]); +} + static void scan_nat_page(struct f2fs_sb_info *sbi, struct page *nat_page, nid_t start_nid) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct f2fs_nat_block *nat_blk = page_address(nat_page); block_t blk_addr; + unsigned int nat_ofs = NAT_BLOCK_OFFSET(start_nid); int i; + set_bit_le(nat_ofs, nm_i->nat_block_bitmap); + i = start_nid % NAT_ENTRY_PER_BLOCK; for (; i < NAT_ENTRY_PER_BLOCK; i++, start_nid++) { + bool freed = false; if (unlikely(start_nid >= nm_i->max_nid)) break; @@ -1843,10 +1863,54 @@ static void scan_nat_page(struct f2fs_sb_info *sbi, blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr); f2fs_bug_on(sbi, blk_addr == NEW_ADDR); if (blk_addr == NULL_ADDR) - add_free_nid(sbi, start_nid, true); + freed = add_free_nid(sbi, start_nid, true); + update_free_nid_bitmap(sbi, start_nid, freed); } } +static void scan_free_nid_bits(struct f2fs_sb_info *sbi) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); + struct f2fs_journal *journal = curseg->journal; + unsigned int i, idx; + unsigned int target = FREE_NID_PAGES * NAT_ENTRY_PER_BLOCK; + + down_read(&nm_i->nat_tree_lock); + + for (i = 0; i < nm_i->nat_blocks; i++) { + if (!test_bit_le(i, nm_i->nat_block_bitmap)) + continue; + for (idx = 0; idx < NAT_ENTRY_PER_BLOCK; idx++) { + nid_t nid; + + if (!test_bit_le(idx, nm_i->free_nid_bitmap[i])) + continue; + + nid = i * NAT_ENTRY_PER_BLOCK + idx; + add_free_nid(sbi, nid, true); + + if (nm_i->nid_cnt[FREE_NID_LIST] >= target) + goto out; + } + } +out: + down_read(&curseg->journal_rwsem); + for (i = 0; i < nats_in_cursum(journal); i++) { + block_t addr; + nid_t nid; + + addr = le32_to_cpu(nat_in_journal(journal, i).block_addr); + nid = le32_to_cpu(nid_in_journal(journal, i)); + if (addr == NULL_ADDR) + add_free_nid(sbi, nid, true); + else + remove_free_nid(sbi, nid); + } + up_read(&curseg->journal_rwsem); + up_read(&nm_i->nat_tree_lock); +} + static int scan_nat_bits(struct f2fs_sb_info *sbi) { struct f2fs_nm_info *nm_i = NM_I(sbi); @@ -1915,9 +1979,17 @@ static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount) if (!sync && !available_free_memory(sbi, FREE_NIDS)) return; - /* try to find free nids with nat_bits */ - if (!mount && !scan_nat_bits(sbi) && nm_i->nid_cnt[FREE_NID_LIST]) - return; + if (!mount) { + /* try to find free nids in free_nid_bitmap */ + scan_free_nid_bits(sbi); + + if (nm_i->nid_cnt[FREE_NID_LIST]) + return; + + /* try to find free nids with nat_bits */ + if (!scan_nat_bits(sbi) && nm_i->nid_cnt[FREE_NID_LIST]) + return; + } /* find next valid candidate */ if (enabled_nat_bits(sbi, NULL)) { @@ -2013,6 +2085,9 @@ retry: i->state = NID_ALLOC; __insert_nid_to_list(sbi, i, ALLOC_NID_LIST, false); nm_i->available_nids--; + + update_free_nid_bitmap(sbi, *nid, false); + spin_unlock(&nm_i->nid_list_lock); return true; } @@ -2067,6 +2142,8 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) nm_i->available_nids++; + update_free_nid_bitmap(sbi, nid, true); + spin_unlock(&nm_i->nid_list_lock); if (need_free) @@ -2395,6 +2472,11 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, add_free_nid(sbi, nid, false); spin_lock(&NM_I(sbi)->nid_list_lock); NM_I(sbi)->available_nids++; + update_free_nid_bitmap(sbi, nid, true); + spin_unlock(&NM_I(sbi)->nid_list_lock); + } else { + spin_lock(&NM_I(sbi)->nid_list_lock); + update_free_nid_bitmap(sbi, nid, false); spin_unlock(&NM_I(sbi)->nid_list_lock); } } @@ -2561,6 +2643,22 @@ static int init_node_manager(struct f2fs_sb_info *sbi) return 0; } +int init_free_nid_cache(struct f2fs_sb_info *sbi) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + + nm_i->free_nid_bitmap = f2fs_kvzalloc(nm_i->nat_blocks * + NAT_ENTRY_BITMAP_SIZE, GFP_KERNEL); + if (!nm_i->free_nid_bitmap) + return -ENOMEM; + + nm_i->nat_block_bitmap = f2fs_kvzalloc(nm_i->nat_blocks / 8, + GFP_KERNEL); + if (!nm_i->nat_block_bitmap) + return -ENOMEM; + return 0; +} + int build_node_manager(struct f2fs_sb_info *sbi) { int err; @@ -2573,6 +2671,10 @@ int build_node_manager(struct f2fs_sb_info *sbi) if (err) return err; + err = init_free_nid_cache(sbi); + if (err) + return err; + build_free_nids(sbi, true, true); return 0; } @@ -2631,6 +2733,9 @@ void destroy_node_manager(struct f2fs_sb_info *sbi) } up_write(&nm_i->nat_tree_lock); + kvfree(nm_i->nat_block_bitmap); + kvfree(nm_i->free_nid_bitmap); + kfree(nm_i->nat_bitmap); kfree(nm_i->nat_bits); #ifdef CONFIG_F2FS_CHECK_FS diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index 1c92ace2e8f8..e2d239ed4c60 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -279,6 +279,7 @@ struct f2fs_node { * For NAT entries */ #define NAT_ENTRY_PER_BLOCK (PAGE_SIZE / sizeof(struct f2fs_nat_entry)) +#define NAT_ENTRY_BITMAP_SIZE ((NAT_ENTRY_PER_BLOCK + 7) / 8) struct f2fs_nat_entry { __u8 version; /* latest version of cached nat entry */ From b1305bba60c0a2bf2aeb2c16777f2fbe0d46f282 Mon Sep 17 00:00:00 2001 From: Kinglong Mee Date: Sun, 26 Feb 2017 20:47:16 +0800 Subject: [PATCH 166/804] f2fs: use MAX_FREE_NIDS for the free nids target F2FS has define MAX_FREE_NIDS for maximum of cached free nids target. Signed-off-by: Kinglong Mee Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 2909c935039a..cbc0d6ca58da 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1874,7 +1874,6 @@ static void scan_free_nid_bits(struct f2fs_sb_info *sbi) struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); struct f2fs_journal *journal = curseg->journal; unsigned int i, idx; - unsigned int target = FREE_NID_PAGES * NAT_ENTRY_PER_BLOCK; down_read(&nm_i->nat_tree_lock); @@ -1890,7 +1889,7 @@ static void scan_free_nid_bits(struct f2fs_sb_info *sbi) nid = i * NAT_ENTRY_PER_BLOCK + idx; add_free_nid(sbi, nid, true); - if (nm_i->nid_cnt[FREE_NID_LIST] >= target) + if (nm_i->nid_cnt[FREE_NID_LIST] >= MAX_FREE_NIDS) goto out; } } @@ -1916,7 +1915,6 @@ static int scan_nat_bits(struct f2fs_sb_info *sbi) struct f2fs_nm_info *nm_i = NM_I(sbi); struct page *page; unsigned int i = 0; - nid_t target = FREE_NID_PAGES * NAT_ENTRY_PER_BLOCK; nid_t nid; if (!enabled_nat_bits(sbi, NULL)) @@ -1937,7 +1935,7 @@ check_empty: add_free_nid(sbi, nid, true); } - if (nm_i->nid_cnt[FREE_NID_LIST] >= target) + if (nm_i->nid_cnt[FREE_NID_LIST] >= MAX_FREE_NIDS) goto out; i++; goto check_empty; @@ -1955,7 +1953,7 @@ check_partial: scan_nat_page(sbi, page, nid); f2fs_put_page(page, 1); - if (nm_i->nid_cnt[FREE_NID_LIST] < target) { + if (nm_i->nid_cnt[FREE_NID_LIST] < MAX_FREE_NIDS) { i++; goto check_partial; } From 6ac7367ebfee241a3df925859e6dd55dd89da2fc Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 27 Feb 2017 17:10:45 +0800 Subject: [PATCH 167/804] f2fs: fix to update F2FS_{CP_}WB_DATA count correctly We should only account F2FS_{CP_}WB_DATA IOs for write path, fix it. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index dacc9b2896d7..58e7dcb9af62 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -398,7 +398,8 @@ alloc_new: if ((fio->type == DATA || fio->type == NODE) && fio->new_blkaddr & F2FS_IO_SIZE_MASK(sbi)) { err = -EAGAIN; - dec_page_count(sbi, WB_DATA_TYPE(bio_page)); + if (!is_read) + dec_page_count(sbi, WB_DATA_TYPE(bio_page)); goto out_fail; } io->bio = __bio_alloc(sbi, fio->new_blkaddr, From 9113aae794eb25d3d74c5589b49c091727a9d78b Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 27 Feb 2017 18:43:12 +0800 Subject: [PATCH 168/804] f2fs: fix memory leak of write_io_dummy mempool during umount Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 35e712bbccf1..da9592d88edb 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -818,7 +818,8 @@ static void f2fs_put_super(struct super_block *sb) kfree(sbi->raw_super); destroy_device_list(sbi); - + if (sbi->write_io_dummy) + mempool_destroy(sbi->write_io_dummy); destroy_percpu_info(sbi); kfree(sbi); } From f6493d7dd1cd7c6bf764fa3d4417a9562e9601b5 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 27 Feb 2017 18:43:13 +0800 Subject: [PATCH 169/804] f2fs: fix to enlarge size of write_io_dummy mempool It needs to double cache size of write_io_dummy mempool, otherwise we may run out of cache in scenraio of Data/Node IOs were issued concurrently. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index da9592d88edb..379259ce4cd1 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1931,7 +1931,7 @@ try_onemore: if (F2FS_IO_SIZE(sbi) > 1) { sbi->write_io_dummy = - mempool_create_page_pool(F2FS_IO_SIZE(sbi) - 1, 0); + mempool_create_page_pool(2 * (F2FS_IO_SIZE(sbi) - 1), 0); if (!sbi->write_io_dummy) goto free_options; } From e323e9ef9f41168ae00ba25c6560aabf9e706b1f Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Mon, 27 Feb 2017 18:59:53 +0800 Subject: [PATCH 170/804] f2fs: remove redundant set_page_dirty() This patch remove redundant set_page_dirty in truncate_blocks Signed-off-by: Yunlei He Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 892caab7f74b..cfd86ae20b7c 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -568,8 +568,7 @@ int truncate_blocks(struct inode *inode, u64 from, bool lock) } if (f2fs_has_inline_data(inode)) { - if (truncate_inline_inode(ipage, from)) - set_page_dirty(ipage); + truncate_inline_inode(ipage, from); if (from == 0) clear_inode_flag(inode, FI_DATA_EXIST); f2fs_put_page(ipage, 1); From 5086fe4c101b9c5a7cca87754fc6b4a17101f9ff Mon Sep 17 00:00:00 2001 From: Masato Suzuki Date: Mon, 27 Feb 2017 20:52:49 +0900 Subject: [PATCH 171/804] f2fs: Fix zoned block device support The introduction of the multi-device feature partially broke the support for zoned block devices. In the function f2fs_scan_devices, sbi->devs allocation and initialization is skipped in the case of a single device mount. This result in no device information structure being allocated for the device. This is fine if the device is a regular device, but in the case of a zoned block device, the device zone type array is not initialized, which causes the function __f2fs_issue_discard_zone to fail as get_blkz_type is unable to determine the zone type of a section. Fix this by always allocating and initializing the sbi->devs device information array even in the case of a single device if that device is zoned. For this particular case, make sure to obtain a reference on the single device so that the call to blkdev_put() in destroy_device_list operates as expected. Fixes: 3c62be17d4f562f4 ("f2fs: support multiple devices") Cc: # v4.10 Signed-off-by: Masato Suzuki Acked-by: Damien Le Moal Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 71 +++++++++++++++++++++++++++++++++---------------- 1 file changed, 48 insertions(+), 23 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 379259ce4cd1..b7f8932c3502 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1745,36 +1745,59 @@ int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover) static int f2fs_scan_devices(struct f2fs_sb_info *sbi) { struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); + unsigned int max_devices = MAX_DEVICES; int i; - for (i = 0; i < MAX_DEVICES; i++) { - if (!RDEV(i).path[0]) + /* Initialize single device information */ + if (!RDEV(0).path[0]) { +#ifdef CONFIG_BLK_DEV_ZONED + if (bdev_zoned_model(sbi->sb->s_bdev) == BLK_ZONED_NONE) return 0; + max_devices = 1; +#else + return 0; +#endif + } - if (i == 0) { - sbi->devs = kzalloc(sizeof(struct f2fs_dev_info) * - MAX_DEVICES, GFP_KERNEL); - if (!sbi->devs) - return -ENOMEM; - } + /* + * Initialize multiple devices information, or single + * zoned block device information. + */ + sbi->devs = kcalloc(max_devices, sizeof(struct f2fs_dev_info), + GFP_KERNEL); + if (!sbi->devs) + return -ENOMEM; - memcpy(FDEV(i).path, RDEV(i).path, MAX_PATH_LEN); - FDEV(i).total_segments = le32_to_cpu(RDEV(i).total_segments); - if (i == 0) { - FDEV(i).start_blk = 0; - FDEV(i).end_blk = FDEV(i).start_blk + - (FDEV(i).total_segments << - sbi->log_blocks_per_seg) - 1 + - le32_to_cpu(raw_super->segment0_blkaddr); - } else { - FDEV(i).start_blk = FDEV(i - 1).end_blk + 1; - FDEV(i).end_blk = FDEV(i).start_blk + - (FDEV(i).total_segments << - sbi->log_blocks_per_seg) - 1; - } + for (i = 0; i < max_devices; i++) { - FDEV(i).bdev = blkdev_get_by_path(FDEV(i).path, + if (i > 0 && !RDEV(i).path[0]) + break; + + if (max_devices == 1) { + /* Single zoned block device mount */ + FDEV(0).bdev = + blkdev_get_by_dev(sbi->sb->s_bdev->bd_dev, sbi->sb->s_mode, sbi->sb->s_type); + } else { + /* Multi-device mount */ + memcpy(FDEV(i).path, RDEV(i).path, MAX_PATH_LEN); + FDEV(i).total_segments = + le32_to_cpu(RDEV(i).total_segments); + if (i == 0) { + FDEV(i).start_blk = 0; + FDEV(i).end_blk = FDEV(i).start_blk + + (FDEV(i).total_segments << + sbi->log_blocks_per_seg) - 1 + + le32_to_cpu(raw_super->segment0_blkaddr); + } else { + FDEV(i).start_blk = FDEV(i - 1).end_blk + 1; + FDEV(i).end_blk = FDEV(i).start_blk + + (FDEV(i).total_segments << + sbi->log_blocks_per_seg) - 1; + } + FDEV(i).bdev = blkdev_get_by_path(FDEV(i).path, + sbi->sb->s_mode, sbi->sb->s_type); + } if (IS_ERR(FDEV(i).bdev)) return PTR_ERR(FDEV(i).bdev); @@ -1794,6 +1817,8 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) "Failed to initialize F2FS blkzone information"); return -EINVAL; } + if (max_devices == 1) + break; f2fs_msg(sbi->sb, KERN_INFO, "Mount Device [%2d]: %20s, %8u, %8x - %8x (zone: %s)", i, FDEV(i).path, From c1c90b7d9dc1440665b1cb0ba069013a5b483464 Mon Sep 17 00:00:00 2001 From: Hou Pengyang Date: Mon, 27 Feb 2017 13:02:58 +0000 Subject: [PATCH 172/804] f2fs: add f2fs_drop_inode tracepoint Signed-off-by: Hou Pengyang Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 7 +++++-- include/trace/events/f2fs.h | 7 +++++++ 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index b7f8932c3502..85c282272067 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -624,6 +624,7 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) static int f2fs_drop_inode(struct inode *inode) { + int ret; /* * This is to avoid a deadlock condition like below. * writeback_single_inode(inode) @@ -656,10 +657,12 @@ static int f2fs_drop_inode(struct inode *inode) spin_lock(&inode->i_lock); atomic_dec(&inode->i_count); } + trace_f2fs_drop_inode(inode, 0); return 0; } - - return generic_drop_inode(inode); + ret = generic_drop_inode(inode); + trace_f2fs_drop_inode(inode, ret); + return ret; } int f2fs_inode_dirtied(struct inode *inode, bool sync) diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index bd1772879c8c..b95872b9c3ae 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -309,6 +309,13 @@ DEFINE_EVENT(f2fs__inode_exit, f2fs_unlink_exit, TP_ARGS(inode, ret) ); +DEFINE_EVENT(f2fs__inode_exit, f2fs_drop_inode, + + TP_PROTO(struct inode *inode, int ret), + + TP_ARGS(inode, ret) +); + DEFINE_EVENT(f2fs__inode, f2fs_truncate, TP_PROTO(struct inode *inode), From 95bfba756e531d42d1b06b45b12e3e0f65560aee Mon Sep 17 00:00:00 2001 From: Hou Pengyang Date: Mon, 27 Feb 2017 13:02:59 +0000 Subject: [PATCH 173/804] f2fs: fix a plint compile warning fix such pclint warning: ... Loss of precision (arg. no. 2) (unsigned long long to unsigned int)) Signed-off-by: Hou Pengyang Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 8c8e7135ef58..68d6a4cad349 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1009,6 +1009,6 @@ void build_gc_manager(struct f2fs_sb_info *sbi) ovp_count = SM_I(sbi)->ovp_segments << sbi->log_blocks_per_seg; blocks_per_sec = sbi->blocks_per_seg * sbi->segs_per_sec; - sbi->fggc_threshold = div_u64((main_count - ovp_count) * blocks_per_sec, + sbi->fggc_threshold = div64_u64((main_count - ovp_count) * blocks_per_sec, (main_count - resv_count)); } From 7375ae65fa6dae808669b6837b49ca40fe227531 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 27 Feb 2017 11:57:11 -0800 Subject: [PATCH 174/804] f2fs: avoid to issue redundant discard commands If segs_per_sec is over 1 like under SMR, previously f2fs issues discard commands redundantly on the same section, since we didn't move end position for the previous discard command. E.g., start end | | prefree_bitmap = [01111100111100] And, after issue discard for this section, end start | | prefree_bitmap = [01111100111100] Select this section again by searching from (end + 1), start end | | prefree_bitmap = [01111100111100] Fixes: 36abef4e796d38 ("f2fs: introduce mode=lfs mount option") Cc: Cc: Damien Le Moal Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index e4ef306ba234..a09c726cc1c3 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1115,6 +1115,8 @@ next: start = start_segno + sbi->segs_per_sec; if (start < end) goto next; + else + end = start - 1; } mutex_unlock(&dirty_i->seglist_lock); From 9cc3fbc9ea182cfd50758d754896108749f74808 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 27 Feb 2017 21:28:53 -0800 Subject: [PATCH 175/804] f2fs: avoid to flush nat journal entries This patch adds a missing condition which flushes nat journal entries unnecessarily introduced by: f2fs: add bitmaps for empty or full NAT blocks Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index cbc0d6ca58da..81f0daad982b 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2516,7 +2516,7 @@ void flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) * entries, remove all entries from journal and merge them * into nat entry set. */ - if (cpc->reason == CP_UMOUNT || + if (enabled_nat_bits(sbi, cpc) || !__has_cursum_space(journal, nm_i->dirty_nat_cnt, NAT_JOURNAL)) remove_nats_in_journal(sbi); From b5bb7b2de94dc8802a7eb27e668f7e4e122de209 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 8 Mar 2017 15:24:43 -0800 Subject: [PATCH 176/804] fscrypt: catch up to v4.11-rc1 fscrypt: - fs/crypto/bio.c changes f2fs: - fscrypt: use ENOKEY when file cannot be created w/o key - fscrypt: split supp and notsupp declarations into their own headers - fscrypt: make fscrypt_operations.key_prefix a string Signed-off-by: Jaegeuk Kim --- fs/crypto/Makefile | 1 + fs/crypto/bio.c | 143 ++++++++++++++++++ fs/crypto/crypto.c | 249 +++++++++++--------------------- fs/crypto/fname.c | 12 +- fs/crypto/fscrypt_private.h | 117 +++++++++++++++ fs/crypto/keyinfo.c | 52 +++---- fs/crypto/policy.c | 97 +++++-------- fs/f2fs/data.c | 4 +- fs/f2fs/dir.c | 5 +- fs/f2fs/f2fs.h | 40 +---- fs/f2fs/namei.c | 4 +- fs/f2fs/super.c | 14 +- include/linux/fscrypt_common.h | 146 +++++++++++++++++++ include/linux/fscrypt_notsupp.h | 168 +++++++++++++++++++++ include/linux/fscrypt_supp.h | 66 +++++++++ include/uapi/linux/fs.h | 15 ++ 16 files changed, 818 insertions(+), 315 deletions(-) create mode 100644 fs/crypto/bio.c create mode 100644 fs/crypto/fscrypt_private.h create mode 100644 include/linux/fscrypt_common.h create mode 100644 include/linux/fscrypt_notsupp.h create mode 100644 include/linux/fscrypt_supp.h diff --git a/fs/crypto/Makefile b/fs/crypto/Makefile index f17684c48739..9f6607f17b53 100644 --- a/fs/crypto/Makefile +++ b/fs/crypto/Makefile @@ -1,3 +1,4 @@ obj-$(CONFIG_FS_ENCRYPTION) += fscrypto.o fscrypto-y := crypto.o fname.o policy.o keyinfo.o +fscrypto-$(CONFIG_BLOCK) += bio.o diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c new file mode 100644 index 000000000000..a91ed46fe503 --- /dev/null +++ b/fs/crypto/bio.c @@ -0,0 +1,143 @@ +/* + * This contains encryption functions for per-file encryption. + * + * Copyright (C) 2015, Google, Inc. + * Copyright (C) 2015, Motorola Mobility + * + * Written by Michael Halcrow, 2014. + * + * Filename encryption additions + * Uday Savagaonkar, 2014 + * Encryption policy handling additions + * Ildar Muslukhov, 2014 + * Add fscrypt_pullback_bio_page() + * Jaegeuk Kim, 2015. + * + * This has not yet undergone a rigorous security audit. + * + * The usage of AES-XTS should conform to recommendations in NIST + * Special Publication 800-38E and IEEE P1619/D16. + */ + +#include +#include +#include +#include +#include "fscrypt_private.h" + +/* + * Call fscrypt_decrypt_page on every single page, reusing the encryption + * context. + */ +static void completion_pages(struct work_struct *work) +{ + struct fscrypt_ctx *ctx = + container_of(work, struct fscrypt_ctx, r.work); + struct bio *bio = ctx->r.bio; + struct bio_vec *bv; + int i; + + bio_for_each_segment_all(bv, bio, i) { + struct page *page = bv->bv_page; + int ret = fscrypt_decrypt_page(page->mapping->host, page, + PAGE_SIZE, 0, page->index); + + if (ret) { + WARN_ON_ONCE(1); + SetPageError(page); + } else { + SetPageUptodate(page); + } + unlock_page(page); + } + fscrypt_release_ctx(ctx); + bio_put(bio); +} + +void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *ctx, struct bio *bio) +{ + INIT_WORK(&ctx->r.work, completion_pages); + ctx->r.bio = bio; + queue_work(fscrypt_read_workqueue, &ctx->r.work); +} +EXPORT_SYMBOL(fscrypt_decrypt_bio_pages); + +void fscrypt_pullback_bio_page(struct page **page, bool restore) +{ + struct fscrypt_ctx *ctx; + struct page *bounce_page; + + /* The bounce data pages are unmapped. */ + if ((*page)->mapping) + return; + + /* The bounce data page is unmapped. */ + bounce_page = *page; + ctx = (struct fscrypt_ctx *)page_private(bounce_page); + + /* restore control page */ + *page = ctx->w.control_page; + + if (restore) + fscrypt_restore_control_page(bounce_page); +} +EXPORT_SYMBOL(fscrypt_pullback_bio_page); + +int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, + sector_t pblk, unsigned int len) +{ + struct fscrypt_ctx *ctx; + struct page *ciphertext_page = NULL; + struct bio *bio; + int ret, err = 0; + + BUG_ON(inode->i_sb->s_blocksize != PAGE_SIZE); + + ctx = fscrypt_get_ctx(inode, GFP_NOFS); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + ciphertext_page = fscrypt_alloc_bounce_page(ctx, GFP_NOWAIT); + if (IS_ERR(ciphertext_page)) { + err = PTR_ERR(ciphertext_page); + goto errout; + } + + while (len--) { + err = fscrypt_do_page_crypto(inode, FS_ENCRYPT, lblk, + ZERO_PAGE(0), ciphertext_page, + PAGE_SIZE, 0, GFP_NOFS); + if (err) + goto errout; + + bio = bio_alloc(GFP_NOWAIT, 1); + if (!bio) { + err = -ENOMEM; + goto errout; + } + bio->bi_bdev = inode->i_sb->s_bdev; + bio->bi_iter.bi_sector = + pblk << (inode->i_sb->s_blocksize_bits - 9); + bio_set_op_attrs(bio, REQ_OP_WRITE, 0); + ret = bio_add_page(bio, ciphertext_page, + inode->i_sb->s_blocksize, 0); + if (ret != inode->i_sb->s_blocksize) { + /* should never happen! */ + WARN_ON(1); + bio_put(bio); + err = -EIO; + goto errout; + } + err = submit_bio_wait(0, bio); + bio_put(bio); + if (err) + goto errout; + lblk++; + pblk++; + } + err = 0; +errout: + fscrypt_release_ctx(ctx); + return err; +} +EXPORT_SYMBOL(fscrypt_zeroout_range); diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index 2d40ab9edc9f..02a7a9286449 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -24,10 +24,9 @@ #include #include #include -#include #include #include -#include +#include "fscrypt_private.h" static unsigned int num_prealloc_crypto_pages = 32; static unsigned int num_prealloc_crypto_ctxs = 128; @@ -44,7 +43,7 @@ static mempool_t *fscrypt_bounce_page_pool = NULL; static LIST_HEAD(fscrypt_free_ctxs); static DEFINE_SPINLOCK(fscrypt_ctx_lock); -static struct workqueue_struct *fscrypt_read_workqueue; +struct workqueue_struct *fscrypt_read_workqueue; static DEFINE_MUTEX(fscrypt_init_mutex); static struct kmem_cache *fscrypt_ctx_cachep; @@ -63,7 +62,7 @@ void fscrypt_release_ctx(struct fscrypt_ctx *ctx) { unsigned long flags; - if (ctx->flags & FS_WRITE_PATH_FL && ctx->w.bounce_page) { + if (ctx->flags & FS_CTX_HAS_BOUNCE_BUFFER_FL && ctx->w.bounce_page) { mempool_free(ctx->w.bounce_page, fscrypt_bounce_page_pool); ctx->w.bounce_page = NULL; } @@ -88,7 +87,7 @@ EXPORT_SYMBOL(fscrypt_release_ctx); * Return: An allocated and initialized encryption context on success; error * value or NULL otherwise. */ -struct fscrypt_ctx *fscrypt_get_ctx(struct inode *inode, gfp_t gfp_flags) +struct fscrypt_ctx *fscrypt_get_ctx(const struct inode *inode, gfp_t gfp_flags) { struct fscrypt_ctx *ctx = NULL; struct fscrypt_info *ci = inode->i_crypt_info; @@ -121,7 +120,7 @@ struct fscrypt_ctx *fscrypt_get_ctx(struct inode *inode, gfp_t gfp_flags) } else { ctx->flags &= ~FS_CTX_REQUIRES_FREE_ENCRYPT_FL; } - ctx->flags &= ~FS_WRITE_PATH_FL; + ctx->flags &= ~FS_CTX_HAS_BOUNCE_BUFFER_FL; return ctx; } EXPORT_SYMBOL(fscrypt_get_ctx); @@ -141,15 +140,10 @@ static void page_crypt_complete(struct crypto_async_request *req, int res) complete(&ecr->completion); } -typedef enum { - FS_DECRYPT = 0, - FS_ENCRYPT, -} fscrypt_direction_t; - -static int do_page_crypto(struct inode *inode, - fscrypt_direction_t rw, pgoff_t index, - struct page *src_page, struct page *dest_page, - gfp_t gfp_flags) +int fscrypt_do_page_crypto(const struct inode *inode, fscrypt_direction_t rw, + u64 lblk_num, struct page *src_page, + struct page *dest_page, unsigned int len, + unsigned int offs, gfp_t gfp_flags) { struct { __le64 index; @@ -162,6 +156,8 @@ static int do_page_crypto(struct inode *inode, struct crypto_skcipher *tfm = ci->ci_ctfm; int res = 0; + BUG_ON(len == 0); + req = skcipher_request_alloc(tfm, gfp_flags); if (!req) { printk_ratelimited(KERN_ERR @@ -175,14 +171,14 @@ static int do_page_crypto(struct inode *inode, page_crypt_complete, &ecr); BUILD_BUG_ON(sizeof(xts_tweak) != FS_XTS_TWEAK_SIZE); - xts_tweak.index = cpu_to_le64(index); + xts_tweak.index = cpu_to_le64(lblk_num); memset(xts_tweak.padding, 0, sizeof(xts_tweak.padding)); sg_init_table(&dst, 1); - sg_set_page(&dst, dest_page, PAGE_SIZE, 0); + sg_set_page(&dst, dest_page, len, offs); sg_init_table(&src, 1); - sg_set_page(&src, src_page, PAGE_SIZE, 0); - skcipher_request_set_crypt(req, &src, &dst, PAGE_SIZE, &xts_tweak); + sg_set_page(&src, src_page, len, offs); + skcipher_request_set_crypt(req, &src, &dst, len, &xts_tweak); if (rw == FS_DECRYPT) res = crypto_skcipher_decrypt(req); else @@ -202,53 +198,86 @@ static int do_page_crypto(struct inode *inode, return 0; } -static struct page *alloc_bounce_page(struct fscrypt_ctx *ctx, gfp_t gfp_flags) +struct page *fscrypt_alloc_bounce_page(struct fscrypt_ctx *ctx, + gfp_t gfp_flags) { ctx->w.bounce_page = mempool_alloc(fscrypt_bounce_page_pool, gfp_flags); if (ctx->w.bounce_page == NULL) return ERR_PTR(-ENOMEM); - ctx->flags |= FS_WRITE_PATH_FL; + ctx->flags |= FS_CTX_HAS_BOUNCE_BUFFER_FL; return ctx->w.bounce_page; } /** * fscypt_encrypt_page() - Encrypts a page - * @inode: The inode for which the encryption should take place - * @plaintext_page: The page to encrypt. Must be locked. - * @gfp_flags: The gfp flag for memory allocation + * @inode: The inode for which the encryption should take place + * @page: The page to encrypt. Must be locked for bounce-page + * encryption. + * @len: Length of data to encrypt in @page and encrypted + * data in returned page. + * @offs: Offset of data within @page and returned + * page holding encrypted data. + * @lblk_num: Logical block number. This must be unique for multiple + * calls with same inode, except when overwriting + * previously written data. + * @gfp_flags: The gfp flag for memory allocation * - * Allocates a ciphertext page and encrypts plaintext_page into it using the ctx - * encryption context. + * Encrypts @page using the ctx encryption context. Performs encryption + * either in-place or into a newly allocated bounce page. + * Called on the page write path. * - * Called on the page write path. The caller must call + * Bounce page allocation is the default. + * In this case, the contents of @page are encrypted and stored in an + * allocated bounce page. @page has to be locked and the caller must call * fscrypt_restore_control_page() on the returned ciphertext page to * release the bounce buffer and the encryption context. * - * Return: An allocated page with the encrypted content on success. Else, an + * In-place encryption is used by setting the FS_CFLG_OWN_PAGES flag in + * fscrypt_operations. Here, the input-page is returned with its content + * encrypted. + * + * Return: A page with the encrypted content on success. Else, an * error value or NULL. */ -struct page *fscrypt_encrypt_page(struct inode *inode, - struct page *plaintext_page, gfp_t gfp_flags) +struct page *fscrypt_encrypt_page(const struct inode *inode, + struct page *page, + unsigned int len, + unsigned int offs, + u64 lblk_num, gfp_t gfp_flags) + { struct fscrypt_ctx *ctx; - struct page *ciphertext_page = NULL; + struct page *ciphertext_page = page; int err; - BUG_ON(!PageLocked(plaintext_page)); + BUG_ON(len % FS_CRYPTO_BLOCK_SIZE != 0); + + if (inode->i_sb->s_cop->flags & FS_CFLG_OWN_PAGES) { + /* with inplace-encryption we just encrypt the page */ + err = fscrypt_do_page_crypto(inode, FS_ENCRYPT, lblk_num, page, + ciphertext_page, len, offs, + gfp_flags); + if (err) + return ERR_PTR(err); + + return ciphertext_page; + } + + BUG_ON(!PageLocked(page)); ctx = fscrypt_get_ctx(inode, gfp_flags); if (IS_ERR(ctx)) return (struct page *)ctx; /* The encryption operation will require a bounce page. */ - ciphertext_page = alloc_bounce_page(ctx, gfp_flags); + ciphertext_page = fscrypt_alloc_bounce_page(ctx, gfp_flags); if (IS_ERR(ciphertext_page)) goto errout; - ctx->w.control_page = plaintext_page; - err = do_page_crypto(inode, FS_ENCRYPT, plaintext_page->index, - plaintext_page, ciphertext_page, - gfp_flags); + ctx->w.control_page = page; + err = fscrypt_do_page_crypto(inode, FS_ENCRYPT, lblk_num, + page, ciphertext_page, len, offs, + gfp_flags); if (err) { ciphertext_page = ERR_PTR(err); goto errout; @@ -265,8 +294,13 @@ errout: EXPORT_SYMBOL(fscrypt_encrypt_page); /** - * f2crypt_decrypt_page() - Decrypts a page in-place - * @page: The page to decrypt. Must be locked. + * fscrypt_decrypt_page() - Decrypts a page in-place + * @inode: The corresponding inode for the page to decrypt. + * @page: The page to decrypt. Must be locked in case + * it is a writeback page (FS_CFLG_OWN_PAGES unset). + * @len: Number of bytes in @page to be decrypted. + * @offs: Start of data in @page. + * @lblk_num: Logical block number. * * Decrypts page in-place using the ctx encryption context. * @@ -274,75 +308,17 @@ EXPORT_SYMBOL(fscrypt_encrypt_page); * * Return: Zero on success, non-zero otherwise. */ -int fscrypt_decrypt_page(struct page *page) +int fscrypt_decrypt_page(const struct inode *inode, struct page *page, + unsigned int len, unsigned int offs, u64 lblk_num) { - BUG_ON(!PageLocked(page)); + if (!(inode->i_sb->s_cop->flags & FS_CFLG_OWN_PAGES)) + BUG_ON(!PageLocked(page)); - return do_page_crypto(page->mapping->host, - FS_DECRYPT, page->index, page, page, GFP_NOFS); + return fscrypt_do_page_crypto(inode, FS_DECRYPT, lblk_num, page, page, + len, offs, GFP_NOFS); } EXPORT_SYMBOL(fscrypt_decrypt_page); -int fscrypt_zeroout_range(struct inode *inode, pgoff_t lblk, - sector_t pblk, unsigned int len) -{ - struct fscrypt_ctx *ctx; - struct page *ciphertext_page = NULL; - struct bio *bio; - int ret, err = 0; - - BUG_ON(inode->i_sb->s_blocksize != PAGE_SIZE); - - ctx = fscrypt_get_ctx(inode, GFP_NOFS); - if (IS_ERR(ctx)) - return PTR_ERR(ctx); - - ciphertext_page = alloc_bounce_page(ctx, GFP_NOWAIT); - if (IS_ERR(ciphertext_page)) { - err = PTR_ERR(ciphertext_page); - goto errout; - } - - while (len--) { - err = do_page_crypto(inode, FS_ENCRYPT, lblk, - ZERO_PAGE(0), ciphertext_page, - GFP_NOFS); - if (err) - goto errout; - - bio = bio_alloc(GFP_NOWAIT, 1); - if (!bio) { - err = -ENOMEM; - goto errout; - } - bio->bi_bdev = inode->i_sb->s_bdev; - bio->bi_iter.bi_sector = - pblk << (inode->i_sb->s_blocksize_bits - 9); - ret = bio_add_page(bio, ciphertext_page, - inode->i_sb->s_blocksize, 0); - if (ret != inode->i_sb->s_blocksize) { - /* should never happen! */ - WARN_ON(1); - bio_put(bio); - err = -EIO; - goto errout; - } - err = submit_bio_wait(WRITE, bio); - if ((err == 0) && bio->bi_error) - err = -EIO; - bio_put(bio); - if (err) - goto errout; - lblk++; - pblk++; - } - err = 0; -errout: - fscrypt_release_ctx(ctx); - return err; -} -EXPORT_SYMBOL(fscrypt_zeroout_range); - /* * Validate dentries for encrypted directories to make sure we aren't * potentially caching stale data after a key has been added or @@ -399,63 +375,6 @@ const struct dentry_operations fscrypt_d_ops = { }; EXPORT_SYMBOL(fscrypt_d_ops); -/* - * Call fscrypt_decrypt_page on every single page, reusing the encryption - * context. - */ -static void completion_pages(struct work_struct *work) -{ - struct fscrypt_ctx *ctx = - container_of(work, struct fscrypt_ctx, r.work); - struct bio *bio = ctx->r.bio; - struct bio_vec *bv; - int i; - - bio_for_each_segment_all(bv, bio, i) { - struct page *page = bv->bv_page; - int ret = fscrypt_decrypt_page(page); - - if (ret) { - WARN_ON_ONCE(1); - SetPageError(page); - } else { - SetPageUptodate(page); - } - unlock_page(page); - } - fscrypt_release_ctx(ctx); - bio_put(bio); -} - -void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *ctx, struct bio *bio) -{ - INIT_WORK(&ctx->r.work, completion_pages); - ctx->r.bio = bio; - queue_work(fscrypt_read_workqueue, &ctx->r.work); -} -EXPORT_SYMBOL(fscrypt_decrypt_bio_pages); - -void fscrypt_pullback_bio_page(struct page **page, bool restore) -{ - struct fscrypt_ctx *ctx; - struct page *bounce_page; - - /* The bounce data pages are unmapped. */ - if ((*page)->mapping) - return; - - /* The bounce data page is unmapped. */ - bounce_page = *page; - ctx = (struct fscrypt_ctx *)page_private(bounce_page); - - /* restore control page */ - *page = ctx->w.control_page; - - if (restore) - fscrypt_restore_control_page(bounce_page); -} -EXPORT_SYMBOL(fscrypt_pullback_bio_page); - void fscrypt_restore_control_page(struct page *page) { struct fscrypt_ctx *ctx; @@ -481,17 +400,22 @@ static void fscrypt_destroy(void) /** * fscrypt_initialize() - allocate major buffers for fs encryption. + * @cop_flags: fscrypt operations flags * * We only call this when we start accessing encrypted files, since it * results in memory getting allocated that wouldn't otherwise be used. * * Return: Zero on success, non-zero otherwise. */ -int fscrypt_initialize(void) +int fscrypt_initialize(unsigned int cop_flags) { int i, res = -ENOMEM; - if (fscrypt_bounce_page_pool) + /* + * No need to allocate a bounce page pool if there already is one or + * this FS won't use it. + */ + if (cop_flags & FS_CFLG_OWN_PAGES || fscrypt_bounce_page_pool) return 0; mutex_lock(&fscrypt_init_mutex); @@ -520,7 +444,6 @@ fail: mutex_unlock(&fscrypt_init_mutex); return res; } -EXPORT_SYMBOL(fscrypt_initialize); /** * fscrypt_init() - Set up for fs encryption. diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index 9b774f4b50c8..13052b85c393 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -12,7 +12,7 @@ #include #include -#include +#include "fscrypt_private.h" /** * fname_crypt_complete() - completion callback for filename crypto @@ -209,7 +209,7 @@ static int digest_decode(const char *src, int len, char *dst) return cp - dst; } -u32 fscrypt_fname_encrypted_size(struct inode *inode, u32 ilen) +u32 fscrypt_fname_encrypted_size(const struct inode *inode, u32 ilen) { int padding = 32; struct fscrypt_info *ci = inode->i_crypt_info; @@ -227,7 +227,7 @@ EXPORT_SYMBOL(fscrypt_fname_encrypted_size); * Allocates an output buffer that is sufficient for the crypto operation * specified by the context and the direction. */ -int fscrypt_fname_alloc_buffer(struct inode *inode, +int fscrypt_fname_alloc_buffer(const struct inode *inode, u32 ilen, struct fscrypt_str *crypto_str) { unsigned int olen = fscrypt_fname_encrypted_size(inode, ilen); @@ -332,7 +332,7 @@ int fscrypt_fname_usr_to_disk(struct inode *inode, * in a directory. Consequently, a user space name cannot be mapped to * a disk-space name */ - return -EACCES; + return -ENOKEY; } EXPORT_SYMBOL(fscrypt_fname_usr_to_disk); @@ -350,7 +350,7 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, fname->disk_name.len = iname->len; return 0; } - ret = get_crypt_info(dir); + ret = fscrypt_get_crypt_info(dir); if (ret && ret != -EOPNOTSUPP) return ret; @@ -367,7 +367,7 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, return 0; } if (!lookup) - return -EACCES; + return -ENOKEY; /* * We don't have the key and we are doing a lookup; decode the diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h new file mode 100644 index 000000000000..ea01e5279675 --- /dev/null +++ b/fs/crypto/fscrypt_private.h @@ -0,0 +1,117 @@ +/* + * fscrypt_private.h + * + * Copyright (C) 2015, Google, Inc. + * + * This contains encryption key functions. + * + * Written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar, 2015. + */ + +#ifndef _FSCRYPT_PRIVATE_H +#define _FSCRYPT_PRIVATE_H + +#include + +#define FS_FNAME_CRYPTO_DIGEST_SIZE 32 + +/* Encryption parameters */ +#define FS_XTS_TWEAK_SIZE 16 +#define FS_AES_128_ECB_KEY_SIZE 16 +#define FS_AES_256_GCM_KEY_SIZE 32 +#define FS_AES_256_CBC_KEY_SIZE 32 +#define FS_AES_256_CTS_KEY_SIZE 32 +#define FS_AES_256_XTS_KEY_SIZE 64 +#define FS_MAX_KEY_SIZE 64 + +#define FS_KEY_DESC_PREFIX "fscrypt:" +#define FS_KEY_DESC_PREFIX_SIZE 8 + +#define FS_KEY_DERIVATION_NONCE_SIZE 16 + +/** + * Encryption context for inode + * + * Protector format: + * 1 byte: Protector format (1 = this version) + * 1 byte: File contents encryption mode + * 1 byte: File names encryption mode + * 1 byte: Flags + * 8 bytes: Master Key descriptor + * 16 bytes: Encryption Key derivation nonce + */ +struct fscrypt_context { + u8 format; + u8 contents_encryption_mode; + u8 filenames_encryption_mode; + u8 flags; + u8 master_key_descriptor[FS_KEY_DESCRIPTOR_SIZE]; + u8 nonce[FS_KEY_DERIVATION_NONCE_SIZE]; +} __packed; + +#define FS_ENCRYPTION_CONTEXT_FORMAT_V1 1 + +/* This is passed in from userspace into the kernel keyring */ +struct fscrypt_key { + u32 mode; + u8 raw[FS_MAX_KEY_SIZE]; + u32 size; +} __packed; + +/* + * A pointer to this structure is stored in the file system's in-core + * representation of an inode. + */ +struct fscrypt_info { + u8 ci_data_mode; + u8 ci_filename_mode; + u8 ci_flags; + struct crypto_skcipher *ci_ctfm; + struct key *ci_keyring_key; + u8 ci_master_key[FS_KEY_DESCRIPTOR_SIZE]; +}; + +typedef enum { + FS_DECRYPT = 0, + FS_ENCRYPT, +} fscrypt_direction_t; + +#define FS_CTX_REQUIRES_FREE_ENCRYPT_FL 0x00000001 +#define FS_CTX_HAS_BOUNCE_BUFFER_FL 0x00000002 + +struct fscrypt_completion_result { + struct completion completion; + int res; +}; + +#define DECLARE_FS_COMPLETION_RESULT(ecr) \ + struct fscrypt_completion_result ecr = { \ + COMPLETION_INITIALIZER_ONSTACK((ecr).completion), 0 } + +/* bio stuffs */ +#define REQ_OP_READ READ +#define REQ_OP_WRITE WRITE +#define bio_op(bio) ((bio)->bi_rw & 1) + +static inline void bio_set_op_attrs(struct bio *bio, unsigned op, + unsigned op_flags) +{ + bio->bi_rw = op | op_flags; +} + +/* crypto.c */ +extern int fscrypt_initialize(unsigned int cop_flags); +extern struct workqueue_struct *fscrypt_read_workqueue; +extern int fscrypt_do_page_crypto(const struct inode *inode, + fscrypt_direction_t rw, u64 lblk_num, + struct page *src_page, + struct page *dest_page, + unsigned int len, unsigned int offs, + gfp_t gfp_flags); +extern struct page *fscrypt_alloc_bounce_page(struct fscrypt_ctx *ctx, + gfp_t gfp_flags); + +/* keyinfo.c */ +extern int fscrypt_get_crypt_info(struct inode *); + +#endif /* _FSCRYPT_PRIVATE_H */ diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c index 67fb6d8876d0..02eb6b9e4438 100644 --- a/fs/crypto/keyinfo.c +++ b/fs/crypto/keyinfo.c @@ -10,7 +10,7 @@ #include #include -#include +#include "fscrypt_private.h" static void derive_crypt_complete(struct crypto_async_request *req, int rc) { @@ -77,26 +77,22 @@ out: static int validate_user_key(struct fscrypt_info *crypt_info, struct fscrypt_context *ctx, u8 *raw_key, - u8 *prefix, int prefix_size) + const char *prefix) { - u8 *full_key_descriptor; + char *description; struct key *keyring_key; struct fscrypt_key *master_key; const struct user_key_payload *ukp; - int full_key_len = prefix_size + (FS_KEY_DESCRIPTOR_SIZE * 2) + 1; int res; - full_key_descriptor = kmalloc(full_key_len, GFP_NOFS); - if (!full_key_descriptor) + description = kasprintf(GFP_NOFS, "%s%*phN", prefix, + FS_KEY_DESCRIPTOR_SIZE, + ctx->master_key_descriptor); + if (!description) return -ENOMEM; - memcpy(full_key_descriptor, prefix, prefix_size); - sprintf(full_key_descriptor + prefix_size, - "%*phN", FS_KEY_DESCRIPTOR_SIZE, - ctx->master_key_descriptor); - full_key_descriptor[full_key_len - 1] = '\0'; - keyring_key = request_key(&key_type_logon, full_key_descriptor, NULL); - kfree(full_key_descriptor); + keyring_key = request_key(&key_type_logon, description, NULL); + kfree(description); if (IS_ERR(keyring_key)) return PTR_ERR(keyring_key); @@ -178,7 +174,7 @@ static void put_crypt_info(struct fscrypt_info *ci) kmem_cache_free(fscrypt_info_cachep, ci); } -int get_crypt_info(struct inode *inode) +int fscrypt_get_crypt_info(struct inode *inode) { struct fscrypt_info *crypt_info; struct fscrypt_context ctx; @@ -188,7 +184,7 @@ int get_crypt_info(struct inode *inode) u8 *raw_key = NULL; int res; - res = fscrypt_initialize(); + res = fscrypt_initialize(inode->i_sb->s_cop->flags); if (res) return res; @@ -206,12 +202,15 @@ retry: res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); if (res < 0) { - if (!fscrypt_dummy_context_enabled(inode)) + if (!fscrypt_dummy_context_enabled(inode) || + inode->i_sb->s_cop->is_encrypted(inode)) return res; + /* Fake up a context for an unencrypted directory */ + memset(&ctx, 0, sizeof(ctx)); ctx.format = FS_ENCRYPTION_CONTEXT_FORMAT_V1; ctx.contents_encryption_mode = FS_ENCRYPTION_MODE_AES_256_XTS; ctx.filenames_encryption_mode = FS_ENCRYPTION_MODE_AES_256_CTS; - ctx.flags = 0; + memset(ctx.master_key_descriptor, 0x42, FS_KEY_DESCRIPTOR_SIZE); } else if (res != sizeof(ctx)) { return -EINVAL; } @@ -247,20 +246,10 @@ retry: if (!raw_key) goto out; - if (fscrypt_dummy_context_enabled(inode)) { - memset(raw_key, 0x42, FS_AES_256_XTS_KEY_SIZE); - goto got_key; - } - - res = validate_user_key(crypt_info, &ctx, raw_key, - FS_KEY_DESC_PREFIX, FS_KEY_DESC_PREFIX_SIZE); + res = validate_user_key(crypt_info, &ctx, raw_key, FS_KEY_DESC_PREFIX); if (res && inode->i_sb->s_cop->key_prefix) { - u8 *prefix = NULL; - int prefix_size, res2; - - prefix_size = inode->i_sb->s_cop->key_prefix(inode, &prefix); - res2 = validate_user_key(crypt_info, &ctx, raw_key, - prefix, prefix_size); + int res2 = validate_user_key(crypt_info, &ctx, raw_key, + inode->i_sb->s_cop->key_prefix); if (res2) { if (res2 == -ENOKEY) res = -ENOKEY; @@ -269,7 +258,6 @@ retry: } else if (res) { goto out; } -got_key: ctfm = crypto_alloc_skcipher(cipher_str, 0, 0); if (!ctfm || IS_ERR(ctfm)) { res = ctfm ? PTR_ERR(ctfm) : -ENOMEM; @@ -327,7 +315,7 @@ int fscrypt_get_encryption_info(struct inode *inode) (ci->ci_keyring_key->flags & ((1 << KEY_FLAG_INVALIDATED) | (1 << KEY_FLAG_REVOKED) | (1 << KEY_FLAG_DEAD))))) - return get_crypt_info(inode); + return fscrypt_get_crypt_info(inode); return 0; } EXPORT_SYMBOL(fscrypt_get_encryption_info); diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index 69ec4da11a7b..14b76da71269 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -10,40 +10,23 @@ #include #include -#include #include - -static int inode_has_encryption_context(struct inode *inode) -{ - if (!inode->i_sb->s_cop->get_context) - return 0; - return (inode->i_sb->s_cop->get_context(inode, NULL, 0L) > 0); -} +#include "fscrypt_private.h" /* - * check whether the policy is consistent with the encryption context - * for the inode + * check whether an encryption policy is consistent with an encryption context */ -static int is_encryption_context_consistent_with_policy(struct inode *inode, +static bool is_encryption_context_consistent_with_policy( + const struct fscrypt_context *ctx, const struct fscrypt_policy *policy) { - struct fscrypt_context ctx; - int res; - - if (!inode->i_sb->s_cop->get_context) - return 0; - - res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); - if (res != sizeof(ctx)) - return 0; - - return (memcmp(ctx.master_key_descriptor, policy->master_key_descriptor, - FS_KEY_DESCRIPTOR_SIZE) == 0 && - (ctx.flags == policy->flags) && - (ctx.contents_encryption_mode == - policy->contents_encryption_mode) && - (ctx.filenames_encryption_mode == - policy->filenames_encryption_mode)); + return memcmp(ctx->master_key_descriptor, policy->master_key_descriptor, + FS_KEY_DESCRIPTOR_SIZE) == 0 && + (ctx->flags == policy->flags) && + (ctx->contents_encryption_mode == + policy->contents_encryption_mode) && + (ctx->filenames_encryption_mode == + policy->filenames_encryption_mode); } static int create_encryption_context_from_policy(struct inode *inode, @@ -66,20 +49,12 @@ static int create_encryption_context_from_policy(struct inode *inode, FS_KEY_DESCRIPTOR_SIZE); if (!fscrypt_valid_contents_enc_mode( - policy->contents_encryption_mode)) { - printk(KERN_WARNING - "%s: Invalid contents encryption mode %d\n", __func__, - policy->contents_encryption_mode); + policy->contents_encryption_mode)) return -EINVAL; - } if (!fscrypt_valid_filenames_enc_mode( - policy->filenames_encryption_mode)) { - printk(KERN_WARNING - "%s: Invalid filenames encryption mode %d\n", __func__, - policy->filenames_encryption_mode); + policy->filenames_encryption_mode)) return -EINVAL; - } if (policy->flags & ~FS_POLICY_FLAGS_VALID) return -EINVAL; @@ -98,6 +73,7 @@ int fscrypt_ioctl_set_policy(struct file *filp, const void __user *arg) struct fscrypt_policy policy; struct inode *inode = file_inode(filp); int ret; + struct fscrypt_context ctx; if (copy_from_user(&policy, arg, sizeof(policy))) return -EFAULT; @@ -114,9 +90,10 @@ int fscrypt_ioctl_set_policy(struct file *filp, const void __user *arg) inode_lock(inode); - if (!inode_has_encryption_context(inode)) { + ret = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); + if (ret == -ENODATA) { if (!S_ISDIR(inode->i_mode)) - ret = -EINVAL; + ret = -ENOTDIR; else if (!inode->i_sb->s_cop->empty_dir) ret = -EOPNOTSUPP; else if (!inode->i_sb->s_cop->empty_dir(inode)) @@ -124,12 +101,14 @@ int fscrypt_ioctl_set_policy(struct file *filp, const void __user *arg) else ret = create_encryption_context_from_policy(inode, &policy); - } else if (!is_encryption_context_consistent_with_policy(inode, - &policy)) { - printk(KERN_WARNING - "%s: Policy inconsistent with encryption context\n", - __func__); - ret = -EINVAL; + } else if (ret == sizeof(ctx) && + is_encryption_context_consistent_with_policy(&ctx, + &policy)) { + /* The file already uses the same encryption policy. */ + ret = 0; + } else if (ret >= 0 || ret == -ERANGE) { + /* The file already uses a different encryption policy. */ + ret = -EEXIST; } inode_unlock(inode); @@ -151,8 +130,10 @@ int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg) return -ENODATA; res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); + if (res < 0 && res != -ERANGE) + return res; if (res != sizeof(ctx)) - return -ENODATA; + return -EINVAL; if (ctx.format != FS_ENCRYPTION_CONTEXT_FORMAT_V1) return -EINVAL; @@ -217,9 +198,9 @@ EXPORT_SYMBOL(fscrypt_has_permitted_context); * @parent: Parent inode from which the context is inherited. * @child: Child inode that inherits the context from @parent. * @fs_data: private data given by FS. - * @preload: preload child i_crypt_info + * @preload: preload child i_crypt_info if true * - * Return: Zero on success, non-zero otherwise + * Return: 0 on success, -errno on failure */ int fscrypt_inherit_context(struct inode *parent, struct inode *child, void *fs_data, bool preload) @@ -240,19 +221,11 @@ int fscrypt_inherit_context(struct inode *parent, struct inode *child, return -ENOKEY; ctx.format = FS_ENCRYPTION_CONTEXT_FORMAT_V1; - if (fscrypt_dummy_context_enabled(parent)) { - ctx.contents_encryption_mode = FS_ENCRYPTION_MODE_AES_256_XTS; - ctx.filenames_encryption_mode = FS_ENCRYPTION_MODE_AES_256_CTS; - ctx.flags = 0; - memset(ctx.master_key_descriptor, 0x42, FS_KEY_DESCRIPTOR_SIZE); - res = 0; - } else { - ctx.contents_encryption_mode = ci->ci_data_mode; - ctx.filenames_encryption_mode = ci->ci_filename_mode; - ctx.flags = ci->ci_flags; - memcpy(ctx.master_key_descriptor, ci->ci_master_key, - FS_KEY_DESCRIPTOR_SIZE); - } + ctx.contents_encryption_mode = ci->ci_data_mode; + ctx.filenames_encryption_mode = ci->ci_filename_mode; + ctx.flags = ci->ci_flags; + memcpy(ctx.master_key_descriptor, ci->ci_master_key, + FS_KEY_DESCRIPTOR_SIZE); get_random_bytes(ctx.nonce, FS_KEY_DERIVATION_NONCE_SIZE); res = parent->i_sb->s_cop->set_context(child, &ctx, sizeof(ctx), fs_data); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 58e7dcb9af62..bda784e38407 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1309,7 +1309,9 @@ int do_write_data_page(struct f2fs_io_info *fio) fio->old_blkaddr); retry_encrypt: fio->encrypted_page = fscrypt_encrypt_page(inode, fio->page, - gfp_flags); + PAGE_SIZE, 0, + fio->page->index, + gfp_flags); if (IS_ERR(fio->encrypted_page)) { err = PTR_ERR(fio->encrypted_page); if (err == -ENOMEM) { diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 35cbe7185594..4e2153620a3b 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -272,7 +272,10 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir, err = fscrypt_setup_filename(dir, child, 1, &fname); if (err) { - *res_page = ERR_PTR(err); + if (err == -ENOENT) + *res_page = NULL; + else + *res_page = ERR_PTR(err); return NULL; } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index f26b9b451e13..fa463ef1ccc6 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -22,7 +22,11 @@ #include #include #include -#include +#ifdef CONFIG_F2FS_FS_ENCRYPTION +#include +#else +#include +#endif #include #include @@ -854,10 +858,6 @@ enum { MAX_TIME, }; -#ifdef CONFIG_F2FS_FS_ENCRYPTION -#define F2FS_KEY_DESC_PREFIX "f2fs:" -#define F2FS_KEY_DESC_PREFIX_SIZE 5 -#endif struct f2fs_sb_info { struct super_block *sb; /* pointer to VFS super block */ struct proc_dir_entry *s_proc; /* proc entry */ @@ -865,11 +865,6 @@ struct f2fs_sb_info { int valid_super_block; /* valid super block no */ unsigned long s_flag; /* flags for sbi */ -#ifdef CONFIG_F2FS_FS_ENCRYPTION - u8 key_prefix[F2FS_KEY_DESC_PREFIX_SIZE]; - u8 key_prefix_size; -#endif - #ifdef CONFIG_BLK_DEV_ZONED unsigned int blocks_per_blkz; /* F2FS blocks per zone */ unsigned int log_blocks_per_blkz; /* log2 F2FS blocks per zone */ @@ -2704,29 +2699,4 @@ static inline bool f2fs_may_encrypt(struct inode *inode) return 0; #endif } - -#ifndef CONFIG_F2FS_FS_ENCRYPTION -#define fscrypt_set_d_op(i) -#define fscrypt_get_ctx fscrypt_notsupp_get_ctx -#define fscrypt_release_ctx fscrypt_notsupp_release_ctx -#define fscrypt_encrypt_page fscrypt_notsupp_encrypt_page -#define fscrypt_decrypt_page fscrypt_notsupp_decrypt_page -#define fscrypt_decrypt_bio_pages fscrypt_notsupp_decrypt_bio_pages -#define fscrypt_pullback_bio_page fscrypt_notsupp_pullback_bio_page -#define fscrypt_restore_control_page fscrypt_notsupp_restore_control_page -#define fscrypt_zeroout_range fscrypt_notsupp_zeroout_range -#define fscrypt_ioctl_set_policy fscrypt_notsupp_ioctl_set_policy -#define fscrypt_ioctl_get_policy fscrypt_notsupp_ioctl_get_policy -#define fscrypt_has_permitted_context fscrypt_notsupp_has_permitted_context -#define fscrypt_inherit_context fscrypt_notsupp_inherit_context -#define fscrypt_get_encryption_info fscrypt_notsupp_get_encryption_info -#define fscrypt_put_encryption_info fscrypt_notsupp_put_encryption_info -#define fscrypt_setup_filename fscrypt_notsupp_setup_filename -#define fscrypt_free_filename fscrypt_notsupp_free_filename -#define fscrypt_fname_encrypted_size fscrypt_notsupp_fname_encrypted_size -#define fscrypt_fname_alloc_buffer fscrypt_notsupp_fname_alloc_buffer -#define fscrypt_fname_free_buffer fscrypt_notsupp_fname_free_buffer -#define fscrypt_fname_disk_to_usr fscrypt_notsupp_fname_disk_to_usr -#define fscrypt_fname_usr_to_disk fscrypt_notsupp_fname_usr_to_disk -#endif #endif diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index db3079cd665d..a5a9ffc8e358 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -400,7 +400,7 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, return err; if (!fscrypt_has_encryption_key(dir)) - return -EPERM; + return -ENOKEY; disk_link.len = (fscrypt_fname_encrypted_size(dir, len) + sizeof(struct fscrypt_symlink_data)); @@ -444,7 +444,7 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, goto err_out; if (!fscrypt_has_encryption_key(inode)) { - err = -EPERM; + err = -ENOKEY; goto err_out; } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 85c282272067..291b92a486d5 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1206,12 +1206,6 @@ static int f2fs_get_context(struct inode *inode, void *ctx, size_t len) ctx, len, NULL); } -static int f2fs_key_prefix(struct inode *inode, u8 **key) -{ - *key = F2FS_I_SB(inode)->key_prefix; - return F2FS_I_SB(inode)->key_prefix_size; -} - static int f2fs_set_context(struct inode *inode, const void *ctx, size_t len, void *fs_data) { @@ -1227,8 +1221,8 @@ static unsigned f2fs_max_namelen(struct inode *inode) } static const struct fscrypt_operations f2fs_cryptops = { + .key_prefix = "f2fs:", .get_context = f2fs_get_context, - .key_prefix = f2fs_key_prefix, .set_context = f2fs_set_context, .is_encrypted = f2fs_encrypted_inode, .empty_dir = f2fs_empty_dir, @@ -1568,12 +1562,6 @@ static void init_sb_info(struct f2fs_sb_info *sbi) mutex_init(&sbi->wio_mutex[NODE]); mutex_init(&sbi->wio_mutex[DATA]); spin_lock_init(&sbi->cp_lock); - -#ifdef CONFIG_F2FS_FS_ENCRYPTION - memcpy(sbi->key_prefix, F2FS_KEY_DESC_PREFIX, - F2FS_KEY_DESC_PREFIX_SIZE); - sbi->key_prefix_size = F2FS_KEY_DESC_PREFIX_SIZE; -#endif } static int init_percpu_info(struct f2fs_sb_info *sbi) diff --git a/include/linux/fscrypt_common.h b/include/linux/fscrypt_common.h new file mode 100644 index 000000000000..547f81592ba1 --- /dev/null +++ b/include/linux/fscrypt_common.h @@ -0,0 +1,146 @@ +/* + * fscrypt_common.h: common declarations for per-file encryption + * + * Copyright (C) 2015, Google, Inc. + * + * Written by Michael Halcrow, 2015. + * Modified by Jaegeuk Kim, 2015. + */ + +#ifndef _LINUX_FSCRYPT_COMMON_H +#define _LINUX_FSCRYPT_COMMON_H + +#include +#include +#include +#include +#include +#include +#include + +#define FS_CRYPTO_BLOCK_SIZE 16 + +struct fscrypt_info; + +struct fscrypt_ctx { + union { + struct { + struct page *bounce_page; /* Ciphertext page */ + struct page *control_page; /* Original page */ + } w; + struct { + struct bio *bio; + struct work_struct work; + } r; + struct list_head free_list; /* Free list */ + }; + u8 flags; /* Flags */ +}; + +/** + * For encrypted symlinks, the ciphertext length is stored at the beginning + * of the string in little-endian format. + */ +struct fscrypt_symlink_data { + __le16 len; + char encrypted_path[1]; +} __packed; + +/** + * This function is used to calculate the disk space required to + * store a filename of length l in encrypted symlink format. + */ +static inline u32 fscrypt_symlink_data_len(u32 l) +{ + if (l < FS_CRYPTO_BLOCK_SIZE) + l = FS_CRYPTO_BLOCK_SIZE; + return (l + sizeof(struct fscrypt_symlink_data) - 1); +} + +struct fscrypt_str { + unsigned char *name; + u32 len; +}; + +struct fscrypt_name { + const struct qstr *usr_fname; + struct fscrypt_str disk_name; + u32 hash; + u32 minor_hash; + struct fscrypt_str crypto_buf; +}; + +#define FSTR_INIT(n, l) { .name = n, .len = l } +#define FSTR_TO_QSTR(f) QSTR_INIT((f)->name, (f)->len) +#define fname_name(p) ((p)->disk_name.name) +#define fname_len(p) ((p)->disk_name.len) + +/* + * fscrypt superblock flags + */ +#define FS_CFLG_OWN_PAGES (1U << 1) + +/* + * crypto opertions for filesystems + */ +struct fscrypt_operations { + unsigned int flags; + const char *key_prefix; + int (*get_context)(struct inode *, void *, size_t); + int (*prepare_context)(struct inode *); + int (*set_context)(struct inode *, const void *, size_t, void *); + int (*dummy_context)(struct inode *); + bool (*is_encrypted)(struct inode *); + bool (*empty_dir)(struct inode *); + unsigned (*max_namelen)(struct inode *); +}; + +static inline bool fscrypt_dummy_context_enabled(struct inode *inode) +{ + if (inode->i_sb->s_cop->dummy_context && + inode->i_sb->s_cop->dummy_context(inode)) + return true; + return false; +} + +static inline bool fscrypt_valid_contents_enc_mode(u32 mode) +{ + return (mode == FS_ENCRYPTION_MODE_AES_256_XTS); +} + +static inline bool fscrypt_valid_filenames_enc_mode(u32 mode) +{ + return (mode == FS_ENCRYPTION_MODE_AES_256_CTS); +} + +static inline bool fscrypt_is_dot_dotdot(const struct qstr *str) +{ + if (str->len == 1 && str->name[0] == '.') + return true; + + if (str->len == 2 && str->name[0] == '.' && str->name[1] == '.') + return true; + + return false; +} + +static inline struct page *fscrypt_control_page(struct page *page) +{ +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) + return ((struct fscrypt_ctx *)page_private(page))->w.control_page; +#else + WARN_ON_ONCE(1); + return ERR_PTR(-EINVAL); +#endif +} + +static inline int fscrypt_has_encryption_key(const struct inode *inode) +{ +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) + return (inode->i_crypt_info != NULL); +#else + return 0; +#endif +} + +#endif /* _LINUX_FSCRYPT_COMMON_H */ diff --git a/include/linux/fscrypt_notsupp.h b/include/linux/fscrypt_notsupp.h new file mode 100644 index 000000000000..3511ca798804 --- /dev/null +++ b/include/linux/fscrypt_notsupp.h @@ -0,0 +1,168 @@ +/* + * fscrypt_notsupp.h + * + * This stubs out the fscrypt functions for filesystems configured without + * encryption support. + */ + +#ifndef _LINUX_FSCRYPT_NOTSUPP_H +#define _LINUX_FSCRYPT_NOTSUPP_H + +#include + +/* crypto.c */ +static inline struct fscrypt_ctx *fscrypt_get_ctx(const struct inode *inode, + gfp_t gfp_flags) +{ + return ERR_PTR(-EOPNOTSUPP); +} + +static inline void fscrypt_release_ctx(struct fscrypt_ctx *ctx) +{ + return; +} + +static inline struct page *fscrypt_encrypt_page(const struct inode *inode, + struct page *page, + unsigned int len, + unsigned int offs, + u64 lblk_num, gfp_t gfp_flags) +{ + return ERR_PTR(-EOPNOTSUPP); +} + +static inline int fscrypt_decrypt_page(const struct inode *inode, + struct page *page, + unsigned int len, unsigned int offs, + u64 lblk_num) +{ + return -EOPNOTSUPP; +} + + +static inline void fscrypt_restore_control_page(struct page *page) +{ + return; +} + +static inline void fscrypt_set_d_op(struct dentry *dentry) +{ + return; +} + +static inline void fscrypt_set_encrypted_dentry(struct dentry *dentry) +{ + return; +} + +/* policy.c */ +static inline int fscrypt_ioctl_set_policy(struct file *filp, + const void __user *arg) +{ + return -EOPNOTSUPP; +} + +static inline int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg) +{ + return -EOPNOTSUPP; +} + +static inline int fscrypt_has_permitted_context(struct inode *parent, + struct inode *child) +{ + return 0; +} + +static inline int fscrypt_inherit_context(struct inode *parent, + struct inode *child, + void *fs_data, bool preload) +{ + return -EOPNOTSUPP; +} + +/* keyinfo.c */ +static inline int fscrypt_get_encryption_info(struct inode *inode) +{ + return -EOPNOTSUPP; +} + +static inline void fscrypt_put_encryption_info(struct inode *inode, + struct fscrypt_info *ci) +{ + return; +} + + /* fname.c */ +static inline int fscrypt_setup_filename(struct inode *dir, + const struct qstr *iname, + int lookup, struct fscrypt_name *fname) +{ + if (dir->i_sb->s_cop->is_encrypted(dir)) + return -EOPNOTSUPP; + + memset(fname, 0, sizeof(struct fscrypt_name)); + fname->usr_fname = iname; + fname->disk_name.name = (unsigned char *)iname->name; + fname->disk_name.len = iname->len; + return 0; +} + +static inline void fscrypt_free_filename(struct fscrypt_name *fname) +{ + return; +} + +static inline u32 fscrypt_fname_encrypted_size(const struct inode *inode, + u32 ilen) +{ + /* never happens */ + WARN_ON(1); + return 0; +} + +static inline int fscrypt_fname_alloc_buffer(const struct inode *inode, + u32 ilen, + struct fscrypt_str *crypto_str) +{ + return -EOPNOTSUPP; +} + +static inline void fscrypt_fname_free_buffer(struct fscrypt_str *crypto_str) +{ + return; +} + +static inline int fscrypt_fname_disk_to_usr(struct inode *inode, + u32 hash, u32 minor_hash, + const struct fscrypt_str *iname, + struct fscrypt_str *oname) +{ + return -EOPNOTSUPP; +} + +static inline int fscrypt_fname_usr_to_disk(struct inode *inode, + const struct qstr *iname, + struct fscrypt_str *oname) +{ + return -EOPNOTSUPP; +} + +/* bio.c */ +static inline void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *ctx, + struct bio *bio) +{ + return; +} + +static inline void fscrypt_pullback_bio_page(struct page **page, bool restore) +{ + return; +} + +static inline int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, + sector_t pblk, unsigned int len) +{ + return -EOPNOTSUPP; +} + +#endif /* _LINUX_FSCRYPT_NOTSUPP_H */ diff --git a/include/linux/fscrypt_supp.h b/include/linux/fscrypt_supp.h new file mode 100644 index 000000000000..a140f47e9b27 --- /dev/null +++ b/include/linux/fscrypt_supp.h @@ -0,0 +1,66 @@ +/* + * fscrypt_supp.h + * + * This is included by filesystems configured with encryption support. + */ + +#ifndef _LINUX_FSCRYPT_SUPP_H +#define _LINUX_FSCRYPT_SUPP_H + +#include + +/* crypto.c */ +extern struct kmem_cache *fscrypt_info_cachep; +extern struct fscrypt_ctx *fscrypt_get_ctx(const struct inode *, gfp_t); +extern void fscrypt_release_ctx(struct fscrypt_ctx *); +extern struct page *fscrypt_encrypt_page(const struct inode *, struct page *, + unsigned int, unsigned int, + u64, gfp_t); +extern int fscrypt_decrypt_page(const struct inode *, struct page *, unsigned int, + unsigned int, u64); +extern void fscrypt_restore_control_page(struct page *); + +extern const struct dentry_operations fscrypt_d_ops; + +static inline void fscrypt_set_d_op(struct dentry *dentry) +{ + d_set_d_op(dentry, &fscrypt_d_ops); +} + +static inline void fscrypt_set_encrypted_dentry(struct dentry *dentry) +{ + spin_lock(&dentry->d_lock); + dentry->d_flags |= DCACHE_ENCRYPTED_WITH_KEY; + spin_unlock(&dentry->d_lock); +} + +/* policy.c */ +extern int fscrypt_ioctl_set_policy(struct file *, const void __user *); +extern int fscrypt_ioctl_get_policy(struct file *, void __user *); +extern int fscrypt_has_permitted_context(struct inode *, struct inode *); +extern int fscrypt_inherit_context(struct inode *, struct inode *, + void *, bool); +/* keyinfo.c */ +extern int fscrypt_get_encryption_info(struct inode *); +extern void fscrypt_put_encryption_info(struct inode *, struct fscrypt_info *); + +/* fname.c */ +extern int fscrypt_setup_filename(struct inode *, const struct qstr *, + int lookup, struct fscrypt_name *); +extern void fscrypt_free_filename(struct fscrypt_name *); +extern u32 fscrypt_fname_encrypted_size(const struct inode *, u32); +extern int fscrypt_fname_alloc_buffer(const struct inode *, u32, + struct fscrypt_str *); +extern void fscrypt_fname_free_buffer(struct fscrypt_str *); +extern int fscrypt_fname_disk_to_usr(struct inode *, u32, u32, + const struct fscrypt_str *, struct fscrypt_str *); +extern int fscrypt_fname_usr_to_disk(struct inode *, const struct qstr *, + struct fscrypt_str *); + +/* bio.c */ +extern void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *, struct bio *); +extern void fscrypt_pullback_bio_page(struct page **, bool); +extern int fscrypt_zeroout_range(const struct inode *, pgoff_t, sector_t, + unsigned int); + +#endif /* _LINUX_FSCRYPT_SUPP_H */ diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index c8c093e8c83d..ea33e08d9d75 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -176,6 +176,21 @@ struct inodes_stat_t { /* Policy provided via an ioctl on the topmost directory */ #define FS_KEY_DESCRIPTOR_SIZE 8 +#define FS_POLICY_FLAGS_PAD_4 0x00 +#define FS_POLICY_FLAGS_PAD_8 0x01 +#define FS_POLICY_FLAGS_PAD_16 0x02 +#define FS_POLICY_FLAGS_PAD_32 0x03 +#define FS_POLICY_FLAGS_PAD_MASK 0x03 +#define FS_POLICY_FLAGS_VALID 0x03 + +/* Encryption algorithms */ +#define FS_ENCRYPTION_MODE_INVALID 0 +#define FS_ENCRYPTION_MODE_AES_256_XTS 1 +#define FS_ENCRYPTION_MODE_AES_256_GCM 2 +#define FS_ENCRYPTION_MODE_AES_256_CBC 3 +#define FS_ENCRYPTION_MODE_AES_256_CTS 4 + + struct fscrypt_policy { __u8 version; __u8 contents_encryption_mode; From 9fa38a0c6456093028506d1577e29ed6246751a7 Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Tue, 28 Feb 2017 20:32:41 +0800 Subject: [PATCH 177/804] f2fs: fix an error return value in truncate_partial_data_page This patch fix a error return value in truncate_partial_data_page Signed-off-by: Yunlei He Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index cfd86ae20b7c..11053141ee4f 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -530,7 +530,7 @@ static int truncate_partial_data_page(struct inode *inode, u64 from, page = get_lock_data_page(inode, index, true); if (IS_ERR(page)) - return 0; + return PTR_ERR(page) == -ENOENT ? 0 : PTR_ERR(page); truncate_out: f2fs_wait_on_page_writeback(page, DATA, true); zero_user(page, offset, PAGE_SIZE - offset); From 9b86801f59e4ea6375c49d5a316aaa8b8bb76efe Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Sat, 4 Mar 2017 13:56:10 -0800 Subject: [PATCH 178/804] f2fs: don't need to invalidate wrong node page If f2fs_new_inode() is failed, the bad inode will invalidate 0'th node page during f2fs_evict_inode(), which doesn't need to do. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/inode.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 24bb8213d974..ef8610bf950f 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -411,7 +411,10 @@ no_delete: stat_dec_inline_dir(inode); stat_dec_inline_inode(inode); - invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino, inode->i_ino); + /* ino == 0, if f2fs_new_inode() was failed t*/ + if (inode->i_ino) + invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino, + inode->i_ino); if (xnid) invalidate_mapping_pages(NODE_MAPPING(sbi), xnid, xnid); if (inode->i_nlink) { From 4b056f06acfac3a7a5124c5eb1c1ee43807b3367 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 6 Mar 2017 11:59:56 -0800 Subject: [PATCH 179/804] f2fs: don't overwrite node block by SSR This patch fixes that SSR can overwrite previous warm node block consisting of a node chain since the last checkpoint. Fixes: 5b6c6be2d878 ("f2fs: use SSR for warm node as well") Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index a09c726cc1c3..de30f4a86219 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1242,6 +1242,12 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) if (f2fs_discard_en(sbi) && !f2fs_test_and_set_bit(offset, se->discard_map)) sbi->discard_blks--; + + /* don't overwrite by SSR to keep node chain */ + if (se->type == CURSEG_WARM_NODE) { + if (!f2fs_test_and_set_bit(offset, se->ckpt_valid_map)) + se->ckpt_valid_blocks++; + } } else { if (!f2fs_test_and_clear_bit(offset, se->cur_valid_map)) { #ifdef CONFIG_F2FS_CHECK_FS From d95038cf6b1eaaa2b9dc9086563663180f7defb1 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 10 Mar 2017 09:36:10 -0800 Subject: [PATCH 180/804] f2fs: le16_to_cpu for xattr->e_value_size This patch fixes missing le16 conversion, reported by kbuild test robot. Fixes: 5f35a2cd5 ("f2fs: Don't update the xattr data that same as the exist") Signed-off-by: Jaegeuk Kim --- fs/f2fs/xattr.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index d0d15920e3ff..fb5062a4df77 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -586,7 +586,9 @@ static bool f2fs_xattr_value_same(struct f2fs_xattr_entry *entry, const void *value, size_t size) { void *pval = entry->e_name + entry->e_name_len; - return (entry->e_value_size == size) && !memcmp(pval, value, size); + + return (le16_to_cpu(entry->e_value_size) == size) && + !memcmp(pval, value, size); } static int __f2fs_setxattr(struct inode *inode, int index, From 48da6d86af950c6e29538b5579046b361d00e876 Mon Sep 17 00:00:00 2001 From: kbuild test robot Date: Fri, 10 Mar 2017 15:25:59 +0800 Subject: [PATCH 181/804] f2fs: __update_nat_bits() can be static Signed-off-by: Fengguang Wu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 81f0daad982b..286d015aab8e 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2386,7 +2386,7 @@ add_out: list_add_tail(&nes->set_list, head); } -void __update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid, +static void __update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid, struct page *page) { struct f2fs_nm_info *nm_i = NM_I(sbi); From f0135c1551e03aa50c702b4e6caa722eec472082 Mon Sep 17 00:00:00 2001 From: kbuild test robot Date: Fri, 10 Mar 2017 15:54:31 +0800 Subject: [PATCH 182/804] f2fs: update_free_nid_bitmap() can be static Signed-off-by: Fengguang Wu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 286d015aab8e..ae4711d17f5f 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1826,7 +1826,7 @@ static void remove_free_nid(struct f2fs_sb_info *sbi, nid_t nid) kmem_cache_free(free_nid_slab, i); } -void update_free_nid_bitmap(struct f2fs_sb_info *sbi, nid_t nid, bool set) +static void update_free_nid_bitmap(struct f2fs_sb_info *sbi, nid_t nid, bool set) { struct f2fs_nm_info *nm_i = NM_I(sbi); unsigned int nat_ofs = NAT_BLOCK_OFFSET(nid); @@ -2641,7 +2641,7 @@ static int init_node_manager(struct f2fs_sb_info *sbi) return 0; } -int init_free_nid_cache(struct f2fs_sb_info *sbi) +static int init_free_nid_cache(struct f2fs_sb_info *sbi) { struct f2fs_nm_info *nm_i = NM_I(sbi); From 7ae846e99ba2d099949eccba2dd48421960e1619 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 7 Mar 2017 14:11:06 -0800 Subject: [PATCH 183/804] f2fs: use __set{__clear}_bit_le This patch uses __set{__clear}_bit_le for highter speed. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index ae4711d17f5f..3a441d84643a 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1836,9 +1836,9 @@ static void update_free_nid_bitmap(struct f2fs_sb_info *sbi, nid_t nid, bool set return; if (set) - set_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]); + __set_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]); else - clear_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]); + __clear_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]); } static void scan_nat_page(struct f2fs_sb_info *sbi, @@ -1850,7 +1850,7 @@ static void scan_nat_page(struct f2fs_sb_info *sbi, unsigned int nat_ofs = NAT_BLOCK_OFFSET(start_nid); int i; - set_bit_le(nat_ofs, nm_i->nat_block_bitmap); + __set_bit_le(nat_ofs, nm_i->nat_block_bitmap); i = start_nid % NAT_ENTRY_PER_BLOCK; From 7abdfbd622db16c0a7f222b17601d8a5816d981a Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 1 Mar 2017 17:09:07 +0800 Subject: [PATCH 184/804] f2fs: skip scanning free nid bitmap of full NAT blocks This patch adds to account free nids for each NAT blocks, and while scanning all free nid bitmap, do check count and skip lookuping in full NAT block. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim Conflicts: fs/f2fs/node.c --- fs/f2fs/debug.c | 1 + fs/f2fs/f2fs.h | 2 ++ fs/f2fs/node.c | 34 ++++++++++++++++++++++++++++------ 3 files changed, 31 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index a77df377e2e8..ee2d0a485fc3 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -196,6 +196,7 @@ static void update_mem_info(struct f2fs_sb_info *sbi) si->base_mem += (NM_I(sbi)->nat_bits_blocks << F2FS_BLKSIZE_BITS); si->base_mem += NM_I(sbi)->nat_blocks * NAT_ENTRY_BITMAP_SIZE; si->base_mem += NM_I(sbi)->nat_blocks / 8; + si->base_mem += NM_I(sbi)->nat_blocks * sizeof(unsigned short); get_cache: si->cache_mem = 0; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index fa463ef1ccc6..7c7ebf323255 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -628,6 +628,8 @@ struct f2fs_nm_info { struct mutex build_lock; /* lock for build free nids */ unsigned char (*free_nid_bitmap)[NAT_ENTRY_BITMAP_SIZE]; unsigned char *nat_block_bitmap; + unsigned short *free_nid_count; /* free nid count of NAT block */ + spinlock_t free_nid_lock; /* protect updating of nid count */ /* for checkpoint */ char *nat_bitmap; /* NAT bitmap pointer */ diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 3a441d84643a..0184ba3fbe94 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1826,7 +1826,8 @@ static void remove_free_nid(struct f2fs_sb_info *sbi, nid_t nid) kmem_cache_free(free_nid_slab, i); } -static void update_free_nid_bitmap(struct f2fs_sb_info *sbi, nid_t nid, bool set) +static void update_free_nid_bitmap(struct f2fs_sb_info *sbi, nid_t nid, + bool set, bool build) { struct f2fs_nm_info *nm_i = NM_I(sbi); unsigned int nat_ofs = NAT_BLOCK_OFFSET(nid); @@ -1839,6 +1840,13 @@ static void update_free_nid_bitmap(struct f2fs_sb_info *sbi, nid_t nid, bool set __set_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]); else __clear_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]); + + spin_lock(&nm_i->free_nid_lock); + if (set) + nm_i->free_nid_count[nat_ofs]++; + else if (!build) + nm_i->free_nid_count[nat_ofs]--; + spin_unlock(&nm_i->free_nid_lock); } static void scan_nat_page(struct f2fs_sb_info *sbi, @@ -1850,6 +1858,9 @@ static void scan_nat_page(struct f2fs_sb_info *sbi, unsigned int nat_ofs = NAT_BLOCK_OFFSET(start_nid); int i; + if (test_bit_le(nat_ofs, nm_i->nat_block_bitmap)) + return; + __set_bit_le(nat_ofs, nm_i->nat_block_bitmap); i = start_nid % NAT_ENTRY_PER_BLOCK; @@ -1864,7 +1875,7 @@ static void scan_nat_page(struct f2fs_sb_info *sbi, f2fs_bug_on(sbi, blk_addr == NEW_ADDR); if (blk_addr == NULL_ADDR) freed = add_free_nid(sbi, start_nid, true); - update_free_nid_bitmap(sbi, start_nid, freed); + update_free_nid_bitmap(sbi, start_nid, freed, true); } } @@ -1880,6 +1891,8 @@ static void scan_free_nid_bits(struct f2fs_sb_info *sbi) for (i = 0; i < nm_i->nat_blocks; i++) { if (!test_bit_le(i, nm_i->nat_block_bitmap)) continue; + if (!nm_i->free_nid_count[i]) + continue; for (idx = 0; idx < NAT_ENTRY_PER_BLOCK; idx++) { nid_t nid; @@ -2084,7 +2097,7 @@ retry: __insert_nid_to_list(sbi, i, ALLOC_NID_LIST, false); nm_i->available_nids--; - update_free_nid_bitmap(sbi, *nid, false); + update_free_nid_bitmap(sbi, *nid, false, false); spin_unlock(&nm_i->nid_list_lock); return true; @@ -2140,7 +2153,7 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) nm_i->available_nids++; - update_free_nid_bitmap(sbi, nid, true); + update_free_nid_bitmap(sbi, nid, true, false); spin_unlock(&nm_i->nid_list_lock); @@ -2470,11 +2483,11 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, add_free_nid(sbi, nid, false); spin_lock(&NM_I(sbi)->nid_list_lock); NM_I(sbi)->available_nids++; - update_free_nid_bitmap(sbi, nid, true); + update_free_nid_bitmap(sbi, nid, true, false); spin_unlock(&NM_I(sbi)->nid_list_lock); } else { spin_lock(&NM_I(sbi)->nid_list_lock); - update_free_nid_bitmap(sbi, nid, false); + update_free_nid_bitmap(sbi, nid, false, false); spin_unlock(&NM_I(sbi)->nid_list_lock); } } @@ -2654,6 +2667,14 @@ static int init_free_nid_cache(struct f2fs_sb_info *sbi) GFP_KERNEL); if (!nm_i->nat_block_bitmap) return -ENOMEM; + + nm_i->free_nid_count = f2fs_kvzalloc(nm_i->nat_blocks * + sizeof(unsigned short), GFP_KERNEL); + if (!nm_i->free_nid_count) + return -ENOMEM; + + spin_lock_init(&nm_i->free_nid_lock); + return 0; } @@ -2733,6 +2754,7 @@ void destroy_node_manager(struct f2fs_sb_info *sbi) kvfree(nm_i->nat_block_bitmap); kvfree(nm_i->free_nid_bitmap); + kvfree(nm_i->free_nid_count); kfree(nm_i->nat_bitmap); kfree(nm_i->nat_bits); From e10680dfc72f2ff195201b7e348288af5c5b072a Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 8 Mar 2017 20:07:49 +0800 Subject: [PATCH 185/804] f2fs: combine nat_bits and free_nid_bitmap cache MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Both nat_bits cache and free_nid_bitmap cache provide same functionality as a intermediate cache between free nid cache and disk, but with different granularity of indicating free nid range, and different persistence policy. nat_bits cache provides better persistence ability, and free_nid_bitmap provides better granularity. In this patch we combine advantage of both caches, so finally policy of the intermediate cache would be: - init: load free nid status from nat_bits into free_nid_bitmap - lookup: scan free_nid_bitmap before load NAT blocks - update: update free_nid_bitmap in real-time - persistence: udpate and persist nat_bits in checkpoint This patch also resolves performance regression reported by lkp-robot. commit: 4ac912427c4214d8031d9ad6fbc3bc75e71512df ("f2fs: introduce free nid bitmap") d00030cf9cd0bb96fdccc41e33d3c91dcbb672ba ("f2fs: use __set{__clear}_bit_le") 1382c0f3f9d3f936c8bc42ed1591cf7a593ef9f7 ("f2fs: combine nat_bits and free_nid_bitmap cache") 4ac912427c4214d8 d00030cf9cd0bb96fdccc41e33 1382c0f3f9d3f936c8bc42ed15 ---------------- -------------------------- -------------------------- %stddev %change %stddev %change %stddev \ | \ | \ 77863 ± 0% +2.1% 79485 ± 1% +50.8% 117404 ± 0% aim7.jobs-per-min 231.63 ± 0% -2.0% 227.01 ± 1% -33.6% 153.80 ± 0% aim7.time.elapsed_time 231.63 ± 0% -2.0% 227.01 ± 1% -33.6% 153.80 ± 0% aim7.time.elapsed_time.max 896604 ± 0% -0.8% 889221 ± 3% -20.2% 715260 ± 1% aim7.time.involuntary_context_switches 2394 ± 1% +4.6% 2503 ± 1% +3.7% 2481 ± 2% aim7.time.maximum_resident_set_size 6240 ± 0% -1.5% 6145 ± 1% -14.1% 5360 ± 1% aim7.time.system_time 1111357 ± 3% +1.9% 1132509 ± 2% -6.2% 1041932 ± 2% aim7.time.voluntary_context_switches ... Signed-off-by: Chao Yu Tested-by: Xiaolong Ye Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 125 +++++++++++++++++++------------------------------ 1 file changed, 47 insertions(+), 78 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 0184ba3fbe94..5c70f33a2b4c 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -338,9 +338,6 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, set_nat_flag(e, IS_CHECKPOINTED, false); __set_nat_cache_dirty(nm_i, e); - if (enabled_nat_bits(sbi, NULL) && new_blkaddr == NEW_ADDR) - __clear_bit_le(NAT_BLOCK_OFFSET(ni->nid), nm_i->empty_nat_bits); - /* update fsync_mark if its inode nat entry is still alive */ if (ni->nid != ni->ino) e = __lookup_nat_cache(nm_i, ni->ino); @@ -1827,7 +1824,7 @@ static void remove_free_nid(struct f2fs_sb_info *sbi, nid_t nid) } static void update_free_nid_bitmap(struct f2fs_sb_info *sbi, nid_t nid, - bool set, bool build) + bool set, bool build, bool locked) { struct f2fs_nm_info *nm_i = NM_I(sbi); unsigned int nat_ofs = NAT_BLOCK_OFFSET(nid); @@ -1841,12 +1838,14 @@ static void update_free_nid_bitmap(struct f2fs_sb_info *sbi, nid_t nid, else __clear_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]); - spin_lock(&nm_i->free_nid_lock); + if (!locked) + spin_lock(&nm_i->free_nid_lock); if (set) nm_i->free_nid_count[nat_ofs]++; else if (!build) nm_i->free_nid_count[nat_ofs]--; - spin_unlock(&nm_i->free_nid_lock); + if (!locked) + spin_unlock(&nm_i->free_nid_lock); } static void scan_nat_page(struct f2fs_sb_info *sbi, @@ -1875,7 +1874,7 @@ static void scan_nat_page(struct f2fs_sb_info *sbi, f2fs_bug_on(sbi, blk_addr == NEW_ADDR); if (blk_addr == NULL_ADDR) freed = add_free_nid(sbi, start_nid, true); - update_free_nid_bitmap(sbi, start_nid, freed, true); + update_free_nid_bitmap(sbi, start_nid, freed, true, false); } } @@ -1923,58 +1922,6 @@ out: up_read(&nm_i->nat_tree_lock); } -static int scan_nat_bits(struct f2fs_sb_info *sbi) -{ - struct f2fs_nm_info *nm_i = NM_I(sbi); - struct page *page; - unsigned int i = 0; - nid_t nid; - - if (!enabled_nat_bits(sbi, NULL)) - return -EAGAIN; - - down_read(&nm_i->nat_tree_lock); -check_empty: - i = find_next_bit_le(nm_i->empty_nat_bits, nm_i->nat_blocks, i); - if (i >= nm_i->nat_blocks) { - i = 0; - goto check_partial; - } - - for (nid = i * NAT_ENTRY_PER_BLOCK; nid < (i + 1) * NAT_ENTRY_PER_BLOCK; - nid++) { - if (unlikely(nid >= nm_i->max_nid)) - break; - add_free_nid(sbi, nid, true); - } - - if (nm_i->nid_cnt[FREE_NID_LIST] >= MAX_FREE_NIDS) - goto out; - i++; - goto check_empty; - -check_partial: - i = find_next_zero_bit_le(nm_i->full_nat_bits, nm_i->nat_blocks, i); - if (i >= nm_i->nat_blocks) { - disable_nat_bits(sbi, true); - up_read(&nm_i->nat_tree_lock); - return -EINVAL; - } - - nid = i * NAT_ENTRY_PER_BLOCK; - page = get_current_nat_page(sbi, nid); - scan_nat_page(sbi, page, nid); - f2fs_put_page(page, 1); - - if (nm_i->nid_cnt[FREE_NID_LIST] < MAX_FREE_NIDS) { - i++; - goto check_partial; - } -out: - up_read(&nm_i->nat_tree_lock); - return 0; -} - static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount) { struct f2fs_nm_info *nm_i = NM_I(sbi); @@ -1996,21 +1943,6 @@ static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount) if (nm_i->nid_cnt[FREE_NID_LIST]) return; - - /* try to find free nids with nat_bits */ - if (!scan_nat_bits(sbi) && nm_i->nid_cnt[FREE_NID_LIST]) - return; - } - - /* find next valid candidate */ - if (enabled_nat_bits(sbi, NULL)) { - int idx = find_next_zero_bit_le(nm_i->full_nat_bits, - nm_i->nat_blocks, 0); - - if (idx >= nm_i->nat_blocks) - set_sbi_flag(sbi, SBI_NEED_FSCK); - else - nid = idx * NAT_ENTRY_PER_BLOCK; } /* readahead nat pages to be scanned */ @@ -2097,7 +2029,7 @@ retry: __insert_nid_to_list(sbi, i, ALLOC_NID_LIST, false); nm_i->available_nids--; - update_free_nid_bitmap(sbi, *nid, false, false); + update_free_nid_bitmap(sbi, *nid, false, false, false); spin_unlock(&nm_i->nid_list_lock); return true; @@ -2153,7 +2085,7 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) nm_i->available_nids++; - update_free_nid_bitmap(sbi, nid, true, false); + update_free_nid_bitmap(sbi, nid, true, false, false); spin_unlock(&nm_i->nid_list_lock); @@ -2483,11 +2415,11 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, add_free_nid(sbi, nid, false); spin_lock(&NM_I(sbi)->nid_list_lock); NM_I(sbi)->available_nids++; - update_free_nid_bitmap(sbi, nid, true, false); + update_free_nid_bitmap(sbi, nid, true, false, false); spin_unlock(&NM_I(sbi)->nid_list_lock); } else { spin_lock(&NM_I(sbi)->nid_list_lock); - update_free_nid_bitmap(sbi, nid, false, false); + update_free_nid_bitmap(sbi, nid, false, false, false); spin_unlock(&NM_I(sbi)->nid_list_lock); } } @@ -2593,6 +2525,40 @@ static int __get_nat_bitmaps(struct f2fs_sb_info *sbi) return 0; } +inline void load_free_nid_bitmap(struct f2fs_sb_info *sbi) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + unsigned int i = 0; + nid_t nid, last_nid; + + if (!enabled_nat_bits(sbi, NULL)) + return; + + for (i = 0; i < nm_i->nat_blocks; i++) { + i = find_next_bit_le(nm_i->empty_nat_bits, nm_i->nat_blocks, i); + if (i >= nm_i->nat_blocks) + break; + + __set_bit_le(i, nm_i->nat_block_bitmap); + + nid = i * NAT_ENTRY_PER_BLOCK; + last_nid = (i + 1) * NAT_ENTRY_PER_BLOCK; + + spin_lock(&nm_i->free_nid_lock); + for (; nid < last_nid; nid++) + update_free_nid_bitmap(sbi, nid, true, true, true); + spin_unlock(&nm_i->free_nid_lock); + } + + for (i = 0; i < nm_i->nat_blocks; i++) { + i = find_next_bit_le(nm_i->full_nat_bits, nm_i->nat_blocks, i); + if (i >= nm_i->nat_blocks) + break; + + __set_bit_le(i, nm_i->nat_block_bitmap); + } +} + static int init_node_manager(struct f2fs_sb_info *sbi) { struct f2fs_super_block *sb_raw = F2FS_RAW_SUPER(sbi); @@ -2694,6 +2660,9 @@ int build_node_manager(struct f2fs_sb_info *sbi) if (err) return err; + /* load free nid status from nat_bits table */ + load_free_nid_bitmap(sbi); + build_free_nids(sbi, true, true); return 0; } From bf5320b6fbd9d37a3776d6daaf0f60fd6c6cd451 Mon Sep 17 00:00:00 2001 From: Kinglong Mee Date: Sat, 11 Mar 2017 21:18:01 +0800 Subject: [PATCH 189/804] f2fs: le32_to_cpu for ckpt->cp_pack_total_block_count Fixes: 22ad0b6ab4 ("f2fs: add bitmaps for empty or full NAT blocks") Signed-off-by: Kinglong Mee Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 645c3f7f21ce..08b9a1f578e3 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1025,7 +1025,8 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc) spin_lock(&sbi->cp_lock); - if (cpc->reason == CP_UMOUNT && ckpt->cp_pack_total_block_count > + if (cpc->reason == CP_UMOUNT && + le32_to_cpu(ckpt->cp_pack_total_block_count) > sbi->blocks_per_seg - NM_I(sbi)->nat_bits_blocks) disable_nat_bits(sbi, false); From 3100307fdbdd104f0c5d50fae206859f14e362a3 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 7 Mar 2017 13:32:20 -0800 Subject: [PATCH 190/804] f2fs: fix wrong error injection for evict_inode The previous one was not a proper location to inject an error, since there is no point to get errors. Instead, we can emulate EIO during truncation, and the below logic should handle it correctly. Signed-off-by: Jaegeuk Kim --- fs/f2fs/inode.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index ef8610bf950f..2520fa72b23f 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -372,13 +372,6 @@ void f2fs_evict_inode(struct inode *inode) if (inode->i_nlink || is_bad_inode(inode)) goto no_delete; -#ifdef CONFIG_F2FS_FAULT_INJECTION - if (time_to_inject(sbi, FAULT_EVICT_INODE)) { - f2fs_show_injection_info(FAULT_EVICT_INODE); - goto no_delete; - } -#endif - remove_ino_entry(sbi, inode->i_ino, APPEND_INO); remove_ino_entry(sbi, inode->i_ino, UPDATE_INO); @@ -389,6 +382,12 @@ retry: if (F2FS_HAS_BLOCKS(inode)) err = f2fs_truncate(inode); +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(sbi, FAULT_EVICT_INODE)) { + f2fs_show_injection_info(FAULT_EVICT_INODE); + err = -EIO; + } +#endif if (!err) { f2fs_lock_op(sbi); err = remove_inode_page(inode); From 8f326468d5b9c0b640a7265d9d4684917caa1334 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 7 Mar 2017 11:22:45 -0800 Subject: [PATCH 191/804] f2fs: don't allow to get pino when filename is encrypted After renaming an encrypted file, we have no way to get its encrypted filename from its dentry. Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 11053141ee4f..38d39f656746 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -112,6 +112,9 @@ static int get_parent_ino(struct inode *inode, nid_t *pino) { struct dentry *dentry; + if (file_enc_name(inode)) + return 0; + inode = igrab(inode); dentry = d_find_any_alias(inode); iput(inode); From 78e31d26ec81b8bc65589bb5289fb6ced240d320 Mon Sep 17 00:00:00 2001 From: Kinglong Mee Date: Sat, 4 Mar 2017 21:48:28 +0800 Subject: [PATCH 192/804] f2fs: fix the fault of checking F2FS_LINK_MAX for rename inode The parent directory's nlink will change, not the inode. Signed-off-by: Kinglong Mee Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/namei.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index a5a9ffc8e358..65fff81889cf 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -905,8 +905,8 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, old_nlink = old_dir_entry ? -1 : 1; new_nlink = -old_nlink; err = -EMLINK; - if ((old_nlink > 0 && old_inode->i_nlink >= F2FS_LINK_MAX) || - (new_nlink > 0 && new_inode->i_nlink >= F2FS_LINK_MAX)) + if ((old_nlink > 0 && old_dir->i_nlink >= F2FS_LINK_MAX) || + (new_nlink > 0 && new_dir->i_nlink >= F2FS_LINK_MAX)) goto out_new_dir; } From ef250a614506801289ffb8cb1e8e20e1a4e341ec Mon Sep 17 00:00:00 2001 From: Kinglong Mee Date: Wed, 8 Mar 2017 09:49:53 +0800 Subject: [PATCH 193/804] f2fs: fix the fault of calculating blkstart twice When the zone type is BLK_ZONE_TYPE_CONVENTIONAL, the blkstart is calculated twice. Signed-off-by: Kinglong Mee Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index de30f4a86219..b914cfb49096 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -875,6 +875,7 @@ static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t blkstart, block_t blklen) { sector_t sector, nr_sects; + block_t lblkstart = blkstart; int devi = 0; if (sbi->s_ndevs) { @@ -892,7 +893,7 @@ static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, case BLK_ZONE_TYPE_CONVENTIONAL: if (!blk_queue_discard(bdev_get_queue(bdev))) return 0; - return __f2fs_issue_discard_async(sbi, bdev, blkstart, blklen); + return __f2fs_issue_discard_async(sbi, bdev, lblkstart, blklen); case BLK_ZONE_TYPE_SEQWRITE_REQ: case BLK_ZONE_TYPE_SEQWRITE_PREF: sector = SECTOR_FROM_BLOCK(blkstart); From b39d14bd84c39a4771dc80cb0acda88236b812e2 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 7 Mar 2017 13:41:22 -0800 Subject: [PATCH 194/804] f2fs: build stat_info before orphan inode recovery f2fs_sync_fs() -> write_checkpoint() calls stat_inc_cp_count(sbi->stat_info), which needs stat_info allocation. Otherwise, we can hit: [254042.598623] ? count_shadow_nodes+0xa0/0xa0 [254042.598633] f2fs_sync_fs+0x65/0xd0 [f2fs] [254042.598645] f2fs_balance_fs_bg+0xe4/0x1c0 [f2fs] [254042.598657] f2fs_write_node_pages+0x34/0x1a0 [f2fs] [254042.598664] ? pagevec_lookup_entries+0x1e/0x30 [254042.598673] do_writepages+0x1e/0x30 [254042.598682] __writeback_single_inode+0x45/0x330 [254042.598688] writeback_single_inode+0xd7/0x190 [254042.598694] write_inode_now+0x86/0xa0 [254042.598699] iput+0x122/0x200 [254042.598709] f2fs_fill_super+0xd4a/0x14d0 [f2fs] [254042.598717] mount_bdev+0x184/0x1c0 [254042.598934] ? f2fs_commit_super+0x100/0x100 [f2fs] [254042.599142] f2fs_mount+0x15/0x20 [f2fs] [254042.599349] mount_fs+0x39/0x160 [254042.599554] ? __alloc_percpu+0x15/0x20 [254042.599759] vfs_kern_mount+0x67/0x110 [254042.599972] do_mount+0x1bb/0xc80 [254042.600175] ? memdup_user+0x42/0x60 [254042.600380] SyS_mount+0x83/0xd0 [254042.600583] entry_SYSCALL_64_fastpath+0x1e/0xad Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 291b92a486d5..b760414f3f9d 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2028,6 +2028,10 @@ try_onemore: f2fs_join_shrinker(sbi); + err = f2fs_build_stats(sbi); + if (err) + goto free_nm; + /* if there are nt orphan nodes free them */ err = recover_orphan_inodes(sbi); if (err) @@ -2052,10 +2056,6 @@ try_onemore: goto free_root_inode; } - err = f2fs_build_stats(sbi); - if (err) - goto free_root_inode; - if (f2fs_proc_root) sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root); @@ -2149,7 +2149,6 @@ free_proc: remove_proc_entry("segment_bits", sbi->s_proc); remove_proc_entry(sb->s_id, f2fs_proc_root); } - f2fs_destroy_stats(sbi); free_root_inode: dput(sb->s_root); sb->s_root = NULL; @@ -2167,6 +2166,7 @@ free_node_inode: truncate_inode_pages_final(META_MAPPING(sbi)); iput(sbi->node_inode); mutex_unlock(&sbi->umount_mutex); + f2fs_destroy_stats(sbi); free_nm: destroy_node_manager(sbi); free_sm: From 26012ec09c68b91929d33ad268a8e470c5df870e Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 17 Mar 2017 09:55:52 +0800 Subject: [PATCH 195/804] f2fs: fix stale ATOMIC_WRITTEN_PAGE private pointer When I forced to enable atomic operations intentionally, I could hit the below panic, since we didn't clear page->private in f2fs_invalidate_page called by file truncation. The panic occurs due to NULL mapping having page->private. BUG: unable to handle kernel paging request at ffffffffffffffff IP: drop_buffers+0x38/0xe0 PGD 5d00c067 PUD 5d00e067 PMD 0 CPU: 3 PID: 1648 Comm: fsstress Tainted: G D OE 4.10.0+ #5 Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006 task: ffff9151952863c0 task.stack: ffffaaec40db4000 RIP: 0010:drop_buffers+0x38/0xe0 RSP: 0018:ffffaaec40db74c8 EFLAGS: 00010292 Call Trace: ? page_referenced+0x8b/0x170 try_to_free_buffers+0xc5/0xe0 try_to_release_page+0x49/0x50 shrink_page_list+0x8bc/0x9f0 shrink_inactive_list+0x1dd/0x500 ? shrink_active_list+0x2c0/0x430 shrink_node_memcg+0x5eb/0x7c0 shrink_node+0xe1/0x320 do_try_to_free_pages+0xef/0x2e0 try_to_free_pages+0xe9/0x190 __alloc_pages_slowpath+0x390/0xe70 __alloc_pages_nodemask+0x291/0x2b0 alloc_pages_current+0x95/0x140 __page_cache_alloc+0xc4/0xe0 pagecache_get_page+0xab/0x2a0 grab_cache_page_write_begin+0x20/0x40 get_read_data_page+0x2e6/0x4c0 [f2fs] ? f2fs_mark_inode_dirty_sync+0x16/0x30 [f2fs] ? truncate_data_blocks_range+0x238/0x2b0 [f2fs] get_lock_data_page+0x30/0x190 [f2fs] __exchange_data_block+0xaaf/0xf40 [f2fs] f2fs_fallocate+0x418/0xd00 [f2fs] vfs_fallocate+0x157/0x220 SyS_fallocate+0x48/0x80 Signed-off-by: Yunlei He Signed-off-by: Chao Yu [Chao Yu: use INMEM_INVALIDATE for better tracing] Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 +- fs/f2fs/f2fs.h | 2 ++ fs/f2fs/segment.c | 30 ++++++++++++++++++++++++++++++ include/trace/events/f2fs.h | 2 ++ 4 files changed, 35 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index bda784e38407..3c1221c12026 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1948,7 +1948,7 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset, /* This is atomic written page, keep Private */ if (IS_ATOMIC_WRITTEN_PAGE(page)) - return; + return drop_inmem_page(inode, page); set_page_private(page, 0); ClearPagePrivate(page); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 7c7ebf323255..e32bc391ed0b 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -789,6 +789,7 @@ enum page_type { META_FLUSH, INMEM, /* the below types are used by tracepoints only. */ INMEM_DROP, + INMEM_INVALIDATE, INMEM_REVOKE, IPU, OPU, @@ -2251,6 +2252,7 @@ void destroy_node_manager_caches(void); */ void register_inmem_page(struct inode *inode, struct page *page); void drop_inmem_pages(struct inode *inode); +void drop_inmem_page(struct inode *inode, struct page *page); int commit_inmem_pages(struct inode *inode); void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need); void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index b914cfb49096..7d7a8270bbbe 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -250,6 +250,36 @@ void drop_inmem_pages(struct inode *inode) stat_dec_atomic_write(inode); } +void drop_inmem_page(struct inode *inode, struct page *page) +{ + struct f2fs_inode_info *fi = F2FS_I(inode); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct list_head *head = &fi->inmem_pages; + struct inmem_pages *cur = NULL; + + f2fs_bug_on(sbi, !IS_ATOMIC_WRITTEN_PAGE(page)); + + mutex_lock(&fi->inmem_lock); + list_for_each_entry(cur, head, list) { + if (cur->page == page) + break; + } + + f2fs_bug_on(sbi, !cur || cur->page != page); + list_del(&cur->list); + mutex_unlock(&fi->inmem_lock); + + dec_page_count(sbi, F2FS_INMEM_PAGES); + kmem_cache_free(inmem_entry_slab, cur); + + ClearPageUptodate(page); + set_page_private(page, 0); + ClearPagePrivate(page); + f2fs_put_page(page, 0); + + trace_f2fs_commit_inmem_page(page, INMEM_INVALIDATE); +} + static int __commit_inmem_pages(struct inode *inode, struct list_head *revoke_list) { diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index b95872b9c3ae..c9ea83dfd986 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -15,6 +15,7 @@ TRACE_DEFINE_ENUM(META); TRACE_DEFINE_ENUM(META_FLUSH); TRACE_DEFINE_ENUM(INMEM); TRACE_DEFINE_ENUM(INMEM_DROP); +TRACE_DEFINE_ENUM(INMEM_INVALIDATE); TRACE_DEFINE_ENUM(IPU); TRACE_DEFINE_ENUM(OPU); TRACE_DEFINE_ENUM(CURSEG_HOT_DATA); @@ -52,6 +53,7 @@ TRACE_DEFINE_ENUM(CP_DISCARD); { META_FLUSH, "META_FLUSH" }, \ { INMEM, "INMEM" }, \ { INMEM_DROP, "INMEM_DROP" }, \ + { INMEM_INVALIDATE, "INMEM_INVALIDATE" }, \ { INMEM_REVOKE, "INMEM_REVOKE" }, \ { IPU, "IN-PLACE" }, \ { OPU, "OUT-OF-PLACE" }) From dd6b2029c25b44d823d93b7cde0eea6a469a27fa Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 17 Mar 2017 10:04:15 +0800 Subject: [PATCH 196/804] f2fs: don't allow atomic writes for not regular files The atomic writes only supports regular files for database. Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 38d39f656746..0e15770cc728 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1523,6 +1523,9 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) if (!inode_owner_or_capable(inode)) return -EACCES; + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + ret = mnt_want_write_file(filp); if (ret) return ret; From 3d60b5db39906313247391c2173496fb0927e5e9 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 17 Mar 2017 15:43:57 +0800 Subject: [PATCH 197/804] f2fs: don't allow volatile writes for non-regular file Now f2fs only supports volatile writes for journal db regular file. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 0e15770cc728..055495008c6c 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1603,6 +1603,9 @@ static int f2fs_ioc_start_volatile_write(struct file *filp) if (!inode_owner_or_capable(inode)) return -EACCES; + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + ret = mnt_want_write_file(filp); if (ret) return ret; From 363f8e93f52dafc302716056f31e835b045cd2c3 Mon Sep 17 00:00:00 2001 From: Kinglong Mee Date: Sat, 4 Mar 2017 22:13:10 +0800 Subject: [PATCH 198/804] f2fs: make sure trace all f2fs_issue_flush The root device's issue flush trace is missing, add it and tracing the result from submit. Fixes d50aaeec90 ("f2fs: show actual device info in tracepoints") Signed-off-by: Kinglong Mee Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 24 +++++++++++++----------- include/trace/events/f2fs.h | 11 +++++++---- 2 files changed, 20 insertions(+), 15 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 7d7a8270bbbe..26eef87e82ec 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -441,7 +441,8 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) } } -static int __submit_flush_wait(struct block_device *bdev) +static int __submit_flush_wait(struct f2fs_sb_info *sbi, + struct block_device *bdev) { struct bio *bio = f2fs_bio_alloc(0); int ret; @@ -450,23 +451,24 @@ static int __submit_flush_wait(struct block_device *bdev) bio->bi_bdev = bdev; ret = submit_bio_wait(WRITE_FLUSH, bio); bio_put(bio); + + trace_f2fs_issue_flush(bdev, test_opt(sbi, NOBARRIER), + test_opt(sbi, FLUSH_MERGE), ret); return ret; } static int submit_flush_wait(struct f2fs_sb_info *sbi) { - int ret = __submit_flush_wait(sbi->sb->s_bdev); + int ret = __submit_flush_wait(sbi, sbi->sb->s_bdev); int i; - if (sbi->s_ndevs && !ret) { - for (i = 1; i < sbi->s_ndevs; i++) { - trace_f2fs_issue_flush(FDEV(i).bdev, - test_opt(sbi, NOBARRIER), - test_opt(sbi, FLUSH_MERGE)); - ret = __submit_flush_wait(FDEV(i).bdev); - if (ret) - break; - } + if (!sbi->s_ndevs || ret) + return ret; + + for (i = 1; i < sbi->s_ndevs; i++) { + ret = __submit_flush_wait(sbi, FDEV(i).bdev); + if (ret) + break; } return ret; } diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index c9ea83dfd986..8ca1ddf50dc1 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -1166,26 +1166,29 @@ TRACE_EVENT(f2fs_issue_reset_zone, TRACE_EVENT(f2fs_issue_flush, TP_PROTO(struct block_device *dev, unsigned int nobarrier, - unsigned int flush_merge), + unsigned int flush_merge, int ret), - TP_ARGS(dev, nobarrier, flush_merge), + TP_ARGS(dev, nobarrier, flush_merge, ret), TP_STRUCT__entry( __field(dev_t, dev) __field(unsigned int, nobarrier) __field(unsigned int, flush_merge) + __field(int, ret) ), TP_fast_assign( __entry->dev = dev->bd_dev; __entry->nobarrier = nobarrier; __entry->flush_merge = flush_merge; + __entry->ret = ret; ), - TP_printk("dev = (%d,%d), %s %s", + TP_printk("dev = (%d,%d), %s %s, ret = %d", show_dev(__entry->dev), __entry->nobarrier ? "skip (nobarrier)" : "issue", - __entry->flush_merge ? " with flush_merge" : "") + __entry->flush_merge ? " with flush_merge" : "", + __entry->ret) ); TRACE_EVENT(f2fs_lookup_extent_tree_start, From bfd70a38c16385130ad653d46a2ec694bddbb762 Mon Sep 17 00:00:00 2001 From: Kinglong Mee Date: Tue, 28 Feb 2017 21:34:47 +0800 Subject: [PATCH 199/804] f2fs: drop duplicate radix tree lookup of nat_entry_set The nat entry is listed from the set list for freeing, it's duplicate to do radix tree lookup again. Signed-off-by: Kinglong Mee [Jaegeuk Kim: remove unnecessary f2fs_bug_on] Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 5c70f33a2b4c..edabf883cf0c 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -177,18 +177,12 @@ static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i, } static void __clear_nat_cache_dirty(struct f2fs_nm_info *nm_i, - struct nat_entry *ne) + struct nat_entry_set *set, struct nat_entry *ne) { - nid_t set = NAT_BLOCK_OFFSET(ne->ni.nid); - struct nat_entry_set *head; - - head = radix_tree_lookup(&nm_i->nat_set_root, set); - if (head) { - list_move_tail(&ne->list, &nm_i->nat_entries); - set_nat_flag(ne, IS_DIRTY, false); - head->entry_cnt--; - nm_i->dirty_nat_cnt--; - } + list_move_tail(&ne->list, &nm_i->nat_entries); + set_nat_flag(ne, IS_DIRTY, false); + set->entry_cnt--; + nm_i->dirty_nat_cnt--; } static unsigned int __gang_lookup_nat_set(struct f2fs_nm_info *nm_i, @@ -2410,7 +2404,7 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, } raw_nat_from_node_info(raw_ne, &ne->ni); nat_reset_flag(ne); - __clear_nat_cache_dirty(NM_I(sbi), ne); + __clear_nat_cache_dirty(NM_I(sbi), set, ne); if (nat_get_blkaddr(ne) == NULL_ADDR) { add_free_nid(sbi, nid, false); spin_lock(&NM_I(sbi)->nid_list_lock); From 1ef38ece5cfe9b91e90a280a5513ecacbdc62920 Mon Sep 17 00:00:00 2001 From: Kinglong Mee Date: Tue, 28 Feb 2017 21:34:37 +0800 Subject: [PATCH 200/804] f2fs: remove dead macro PGOFS_OF_NEXT_DNODE Fixes: 3cf4574705 ("f2fs: introduce get_next_page_offset to speed up SEEK_DATA") Signed-off-by: Kinglong Mee Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 6 ------ 1 file changed, 6 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index e32bc391ed0b..83ccbfceffdf 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2097,12 +2097,6 @@ static inline void *f2fs_kvzalloc(size_t size, gfp_t flags) ((is_inode_flag_set(i, FI_ACL_MODE)) ? \ (F2FS_I(i)->i_acl_mode) : ((i)->i_mode)) -/* get offset of first page in next direct node */ -#define PGOFS_OF_NEXT_DNODE(pgofs, inode) \ - ((pgofs < ADDRS_PER_INODE(inode)) ? ADDRS_PER_INODE(inode) : \ - (pgofs - ADDRS_PER_INODE(inode) + ADDRS_PER_BLOCK) / \ - ADDRS_PER_BLOCK * ADDRS_PER_BLOCK + ADDRS_PER_INODE(inode)) - /* * file.c */ From 9f5bdf3b0ab30cf0dacca1c0dcd08f5b258bbc37 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 7 Mar 2017 13:54:56 -0800 Subject: [PATCH 201/804] f2fs: show more precise message on orphan recovery failure This case is not caused by fsck.f2fs. User needs to retry mount. Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 08b9a1f578e3..4a0b286790e0 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -568,7 +568,7 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) if (ni.blk_addr != NULL_ADDR) { set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_msg(sbi->sb, KERN_WARNING, - "%s: orphan failed (ino=%x), run fsck to fix.", + "%s: orphan failed (ino=%x) by kernel, retry mount.", __func__, ino); return -EIO; } From 633f62b7dcf918cfcc01b90b41da78d30679df38 Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Wed, 1 Mar 2017 18:07:10 +0800 Subject: [PATCH 202/804] f2fs: skip writeback meta pages if cp_mutex acquire failed Skip writeback meta pages if cp_mutex lock acquire failed, cp will flush dirty pages instead. Signed-off-by: Yunlei He Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 4a0b286790e0..61c519688f9d 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -276,10 +276,11 @@ static int f2fs_write_meta_pages(struct address_space *mapping, get_pages(sbi, F2FS_DIRTY_META) < nr_pages_to_skip(sbi, META)) goto skip_write; - trace_f2fs_writepages(mapping->host, wbc, META); + /* if locked failed, cp will flush dirty pages instead */ + if (!mutex_trylock(&sbi->cp_mutex)) + goto skip_write; - /* if mounting is failed, skip writing node pages */ - mutex_lock(&sbi->cp_mutex); + trace_f2fs_writepages(mapping->host, wbc, META); diff = nr_pages_to_write(sbi, META, wbc); written = sync_meta_pages(sbi, META, wbc->nr_to_write); mutex_unlock(&sbi->cp_mutex); From 28fa89b32d44f7a09e6ef7357049a4ebce892e3b Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 7 Mar 2017 18:02:02 -0800 Subject: [PATCH 203/804] f2fs: allocate a bio for discarding when actually issuing it Let's allocate a bio when issuing discard commands later. Signed-off-by: Jaegeuk Kim Conflicts: fs/f2fs/segment.c --- fs/f2fs/f2fs.h | 4 +- fs/f2fs/segment.c | 192 ++++++++++++++++++++++++---------------------- 2 files changed, 105 insertions(+), 91 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 83ccbfceffdf..4ac8700d362a 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -264,10 +264,12 @@ enum { struct discard_cmd { struct list_head list; /* command list */ struct completion wait; /* compleation */ + struct block_device *bdev; /* bdev */ block_t lstart; /* logical start address */ + block_t start; /* actual start address in dev */ block_t len; /* length */ - struct bio *bio; /* bio */ int state; /* state */ + int error; /* bio error */ }; struct discard_cmd_control { diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 26eef87e82ec..f0c06e4785a9 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -666,7 +666,8 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) } static void __add_discard_cmd(struct f2fs_sb_info *sbi, - struct bio *bio, block_t lstart, block_t len) + struct block_device *bdev, block_t lstart, + block_t start, block_t len) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct list_head *cmd_list = &(dcc->discard_cmd_list); @@ -674,11 +675,12 @@ static void __add_discard_cmd(struct f2fs_sb_info *sbi, dc = f2fs_kmem_cache_alloc(discard_cmd_slab, GFP_NOFS); INIT_LIST_HEAD(&dc->list); - dc->bio = bio; - bio->bi_private = dc; + dc->bdev = bdev; dc->lstart = lstart; + dc->start = start; dc->len = len; dc->state = D_PREP; + dc->error = 0; init_completion(&dc->wait); mutex_lock(&dcc->cmd_lock); @@ -688,70 +690,27 @@ static void __add_discard_cmd(struct f2fs_sb_info *sbi, static void __remove_discard_cmd(struct f2fs_sb_info *sbi, struct discard_cmd *dc) { - int err = dc->bio->bi_error; - if (dc->state == D_DONE) atomic_dec(&(SM_I(sbi)->dcc_info->submit_discard)); - if (err == -EOPNOTSUPP) - err = 0; + if (dc->error == -EOPNOTSUPP) + dc->error = 0; - if (err) + if (dc->error) f2fs_msg(sbi->sb, KERN_INFO, - "Issue discard failed, ret: %d", err); - bio_put(dc->bio); + "Issue discard failed, ret: %d", dc->error); list_del(&dc->list); kmem_cache_free(discard_cmd_slab, dc); } -/* This should be covered by global mutex, &sit_i->sentry_lock */ -void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) -{ - struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; - struct list_head *wait_list = &(dcc->discard_cmd_list); - struct discard_cmd *dc, *tmp; - struct blk_plug plug; - - mutex_lock(&dcc->cmd_lock); - - blk_start_plug(&plug); - - list_for_each_entry_safe(dc, tmp, wait_list, list) { - - if (blkaddr == NULL_ADDR) { - if (dc->state == D_PREP) { - dc->state = D_SUBMIT; - submit_bio(REQ_SYNC, dc->bio); - atomic_inc(&dcc->submit_discard); - } - continue; - } - - if (dc->lstart <= blkaddr && blkaddr < dc->lstart + dc->len) { - if (dc->state == D_SUBMIT) - wait_for_completion_io(&dc->wait); - else - __remove_discard_cmd(sbi, dc); - } - } - blk_finish_plug(&plug); - - /* this comes from f2fs_put_super */ - if (blkaddr == NULL_ADDR) { - list_for_each_entry_safe(dc, tmp, wait_list, list) { - wait_for_completion_io(&dc->wait); - __remove_discard_cmd(sbi, dc); - } - } - mutex_unlock(&dcc->cmd_lock); -} - static void f2fs_submit_discard_endio(struct bio *bio) { struct discard_cmd *dc = (struct discard_cmd *)bio->bi_private; complete(&dc->wait); + dc->error = bio->bi_error; dc->state = D_DONE; + bio_put(bio); } /* copied from block/blk-lib.c in 4.10-rc1 */ @@ -835,6 +794,88 @@ static int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, return 0; } +static void __submit_discard_cmd(struct f2fs_sb_info *sbi, + struct discard_cmd *dc) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct bio *bio = NULL; + + if (dc->state != D_PREP) + return; + + dc->error = __blkdev_issue_discard(dc->bdev, + SECTOR_FROM_BLOCK(dc->start), + SECTOR_FROM_BLOCK(dc->len), + GFP_NOFS, 0, &bio); + if (!dc->error) { + /* should keep before submission to avoid D_DONE right away */ + dc->state = D_SUBMIT; + atomic_inc(&dcc->submit_discard); + if (bio) { + bio->bi_private = dc; + bio->bi_end_io = f2fs_submit_discard_endio; + submit_bio(REQ_SYNC, bio); + } + } else { + __remove_discard_cmd(sbi, dc); + } +} + +static int __queue_discard_cmd(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t blkstart, block_t blklen) +{ + block_t lblkstart = blkstart; + + trace_f2fs_issue_discard(bdev, blkstart, blklen); + + if (sbi->s_ndevs) { + int devi = f2fs_target_device_index(sbi, blkstart); + + blkstart -= FDEV(devi).start_blk; + } + __add_discard_cmd(sbi, bdev, lblkstart, blkstart, blklen); + wake_up(&SM_I(sbi)->dcc_info->discard_wait_queue); + return 0; +} + +/* This should be covered by global mutex, &sit_i->sentry_lock */ +void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct list_head *wait_list = &(dcc->discard_cmd_list); + struct discard_cmd *dc, *tmp; + struct blk_plug plug; + + mutex_lock(&dcc->cmd_lock); + + blk_start_plug(&plug); + + list_for_each_entry_safe(dc, tmp, wait_list, list) { + + if (blkaddr == NULL_ADDR) { + __submit_discard_cmd(sbi, dc); + continue; + } + + if (dc->lstart <= blkaddr && blkaddr < dc->lstart + dc->len) { + if (dc->state == D_SUBMIT) + wait_for_completion_io(&dc->wait); + else + __remove_discard_cmd(sbi, dc); + } + } + blk_finish_plug(&plug); + + /* this comes from f2fs_put_super */ + if (blkaddr == NULL_ADDR) { + list_for_each_entry_safe(dc, tmp, wait_list, list) { + wait_for_completion_io(&dc->wait); + __remove_discard_cmd(sbi, dc); + } + } + mutex_unlock(&dcc->cmd_lock); +} + static int issue_discard_thread(void *data) { struct f2fs_sb_info *sbi = data; @@ -852,15 +893,14 @@ repeat: mutex_lock(&dcc->cmd_lock); list_for_each_entry_safe(dc, tmp, cmd_list, list) { - if (dc->state == D_PREP) { - dc->state = D_SUBMIT; - submit_bio(REQ_SYNC, dc->bio); - atomic_inc(&dcc->submit_discard); - if (iter++ > DISCARD_ISSUE_RATE) - break; - } else if (dc->state == D_DONE) { + + if (is_idle(sbi)) + __submit_discard_cmd(sbi, dc); + + if (dc->state == D_PREP && iter++ > DISCARD_ISSUE_RATE) + break; + if (dc->state == D_DONE) __remove_discard_cmd(sbi, dc); - } } mutex_unlock(&dcc->cmd_lock); @@ -874,34 +914,6 @@ repeat: goto repeat; } - -/* this function is copied from blkdev_issue_discard from block/blk-lib.c */ -static int __f2fs_issue_discard_async(struct f2fs_sb_info *sbi, - struct block_device *bdev, block_t blkstart, block_t blklen) -{ - struct bio *bio = NULL; - block_t lblkstart = blkstart; - int err; - - trace_f2fs_issue_discard(bdev, blkstart, blklen); - - if (sbi->s_ndevs) { - int devi = f2fs_target_device_index(sbi, blkstart); - - blkstart -= FDEV(devi).start_blk; - } - err = __blkdev_issue_discard(bdev, - SECTOR_FROM_BLOCK(blkstart), - SECTOR_FROM_BLOCK(blklen), - GFP_NOFS, 0, &bio); - if (!err && bio) { - bio->bi_end_io = f2fs_submit_discard_endio; - __add_discard_cmd(sbi, bio, lblkstart, blklen); - wake_up(&SM_I(sbi)->dcc_info->discard_wait_queue); - } - return err; -} - #ifdef CONFIG_BLK_DEV_ZONED static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t blkstart, block_t blklen) @@ -925,7 +937,7 @@ static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, case BLK_ZONE_TYPE_CONVENTIONAL: if (!blk_queue_discard(bdev_get_queue(bdev))) return 0; - return __f2fs_issue_discard_async(sbi, bdev, lblkstart, blklen); + return __queue_discard_cmd(sbi, bdev, lblkstart, blklen); case BLK_ZONE_TYPE_SEQWRITE_REQ: case BLK_ZONE_TYPE_SEQWRITE_PREF: sector = SECTOR_FROM_BLOCK(blkstart); @@ -957,7 +969,7 @@ static int __issue_discard_async(struct f2fs_sb_info *sbi, bdev_zoned_model(bdev) != BLK_ZONED_NONE) return __f2fs_issue_discard_zone(sbi, bdev, blkstart, blklen); #endif - return __f2fs_issue_discard_async(sbi, bdev, blkstart, blklen); + return __queue_discard_cmd(sbi, bdev, blkstart, blklen); } static int f2fs_issue_discard(struct f2fs_sb_info *sbi, From 02f88520d6f38a485923badc286238a8833dd3ca Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Thu, 2 Mar 2017 10:36:20 +0800 Subject: [PATCH 204/804] f2fs: add a punch discard command function This patch add a function to punch discard command if one segment reuse before discard. Split this segment from multi-segments discard range, and discard the left bigger range. Signed-off-by: Yunlei He Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index f0c06e4785a9..5fc0173af7e3 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -838,6 +838,25 @@ static int __queue_discard_cmd(struct f2fs_sb_info *sbi, return 0; } +static void __punch_discard_cmd(struct f2fs_sb_info *sbi, + struct discard_cmd *dc, block_t blkaddr) +{ + block_t end_block = START_BLOCK(sbi, GET_SEGNO(sbi, blkaddr) + 1); + + if (dc->state == D_DONE || dc->lstart + dc->len <= end_block) { + __remove_discard_cmd(sbi, dc); + return; + } + + if (blkaddr - dc->lstart < dc->lstart + dc->len - end_block) { + dc->start += (end_block - dc->lstart); + dc->len -= (end_block - dc->lstart); + dc->lstart = end_block; + } else { + dc->len = blkaddr - dc->lstart; + } +} + /* This should be covered by global mutex, &sit_i->sentry_lock */ void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) { @@ -860,8 +879,7 @@ void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) if (dc->lstart <= blkaddr && blkaddr < dc->lstart + dc->len) { if (dc->state == D_SUBMIT) wait_for_completion_io(&dc->wait); - else - __remove_discard_cmd(sbi, dc); + __punch_discard_cmd(sbi, dc, blkaddr); } } blk_finish_plug(&plug); From 23128a06f3b485635c8388317755c8a7ec1383ef Mon Sep 17 00:00:00 2001 From: Sheng Yong Date: Wed, 8 Mar 2017 10:47:11 +0800 Subject: [PATCH 205/804] f2fs: use parameter max_items instead of PIDVEC_SIZE Signed-off-by: Sheng Yong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/trace.c b/fs/f2fs/trace.c index 73b4e1d1912a..c82ab4048127 100644 --- a/fs/f2fs/trace.c +++ b/fs/f2fs/trace.c @@ -138,7 +138,7 @@ static unsigned int gang_lookup_pids(pid_t *results, unsigned long first_index, radix_tree_for_each_slot(slot, &pids, &iter, first_index) { results[ret] = iter.index; - if (++ret == PIDVEC_SIZE) + if (++ret == max_items) break; } return ret; From ba5e838808d91d5203ddac1031fa2bb186c565dd Mon Sep 17 00:00:00 2001 From: Sheng Yong Date: Wed, 8 Mar 2017 10:47:12 +0800 Subject: [PATCH 206/804] f2fs: check range before defragment This patch checks the parameter range passed by ioctl to void that range exceeds the max_file_blocks limit. Signed-off-by: Sheng Yong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 055495008c6c..db0659c3c740 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2059,6 +2059,12 @@ static int f2fs_ioc_defragment(struct file *filp, unsigned long arg) goto out; } + if (unlikely((range.start + range.len) >> PAGE_SHIFT > + sbi->max_file_blocks)) { + err = -EINVAL; + goto out; + } + err = f2fs_defragment_range(sbi, filp, &range); f2fs_update_time(sbi, REQ_TIME); if (err < 0) From ce8679a179a818dad97917e8a3077ff7e4518f64 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 9 Mar 2017 15:24:24 -0800 Subject: [PATCH 207/804] f2fs: add fault injection on f2fs_truncate Inject a fault during f2fs_truncate(). Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 1 + fs/f2fs/file.c | 6 ++++++ fs/f2fs/super.c | 1 + 3 files changed, 8 insertions(+) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 4ac8700d362a..c524d875ac79 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -51,6 +51,7 @@ enum { FAULT_BLOCK, FAULT_DIR_DEPTH, FAULT_EVICT_INODE, + FAULT_TRUNCATE, FAULT_IO, FAULT_CHECKPOINT, FAULT_MAX, diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index db0659c3c740..094b83b53ebc 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -622,6 +622,12 @@ int f2fs_truncate(struct inode *inode) trace_f2fs_truncate(inode); +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(F2FS_I_SB(inode), FAULT_TRUNCATE)) { + f2fs_show_injection_info(FAULT_TRUNCATE); + return -EIO; + } +#endif /* we should check inline_data size */ if (!f2fs_may_inline_data(inode)) { err = f2fs_convert_inline_inode(inode); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index b760414f3f9d..779fd5e5cf40 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -49,6 +49,7 @@ char *fault_name[FAULT_MAX] = { [FAULT_BLOCK] = "no more block", [FAULT_DIR_DEPTH] = "too big dir depth", [FAULT_EVICT_INODE] = "evict_inode fail", + [FAULT_TRUNCATE] = "truncate fail", [FAULT_IO] = "IO error", [FAULT_CHECKPOINT] = "checkpoint error", }; From cb7b3c2fe5e91e92eebff970e18dacdc0df1a194 Mon Sep 17 00:00:00 2001 From: Fan Li Date: Wed, 8 Mar 2017 13:39:16 +0800 Subject: [PATCH 208/804] f2fs: adjust the way of calculating nat block use a slightly simpler expression to calculate nat block with nid. Signed-off-by: Fan Li Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 2f9603fa85a5..ebed0240aa53 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -200,13 +200,16 @@ static inline pgoff_t current_nat_addr(struct f2fs_sb_info *sbi, nid_t start) struct f2fs_nm_info *nm_i = NM_I(sbi); pgoff_t block_off; pgoff_t block_addr; - int seg_off; + /* + * block_off = segment_off * 512 + off_in_segment + * OLD = (segment_off * 512) * 2 + off_in_segment + * NEW = 2 * (segment_off * 512 + off_in_segment) - off_in_segment + */ block_off = NAT_BLOCK_OFFSET(start); - seg_off = block_off >> sbi->log_blocks_per_seg; block_addr = (pgoff_t)(nm_i->nat_blkaddr + - (seg_off << sbi->log_blocks_per_seg << 1) + + (block_off << 1) - (block_off & (sbi->blocks_per_seg - 1))); if (f2fs_test_bit(block_off, nm_i->nat_bitmap)) From 0812585ae021e44eb27c89bc93582471d2475b22 Mon Sep 17 00:00:00 2001 From: Kinglong Mee Date: Fri, 10 Mar 2017 17:54:03 +0800 Subject: [PATCH 209/804] f2fs: drop duplicate new_size assign in f2fs_zero_range Signed-off-by: Kinglong Mee Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 094b83b53ebc..9e94ba41a559 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1202,8 +1202,6 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, if (ret) return ret; - if (offset + len > new_size) - new_size = offset + len; new_size = max_t(loff_t, new_size, offset + len); } else { if (off_start) { From 00a248a675d5a94b70d309f3378d0603bd06a180 Mon Sep 17 00:00:00 2001 From: Kinglong Mee Date: Fri, 10 Mar 2017 17:54:26 +0800 Subject: [PATCH 210/804] f2fs: avoid copy date to user-space if move file range fail If move file range return error, the data copied to user-space is duplicate. Signed-off-by: Kinglong Mee Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 9e94ba41a559..7fe9ee4a605d 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2213,6 +2213,8 @@ static int f2fs_ioc_move_range(struct file *filp, unsigned long arg) range.pos_out, range.len); mnt_drop_write_file(filp); + if (err) + goto err_out; if (copy_to_user((struct f2fs_move_range __user *)arg, &range, sizeof(range))) From 74492a8e110cb26337d4a7816a7228383c81e392 Mon Sep 17 00:00:00 2001 From: Kinglong Mee Date: Fri, 10 Mar 2017 17:54:52 +0800 Subject: [PATCH 211/804] f2fs: check new size by inode_newsize_ok in f2fs_insert_range The inode_newsize_ok is better than only checking the maxbytes, eg. the rlimit etc. Signed-off-by: Kinglong Mee Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 7fe9ee4a605d..702f89a94a9c 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1269,8 +1269,9 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) int ret = 0; new_size = i_size_read(inode) + len; - if (new_size > inode->i_sb->s_maxbytes) - return -EFBIG; + ret = inode_newsize_ok(inode, new_size); + if (ret) + return ret; if (offset >= i_size_read(inode)) return -EINVAL; From 0ec599668d9a5fa0feb60c458a441886547a7c46 Mon Sep 17 00:00:00 2001 From: Kinglong Mee Date: Fri, 10 Mar 2017 17:55:07 +0800 Subject: [PATCH 212/804] f2fs: move mnt_want_write_file after arguments checking It's needless of mnt_want_write_file for arguments checking. Signed-off-by: Kinglong Mee Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 50 +++++++++++++++++++++----------------------------- 1 file changed, 21 insertions(+), 29 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 702f89a94a9c..0493afe2b068 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2042,45 +2042,37 @@ static int f2fs_ioc_defragment(struct file *filp, unsigned long arg) if (!S_ISREG(inode->i_mode)) return -EINVAL; + if (f2fs_readonly(sbi->sb)) + return -EROFS; + + if (copy_from_user(&range, (struct f2fs_defragment __user *)arg, + sizeof(range))) + return -EFAULT; + + /* verify alignment of offset & size */ + if (range.start & (F2FS_BLKSIZE - 1) || range.len & (F2FS_BLKSIZE - 1)) + return -EINVAL; + + if (unlikely((range.start + range.len) >> PAGE_SHIFT > + sbi->max_file_blocks)) + return -EINVAL; + err = mnt_want_write_file(filp); if (err) return err; - if (f2fs_readonly(sbi->sb)) { - err = -EROFS; - goto out; - } - - if (copy_from_user(&range, (struct f2fs_defragment __user *)arg, - sizeof(range))) { - err = -EFAULT; - goto out; - } - - /* verify alignment of offset & size */ - if (range.start & (F2FS_BLKSIZE - 1) || - range.len & (F2FS_BLKSIZE - 1)) { - err = -EINVAL; - goto out; - } - - if (unlikely((range.start + range.len) >> PAGE_SHIFT > - sbi->max_file_blocks)) { - err = -EINVAL; - goto out; - } - err = f2fs_defragment_range(sbi, filp, &range); + mnt_drop_write_file(filp); + f2fs_update_time(sbi, REQ_TIME); if (err < 0) - goto out; + return err; if (copy_to_user((struct f2fs_defragment __user *)arg, &range, sizeof(range))) - err = -EFAULT; -out: - mnt_drop_write_file(filp); - return err; + return -EFAULT; + + return 0; } static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, From 186a33ffeb490bfa2b89905789e2dfa8e320b025 Mon Sep 17 00:00:00 2001 From: Kinglong Mee Date: Fri, 10 Mar 2017 20:43:20 +0800 Subject: [PATCH 213/804] f2fs: clear FI_DATA_EXIST flag in truncate_inline_inode Clear FI_DATA_EXIST flag atomically in truncate_inline_inode, and the return value from truncate_inline_inode isn't used, remove it. Signed-off-by: Kinglong Mee Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 8 +------- fs/f2fs/file.c | 4 +--- fs/f2fs/inline.c | 21 +++++++++++---------- 3 files changed, 13 insertions(+), 20 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index c524d875ac79..0f7a5a9a8416 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1940,12 +1940,6 @@ static inline int f2fs_has_inline_data(struct inode *inode) return is_inode_flag_set(inode, FI_INLINE_DATA); } -static inline void f2fs_clear_inline_inode(struct inode *inode) -{ - clear_inode_flag(inode, FI_INLINE_DATA); - clear_inode_flag(inode, FI_DATA_EXIST); -} - static inline int f2fs_exist_data(struct inode *inode) { return is_inode_flag_set(inode, FI_DATA_EXIST); @@ -2575,7 +2569,7 @@ extern struct kmem_cache *inode_entry_slab; bool f2fs_may_inline_data(struct inode *inode); bool f2fs_may_inline_dentry(struct inode *inode); void read_inline_data(struct page *page, struct page *ipage); -bool truncate_inline_inode(struct page *ipage, u64 from); +void truncate_inline_inode(struct inode *inode, struct page *ipage, u64 from); int f2fs_read_inline_data(struct inode *inode, struct page *page); int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page); int f2fs_convert_inline_inode(struct inode *inode); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 0493afe2b068..24bbe14ff5db 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -571,9 +571,7 @@ int truncate_blocks(struct inode *inode, u64 from, bool lock) } if (f2fs_has_inline_data(inode)) { - truncate_inline_inode(ipage, from); - if (from == 0) - clear_inode_flag(inode, FI_DATA_EXIST); + truncate_inline_inode(inode, ipage, from); f2fs_put_page(ipage, 1); truncate_page = true; goto out; diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index d82e97b1e6c4..2b8ac2cd35d6 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -63,19 +63,21 @@ void read_inline_data(struct page *page, struct page *ipage) SetPageUptodate(page); } -bool truncate_inline_inode(struct page *ipage, u64 from) +void truncate_inline_inode(struct inode *inode, struct page *ipage, u64 from) { void *addr; if (from >= MAX_INLINE_DATA) - return false; + return; addr = inline_data_addr(ipage); f2fs_wait_on_page_writeback(ipage, NODE, true); memset(addr + from, 0, MAX_INLINE_DATA - from); set_page_dirty(ipage); - return true; + + if (from == 0) + clear_inode_flag(inode, FI_DATA_EXIST); } int f2fs_read_inline_data(struct inode *inode, struct page *page) @@ -146,11 +148,11 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) set_inode_flag(dn->inode, FI_APPEND_WRITE); /* clear inline data and flag after data writeback */ - truncate_inline_inode(dn->inode_page, 0); + truncate_inline_inode(dn->inode, dn->inode_page, 0); clear_inline_node(dn->inode_page); clear_out: stat_dec_inline_inode(dn->inode); - f2fs_clear_inline_inode(dn->inode); + clear_inode_flag(dn->inode, FI_INLINE_DATA); f2fs_put_dnode(dn); return 0; } @@ -267,9 +269,8 @@ process_inline: if (f2fs_has_inline_data(inode)) { ipage = get_node_page(sbi, inode->i_ino); f2fs_bug_on(sbi, IS_ERR(ipage)); - if (!truncate_inline_inode(ipage, 0)) - return false; - f2fs_clear_inline_inode(inode); + truncate_inline_inode(inode, ipage, 0); + clear_inode_flag(inode, FI_INLINE_DATA); f2fs_put_page(ipage, 1); } else if (ri && (ri->i_inline & F2FS_INLINE_DATA)) { if (truncate_blocks(inode, 0, false)) @@ -380,7 +381,7 @@ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage, set_page_dirty(page); /* clear inline dir and flag after data writeback */ - truncate_inline_inode(ipage, 0); + truncate_inline_inode(dir, ipage, 0); stat_dec_inline_dir(dir); clear_inode_flag(dir, FI_INLINE_DENTRY); @@ -455,7 +456,7 @@ static int f2fs_move_rehashed_dirents(struct inode *dir, struct page *ipage, } memcpy(backup_dentry, inline_dentry, MAX_INLINE_DATA); - truncate_inline_inode(ipage, 0); + truncate_inline_inode(dir, ipage, 0); unlock_page(ipage); From 8f3d1ba54be036b594c017dae8a7087f5710b052 Mon Sep 17 00:00:00 2001 From: Kinglong Mee Date: Mon, 13 Mar 2017 16:35:13 +0800 Subject: [PATCH 214/804] f2fs: fix bad prefetchw of NULL page For f2fs_read_data_pages, the f2fs_mpage_readpages gets "page == NULL", so that, the prefetchw(&page->flags) is operated on NULL. Fixes: f1e8866016 ("f2fs: expose f2fs_mpage_readpages") Signed-off-by: Kinglong Mee Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 3c1221c12026..9781e0b9153c 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1150,9 +1150,10 @@ static int f2fs_mpage_readpages(struct address_space *mapping, for (page_idx = 0; nr_pages; page_idx++, nr_pages--) { - prefetchw(&page->flags); if (pages) { page = list_last_entry(pages, struct page, lru); + + prefetchw(&page->flags); list_del(&page->lru); if (add_to_page_cache_lru(page, mapping, page->index, GFP_KERNEL)) From 89f28f5e525195254cce03b9e871f5b0f6b6ba80 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 13 Mar 2017 20:10:41 +0800 Subject: [PATCH 215/804] f2fs: cover update_free_nid_bitmap with nid_list_lock free_nid_bitmap and free_nid_count in update_free_nid_bitmap should be updated atomically, use nid_list_lock cover them to avoid race in concurrent scenario. Signed-off-by: Chao Yu Reviewed-by: Kinglong Mee Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 1 - fs/f2fs/node.c | 27 +++++++++++---------------- 2 files changed, 11 insertions(+), 17 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 0f7a5a9a8416..4398abfe13b1 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -632,7 +632,6 @@ struct f2fs_nm_info { unsigned char (*free_nid_bitmap)[NAT_ENTRY_BITMAP_SIZE]; unsigned char *nat_block_bitmap; unsigned short *free_nid_count; /* free nid count of NAT block */ - spinlock_t free_nid_lock; /* protect updating of nid count */ /* for checkpoint */ char *nat_bitmap; /* NAT bitmap pointer */ diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index edabf883cf0c..077bdb134e97 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1818,7 +1818,7 @@ static void remove_free_nid(struct f2fs_sb_info *sbi, nid_t nid) } static void update_free_nid_bitmap(struct f2fs_sb_info *sbi, nid_t nid, - bool set, bool build, bool locked) + bool set, bool build) { struct f2fs_nm_info *nm_i = NM_I(sbi); unsigned int nat_ofs = NAT_BLOCK_OFFSET(nid); @@ -1832,14 +1832,10 @@ static void update_free_nid_bitmap(struct f2fs_sb_info *sbi, nid_t nid, else __clear_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]); - if (!locked) - spin_lock(&nm_i->free_nid_lock); if (set) nm_i->free_nid_count[nat_ofs]++; else if (!build) nm_i->free_nid_count[nat_ofs]--; - if (!locked) - spin_unlock(&nm_i->free_nid_lock); } static void scan_nat_page(struct f2fs_sb_info *sbi, @@ -1868,7 +1864,9 @@ static void scan_nat_page(struct f2fs_sb_info *sbi, f2fs_bug_on(sbi, blk_addr == NEW_ADDR); if (blk_addr == NULL_ADDR) freed = add_free_nid(sbi, start_nid, true); - update_free_nid_bitmap(sbi, start_nid, freed, true, false); + spin_lock(&NM_I(sbi)->nid_list_lock); + update_free_nid_bitmap(sbi, start_nid, freed, true); + spin_unlock(&NM_I(sbi)->nid_list_lock); } } @@ -2023,7 +2021,7 @@ retry: __insert_nid_to_list(sbi, i, ALLOC_NID_LIST, false); nm_i->available_nids--; - update_free_nid_bitmap(sbi, *nid, false, false, false); + update_free_nid_bitmap(sbi, *nid, false, false); spin_unlock(&nm_i->nid_list_lock); return true; @@ -2079,7 +2077,7 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) nm_i->available_nids++; - update_free_nid_bitmap(sbi, nid, true, false, false); + update_free_nid_bitmap(sbi, nid, true, false); spin_unlock(&nm_i->nid_list_lock); @@ -2409,11 +2407,11 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, add_free_nid(sbi, nid, false); spin_lock(&NM_I(sbi)->nid_list_lock); NM_I(sbi)->available_nids++; - update_free_nid_bitmap(sbi, nid, true, false, false); + update_free_nid_bitmap(sbi, nid, true, false); spin_unlock(&NM_I(sbi)->nid_list_lock); } else { spin_lock(&NM_I(sbi)->nid_list_lock); - update_free_nid_bitmap(sbi, nid, false, false, false); + update_free_nid_bitmap(sbi, nid, false, false); spin_unlock(&NM_I(sbi)->nid_list_lock); } } @@ -2538,10 +2536,10 @@ inline void load_free_nid_bitmap(struct f2fs_sb_info *sbi) nid = i * NAT_ENTRY_PER_BLOCK; last_nid = (i + 1) * NAT_ENTRY_PER_BLOCK; - spin_lock(&nm_i->free_nid_lock); + spin_lock(&NM_I(sbi)->nid_list_lock); for (; nid < last_nid; nid++) - update_free_nid_bitmap(sbi, nid, true, true, true); - spin_unlock(&nm_i->free_nid_lock); + update_free_nid_bitmap(sbi, nid, true, true); + spin_unlock(&NM_I(sbi)->nid_list_lock); } for (i = 0; i < nm_i->nat_blocks; i++) { @@ -2632,9 +2630,6 @@ static int init_free_nid_cache(struct f2fs_sb_info *sbi) sizeof(unsigned short), GFP_KERNEL); if (!nm_i->free_nid_count) return -ENOMEM; - - spin_lock_init(&nm_i->free_nid_lock); - return 0; } From 27eff7f2f11c6abb30bc338b70bb960fa8ace1a0 Mon Sep 17 00:00:00 2001 From: Kinglong Mee Date: Fri, 10 Mar 2017 16:28:46 +0800 Subject: [PATCH 216/804] f2fs: cleanup the disk level filename updating As discuss with Jaegeuk and Chao, "Once checkpoint is done, f2fs doesn't need to update there-in filename at all." The disk-level filename is used only one case, 1. create a file A under a dir 2. sync A 3. godown 4. umount 5. mount (roll_forward) Only the rename/cross_rename changes the filename, if it happens, a. between step 1 and 2, the sync A will caused checkpoint, so that, the roll_forward at step 5 never happens. b. after step 2, the roll_forward happens, file A will roll forward to the result as after step 1. So that, any updating the disk filename is useless, just cleanup it. Signed-off-by: Kinglong Mee Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 25 ++++--------------------- fs/f2fs/f2fs.h | 2 -- fs/f2fs/file.c | 8 -------- fs/f2fs/inline.c | 2 -- fs/f2fs/namei.c | 29 ----------------------------- 5 files changed, 4 insertions(+), 62 deletions(-) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 4e2153620a3b..b71b7f364107 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -337,24 +337,6 @@ static void init_dent_inode(const struct qstr *name, struct page *ipage) set_page_dirty(ipage); } -int update_dent_inode(struct inode *inode, struct inode *to, - const struct qstr *name) -{ - struct page *page; - - if (file_enc_name(to)) - return 0; - - page = get_node_page(F2FS_I_SB(inode), inode->i_ino); - if (IS_ERR(page)) - return PTR_ERR(page); - - init_dent_inode(name, page); - f2fs_put_page(page, 1); - - return 0; -} - void do_make_empty_dir(struct inode *inode, struct inode *parent, struct f2fs_dentry_ptr *d) { @@ -438,8 +420,11 @@ struct page *init_inode_metadata(struct inode *inode, struct inode *dir, set_cold_node(inode, page); } - if (new_name) + if (new_name) { init_dent_inode(new_name, page); + if (f2fs_encrypted_inode(dir)) + file_set_enc_name(inode); + } /* * This file should be checkpointed during fsync. @@ -599,8 +584,6 @@ add_dentry: err = PTR_ERR(page); goto fail; } - if (f2fs_encrypted_inode(dir)) - file_set_enc_name(inode); } make_dentry_ptr(NULL, &d, (void *)dentry_blk, 1); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 4398abfe13b1..2410f1b4ece2 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2154,8 +2154,6 @@ ino_t f2fs_inode_by_name(struct inode *dir, const struct qstr *qstr, struct page **page); void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, struct page *page, struct inode *inode); -int update_dent_inode(struct inode *inode, struct inode *to, - const struct qstr *name); void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *d, const struct qstr *name, f2fs_hash_t name_hash, unsigned int bit_pos); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 24bbe14ff5db..bc5f73828a9b 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -112,20 +112,12 @@ static int get_parent_ino(struct inode *inode, nid_t *pino) { struct dentry *dentry; - if (file_enc_name(inode)) - return 0; - inode = igrab(inode); dentry = d_find_any_alias(inode); iput(inode); if (!dentry) return 0; - if (update_dent_inode(inode, inode, &dentry->d_name)) { - dput(dentry); - return 0; - } - *pino = parent_ino(dentry); dput(dentry); return 1; diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 2b8ac2cd35d6..a92370516659 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -528,8 +528,6 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, err = PTR_ERR(page); goto fail; } - if (f2fs_encrypted_inode(dir)) - file_set_enc_name(inode); } f2fs_wait_on_page_writeback(ipage, NODE, true); diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 65fff81889cf..43eb2bd417a8 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -717,13 +717,6 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, if (err) goto put_out_dir; - err = update_dent_inode(old_inode, new_inode, - &new_dentry->d_name); - if (err) { - release_orphan_inode(sbi); - goto put_out_dir; - } - f2fs_set_link(new_dir, new_entry, new_page, old_inode); new_inode->i_ctime = current_time(new_inode); @@ -776,8 +769,6 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, down_write(&F2FS_I(old_inode)->i_sem); file_lost_pino(old_inode); - if (new_inode && file_enc_name(new_inode)) - file_set_enc_name(old_inode); up_write(&F2FS_I(old_inode)->i_sem); old_inode->i_ctime = current_time(old_inode); @@ -914,18 +905,6 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, f2fs_lock_op(sbi); - err = update_dent_inode(old_inode, new_inode, &new_dentry->d_name); - if (err) - goto out_unlock; - if (file_enc_name(new_inode)) - file_set_enc_name(old_inode); - - err = update_dent_inode(new_inode, old_inode, &old_dentry->d_name); - if (err) - goto out_undo; - if (file_enc_name(old_inode)) - file_set_enc_name(new_inode); - /* update ".." directory entry info of old dentry */ if (old_dir_entry) f2fs_set_link(old_inode, old_dir_entry, old_dir_page, new_dir); @@ -969,14 +948,6 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) f2fs_sync_fs(sbi->sb, 1); return 0; -out_undo: - /* - * Still we may fail to recover name info of f2fs_inode here - * Drop it, once its name is set as encrypted - */ - update_dent_inode(old_inode, old_inode, &old_dentry->d_name); -out_unlock: - f2fs_unlock_op(sbi); out_new_dir: if (new_dir_entry) { f2fs_dentry_kunmap(new_inode, new_dir_page); From 96d73c33c4a3749d1d2b654313bcc98db0156cd1 Mon Sep 17 00:00:00 2001 From: Kinglong Mee Date: Wed, 15 Mar 2017 21:12:50 +0800 Subject: [PATCH 217/804] f2fs: sanity check of crc_offset from raw checkpoint The crc_offset towards or beyond the end of block is wrong, sanity check it. Signed-off-by: Kinglong Mee Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 61c519688f9d..afe4616af025 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -679,7 +679,7 @@ static int get_checkpoint_version(struct f2fs_sb_info *sbi, block_t cp_addr, *cp_block = (struct f2fs_checkpoint *)page_address(*cp_page); crc_offset = le32_to_cpu((*cp_block)->checksum_offset); - if (crc_offset >= blk_size) { + if (crc_offset > (blk_size - sizeof(__le32))) { f2fs_msg(sbi->sb, KERN_WARNING, "invalid crc_offset: %zu", crc_offset); return -EINVAL; From cc248f964ee86a60e04b549caa70277db9c55374 Mon Sep 17 00:00:00 2001 From: Kinglong Mee Date: Sat, 18 Mar 2017 09:20:55 +0800 Subject: [PATCH 218/804] f2fs: avoid stat_inc_atomic_write for non-atomic file After filemap_write_and_wait_range fail, the FI_ATOMIC_FILE flags is removed, so that f2fs should not increase the stat of atomic_write. Signed-off-by: Kinglong Mee Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index bc5f73828a9b..a96d3193f209 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1538,17 +1538,21 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); if (!get_dirty_pages(inode)) - goto out; + goto inc_stat; f2fs_msg(F2FS_I_SB(inode)->sb, KERN_WARNING, "Unexpected flush for atomic writes: ino=%lu, npages=%u", inode->i_ino, get_dirty_pages(inode)); ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); - if (ret) + if (ret) { clear_inode_flag(inode, FI_ATOMIC_FILE); -out: + goto out; + } + +inc_stat: stat_inc_atomic_write(inode); stat_update_max_atomic_write(inode); +out: inode_unlock(inode); mnt_drop_write_file(filp); return ret; From fecfdd67f86e66a821253e5a6d9d41ed0348cd45 Mon Sep 17 00:00:00 2001 From: Kinglong Mee Date: Sat, 18 Mar 2017 09:25:05 +0800 Subject: [PATCH 219/804] f2fs: calculate the f2fs_stat_info into base_mem The memory size of f2fs_stat_info also should be calculated. Signed-off-by: Kinglong Mee Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index ee2d0a485fc3..ef1179df05d9 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -156,7 +156,11 @@ static void update_mem_info(struct f2fs_sb_info *sbi) if (si->base_mem) goto get_cache; - si->base_mem = sizeof(struct f2fs_sb_info) + sbi->sb->s_blocksize; + /* build stat */ + si->base_mem = sizeof(struct f2fs_stat_info); + + /* build superblock */ + si->base_mem += sizeof(struct f2fs_sb_info) + sbi->sb->s_blocksize; si->base_mem += 2 * sizeof(struct f2fs_inode_info); si->base_mem += sizeof(*sbi->ckpt); si->base_mem += sizeof(struct percpu_counter) * NR_COUNT_TYPE; From ac60235af91e76a5b497cac4f1524f43cf8e4633 Mon Sep 17 00:00:00 2001 From: Kinglong Mee Date: Sat, 18 Mar 2017 09:26:13 +0800 Subject: [PATCH 220/804] f2fs: more reasonable mem_size calculating of ino_entry Signed-off-by: Kinglong Mee Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 077bdb134e97..c31283624cfe 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -63,8 +63,9 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type) int i; for (i = 0; i <= UPDATE_INO; i++) - mem_size += (sbi->im[i].ino_num * - sizeof(struct ino_entry)) >> PAGE_SHIFT; + mem_size += sbi->im[i].ino_num * + sizeof(struct ino_entry); + mem_size >>= PAGE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); } else if (type == EXTENT_CACHE) { mem_size = (atomic_read(&sbi->total_ext_tree) * From 506e7056e23ec95f2bd45fde50384bb036bb14af Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 21 Mar 2017 20:09:45 +0800 Subject: [PATCH 221/804] f2fs: fix recording invalid last_victim When doing garbage collection, we try to record segment offset which locates at next one of last victim, using it as the start offset in next searching. But in some corner cases, recorded offset may cross the end of main segment area, it will cause incorrectly searching in dirty_segmap bitmap. This patch adds modular operation to avoid this issue. Reported-by: Yunlei He Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 68d6a4cad349..c5644127fd4f 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -361,6 +361,7 @@ next: sbi->last_victim[p.gc_mode] = last_victim + 1; else sbi->last_victim[p.gc_mode] = segno + 1; + sbi->last_victim[p.gc_mode] %= MAIN_SEGS(sbi); break; } } From ca28c969a4b2478f97a9e176163a5dbf2e3fd617 Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Wed, 22 Mar 2017 11:59:30 +0800 Subject: [PATCH 222/804] f2fs: use set_page_private marcro in f2fs_trace_pid Use set_page_private marcro instead of operte page struct directly Signed-off-by: Yunlei He Signed-off-by: Jaegeuk Kim --- fs/f2fs/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/trace.c b/fs/f2fs/trace.c index c82ab4048127..bccbbf2616d2 100644 --- a/fs/f2fs/trace.c +++ b/fs/f2fs/trace.c @@ -59,7 +59,7 @@ void f2fs_trace_pid(struct page *page) pid_t pid = task_pid_nr(current); void *p; - page->private = pid; + set_page_private(page, (unsigned long)pid); if (radix_tree_preload(GFP_NOFS)) return; From 743ef11f591dcbb9d0318a107e35cdb40419f9a4 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 22 Mar 2017 14:45:05 +0800 Subject: [PATCH 223/804] f2fs: fix race condition in between free nid allocator/initializer In below concurrent case, allocated nid can be loaded into free nid cache and be allocated again. Thread A Thread B - f2fs_create - f2fs_new_inode - alloc_nid - __insert_nid_to_list(ALLOC_NID_LIST) - f2fs_balance_fs_bg - build_free_nids - __build_free_nids - scan_nat_page - add_free_nid - __lookup_nat_cache - f2fs_add_link - init_inode_metadata - new_inode_page - new_node_page - set_node_addr - alloc_nid_done - __remove_nid_from_list(ALLOC_NID_LIST) - __insert_nid_to_list(FREE_NID_LIST) This patch makes nat cache lookup and free nid list operation being atomical to avoid this race condition. Signed-off-by: Jaegeuk Kim Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 63 +++++++++++++++++++++++++++++++++++--------------- 1 file changed, 45 insertions(+), 18 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index c31283624cfe..c098e90cfae7 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1764,40 +1764,67 @@ static void __remove_nid_from_list(struct f2fs_sb_info *sbi, static bool add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) { struct f2fs_nm_info *nm_i = NM_I(sbi); - struct free_nid *i; + struct free_nid *i, *e; struct nat_entry *ne; - int err; + int err = -EINVAL; + bool ret = false; /* 0 nid should not be used */ if (unlikely(nid == 0)) return false; - if (build) { - /* do not add allocated nids */ - ne = __lookup_nat_cache(nm_i, nid); - if (ne && (!get_nat_flag(ne, IS_CHECKPOINTED) || - nat_get_blkaddr(ne) != NULL_ADDR)) - return false; - } - i = f2fs_kmem_cache_alloc(free_nid_slab, GFP_NOFS); i->nid = nid; i->state = NID_NEW; - if (radix_tree_preload(GFP_NOFS)) { - kmem_cache_free(free_nid_slab, i); - return true; - } + if (radix_tree_preload(GFP_NOFS)) + goto err; spin_lock(&nm_i->nid_list_lock); + + if (build) { + /* + * Thread A Thread B + * - f2fs_create + * - f2fs_new_inode + * - alloc_nid + * - __insert_nid_to_list(ALLOC_NID_LIST) + * - f2fs_balance_fs_bg + * - build_free_nids + * - __build_free_nids + * - scan_nat_page + * - add_free_nid + * - __lookup_nat_cache + * - f2fs_add_link + * - init_inode_metadata + * - new_inode_page + * - new_node_page + * - set_node_addr + * - alloc_nid_done + * - __remove_nid_from_list(ALLOC_NID_LIST) + * - __insert_nid_to_list(FREE_NID_LIST) + */ + ne = __lookup_nat_cache(nm_i, nid); + if (ne && (!get_nat_flag(ne, IS_CHECKPOINTED) || + nat_get_blkaddr(ne) != NULL_ADDR)) + goto err_out; + + e = __lookup_free_nid_list(nm_i, nid); + if (e) { + if (e->state == NID_NEW) + ret = true; + goto err_out; + } + } + ret = true; err = __insert_nid_to_list(sbi, i, FREE_NID_LIST, true); +err_out: spin_unlock(&nm_i->nid_list_lock); radix_tree_preload_end(); - if (err) { +err: + if (err) kmem_cache_free(free_nid_slab, i); - return true; - } - return true; + return ret; } static void remove_free_nid(struct f2fs_sb_info *sbi, nid_t nid) From df1b8e6f245a1ebdd0055bf603009a8473237d16 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 22 Mar 2017 17:23:45 +0800 Subject: [PATCH 224/804] f2fs: show the max number of volatile operations This patch adds to show the max number of volatile operations which are conducting concurrently. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 10 ++++++++-- fs/f2fs/f2fs.h | 18 +++++++++++++++++- fs/f2fs/file.c | 5 +++++ 3 files changed, 30 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index ef1179df05d9..0baa3ee39392 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -51,7 +51,9 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->ndirty_all = sbi->ndirty_inode[DIRTY_META]; si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES); si->aw_cnt = atomic_read(&sbi->aw_cnt); + si->vw_cnt = atomic_read(&sbi->vw_cnt); si->max_aw_cnt = atomic_read(&sbi->max_aw_cnt); + si->max_vw_cnt = atomic_read(&sbi->max_vw_cnt); si->nr_wb_cp_data = get_pages(sbi, F2FS_WB_CP_DATA); si->nr_wb_data = get_pages(sbi, F2FS_WB_DATA); if (SM_I(sbi) && SM_I(sbi)->fcc_info) @@ -337,8 +339,10 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, " - IO (CP: %4d, Data: %4d, Flush: %4d, Discard: %4d)\n", si->nr_wb_cp_data, si->nr_wb_data, si->nr_flush, si->nr_discard); - seq_printf(s, " - inmem: %4d, atomic IO: %4d (Max. %4d)\n", - si->inmem_pages, si->aw_cnt, si->max_aw_cnt); + seq_printf(s, " - inmem: %4d, atomic IO: %4d (Max. %4d), " + "volatile IO: %4d (Max. %4d)\n", + si->inmem_pages, si->aw_cnt, si->max_aw_cnt, + si->vw_cnt, si->max_vw_cnt); seq_printf(s, " - nodes: %4d in %4d\n", si->ndirty_node, si->node_pages); seq_printf(s, " - dents: %4d in dirs:%4d (%4d)\n", @@ -438,7 +442,9 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) atomic_set(&sbi->inplace_count, 0); atomic_set(&sbi->aw_cnt, 0); + atomic_set(&sbi->vw_cnt, 0); atomic_set(&sbi->max_aw_cnt, 0); + atomic_set(&sbi->max_vw_cnt, 0); mutex_lock(&f2fs_stat_mutex); list_add_tail(&si->stat_list, &f2fs_stat_list); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 2410f1b4ece2..4b19cba0fee2 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -982,7 +982,9 @@ struct f2fs_sb_info { atomic_t inline_inode; /* # of inline_data inodes */ atomic_t inline_dir; /* # of inline_dentry inodes */ atomic_t aw_cnt; /* # of atomic writes */ + atomic_t vw_cnt; /* # of volatile writes */ atomic_t max_aw_cnt; /* max # of atomic writes */ + atomic_t max_vw_cnt; /* max # of volatile writes */ int bg_gc; /* background gc calls */ unsigned int ndirty_inode[NR_INODE_TYPE]; /* # of dirty inodes */ #endif @@ -2395,7 +2397,7 @@ struct f2fs_stat_info { int total_count, utilization; int bg_gc, nr_wb_cp_data, nr_wb_data, nr_flush, nr_discard; int inline_xattr, inline_inode, inline_dir, append, update, orphans; - int aw_cnt, max_aw_cnt; + int aw_cnt, max_aw_cnt, vw_cnt, max_vw_cnt; unsigned int valid_count, valid_node_count, valid_inode_count, discard_blks; unsigned int bimodal, avg_vblocks; int util_free, util_valid, util_invalid; @@ -2478,6 +2480,17 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) if (cur > max) \ atomic_set(&F2FS_I_SB(inode)->max_aw_cnt, cur); \ } while (0) +#define stat_inc_volatile_write(inode) \ + (atomic_inc(&F2FS_I_SB(inode)->vw_cnt)) +#define stat_dec_volatile_write(inode) \ + (atomic_dec(&F2FS_I_SB(inode)->vw_cnt)) +#define stat_update_max_volatile_write(inode) \ + do { \ + int cur = atomic_read(&F2FS_I_SB(inode)->vw_cnt); \ + int max = atomic_read(&F2FS_I_SB(inode)->max_vw_cnt); \ + if (cur > max) \ + atomic_set(&F2FS_I_SB(inode)->max_vw_cnt, cur); \ + } while (0) #define stat_inc_seg_count(sbi, type, gc_type) \ do { \ struct f2fs_stat_info *si = F2FS_STAT(sbi); \ @@ -2534,6 +2547,9 @@ void f2fs_destroy_root_stats(void); #define stat_inc_atomic_write(inode) #define stat_dec_atomic_write(inode) #define stat_update_max_atomic_write(inode) +#define stat_inc_volatile_write(inode) +#define stat_dec_volatile_write(inode) +#define stat_update_max_volatile_write(inode) #define stat_inc_seg_type(sbi, curseg) #define stat_inc_block_count(sbi, curseg) #define stat_inc_inplace_blocks(sbi) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index a96d3193f209..b06a52e33a79 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1431,6 +1431,7 @@ static int f2fs_release_file(struct inode *inode, struct file *filp) drop_inmem_pages(inode); if (f2fs_is_volatile_file(inode)) { clear_inode_flag(inode, FI_VOLATILE_FILE); + stat_dec_volatile_write(inode); set_inode_flag(inode, FI_DROP_CACHE); filemap_fdatawrite(inode->i_mapping); clear_inode_flag(inode, FI_DROP_CACHE); @@ -1618,6 +1619,9 @@ static int f2fs_ioc_start_volatile_write(struct file *filp) if (ret) goto out; + stat_inc_volatile_write(inode); + stat_update_max_volatile_write(inode); + set_inode_flag(inode, FI_VOLATILE_FILE); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); out: @@ -1673,6 +1677,7 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp) drop_inmem_pages(inode); if (f2fs_is_volatile_file(inode)) { clear_inode_flag(inode, FI_VOLATILE_FILE); + stat_dec_volatile_write(inode); ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); } From afc8c720de8613d2e740a96e030060643b21b0d3 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 22 Mar 2017 17:23:46 +0800 Subject: [PATCH 225/804] f2fs: don't track volatile file in dirty inode list Don't track volatile file in dirty inode list, otherwise with data_flush option, background thread will entry into endless loop for flushing journal file's pages. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index afe4616af025..9d92f83cce94 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -818,7 +818,9 @@ static void __add_dirty_inode(struct inode *inode, enum inode_type type) return; set_inode_flag(inode, flag); - list_add_tail(&F2FS_I(inode)->dirty_list, &sbi->inode_list[type]); + if (!f2fs_is_volatile_file(inode)) + list_add_tail(&F2FS_I(inode)->dirty_list, + &sbi->inode_list[type]); stat_inc_dirty_inode(sbi, type); } From b92a30224597ac6062298756fba67380d7069113 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 23 Mar 2017 13:38:25 +0800 Subject: [PATCH 226/804] f2fs: clean up xattr operation 1. don't allocate redundant memory in read_all_xattrs. 2. introduce RESERVED_XATTR_SIZE for cleanup. Signed-off-by: Chao Yu Reviewed-by: Kinglong Mee Signed-off-by: Jaegeuk Kim --- fs/f2fs/xattr.c | 25 +++++++++++-------------- fs/f2fs/xattr.h | 3 ++- 2 files changed, 13 insertions(+), 15 deletions(-) diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index fb5062a4df77..afe14845c00a 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -297,15 +297,13 @@ static int lookup_all_xattrs(struct inode *inode, struct page *ipage, void *cur_addr, *txattr_addr, *last_addr = NULL; nid_t xnid = F2FS_I(inode)->i_xattr_nid; unsigned int size = xnid ? VALID_XATTR_BLOCK_SIZE : 0; - unsigned int inline_size = 0; + unsigned int inline_size = inline_xattr_size(inode); int err = 0; - inline_size = inline_xattr_size(inode); - if (!size && !inline_size) return -ENODATA; - txattr_addr = kzalloc(inline_size + size + sizeof(__u32), + txattr_addr = kzalloc(inline_size + size + RESERVED_XATTR_SIZE, GFP_F2FS_ZERO); if (!txattr_addr) return -ENOMEM; @@ -375,13 +373,14 @@ static int read_all_xattrs(struct inode *inode, struct page *ipage, { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_xattr_header *header; - size_t size = PAGE_SIZE, inline_size = 0; + nid_t xnid = F2FS_I(inode)->i_xattr_nid; + unsigned int size = VALID_XATTR_BLOCK_SIZE; + unsigned int inline_size = inline_xattr_size(inode); void *txattr_addr; int err; - inline_size = inline_xattr_size(inode); - - txattr_addr = kzalloc(inline_size + size, GFP_F2FS_ZERO); + txattr_addr = kzalloc(inline_size + size + RESERVED_XATTR_SIZE, + GFP_F2FS_ZERO); if (!txattr_addr) return -ENOMEM; @@ -405,19 +404,19 @@ static int read_all_xattrs(struct inode *inode, struct page *ipage, } /* read from xattr node block */ - if (F2FS_I(inode)->i_xattr_nid) { + if (xnid) { struct page *xpage; void *xattr_addr; /* The inode already has an extended attribute block. */ - xpage = get_node_page(sbi, F2FS_I(inode)->i_xattr_nid); + xpage = get_node_page(sbi, xnid); if (IS_ERR(xpage)) { err = PTR_ERR(xpage); goto fail; } xattr_addr = page_address(xpage); - memcpy(txattr_addr + inline_size, xattr_addr, PAGE_SIZE); + memcpy(txattr_addr + inline_size, xattr_addr, size); f2fs_put_page(xpage, 1); } @@ -439,14 +438,12 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, void *txattr_addr, struct page *ipage) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - size_t inline_size = 0; + size_t inline_size = inline_xattr_size(inode); void *xattr_addr; struct page *xpage; nid_t new_nid = 0; int err; - inline_size = inline_xattr_size(inode); - if (hsize > inline_size && !F2FS_I(inode)->i_xattr_nid) if (!alloc_nid(sbi, &new_nid)) return -ENOSPC; diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h index ba64f43d163d..d111568daf83 100644 --- a/fs/f2fs/xattr.h +++ b/fs/f2fs/xattr.h @@ -73,7 +73,8 @@ struct f2fs_xattr_entry { !IS_XATTR_LAST_ENTRY(entry);\ entry = XATTR_NEXT_ENTRY(entry)) #define MAX_XATTR_BLOCK_SIZE (PAGE_SIZE - sizeof(struct node_footer)) -#define VALID_XATTR_BLOCK_SIZE (MAX_XATTR_BLOCK_SIZE - sizeof(__u32)) +#define RESERVED_XATTR_SIZE (sizeof(__u32)) +#define VALID_XATTR_BLOCK_SIZE (MAX_XATTR_BLOCK_SIZE - RESERVED_XATTR_SIZE) #define MIN_OFFSET(i) XATTR_ALIGN(inline_xattr_size(i) + \ VALID_XATTR_BLOCK_SIZE) From fbe4cc0f76dcd78a09acd155761b505291e84087 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 23 Mar 2017 13:38:26 +0800 Subject: [PATCH 227/804] f2fs: don't reserve additional space in xattr block In this patch, we change xattr block disk layout as below: Before: xattr node block layout +---------------------------------------------+---------------+-------------+ | node block xattr entries | reserved | node footer | | 4068 Bytes | 4 Bytes | 24 Bytes | In memory layout +--------------------+---------------------------------+--------------------+ | inline xattr | node block xattr entries | reserved | | 200 Bytes | 4068 Bytes | 4 Bytes | After: xattr node block layout +-------------------------------------------------------------+-------------+ | node block xattr entries | node footer | | 4072 Bytes | 24 Bytes | In memory layout +--------------------+---------------------------------+--------------------+ | inline xattr | node block xattr entries | reserved | | 200 Bytes | 4072 Bytes | 4 Bytes | With this change, we don't need to reserve additional space in node block, just keep reserved space in logical in-memory layout. So that it would help to enlarge valid free space of xattr node block. As tested, generic/026 shows max stored xattr entires number increases from 531 to 532 when inline_xattr option is enabled. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/xattr.c | 6 +++--- fs/f2fs/xattr.h | 5 ++--- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index afe14845c00a..aaf0a4167175 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -303,7 +303,7 @@ static int lookup_all_xattrs(struct inode *inode, struct page *ipage, if (!size && !inline_size) return -ENODATA; - txattr_addr = kzalloc(inline_size + size + RESERVED_XATTR_SIZE, + txattr_addr = kzalloc(inline_size + size + XATTR_PADDING_SIZE, GFP_F2FS_ZERO); if (!txattr_addr) return -ENOMEM; @@ -379,7 +379,7 @@ static int read_all_xattrs(struct inode *inode, struct page *ipage, void *txattr_addr; int err; - txattr_addr = kzalloc(inline_size + size + RESERVED_XATTR_SIZE, + txattr_addr = kzalloc(inline_size + size + XATTR_PADDING_SIZE, GFP_F2FS_ZERO); if (!txattr_addr) return -ENOMEM; @@ -498,7 +498,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, } xattr_addr = page_address(xpage); - memcpy(xattr_addr, txattr_addr + inline_size, MAX_XATTR_BLOCK_SIZE); + memcpy(xattr_addr, txattr_addr + inline_size, VALID_XATTR_BLOCK_SIZE); set_page_dirty(xpage); f2fs_put_page(xpage, 1); diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h index d111568daf83..91f3bd88dcc6 100644 --- a/fs/f2fs/xattr.h +++ b/fs/f2fs/xattr.h @@ -72,9 +72,8 @@ struct f2fs_xattr_entry { for (entry = XATTR_FIRST_ENTRY(addr);\ !IS_XATTR_LAST_ENTRY(entry);\ entry = XATTR_NEXT_ENTRY(entry)) -#define MAX_XATTR_BLOCK_SIZE (PAGE_SIZE - sizeof(struct node_footer)) -#define RESERVED_XATTR_SIZE (sizeof(__u32)) -#define VALID_XATTR_BLOCK_SIZE (MAX_XATTR_BLOCK_SIZE - RESERVED_XATTR_SIZE) +#define VALID_XATTR_BLOCK_SIZE (PAGE_SIZE - sizeof(struct node_footer)) +#define XATTR_PADDING_SIZE (sizeof(__u32)) #define MIN_OFFSET(i) XATTR_ALIGN(inline_xattr_size(i) + \ VALID_XATTR_BLOCK_SIZE) From aa9de43b3bc690f168f03964a53cb615227c694f Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Mon, 13 Mar 2017 20:22:18 +0800 Subject: [PATCH 228/804] f2fs: allow write page cache when writting cp This patch allow write data to normal file when writting new checkpoint. We relax three limitations for write_begin path: 1. data allocation 2. node allocation 3. variables in checkpoint Signed-off-by: Yunlei He Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 40 ++++++++++++++++++++++++++++------------ fs/f2fs/data.c | 28 ++++++++++++++++++++++------ fs/f2fs/f2fs.h | 1 + fs/f2fs/node.c | 12 ++++++------ fs/f2fs/super.c | 1 + 5 files changed, 58 insertions(+), 24 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 9d92f83cce94..a9f141abac5d 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -945,6 +945,19 @@ int f2fs_sync_inode_meta(struct f2fs_sb_info *sbi) return 0; } +static void __prepare_cp_block(struct f2fs_sb_info *sbi) +{ + struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + struct f2fs_nm_info *nm_i = NM_I(sbi); + nid_t last_nid = nm_i->next_scan_nid; + + next_free_nid(sbi, &last_nid); + ckpt->valid_block_count = cpu_to_le64(valid_user_blocks(sbi)); + ckpt->valid_node_count = cpu_to_le32(valid_node_count(sbi)); + ckpt->valid_inode_count = cpu_to_le32(valid_inode_count(sbi)); + ckpt->next_free_nid = cpu_to_le32(last_nid); +} + /* * Freeze all the FS-operations for checkpoint. */ @@ -971,7 +984,14 @@ retry_flush_dents: goto retry_flush_dents; } + /* + * POR: we should ensure that there are no dirty node pages + * until finishing nat/sit flush. inode->i_blocks can be updated. + */ + down_write(&sbi->node_change); + if (get_pages(sbi, F2FS_DIRTY_IMETA)) { + up_write(&sbi->node_change); f2fs_unlock_all(sbi); err = f2fs_sync_inode_meta(sbi); if (err) @@ -979,10 +999,6 @@ retry_flush_dents: goto retry_flush_dents; } - /* - * POR: we should ensure that there are no dirty node pages - * until finishing nat/sit flush. - */ retry_flush_nodes: down_write(&sbi->node_write); @@ -990,11 +1006,19 @@ retry_flush_nodes: up_write(&sbi->node_write); err = sync_node_pages(sbi, &wbc); if (err) { + up_write(&sbi->node_change); f2fs_unlock_all(sbi); goto out; } goto retry_flush_nodes; } + + /* + * sbi->node_change is used only for AIO write_begin path which produces + * dirty node blocks and some checkpoint values by block allocation. + */ + __prepare_cp_block(sbi); + up_write(&sbi->node_change); out: blk_finish_plug(&plug); return err; @@ -1062,7 +1086,6 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); struct f2fs_nm_info *nm_i = NM_I(sbi); unsigned long orphan_num = sbi->im[ORPHAN_INO].ino_num; - nid_t last_nid = nm_i->next_scan_nid; block_t start_blk; unsigned int data_sum_blocks, orphan_blocks; __u32 crc32 = 0; @@ -1079,14 +1102,11 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) return -EIO; } - next_free_nid(sbi, &last_nid); - /* * modify checkpoint * version number is already updated */ ckpt->elapsed_time = cpu_to_le64(get_mtime(sbi)); - ckpt->valid_block_count = cpu_to_le64(valid_user_blocks(sbi)); ckpt->free_segment_count = cpu_to_le32(free_segments(sbi)); for (i = 0; i < NR_CURSEG_NODE_TYPE; i++) { ckpt->cur_node_segno[i] = @@ -1105,10 +1125,6 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) curseg_alloc_type(sbi, i + CURSEG_HOT_DATA); } - ckpt->valid_node_count = cpu_to_le32(valid_node_count(sbi)); - ckpt->valid_inode_count = cpu_to_le32(valid_inode_count(sbi)); - ckpt->next_free_nid = cpu_to_le32(last_nid); - /* 2 cp + n data seg summary + orphan inode blocks */ data_sum_blocks = npages_for_summary_flush(sbi, false); spin_lock(&sbi->cp_lock); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 9781e0b9153c..49c04c4e3bd8 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -786,6 +786,21 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) return err; } +static inline void __do_map_lock(struct f2fs_sb_info *sbi, int flag, bool lock) +{ + if (flag == F2FS_GET_BLOCK_PRE_AIO) { + if (lock) + down_read(&sbi->node_change); + else + up_read(&sbi->node_change); + } else { + if (lock) + f2fs_lock_op(sbi); + else + f2fs_unlock_op(sbi); + } +} + /* * f2fs_map_blocks() now supported readahead/bmap/rw direct_IO with * f2fs_map_blocks structure. @@ -828,7 +843,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, next_dnode: if (create) - f2fs_lock_op(sbi); + __do_map_lock(sbi, flag, true); /* When reading holes, we need its node page */ set_new_dnode(&dn, inode, NULL, NULL, 0); @@ -938,7 +953,7 @@ skip: f2fs_put_dnode(&dn); if (create) { - f2fs_unlock_op(sbi); + __do_map_lock(sbi, flag, false); f2fs_balance_fs(sbi, dn.node_changed); } goto next_dnode; @@ -947,7 +962,7 @@ sync_out: f2fs_put_dnode(&dn); unlock_out: if (create) { - f2fs_unlock_op(sbi); + __do_map_lock(sbi, flag, false); f2fs_balance_fs(sbi, dn.node_changed); } out: @@ -1686,7 +1701,7 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi, if (f2fs_has_inline_data(inode) || (pos & PAGE_MASK) >= i_size_read(inode)) { - f2fs_lock_op(sbi); + __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true); locked = true; } restart: @@ -1722,7 +1737,8 @@ restart: err = get_dnode_of_data(&dn, index, LOOKUP_NODE); if (err || dn.data_blkaddr == NULL_ADDR) { f2fs_put_dnode(&dn); - f2fs_lock_op(sbi); + __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, + true); locked = true; goto restart; } @@ -1736,7 +1752,7 @@ out: f2fs_put_dnode(&dn); unlock_out: if (locked) - f2fs_unlock_op(sbi); + __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, false); return err; } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 4b19cba0fee2..1f3576a74112 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -897,6 +897,7 @@ struct f2fs_sb_info { struct mutex cp_mutex; /* checkpoint procedure lock */ struct rw_semaphore cp_rwsem; /* blocking FS operations */ struct rw_semaphore node_write; /* locking node writes */ + struct rw_semaphore node_change; /* locking node change */ wait_queue_head_t cp_wait; unsigned long last_time[MAX_TIME]; /* to store time in jiffies */ long interval_time[MAX_TIME]; /* to store thresholds */ diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index c098e90cfae7..b737c049174a 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2451,10 +2451,11 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, f2fs_put_page(page, 1); } - f2fs_bug_on(sbi, set->entry_cnt); - - radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set); - kmem_cache_free(nat_entry_set_slab, set); + /* Allow dirty nats by node block allocation in write_begin */ + if (!set->entry_cnt) { + radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set); + kmem_cache_free(nat_entry_set_slab, set); + } } /* @@ -2499,8 +2500,7 @@ void flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) __flush_nat_entry_set(sbi, set, cpc); up_write(&nm_i->nat_tree_lock); - - f2fs_bug_on(sbi, nm_i->dirty_nat_cnt); + /* Allow dirty nats by node block allocation in write_begin */ } static int __get_nat_bitmaps(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 779fd5e5cf40..19abc4fc6592 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1924,6 +1924,7 @@ try_onemore: mutex_init(&sbi->gc_mutex); mutex_init(&sbi->cp_mutex); init_rwsem(&sbi->node_write); + init_rwsem(&sbi->node_change); /* disallow all the data/node/meta page writes */ set_sbi_flag(sbi, SBI_POR_DOING); From 0137923fb501557e166e2d1dc02a4cb1c148b761 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Sat, 25 Mar 2017 00:03:02 -0700 Subject: [PATCH 229/804] f2fs: fix wrong max cost initialization This patch fixes missing increased max cost caused by a patch that we increased cose of data segments in greedy algorithm. Cc: # v4.10+ Fixes: b9cd20619 "f2fs: node segment is prior to data segment selected victim" Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index c5644127fd4f..3db2d26e004a 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -182,7 +182,7 @@ static unsigned int get_max_cost(struct f2fs_sb_info *sbi, if (p->alloc_mode == SSR) return sbi->blocks_per_seg; if (p->gc_mode == GC_GREEDY) - return sbi->blocks_per_seg * p->ofs_unit; + return 2 * sbi->blocks_per_seg * p->ofs_unit; else if (p->gc_mode == GC_CB) return UINT_MAX; else /* No other gc_mode */ From 5bac5ad719c9fce5975f1c838a66b530d4a3d8f2 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 24 Mar 2017 20:41:45 -0400 Subject: [PATCH 230/804] f2fs: allocate node and hot data in the beginning of partition In order to give more spatial locality, this patch changes the block allocation policy which assigns beginning of partition for small and hot data/node blocks. In order to do this, we set noheap allocation by default and introduce another mount option, heap, to reset it back. Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 6 +++++- fs/f2fs/segment.c | 9 +++++++++ fs/f2fs/super.c | 10 +++++++++- 3 files changed, 23 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 3db2d26e004a..90ed2cdff86d 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -172,7 +172,11 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type, if (gc_type != FG_GC && p->max_search > sbi->max_victim_search) p->max_search = sbi->max_victim_search; - p->offset = sbi->last_victim[p->gc_mode]; + /* let's select beginning hot/small space first */ + if (type == CURSEG_HOT_DATA || IS_NODESEG(type)) + p->offset = 0; + else + p->offset = sbi->last_victim[p->gc_mode]; } static unsigned int get_max_cost(struct f2fs_sb_info *sbi, diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 5fc0173af7e3..a305b38737f8 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1598,6 +1598,14 @@ static void reset_curseg(struct f2fs_sb_info *sbi, int type, int modified) __set_sit_entry_type(sbi, type, curseg->segno, modified); } +static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type) +{ + if (type == CURSEG_HOT_DATA || IS_NODESEG(type)) + return 0; + + return CURSEG_I(sbi, type)->segno; +} + /* * Allocate a current working segment. * This function always allocates a free segment in LFS manner. @@ -1616,6 +1624,7 @@ static void new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec) if (test_opt(sbi, NOHEAP)) dir = ALLOC_RIGHT; + segno = __get_next_segno(sbi, type); get_new_segment(sbi, &segno, new_sec, dir); curseg->next_segno = segno; reset_curseg(sbi, type, 1); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 19abc4fc6592..e43824849cb7 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -83,6 +83,7 @@ enum { Opt_discard, Opt_nodiscard, Opt_noheap, + Opt_heap, Opt_user_xattr, Opt_nouser_xattr, Opt_acl, @@ -117,6 +118,7 @@ static match_table_t f2fs_tokens = { {Opt_discard, "discard"}, {Opt_nodiscard, "nodiscard"}, {Opt_noheap, "no_heap"}, + {Opt_heap, "heap"}, {Opt_user_xattr, "user_xattr"}, {Opt_nouser_xattr, "nouser_xattr"}, {Opt_acl, "acl"}, @@ -437,6 +439,9 @@ static int parse_options(struct super_block *sb, char *options) case Opt_noheap: set_opt(sbi, NOHEAP); break; + case Opt_heap: + clear_opt(sbi, NOHEAP); + break; #ifdef CONFIG_F2FS_FS_XATTR case Opt_user_xattr: set_opt(sbi, XATTR_USER); @@ -915,7 +920,9 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) if (test_opt(sbi, DISCARD)) seq_puts(seq, ",discard"); if (test_opt(sbi, NOHEAP)) - seq_puts(seq, ",no_heap_alloc"); + seq_puts(seq, ",no_heap"); + else + seq_puts(seq, ",heap"); #ifdef CONFIG_F2FS_FS_XATTR if (test_opt(sbi, XATTR_USER)) seq_puts(seq, ",user_xattr"); @@ -1049,6 +1056,7 @@ static void default_options(struct f2fs_sb_info *sbi) set_opt(sbi, INLINE_DATA); set_opt(sbi, INLINE_DENTRY); set_opt(sbi, EXTENT_CACHE); + set_opt(sbi, NOHEAP); sbi->sb->s_flags |= MS_LAZYTIME; set_opt(sbi, FLUSH_MERGE); if (f2fs_sb_mounted_blkzoned(sbi->sb)) { From f546e14f0351c04dc996a266f20d81ad639ba863 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 24 Mar 2017 21:08:56 -0400 Subject: [PATCH 231/804] f2fs: start SSR much eariler to avoid FG_GC This patch initiates SSR much eariler, resulting in less FG_GC. Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 5e8ad4280a50..31846b0fcb95 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -495,7 +495,7 @@ static inline bool need_SSR(struct f2fs_sb_info *sbi) return false; return free_sections(sbi) <= (node_secs + 2 * dent_secs + imeta_secs + - reserved_sections(sbi) + 1); + 2 * reserved_sections(sbi)); } static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, From 361ee401443b1eda1267f0a2f9113e45f1fdd947 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 21 Mar 2017 10:59:50 -0400 Subject: [PATCH 232/804] f2fs: relax node version check for victim data in gc - has_not_enough_free_secs node_secs: 0 dent_secs: 0 freed:0 free_segments:103 reserved:104 - f2fs_gc - get_victim_by_default alloc_mode 0, gc_mode 1, max_search 2672, offset 4654, ofs_unit 1 - do_garbage_collect start_segno 3976, end_segno 3977 type 0 - is_alive nid 22797, blkaddr 2131882, ofs_in_node 0, version 0x8/0x0 - gc_data_segment 766, segno 3976, block 512/426 not alive So, this patch fixes subtle corrupted case where node version does not match to summary version which results in infinite loop by gc. Reported-by: Yunlei He Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 90ed2cdff86d..d712b64ee6c2 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -555,8 +555,10 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, get_node_info(sbi, nid, dni); if (sum->version != dni->version) { - f2fs_put_page(node_page, 1); - return false; + f2fs_msg(sbi->sb, KERN_WARNING, + "%s: valid data with mismatched node version.", + __func__); + set_sbi_flag(sbi, SBI_NEED_FSCK); } *nofs = ofs_of_node(node_page); From 0d5b6b22f109d29112080ef74da07c2ac8916e3c Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 25 Mar 2017 17:19:58 +0800 Subject: [PATCH 233/804] f2fs: show issued flush/discard count Show historical count of flush command and discard command. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 24 ++++++++++++++++-------- fs/f2fs/f2fs.h | 9 ++++++--- fs/f2fs/segment.c | 39 ++++++++++++++++++++++++--------------- 3 files changed, 46 insertions(+), 26 deletions(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 0baa3ee39392..f27e66ea7ff3 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -56,12 +56,18 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->max_vw_cnt = atomic_read(&sbi->max_vw_cnt); si->nr_wb_cp_data = get_pages(sbi, F2FS_WB_CP_DATA); si->nr_wb_data = get_pages(sbi, F2FS_WB_DATA); - if (SM_I(sbi) && SM_I(sbi)->fcc_info) - si->nr_flush = - atomic_read(&SM_I(sbi)->fcc_info->submit_flush); - if (SM_I(sbi) && SM_I(sbi)->dcc_info) - si->nr_discard = - atomic_read(&SM_I(sbi)->dcc_info->submit_discard); + if (SM_I(sbi) && SM_I(sbi)->fcc_info) { + si->nr_flushed = + atomic_read(&SM_I(sbi)->fcc_info->issued_flush); + si->nr_flushing = + atomic_read(&SM_I(sbi)->fcc_info->issing_flush); + } + if (SM_I(sbi) && SM_I(sbi)->dcc_info) { + si->nr_discarded = + atomic_read(&SM_I(sbi)->dcc_info->issued_discard); + si->nr_discarding = + atomic_read(&SM_I(sbi)->dcc_info->issing_discard); + } si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg; si->rsvd_segs = reserved_segments(sbi); si->overp_segs = overprovision_segments(sbi); @@ -336,9 +342,11 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, " - Inner Struct Count: tree: %d(%d), node: %d\n", si->ext_tree, si->zombie_tree, si->ext_node); seq_puts(s, "\nBalancing F2FS Async:\n"); - seq_printf(s, " - IO (CP: %4d, Data: %4d, Flush: %4d, Discard: %4d)\n", + seq_printf(s, " - IO (CP: %4d, Data: %4d, Flush: (%4d %4d), " + "Discard: (%4d %4d))\n", si->nr_wb_cp_data, si->nr_wb_data, - si->nr_flush, si->nr_discard); + si->nr_flushing, si->nr_flushed, + si->nr_discarding, si->nr_discarded); seq_printf(s, " - inmem: %4d, atomic IO: %4d (Max. %4d), " "volatile IO: %4d (Max. %4d)\n", si->inmem_pages, si->aw_cnt, si->max_aw_cnt, diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 1f3576a74112..26beb67825c2 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -281,7 +281,8 @@ struct discard_cmd_control { wait_queue_head_t discard_wait_queue; /* waiting queue for wake-up */ struct mutex cmd_lock; int max_discards; /* max. discards to be issued */ - atomic_t submit_discard; /* # of issued discard */ + atomic_t issued_discard; /* # of issued discard */ + atomic_t issing_discard; /* # of issing discard */ }; /* for the list of fsync inodes, used only during recovery */ @@ -710,7 +711,8 @@ struct flush_cmd { struct flush_cmd_control { struct task_struct *f2fs_issue_flush; /* flush thread */ wait_queue_head_t flush_wait_queue; /* waiting queue for wake-up */ - atomic_t submit_flush; /* # of issued flushes */ + atomic_t issued_flush; /* # of issued flushes */ + atomic_t issing_flush; /* # of issing flushes */ struct llist_head issue_list; /* list for command issue */ struct llist_node *dispatch_list; /* list for command dispatch */ }; @@ -2396,7 +2398,8 @@ struct f2fs_stat_info { unsigned int ndirty_dirs, ndirty_files, ndirty_all; int nats, dirty_nats, sits, dirty_sits, free_nids, alloc_nids; int total_count, utilization; - int bg_gc, nr_wb_cp_data, nr_wb_data, nr_flush, nr_discard; + int bg_gc, nr_wb_cp_data, nr_wb_data; + int nr_flushing, nr_flushed, nr_discarding, nr_discarded; int inline_xattr, inline_inode, inline_dir, append, update, orphans; int aw_cnt, max_aw_cnt, vw_cnt, max_vw_cnt; unsigned int valid_count, valid_node_count, valid_inode_count, discard_blks; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index a305b38737f8..717d6cc51ef2 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -490,6 +490,8 @@ repeat: fcc->dispatch_list = llist_reverse_order(fcc->dispatch_list); ret = submit_flush_wait(sbi); + atomic_inc(&fcc->issued_flush); + llist_for_each_entry_safe(cmd, next, fcc->dispatch_list, llnode) { cmd->ret = ret; @@ -507,25 +509,29 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi) { struct flush_cmd_control *fcc = SM_I(sbi)->fcc_info; struct flush_cmd cmd; + int ret; if (test_opt(sbi, NOBARRIER)) return 0; - if (!test_opt(sbi, FLUSH_MERGE)) - return submit_flush_wait(sbi); - - if (!atomic_read(&fcc->submit_flush)) { - int ret; - - atomic_inc(&fcc->submit_flush); + if (!test_opt(sbi, FLUSH_MERGE)) { ret = submit_flush_wait(sbi); - atomic_dec(&fcc->submit_flush); + atomic_inc(&fcc->issued_flush); + return ret; + } + + if (!atomic_read(&fcc->issing_flush)) { + atomic_inc(&fcc->issing_flush); + ret = submit_flush_wait(sbi); + atomic_dec(&fcc->issing_flush); + + atomic_inc(&fcc->issued_flush); return ret; } init_completion(&cmd.wait); - atomic_inc(&fcc->submit_flush); + atomic_inc(&fcc->issing_flush); llist_add(&cmd.llnode, &fcc->issue_list); if (!fcc->dispatch_list) @@ -533,10 +539,10 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi) if (fcc->f2fs_issue_flush) { wait_for_completion(&cmd.wait); - atomic_dec(&fcc->submit_flush); + atomic_dec(&fcc->issing_flush); } else { llist_del_all(&fcc->issue_list); - atomic_set(&fcc->submit_flush, 0); + atomic_set(&fcc->issing_flush, 0); } return cmd.ret; @@ -556,7 +562,8 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi) fcc = kzalloc(sizeof(struct flush_cmd_control), GFP_KERNEL); if (!fcc) return -ENOMEM; - atomic_set(&fcc->submit_flush, 0); + atomic_set(&fcc->issued_flush, 0); + atomic_set(&fcc->issing_flush, 0); init_waitqueue_head(&fcc->flush_wait_queue); init_llist_head(&fcc->issue_list); SM_I(sbi)->fcc_info = fcc; @@ -691,7 +698,7 @@ static void __add_discard_cmd(struct f2fs_sb_info *sbi, static void __remove_discard_cmd(struct f2fs_sb_info *sbi, struct discard_cmd *dc) { if (dc->state == D_DONE) - atomic_dec(&(SM_I(sbi)->dcc_info->submit_discard)); + atomic_dec(&(SM_I(sbi)->dcc_info->issing_discard)); if (dc->error == -EOPNOTSUPP) dc->error = 0; @@ -810,7 +817,8 @@ static void __submit_discard_cmd(struct f2fs_sb_info *sbi, if (!dc->error) { /* should keep before submission to avoid D_DONE right away */ dc->state = D_SUBMIT; - atomic_inc(&dcc->submit_discard); + atomic_inc(&dcc->issued_discard); + atomic_inc(&dcc->issing_discard); if (bio) { bio->bi_private = dc; bio->bi_end_io = f2fs_submit_discard_endio; @@ -1214,7 +1222,8 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi) INIT_LIST_HEAD(&dcc->discard_entry_list); INIT_LIST_HEAD(&dcc->discard_cmd_list); mutex_init(&dcc->cmd_lock); - atomic_set(&dcc->submit_discard, 0); + atomic_set(&dcc->issued_discard, 0); + atomic_set(&dcc->issing_discard, 0); dcc->nr_discards = 0; dcc->max_discards = 0; From 54c1e9049e250dfbbef3f2a51da4efa72e6e4b0c Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 25 Mar 2017 17:19:59 +0800 Subject: [PATCH 234/804] f2fs: count discard command entry Adds to count discard command entry and show the number in debugfs, also fix to add cost of discard command cache into total comsumed memory footprint. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 12 +++++++++--- fs/f2fs/f2fs.h | 2 ++ fs/f2fs/segment.c | 4 ++++ 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index f27e66ea7ff3..906f627e44fc 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -67,6 +67,8 @@ static void update_general_status(struct f2fs_sb_info *sbi) atomic_read(&SM_I(sbi)->dcc_info->issued_discard); si->nr_discarding = atomic_read(&SM_I(sbi)->dcc_info->issing_discard); + si->nr_discard_cmd = + atomic_read(&SM_I(sbi)->dcc_info->discard_cmd_cnt); } si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg; si->rsvd_segs = reserved_segments(sbi); @@ -220,8 +222,11 @@ get_cache: /* build merge flush thread */ if (SM_I(sbi)->fcc_info) si->cache_mem += sizeof(struct flush_cmd_control); - if (SM_I(sbi)->dcc_info) + if (SM_I(sbi)->dcc_info) { si->cache_mem += sizeof(struct discard_cmd_control); + si->cache_mem += sizeof(struct discard_cmd) * + atomic_read(&SM_I(sbi)->dcc_info->discard_cmd_cnt); + } /* free nids */ si->cache_mem += (NM_I(sbi)->nid_cnt[FREE_NID_LIST] + @@ -343,10 +348,11 @@ static int stat_show(struct seq_file *s, void *v) si->ext_tree, si->zombie_tree, si->ext_node); seq_puts(s, "\nBalancing F2FS Async:\n"); seq_printf(s, " - IO (CP: %4d, Data: %4d, Flush: (%4d %4d), " - "Discard: (%4d %4d))\n", + "Discard: (%4d %4d)) cmd: %4d\n", si->nr_wb_cp_data, si->nr_wb_data, si->nr_flushing, si->nr_flushed, - si->nr_discarding, si->nr_discarded); + si->nr_discarding, si->nr_discarded, + si->nr_discard_cmd); seq_printf(s, " - inmem: %4d, atomic IO: %4d (Max. %4d), " "volatile IO: %4d (Max. %4d)\n", si->inmem_pages, si->aw_cnt, si->max_aw_cnt, diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 26beb67825c2..101cc39a8f96 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -283,6 +283,7 @@ struct discard_cmd_control { int max_discards; /* max. discards to be issued */ atomic_t issued_discard; /* # of issued discard */ atomic_t issing_discard; /* # of issing discard */ + atomic_t discard_cmd_cnt; /* # of cached cmd count */ }; /* for the list of fsync inodes, used only during recovery */ @@ -2400,6 +2401,7 @@ struct f2fs_stat_info { int total_count, utilization; int bg_gc, nr_wb_cp_data, nr_wb_data; int nr_flushing, nr_flushed, nr_discarding, nr_discarded; + int nr_discard_cmd; int inline_xattr, inline_inode, inline_dir, append, update, orphans; int aw_cnt, max_aw_cnt, vw_cnt, max_vw_cnt; unsigned int valid_count, valid_node_count, valid_inode_count, discard_blks; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 717d6cc51ef2..036b41257d60 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -693,6 +693,8 @@ static void __add_discard_cmd(struct f2fs_sb_info *sbi, mutex_lock(&dcc->cmd_lock); list_add_tail(&dc->list, cmd_list); mutex_unlock(&dcc->cmd_lock); + + atomic_inc(&dcc->discard_cmd_cnt); } static void __remove_discard_cmd(struct f2fs_sb_info *sbi, struct discard_cmd *dc) @@ -708,6 +710,7 @@ static void __remove_discard_cmd(struct f2fs_sb_info *sbi, struct discard_cmd *d "Issue discard failed, ret: %d", dc->error); list_del(&dc->list); kmem_cache_free(discard_cmd_slab, dc); + atomic_dec(&SM_I(sbi)->dcc_info->discard_cmd_cnt); } static void f2fs_submit_discard_endio(struct bio *bio) @@ -1224,6 +1227,7 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi) mutex_init(&dcc->cmd_lock); atomic_set(&dcc->issued_discard, 0); atomic_set(&dcc->issing_discard, 0); + atomic_set(&dcc->discard_cmd_cnt, 0); dcc->nr_discards = 0; dcc->max_discards = 0; From 79bd5ed6e3181002b57cf693dff1f0a886007453 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 27 Mar 2017 18:14:04 +0800 Subject: [PATCH 235/804] f2fs: clean up destroy_discard_cmd_control Remove unneeded parameter and simply change flow in destroy_discard_cmd_control. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 036b41257d60..4934e8869240 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1246,20 +1246,22 @@ init_thread: return err; } -static void destroy_discard_cmd_control(struct f2fs_sb_info *sbi, bool free) +static void destroy_discard_cmd_control(struct f2fs_sb_info *sbi) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; - if (dcc && dcc->f2fs_issue_discard) { + if (!dcc) + return; + + if (dcc->f2fs_issue_discard) { struct task_struct *discard_thread = dcc->f2fs_issue_discard; dcc->f2fs_issue_discard = NULL; kthread_stop(discard_thread); } - if (free) { - kfree(dcc); - SM_I(sbi)->dcc_info = NULL; - } + + kfree(dcc); + SM_I(sbi)->dcc_info = NULL; } static bool __mark_sit_entry_dirty(struct f2fs_sb_info *sbi, unsigned int segno) @@ -3152,7 +3154,7 @@ void destroy_segment_manager(struct f2fs_sb_info *sbi) if (!sm_info) return; destroy_flush_cmd_control(sbi, true); - destroy_discard_cmd_control(sbi, true); + destroy_discard_cmd_control(sbi); destroy_dirty_segmap(sbi); destroy_curseg(sbi); destroy_free_segmap(sbi); From 77deaff0083f66f951b6415382e26b29b41d29af Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 28 Mar 2017 18:18:50 +0800 Subject: [PATCH 236/804] f2fs: use bitmap in discard_entry This patch changes to use bitmap instead of extent in struct discard_entry to indicate discard range in one segment, for fragmented space, this implementation can save memory footprint. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 6 ++-- fs/f2fs/segment.c | 72 ++++++++++++++++++++++++++--------------------- 2 files changed, 43 insertions(+), 35 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 101cc39a8f96..c46d1b015db0 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -249,11 +249,11 @@ struct inode_entry { struct inode *inode; /* vfs inode pointer */ }; -/* for the list of blockaddresses to be discarded */ +/* for the bitmap indicate blocks to be discarded */ struct discard_entry { struct list_head list; /* list head */ - block_t blkaddr; /* block address to be discarded */ - int len; /* # of consecutive blocks of the discard */ + block_t start_blkaddr; /* start blockaddr of current segment */ + unsigned char discard_map[SIT_VBLOCK_MAP_SIZE]; /* segment discard bitmap */ }; enum { diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 4934e8869240..0aab0bdb5da3 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1041,32 +1041,6 @@ static int f2fs_issue_discard(struct f2fs_sb_info *sbi, return err; } -static void __add_discard_entry(struct f2fs_sb_info *sbi, - struct cp_control *cpc, struct seg_entry *se, - unsigned int start, unsigned int end) -{ - struct list_head *head = &SM_I(sbi)->dcc_info->discard_entry_list; - struct discard_entry *new, *last; - - if (!list_empty(head)) { - last = list_last_entry(head, struct discard_entry, list); - if (START_BLOCK(sbi, cpc->trim_start) + start == - last->blkaddr + last->len && - last->len < MAX_DISCARD_BLOCKS(sbi)) { - last->len += end - start; - goto done; - } - } - - new = f2fs_kmem_cache_alloc(discard_entry_slab, GFP_NOFS); - INIT_LIST_HEAD(&new->list); - new->blkaddr = START_BLOCK(sbi, cpc->trim_start) + start; - new->len = end - start; - list_add_tail(&new->list, head); -done: - SM_I(sbi)->dcc_info->nr_discards += end - start; -} - static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc, bool check_only) { @@ -1079,6 +1053,8 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc, unsigned long *dmap = SIT_I(sbi)->tmp_map; unsigned int start = 0, end = -1; bool force = (cpc->reason == CP_DISCARD); + struct discard_entry *de = NULL; + struct list_head *head = &SM_I(sbi)->dcc_info->discard_entry_list; int i; if (se->valid_blocks == max_blocks || !f2fs_discard_en(sbi)) @@ -1110,7 +1086,17 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc, if (check_only) return true; - __add_discard_entry(sbi, cpc, se, start, end); + if (!de) { + de = f2fs_kmem_cache_alloc(discard_entry_slab, + GFP_F2FS_ZERO); + de->start_blkaddr = START_BLOCK(sbi, cpc->trim_start); + list_add_tail(&de->list, head); + } + + for (i = start; i < end; i++) + __set_bit_le(i, (void *)de->discard_map); + + SM_I(sbi)->dcc_info->nr_discards += end - start; } return false; } @@ -1196,13 +1182,35 @@ next: /* send small discards */ list_for_each_entry_safe(entry, this, head, list) { - if (force && entry->len < cpc->trim_minlen) - goto skip; - f2fs_issue_discard(sbi, entry->blkaddr, entry->len); - cpc->trimmed += entry->len; + unsigned int cur_pos = 0, next_pos, len, total_len = 0; + bool is_valid = test_bit_le(0, entry->discard_map); + +find_next: + if (is_valid) { + next_pos = find_next_zero_bit_le(entry->discard_map, + sbi->blocks_per_seg, cur_pos); + len = next_pos - cur_pos; + + if (force && len < cpc->trim_minlen) + goto skip; + + f2fs_issue_discard(sbi, entry->start_blkaddr + cur_pos, + len); + cpc->trimmed += len; + total_len += len; + } else { + next_pos = find_next_bit_le(entry->discard_map, + sbi->blocks_per_seg, cur_pos); + } skip: + cur_pos = next_pos; + is_valid = !is_valid; + + if (cur_pos < sbi->blocks_per_seg) + goto find_next; + list_del(&entry->list); - SM_I(sbi)->dcc_info->nr_discards -= entry->len; + SM_I(sbi)->dcc_info->nr_discards -= total_len; kmem_cache_free(discard_entry_slab, entry); } } From 669457e6c2af8bdf84090510a4955a249697683f Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 24 Mar 2017 20:05:13 -0400 Subject: [PATCH 237/804] f2fs: write small sized IO to hot log It would better split small and large IOs separately in order to get more consecutive big writes. The default threshold is set to 64KB, but configurable by sysfs/min_hot_blocks. Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 9 +++++++++ fs/f2fs/f2fs.h | 2 ++ fs/f2fs/inline.c | 1 + fs/f2fs/segment.c | 13 ++++++------- fs/f2fs/segment.h | 1 + fs/f2fs/super.c | 2 ++ 6 files changed, 21 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 49c04c4e3bd8..5cb4067c3d84 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1430,6 +1430,8 @@ write: need_balance_fs = true; else if (has_not_enough_free_secs(sbi, 0, 0)) goto redirty_out; + else + set_inode_flag(inode, FI_HOT_DATA); err = -EAGAIN; if (f2fs_has_inline_data(inode)) { @@ -1455,6 +1457,7 @@ out: if (wbc->for_reclaim) { f2fs_submit_merged_bio_cond(sbi, inode, 0, page->index, DATA, WRITE); + clear_inode_flag(inode, FI_HOT_DATA); remove_dirty_inode(inode); submitted = NULL; } @@ -1509,6 +1512,12 @@ static int f2fs_write_cache_pages(struct address_space *mapping, pagevec_init(&pvec, 0); + if (get_dirty_pages(mapping->host) <= + SM_I(F2FS_M_SB(mapping))->min_hot_blocks) + set_inode_flag(mapping->host, FI_HOT_DATA); + else + clear_inode_flag(mapping->host, FI_HOT_DATA); + if (wbc->range_cyclic) { writeback_index = mapping->writeback_index; /* prev offset */ index = writeback_index; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index c46d1b015db0..2542548233db 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -744,6 +744,7 @@ struct f2fs_sm_info { unsigned int ipu_policy; /* in-place-update policy */ unsigned int min_ipu_util; /* in-place-update threshold */ unsigned int min_fsync_blocks; /* threshold for fsync */ + unsigned int min_hot_blocks; /* threshold for hot block allocation */ /* for flush command control */ struct flush_cmd_control *fcc_info; @@ -1783,6 +1784,7 @@ enum { FI_DO_DEFRAG, /* indicate defragment is running */ FI_DIRTY_FILE, /* indicate regular/symlink has dirty pages */ FI_NO_PREALLOC, /* indicate skipped preallocated blocks */ + FI_HOT_DATA, /* indicate file is hot */ }; static inline void __mark_inode_dirty_flag(struct inode *inode, diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index a92370516659..e4f12891d4e4 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -137,6 +137,7 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) /* write data page to try to make data consistent */ set_page_writeback(page); fio.old_blkaddr = dn->data_blkaddr; + set_inode_flag(dn->inode, FI_HOT_DATA); write_data_page(dn, &fio); f2fs_wait_on_page_writeback(page, DATA, true); if (dirty) { diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 0aab0bdb5da3..ed9db665ffe7 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1908,18 +1908,16 @@ static int __get_segment_type_6(struct page *page, enum page_type p_type) if (p_type == DATA) { struct inode *inode = page->mapping->host; - if (S_ISDIR(inode->i_mode)) - return CURSEG_HOT_DATA; - else if (is_cold_data(page) || file_is_cold(inode)) + if (is_cold_data(page) || file_is_cold(inode)) return CURSEG_COLD_DATA; - else - return CURSEG_WARM_DATA; + if (is_inode_flag_set(inode, FI_HOT_DATA)) + return CURSEG_HOT_DATA; + return CURSEG_WARM_DATA; } else { if (IS_DNODE(page)) return is_cold_node(page) ? CURSEG_WARM_NODE : CURSEG_HOT_NODE; - else - return CURSEG_COLD_NODE; + return CURSEG_COLD_NODE; } } @@ -3026,6 +3024,7 @@ int build_segment_manager(struct f2fs_sb_info *sbi) sm_info->ipu_policy = 1 << F2FS_IPU_FSYNC; sm_info->min_ipu_util = DEF_MIN_IPU_UTIL; sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS; + sm_info->min_hot_blocks = DEF_MIN_HOT_BLOCKS; sm_info->trim_sections = DEF_BATCHED_TRIM_SECTIONS; diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 31846b0fcb95..57e36c1ce7bd 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -540,6 +540,7 @@ static inline int utilization(struct f2fs_sb_info *sbi) */ #define DEF_MIN_IPU_UTIL 70 #define DEF_MIN_FSYNC_BLOCKS 8 +#define DEF_MIN_HOT_BLOCKS 16 enum { F2FS_IPU_FORCE, diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index e43824849cb7..3691413e51c3 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -296,6 +296,7 @@ F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, batched_trim_sections, trim_sections); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_hot_blocks, min_hot_blocks); F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh); F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ra_nid_pages, ra_nid_pages); F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, dirty_nats_ratio, dirty_nats_ratio); @@ -321,6 +322,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(ipu_policy), ATTR_LIST(min_ipu_util), ATTR_LIST(min_fsync_blocks), + ATTR_LIST(min_hot_blocks), ATTR_LIST(max_victim_search), ATTR_LIST(dir_level), ATTR_LIST(ram_thresh), From 654cbabc87dc4f5be00da12a7e7799e2eb5a28dc Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 28 Mar 2017 18:07:38 -0700 Subject: [PATCH 238/804] f2fs: avoid IO split due to mixed WB_SYNC_ALL and WB_SYNC_NONE If two threads try to flush dirty pages in different inodes respectively, f2fs_write_data_pages() will produce WRITE and WRITE_SYNC one at a time, resulting in a lot of 4KB seperated IOs. So, this patch gives higher priority to WB_SYNC_ALL IOs and gathers write IOs with a big WRITE_SYNC'ed bio. Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 15 +++++++++++++-- fs/f2fs/f2fs.h | 3 +++ fs/f2fs/super.c | 2 ++ 3 files changed, 18 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 5cb4067c3d84..481dd2cff3ac 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1603,8 +1603,10 @@ continue_unlock: last_idx = page->index; } - if (--wbc->nr_to_write <= 0 && - wbc->sync_mode == WB_SYNC_NONE) { + /* give a priority to WB_SYNC threads */ + if ((atomic_read(&F2FS_M_SB(mapping)->wb_sync_req) || + --wbc->nr_to_write <= 0) && + wbc->sync_mode == WB_SYNC_NONE) { done = 1; break; } @@ -1660,9 +1662,18 @@ static int f2fs_write_data_pages(struct address_space *mapping, trace_f2fs_writepages(mapping->host, wbc, DATA); + /* to avoid spliting IOs due to mixed WB_SYNC_ALL and WB_SYNC_NONE */ + if (wbc->sync_mode == WB_SYNC_ALL) + atomic_inc(&sbi->wb_sync_req); + else if (atomic_read(&sbi->wb_sync_req)) + goto skip_write; + blk_start_plug(&plug); ret = f2fs_write_cache_pages(mapping, wbc); blk_finish_plug(&plug); + + if (wbc->sync_mode == WB_SYNC_ALL) + atomic_dec(&sbi->wb_sync_req); /* * if some pages were truncated, we cannot guarantee its mapping->host * to detect pending bios. diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 2542548233db..ad8c54848edf 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -954,6 +954,9 @@ struct f2fs_sb_info { /* # of allocated blocks */ struct percpu_counter alloc_valid_block_count; + /* writeback control */ + atomic_t wb_sync_req; /* count # of WB_SYNC threads */ + /* valid inode count */ struct percpu_counter total_valid_inode_count; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 3691413e51c3..ea28312fa80f 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1568,6 +1568,8 @@ static void init_sb_info(struct f2fs_sb_info *sbi) for (i = 0; i < NR_COUNT_TYPE; i++) atomic_set(&sbi->nr_pages[i], 0); + atomic_set(&sbi->wb_sync_req, 0); + INIT_LIST_HEAD(&sbi->s_list); mutex_init(&sbi->umount_mutex); mutex_init(&sbi->wio_mutex[NODE]); From bdc8c12ddd5508dd541c62daf3a9a6e5dadfd104 Mon Sep 17 00:00:00 2001 From: Kaixu Xia Date: Sun, 2 Apr 2017 02:39:48 +0800 Subject: [PATCH 239/804] f2fs: remove the redundant variable definition The variable 'i' has been defined before, so here we can use it directly. Signed-off-by: Kaixu Xia Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index a9f141abac5d..8b106d10afe7 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1164,7 +1164,6 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* write nat bits */ if (enabled_nat_bits(sbi, cpc)) { __u64 cp_ver = cur_cp_version(ckpt); - unsigned int i; block_t blk; cp_ver |= ((__u64)crc32 << 32); From 074a551c90b9b1bf413b7b44ec5ee3ab9c7526eb Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 30 Mar 2017 21:02:46 -0700 Subject: [PATCH 240/804] f2fs: submit bio of in-place-update pages This patch tries to split in-place-update bios from sequential bios. Suggested-by: Yunlei He Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 5 ++++- fs/f2fs/f2fs.h | 2 +- fs/f2fs/segment.c | 4 ++-- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 481dd2cff3ac..9ce6c3435c00 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -361,6 +361,9 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) bio_set_op_attrs(bio, fio->op, fio->op_flags); __submit_bio(fio->sbi, bio, fio->type); + + if (!is_read_io(fio->op)) + inc_page_count(fio->sbi, WB_DATA_TYPE(fio->page)); return 0; } @@ -1352,7 +1355,7 @@ retry_encrypt: !is_cold_data(page) && !IS_ATOMIC_WRITTEN_PAGE(page) && need_inplace_update(inode))) { - rewrite_data_page(fio); + err = rewrite_data_page(fio); set_inode_flag(inode, FI_UPDATE_WRITE); trace_f2fs_do_write_data_page(page, IPU); } else { diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index ad8c54848edf..91174ed207de 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2273,7 +2273,7 @@ void update_meta_page(struct f2fs_sb_info *sbi, void *src, block_t blk_addr); void write_meta_page(struct f2fs_sb_info *sbi, struct page *page); void write_node_page(unsigned int nid, struct f2fs_io_info *fio); void write_data_page(struct dnode_of_data *dn, struct f2fs_io_info *fio); -void rewrite_data_page(struct f2fs_io_info *fio); +int rewrite_data_page(struct f2fs_io_info *fio); void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, block_t old_blkaddr, block_t new_blkaddr, bool recover_curseg, bool recover_newaddr); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index ed9db665ffe7..5e9635df9923 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2040,11 +2040,11 @@ void write_data_page(struct dnode_of_data *dn, struct f2fs_io_info *fio) f2fs_update_data_blkaddr(dn, fio->new_blkaddr); } -void rewrite_data_page(struct f2fs_io_info *fio) +int rewrite_data_page(struct f2fs_io_info *fio) { fio->new_blkaddr = fio->old_blkaddr; stat_inc_inplace_blocks(fio->sbi); - f2fs_submit_page_mbio(fio); + return f2fs_submit_page_bio(fio); } void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, From 34cc766bfbd1f720362f457a9dfe7603c76605d6 Mon Sep 17 00:00:00 2001 From: Tomohiro Kusumi Date: Tue, 4 Apr 2017 13:01:22 +0300 Subject: [PATCH 241/804] f2fs: split make_dentry_ptr() into block and inline versions Since callers statically know which type to use, make_dentry_ptr() can simply be splitted into two inline functions. This way, the code has less inlined, fewer arguments, and no cast. Signed-off-by: Tomohiro Kusumi Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 8 ++++---- fs/f2fs/f2fs.h | 32 +++++++++++++++----------------- fs/f2fs/inline.c | 10 +++++----- 3 files changed, 24 insertions(+), 26 deletions(-) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index b71b7f364107..db077960e376 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -94,7 +94,7 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page, dentry_blk = (struct f2fs_dentry_block *)kmap(dentry_page); - make_dentry_ptr(NULL, &d, (void *)dentry_blk, 1); + make_dentry_ptr_block(NULL, &d, dentry_blk); de = find_target_dentry(fname, namehash, max_slots, &d); if (de) *res_page = dentry_page; @@ -366,7 +366,7 @@ static int make_empty_dir(struct inode *inode, dentry_blk = kmap_atomic(dentry_page); - make_dentry_ptr(NULL, &d, (void *)dentry_blk, 1); + make_dentry_ptr_block(NULL, &d, dentry_blk); do_make_empty_dir(inode, parent, &d); kunmap_atomic(dentry_blk); @@ -586,7 +586,7 @@ add_dentry: } } - make_dentry_ptr(NULL, &d, (void *)dentry_blk, 1); + make_dentry_ptr_block(NULL, &d, dentry_blk); f2fs_update_dentry(ino, mode, &d, new_name, dentry_hash, bit_pos); set_page_dirty(dentry_page); @@ -894,7 +894,7 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) dentry_blk = kmap(dentry_page); - make_dentry_ptr(inode, &d, (void *)dentry_blk, 1); + make_dentry_ptr_block(inode, &d, dentry_blk); err = f2fs_fill_dentries(ctx, &d, n * NR_DENTRY_IN_BLOCK, &fstr); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 91174ed207de..48d3882f9d88 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -395,26 +395,24 @@ struct f2fs_dentry_ptr { int max; }; -static inline void make_dentry_ptr(struct inode *inode, - struct f2fs_dentry_ptr *d, void *src, int type) +static inline void make_dentry_ptr_block(struct inode *inode, + struct f2fs_dentry_ptr *d, struct f2fs_dentry_block *t) { d->inode = inode; + d->max = NR_DENTRY_IN_BLOCK; + d->bitmap = &t->dentry_bitmap; + d->dentry = t->dentry; + d->filename = t->filename; +} - if (type == 1) { - struct f2fs_dentry_block *t = (struct f2fs_dentry_block *)src; - - d->max = NR_DENTRY_IN_BLOCK; - d->bitmap = &t->dentry_bitmap; - d->dentry = t->dentry; - d->filename = t->filename; - } else { - struct f2fs_inline_dentry *t = (struct f2fs_inline_dentry *)src; - - d->max = NR_INLINE_DENTRY; - d->bitmap = &t->dentry_bitmap; - d->dentry = t->dentry; - d->filename = t->filename; - } +static inline void make_dentry_ptr_inline(struct inode *inode, + struct f2fs_dentry_ptr *d, struct f2fs_inline_dentry *t) +{ + d->inode = inode; + d->max = NR_INLINE_DENTRY; + d->bitmap = &t->dentry_bitmap; + d->dentry = t->dentry; + d->filename = t->filename; } /* diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index e4f12891d4e4..b3bd1012a4fc 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -302,7 +302,7 @@ struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir, inline_dentry = inline_data_addr(ipage); - make_dentry_ptr(NULL, &d, (void *)inline_dentry, 2); + make_dentry_ptr_inline(NULL, &d, inline_dentry); de = find_target_dentry(fname, namehash, NULL, &d); unlock_page(ipage); if (de) @@ -321,7 +321,7 @@ int make_empty_inline_dir(struct inode *inode, struct inode *parent, dentry_blk = inline_data_addr(ipage); - make_dentry_ptr(NULL, &d, (void *)dentry_blk, 2); + make_dentry_ptr_inline(NULL, &d, dentry_blk); do_make_empty_dir(inode, parent, &d); set_page_dirty(ipage); @@ -402,7 +402,7 @@ static int f2fs_add_inline_entries(struct inode *dir, unsigned long bit_pos = 0; int err = 0; - make_dentry_ptr(NULL, &d, (void *)inline_dentry, 2); + make_dentry_ptr_inline(NULL, &d, inline_dentry); while (bit_pos < d.max) { struct f2fs_dir_entry *de; @@ -534,7 +534,7 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, f2fs_wait_on_page_writeback(ipage, NODE, true); name_hash = f2fs_dentry_hash(new_name); - make_dentry_ptr(NULL, &d, (void *)dentry_blk, 2); + make_dentry_ptr_inline(NULL, &d, dentry_blk); f2fs_update_dentry(ino, mode, &d, new_name, name_hash, bit_pos); set_page_dirty(ipage); @@ -623,7 +623,7 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx, inline_dentry = inline_data_addr(ipage); - make_dentry_ptr(inode, &d, (void *)inline_dentry, 2); + make_dentry_ptr_inline(inode, &d, inline_dentry); err = f2fs_fill_dentries(ctx, &d, 0, fstr); if (!err) From e5c2a70c4a8b85817054c1292f497309a7d871f9 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 4 Apr 2017 16:45:30 -0700 Subject: [PATCH 242/804] Revert "f2fs: put allocate_segment after refresh_sit_entry" This reverts commit 3436c4bdb30de421d46f58c9174669fbcfd40ce0. This makes a leak to register dirty segments. I reproduced the issue by modified postmark which injects a lot of file create/delete/update and finally triggers huge number of SSR allocations. Cc: # v4.10+ [Jaegeuk Kim: Change missing incorrect comment] Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 5e9635df9923..c79f1b05d667 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1960,14 +1960,13 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, stat_inc_block_count(sbi, curseg); - /* - * SIT information should be updated before segment allocation, - * since SSR needs latest valid block information. - */ - refresh_sit_entry(sbi, old_blkaddr, *new_blkaddr); - if (!__has_curseg_space(sbi, type)) sit_i->s_ops->allocate_segment(sbi, type, false); + /* + * SIT information should be updated after segment allocation, + * since we need to keep dirty segments precisely under SSR. + */ + refresh_sit_entry(sbi, old_blkaddr, *new_blkaddr); mutex_unlock(&sit_i->sentry_lock); From 4adc71ee11d7edb42a176efc970baa45ba9d6e9a Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 5 Apr 2017 18:19:48 +0800 Subject: [PATCH 243/804] f2fs: split discard_cmd_list Split discard_cmd_list to discard_{pend,wait}_list, so while sending/waiting discard command, we can avoid traversing unneeded entries in original list. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim Conflicts: fs/f2fs/segment.c --- fs/f2fs/f2fs.h | 3 ++- fs/f2fs/segment.c | 47 ++++++++++++++++++++++++++++++----------------- 2 files changed, 32 insertions(+), 18 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 48d3882f9d88..d0d9668bd738 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -277,7 +277,8 @@ struct discard_cmd_control { struct task_struct *f2fs_issue_discard; /* discard thread */ struct list_head discard_entry_list; /* 4KB discard entry list */ int nr_discards; /* # of discards in the list */ - struct list_head discard_cmd_list; /* discard cmd list */ + struct list_head discard_pend_list; /* store pending entries */ + struct list_head discard_wait_list; /* store on-flushing entries */ wait_queue_head_t discard_wait_queue; /* waiting queue for wake-up */ struct mutex cmd_lock; int max_discards; /* max. discards to be issued */ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index c79f1b05d667..0582eecb5272 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -677,7 +677,7 @@ static void __add_discard_cmd(struct f2fs_sb_info *sbi, block_t start, block_t len) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; - struct list_head *cmd_list = &(dcc->discard_cmd_list); + struct list_head *pend_list = &(dcc->discard_pend_list); struct discard_cmd *dc; dc = f2fs_kmem_cache_alloc(discard_cmd_slab, GFP_NOFS); @@ -691,7 +691,7 @@ static void __add_discard_cmd(struct f2fs_sb_info *sbi, init_completion(&dc->wait); mutex_lock(&dcc->cmd_lock); - list_add_tail(&dc->list, cmd_list); + list_add_tail(&dc->list, pend_list); mutex_unlock(&dcc->cmd_lock); atomic_inc(&dcc->discard_cmd_cnt); @@ -826,6 +826,7 @@ static void __submit_discard_cmd(struct f2fs_sb_info *sbi, bio->bi_private = dc; bio->bi_end_io = f2fs_submit_discard_endio; submit_bio(REQ_SYNC, bio); + list_move_tail(&dc->list, &dcc->discard_wait_list); } } else { __remove_discard_cmd(sbi, dc); @@ -872,31 +873,37 @@ static void __punch_discard_cmd(struct f2fs_sb_info *sbi, void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; - struct list_head *wait_list = &(dcc->discard_cmd_list); + struct list_head *pend_list = &(dcc->discard_pend_list); + struct list_head *wait_list = &(dcc->discard_wait_list); struct discard_cmd *dc, *tmp; struct blk_plug plug; mutex_lock(&dcc->cmd_lock); - blk_start_plug(&plug); + if (blkaddr == NULL_ADDR) + goto release_discard; + + list_for_each_entry_safe(dc, tmp, pend_list, list) { + if (dc->lstart <= blkaddr && blkaddr < dc->lstart + dc->len) + __punch_discard_cmd(sbi, dc, blkaddr); + } list_for_each_entry_safe(dc, tmp, wait_list, list) { - - if (blkaddr == NULL_ADDR) { - __submit_discard_cmd(sbi, dc); - continue; - } - if (dc->lstart <= blkaddr && blkaddr < dc->lstart + dc->len) { if (dc->state == D_SUBMIT) wait_for_completion_io(&dc->wait); __punch_discard_cmd(sbi, dc, blkaddr); } } - blk_finish_plug(&plug); +release_discard: /* this comes from f2fs_put_super */ if (blkaddr == NULL_ADDR) { + blk_start_plug(&plug); + list_for_each_entry_safe(dc, tmp, pend_list, list) + __submit_discard_cmd(sbi, dc); + blk_finish_plug(&plug); + list_for_each_entry_safe(dc, tmp, wait_list, list) { wait_for_completion_io(&dc->wait); __remove_discard_cmd(sbi, dc); @@ -910,7 +917,8 @@ static int issue_discard_thread(void *data) struct f2fs_sb_info *sbi = data; struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; wait_queue_head_t *q = &dcc->discard_wait_queue; - struct list_head *cmd_list = &dcc->discard_cmd_list; + struct list_head *pend_list = &dcc->discard_pend_list; + struct list_head *wait_list = &dcc->discard_wait_list; struct discard_cmd *dc, *tmp; struct blk_plug plug; int iter = 0; @@ -921,13 +929,17 @@ repeat: blk_start_plug(&plug); mutex_lock(&dcc->cmd_lock); - list_for_each_entry_safe(dc, tmp, cmd_list, list) { + list_for_each_entry_safe(dc, tmp, pend_list, list) { + f2fs_bug_on(sbi, dc->state != D_PREP); if (is_idle(sbi)) __submit_discard_cmd(sbi, dc); - if (dc->state == D_PREP && iter++ > DISCARD_ISSUE_RATE) + if (iter++ > DISCARD_ISSUE_RATE) break; + } + + list_for_each_entry_safe(dc, tmp, wait_list, list) { if (dc->state == D_DONE) __remove_discard_cmd(sbi, dc); } @@ -938,8 +950,8 @@ repeat: iter = 0; congestion_wait(BLK_RW_SYNC, HZ/50); - wait_event_interruptible(*q, - kthread_should_stop() || !list_empty(&dcc->discard_cmd_list)); + wait_event_interruptible(*q, kthread_should_stop() || + !list_empty(pend_list) || !list_empty(wait_list)); goto repeat; } @@ -1231,7 +1243,8 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi) return -ENOMEM; INIT_LIST_HEAD(&dcc->discard_entry_list); - INIT_LIST_HEAD(&dcc->discard_cmd_list); + INIT_LIST_HEAD(&dcc->discard_pend_list); + INIT_LIST_HEAD(&dcc->discard_wait_list); mutex_init(&dcc->cmd_lock); atomic_set(&dcc->issued_discard, 0); atomic_set(&dcc->issing_discard, 0); From 745d922434837feebaa8497563bf8f8b39cde782 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 5 Apr 2017 18:19:49 +0800 Subject: [PATCH 244/804] f2fs: introduce f2fs_wait_discard_bios Split f2fs_wait_discard_bios from f2fs_wait_discard_bio, just for cleanup, no logic change. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- fs/f2fs/segment.c | 37 ++++++++++++++++++++++--------------- fs/f2fs/super.c | 2 +- 3 files changed, 24 insertions(+), 17 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index d0d9668bd738..63e333edefaa 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2260,7 +2260,7 @@ void destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free); void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr); bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr); void refresh_sit_entry(struct f2fs_sb_info *sbi, block_t old, block_t new); -void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr); +void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi); void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc); void release_discard_addrs(struct f2fs_sb_info *sbi); int npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 0582eecb5272..acf0c2c7d3b3 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -876,13 +876,9 @@ void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) struct list_head *pend_list = &(dcc->discard_pend_list); struct list_head *wait_list = &(dcc->discard_wait_list); struct discard_cmd *dc, *tmp; - struct blk_plug plug; mutex_lock(&dcc->cmd_lock); - if (blkaddr == NULL_ADDR) - goto release_discard; - list_for_each_entry_safe(dc, tmp, pend_list, list) { if (dc->lstart <= blkaddr && blkaddr < dc->lstart + dc->len) __punch_discard_cmd(sbi, dc, blkaddr); @@ -896,19 +892,30 @@ void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) } } -release_discard: - /* this comes from f2fs_put_super */ - if (blkaddr == NULL_ADDR) { - blk_start_plug(&plug); - list_for_each_entry_safe(dc, tmp, pend_list, list) - __submit_discard_cmd(sbi, dc); - blk_finish_plug(&plug); + mutex_unlock(&dcc->cmd_lock); +} - list_for_each_entry_safe(dc, tmp, wait_list, list) { - wait_for_completion_io(&dc->wait); - __remove_discard_cmd(sbi, dc); - } +/* This comes from f2fs_put_super */ +void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct list_head *pend_list = &(dcc->discard_pend_list); + struct list_head *wait_list = &(dcc->discard_wait_list); + struct discard_cmd *dc, *tmp; + struct blk_plug plug; + + mutex_lock(&dcc->cmd_lock); + + blk_start_plug(&plug); + list_for_each_entry_safe(dc, tmp, pend_list, list) + __submit_discard_cmd(sbi, dc); + blk_finish_plug(&plug); + + list_for_each_entry_safe(dc, tmp, wait_list, list) { + wait_for_completion_io(&dc->wait); + __remove_discard_cmd(sbi, dc); } + mutex_unlock(&dcc->cmd_lock); } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index ea28312fa80f..d50eb8d27e60 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -795,7 +795,7 @@ static void f2fs_put_super(struct super_block *sb) } /* be sure to wait for any on-going discard commands */ - f2fs_wait_discard_bio(sbi, NULL_ADDR); + f2fs_wait_discard_bios(sbi); /* write_checkpoint can update stat informaion */ f2fs_destroy_stats(sbi); From e09409d5c38de68d0c0f4fe43c3ae6da521fb58a Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 5 Apr 2017 18:26:26 +0800 Subject: [PATCH 245/804] f2fs: prevent waiter encountering incorrect discard states In f2fs_submit_discard_endio, we will wake up waiter before setting discard command states, so waiter may use incorrect states. Change the order between complete() and states setting to fix this issue. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index acf0c2c7d3b3..dc2737cfaa67 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -717,9 +717,9 @@ static void f2fs_submit_discard_endio(struct bio *bio) { struct discard_cmd *dc = (struct discard_cmd *)bio->bi_private; - complete(&dc->wait); dc->error = bio->bi_error; dc->state = D_DONE; + complete(&dc->wait); bio_put(bio); } @@ -886,8 +886,7 @@ void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) list_for_each_entry_safe(dc, tmp, wait_list, list) { if (dc->lstart <= blkaddr && blkaddr < dc->lstart + dc->len) { - if (dc->state == D_SUBMIT) - wait_for_completion_io(&dc->wait); + wait_for_completion_io(&dc->wait); __punch_discard_cmd(sbi, dc, blkaddr); } } @@ -947,8 +946,10 @@ repeat: } list_for_each_entry_safe(dc, tmp, wait_list, list) { - if (dc->state == D_DONE) + if (dc->state == D_DONE) { + wait_for_completion_io(&dc->wait); __remove_discard_cmd(sbi, dc); + } } mutex_unlock(&dcc->cmd_lock); From f886a1df9e79b2e70d4fb4e6a93a328a8cf64405 Mon Sep 17 00:00:00 2001 From: Tomohiro Kusumi Date: Wed, 5 Apr 2017 22:49:44 +0300 Subject: [PATCH 246/804] f2fs: fix comment on f2fs_flush_merged_bios() after 86531d6b Callers are to unlock the page on failure after 86531d6b. Signed-off-by: Tomohiro Kusumi Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 9ce6c3435c00..85da089004ee 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -340,7 +340,7 @@ void f2fs_flush_merged_bios(struct f2fs_sb_info *sbi) /* * Fill the locked page with data located in the block address. - * Return unlocked page. + * A caller needs to unlock the page on failure. */ int f2fs_submit_page_bio(struct f2fs_io_info *fio) { From 1c72805ab23732e18b36c8727ebad842786b18dc Mon Sep 17 00:00:00 2001 From: Tomohiro Kusumi Date: Sun, 9 Apr 2017 02:11:36 +0300 Subject: [PATCH 247/804] f2fs: guard macro variables with braces Add braces around variables used within macros for those make sense to do it. Many of the macros in f2fs already do this. What this commit doesn't do is anything that changes line# as a result of adding braces, which usually affects the binary via __LINE__. Confirmed no diff in fs/f2fs/f2fs.ko before/after this commit on x86_64, to make sure this has no functional change as well as there's been no unexpected side effect due to callers' arithmetics within the existing code. Signed-off-by: Tomohiro Kusumi Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 36 +++++++++++----------- fs/f2fs/node.c | 2 +- fs/f2fs/node.h | 22 +++++++------- fs/f2fs/segment.h | 76 +++++++++++++++++++++++------------------------ fs/f2fs/xattr.h | 4 +-- 5 files changed, 70 insertions(+), 70 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 63e333edefaa..cbc35660c466 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -64,7 +64,7 @@ struct f2fs_fault_info { }; extern char *fault_name[FAULT_MAX]; -#define IS_FAULT_SET(fi, type) (fi->inject_type & (1 << (type))) +#define IS_FAULT_SET(fi, type) ((fi)->inject_type & (1 << (type))) #endif /* @@ -90,9 +90,9 @@ extern char *fault_name[FAULT_MAX]; #define F2FS_MOUNT_ADAPTIVE 0x00020000 #define F2FS_MOUNT_LFS 0x00040000 -#define clear_opt(sbi, option) (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option) -#define set_opt(sbi, option) (sbi->mount_opt.opt |= F2FS_MOUNT_##option) -#define test_opt(sbi, option) (sbi->mount_opt.opt & F2FS_MOUNT_##option) +#define clear_opt(sbi, option) ((sbi)->mount_opt.opt &= ~F2FS_MOUNT_##option) +#define set_opt(sbi, option) ((sbi)->mount_opt.opt |= F2FS_MOUNT_##option) +#define test_opt(sbi, option) ((sbi)->mount_opt.opt & F2FS_MOUNT_##option) #define ver_after(a, b) (typecheck(unsigned long long, a) && \ typecheck(unsigned long long, b) && \ @@ -295,13 +295,13 @@ struct fsync_inode_entry { block_t last_dentry; /* block address locating the last dentry */ }; -#define nats_in_cursum(jnl) (le16_to_cpu(jnl->n_nats)) -#define sits_in_cursum(jnl) (le16_to_cpu(jnl->n_sits)) +#define nats_in_cursum(jnl) (le16_to_cpu((jnl)->n_nats)) +#define sits_in_cursum(jnl) (le16_to_cpu((jnl)->n_sits)) -#define nat_in_journal(jnl, i) (jnl->nat_j.entries[i].ne) -#define nid_in_journal(jnl, i) (jnl->nat_j.entries[i].nid) -#define sit_in_journal(jnl, i) (jnl->sit_j.entries[i].se) -#define segno_in_journal(jnl, i) (jnl->sit_j.entries[i].segno) +#define nat_in_journal(jnl, i) ((jnl)->nat_j.entries[i].ne) +#define nid_in_journal(jnl, i) ((jnl)->nat_j.entries[i].nid) +#define sit_in_journal(jnl, i) ((jnl)->sit_j.entries[i].se) +#define segno_in_journal(jnl, i) ((jnl)->sit_j.entries[i].segno) #define MAX_NAT_JENTRIES(jnl) (NAT_JOURNAL_ENTRIES - nats_in_cursum(jnl)) #define MAX_SIT_JENTRIES(jnl) (SIT_JOURNAL_ENTRIES - sits_in_cursum(jnl)) @@ -812,7 +812,7 @@ struct f2fs_io_info { bool submitted; /* indicate IO submission */ }; -#define is_read_io(rw) (rw == READ) +#define is_read_io(rw) ((rw) == READ) struct f2fs_bio_info { struct f2fs_sb_info *sbi; /* f2fs superblock */ struct bio *bio; /* bios to merge */ @@ -1050,8 +1050,8 @@ static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type) * and the return value is in kbytes. s is of struct f2fs_sb_info. */ #define BD_PART_WRITTEN(s) \ -(((u64)part_stat_read(s->sb->s_bdev->bd_part, sectors[1]) - \ - s->sectors_written_start) >> 1) +(((u64)part_stat_read((s)->sb->s_bdev->bd_part, sectors[1]) - \ + (s)->sectors_written_start) >> 1) static inline void f2fs_update_time(struct f2fs_sb_info *sbi, int type) { @@ -2504,8 +2504,8 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) #define stat_inc_seg_count(sbi, type, gc_type) \ do { \ struct f2fs_stat_info *si = F2FS_STAT(sbi); \ - (si)->tot_segs++; \ - if (type == SUM_TYPE_DATA) { \ + si->tot_segs++; \ + if ((type) == SUM_TYPE_DATA) { \ si->data_segs++; \ si->bg_data_segs += (gc_type == BG_GC) ? 1 : 0; \ } else { \ @@ -2515,14 +2515,14 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) } while (0) #define stat_inc_tot_blk_count(si, blks) \ - (si->tot_blks += (blks)) + ((si)->tot_blks += (blks)) #define stat_inc_data_blk_count(sbi, blks, gc_type) \ do { \ struct f2fs_stat_info *si = F2FS_STAT(sbi); \ stat_inc_tot_blk_count(si, blks); \ si->data_blks += (blks); \ - si->bg_data_blks += (gc_type == BG_GC) ? (blks) : 0; \ + si->bg_data_blks += ((gc_type) == BG_GC) ? (blks) : 0; \ } while (0) #define stat_inc_node_blk_count(sbi, blks, gc_type) \ @@ -2530,7 +2530,7 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) struct f2fs_stat_info *si = F2FS_STAT(sbi); \ stat_inc_tot_blk_count(si, blks); \ si->node_blks += (blks); \ - si->bg_node_blks += (gc_type == BG_GC) ? (blks) : 0; \ + si->bg_node_blks += ((gc_type) == BG_GC) ? (blks) : 0; \ } while (0) int f2fs_build_stats(struct f2fs_sb_info *sbi); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index b737c049174a..19ea77dc3192 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -22,7 +22,7 @@ #include "trace.h" #include -#define on_build_free_nids(nmi) mutex_is_locked(&nm_i->build_lock) +#define on_build_free_nids(nmi) mutex_is_locked(&(nm_i)->build_lock) static struct kmem_cache *nat_entry_slab; static struct kmem_cache *free_nid_slab; diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index ebed0240aa53..558048e33cf9 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -9,10 +9,10 @@ * published by the Free Software Foundation. */ /* start node id of a node block dedicated to the given node id */ -#define START_NID(nid) ((nid / NAT_ENTRY_PER_BLOCK) * NAT_ENTRY_PER_BLOCK) +#define START_NID(nid) (((nid) / NAT_ENTRY_PER_BLOCK) * NAT_ENTRY_PER_BLOCK) /* node block offset on the NAT area dedicated to the given start node id */ -#define NAT_BLOCK_OFFSET(start_nid) (start_nid / NAT_ENTRY_PER_BLOCK) +#define NAT_BLOCK_OFFSET(start_nid) ((start_nid) / NAT_ENTRY_PER_BLOCK) /* # of pages to perform synchronous readahead before building free nids */ #define FREE_NID_PAGES 8 @@ -62,16 +62,16 @@ struct nat_entry { struct node_info ni; /* in-memory node information */ }; -#define nat_get_nid(nat) (nat->ni.nid) -#define nat_set_nid(nat, n) (nat->ni.nid = n) -#define nat_get_blkaddr(nat) (nat->ni.blk_addr) -#define nat_set_blkaddr(nat, b) (nat->ni.blk_addr = b) -#define nat_get_ino(nat) (nat->ni.ino) -#define nat_set_ino(nat, i) (nat->ni.ino = i) -#define nat_get_version(nat) (nat->ni.version) -#define nat_set_version(nat, v) (nat->ni.version = v) +#define nat_get_nid(nat) ((nat)->ni.nid) +#define nat_set_nid(nat, n) ((nat)->ni.nid = (n)) +#define nat_get_blkaddr(nat) ((nat)->ni.blk_addr) +#define nat_set_blkaddr(nat, b) ((nat)->ni.blk_addr = (b)) +#define nat_get_ino(nat) ((nat)->ni.ino) +#define nat_set_ino(nat, i) ((nat)->ni.ino = (i)) +#define nat_get_version(nat) ((nat)->ni.version) +#define nat_set_version(nat, v) ((nat)->ni.version = (v)) -#define inc_node_version(version) (++version) +#define inc_node_version(version) (++(version)) static inline void copy_node_info(struct node_info *dst, struct node_info *src) diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 57e36c1ce7bd..b8a1bac9355d 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -21,78 +21,78 @@ #define F2FS_MIN_SEGMENTS 9 /* SB + 2 (CP + SIT + NAT) + SSA + MAIN */ /* L: Logical segment # in volume, R: Relative segment # in main area */ -#define GET_L2R_SEGNO(free_i, segno) (segno - free_i->start_segno) -#define GET_R2L_SEGNO(free_i, segno) (segno + free_i->start_segno) +#define GET_L2R_SEGNO(free_i, segno) ((segno) - (free_i)->start_segno) +#define GET_R2L_SEGNO(free_i, segno) ((segno) + (free_i)->start_segno) -#define IS_DATASEG(t) (t <= CURSEG_COLD_DATA) -#define IS_NODESEG(t) (t >= CURSEG_HOT_NODE) +#define IS_DATASEG(t) ((t) <= CURSEG_COLD_DATA) +#define IS_NODESEG(t) ((t) >= CURSEG_HOT_NODE) #define IS_CURSEG(sbi, seg) \ - ((seg == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno) || \ - (seg == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno) || \ - (seg == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno) || \ - (seg == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno) || \ - (seg == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno) || \ - (seg == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno)) + (((seg) == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno) || \ + ((seg) == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno) || \ + ((seg) == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno) || \ + ((seg) == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno) || \ + ((seg) == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno) || \ + ((seg) == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno)) #define IS_CURSEC(sbi, secno) \ - ((secno == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno / \ - sbi->segs_per_sec) || \ - (secno == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno / \ - sbi->segs_per_sec) || \ - (secno == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno / \ - sbi->segs_per_sec) || \ - (secno == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno / \ - sbi->segs_per_sec) || \ - (secno == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno / \ - sbi->segs_per_sec) || \ - (secno == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno / \ - sbi->segs_per_sec)) \ + (((secno) == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno / \ + (sbi)->segs_per_sec) || \ + ((secno) == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno / \ + (sbi)->segs_per_sec) || \ + ((secno) == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno / \ + (sbi)->segs_per_sec) || \ + ((secno) == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno / \ + (sbi)->segs_per_sec) || \ + ((secno) == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno / \ + (sbi)->segs_per_sec) || \ + ((secno) == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno / \ + (sbi)->segs_per_sec)) \ #define MAIN_BLKADDR(sbi) (SM_I(sbi)->main_blkaddr) #define SEG0_BLKADDR(sbi) (SM_I(sbi)->seg0_blkaddr) #define MAIN_SEGS(sbi) (SM_I(sbi)->main_segments) -#define MAIN_SECS(sbi) (sbi->total_sections) +#define MAIN_SECS(sbi) ((sbi)->total_sections) #define TOTAL_SEGS(sbi) (SM_I(sbi)->segment_count) -#define TOTAL_BLKS(sbi) (TOTAL_SEGS(sbi) << sbi->log_blocks_per_seg) +#define TOTAL_BLKS(sbi) (TOTAL_SEGS(sbi) << (sbi)->log_blocks_per_seg) #define MAX_BLKADDR(sbi) (SEG0_BLKADDR(sbi) + TOTAL_BLKS(sbi)) -#define SEGMENT_SIZE(sbi) (1ULL << (sbi->log_blocksize + \ - sbi->log_blocks_per_seg)) +#define SEGMENT_SIZE(sbi) (1ULL << ((sbi)->log_blocksize + \ + (sbi)->log_blocks_per_seg)) #define START_BLOCK(sbi, segno) (SEG0_BLKADDR(sbi) + \ - (GET_R2L_SEGNO(FREE_I(sbi), segno) << sbi->log_blocks_per_seg)) + (GET_R2L_SEGNO(FREE_I(sbi), segno) << (sbi)->log_blocks_per_seg)) #define NEXT_FREE_BLKADDR(sbi, curseg) \ - (START_BLOCK(sbi, curseg->segno) + curseg->next_blkoff) + (START_BLOCK(sbi, (curseg)->segno) + (curseg)->next_blkoff) #define GET_SEGOFF_FROM_SEG0(sbi, blk_addr) ((blk_addr) - SEG0_BLKADDR(sbi)) #define GET_SEGNO_FROM_SEG0(sbi, blk_addr) \ - (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) >> sbi->log_blocks_per_seg) + (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) >> (sbi)->log_blocks_per_seg) #define GET_BLKOFF_FROM_SEG0(sbi, blk_addr) \ - (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) & (sbi->blocks_per_seg - 1)) + (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) & ((sbi)->blocks_per_seg - 1)) #define GET_SEGNO(sbi, blk_addr) \ - (((blk_addr == NULL_ADDR) || (blk_addr == NEW_ADDR)) ? \ + ((((blk_addr) == NULL_ADDR) || ((blk_addr) == NEW_ADDR)) ? \ NULL_SEGNO : GET_L2R_SEGNO(FREE_I(sbi), \ GET_SEGNO_FROM_SEG0(sbi, blk_addr))) #define GET_SECNO(sbi, segno) \ - ((segno) / sbi->segs_per_sec) + ((segno) / (sbi)->segs_per_sec) #define GET_ZONENO_FROM_SEGNO(sbi, segno) \ - ((segno / sbi->segs_per_sec) / sbi->secs_per_zone) + (((segno) / (sbi)->segs_per_sec) / (sbi)->secs_per_zone) #define GET_SUM_BLOCK(sbi, segno) \ - ((sbi->sm_info->ssa_blkaddr) + segno) + ((sbi)->sm_info->ssa_blkaddr + (segno)) #define GET_SUM_TYPE(footer) ((footer)->entry_type) -#define SET_SUM_TYPE(footer, type) ((footer)->entry_type = type) +#define SET_SUM_TYPE(footer, type) ((footer)->entry_type = (type)) #define SIT_ENTRY_OFFSET(sit_i, segno) \ - (segno % sit_i->sents_per_block) + ((segno) % (sit_i)->sents_per_block) #define SIT_BLOCK_OFFSET(segno) \ - (segno / SIT_ENTRY_PER_BLOCK) + ((segno) / SIT_ENTRY_PER_BLOCK) #define START_SEGNO(segno) \ (SIT_BLOCK_OFFSET(segno) * SIT_ENTRY_PER_BLOCK) #define SIT_BLK_CNT(sbi) \ @@ -103,7 +103,7 @@ #define SECTOR_FROM_BLOCK(blk_addr) \ (((sector_t)blk_addr) << F2FS_LOG_SECTORS_PER_BLOCK) #define SECTOR_TO_BLOCK(sectors) \ - (sectors >> F2FS_LOG_SECTORS_PER_BLOCK) + ((sectors) >> F2FS_LOG_SECTORS_PER_BLOCK) /* * indicate a block allocation direction: RIGHT and LEFT. diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h index 91f3bd88dcc6..08a4840d6d7d 100644 --- a/fs/f2fs/xattr.h +++ b/fs/f2fs/xattr.h @@ -58,10 +58,10 @@ struct f2fs_xattr_entry { #define XATTR_FIRST_ENTRY(ptr) (XATTR_ENTRY(XATTR_HDR(ptr) + 1)) #define XATTR_ROUND (3) -#define XATTR_ALIGN(size) ((size + XATTR_ROUND) & ~XATTR_ROUND) +#define XATTR_ALIGN(size) (((size) + XATTR_ROUND) & ~XATTR_ROUND) #define ENTRY_SIZE(entry) (XATTR_ALIGN(sizeof(struct f2fs_xattr_entry) + \ - entry->e_name_len + le16_to_cpu(entry->e_value_size))) + (entry)->e_name_len + le16_to_cpu((entry)->e_value_size))) #define XATTR_NEXT_ENTRY(entry) ((struct f2fs_xattr_entry *)((char *)(entry) +\ ENTRY_SIZE(entry))) From 20e7964704de1526f45318fadc3b65aa80dcb662 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 7 Apr 2017 14:27:07 -0700 Subject: [PATCH 248/804] f2fs: use segment number for get_valid_blocks This patch fixes to submit a segment number for get_valid_blocks. Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index b8a1bac9355d..39ef9cc0093b 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -80,6 +80,8 @@ GET_SEGNO_FROM_SEG0(sbi, blk_addr))) #define GET_SECNO(sbi, segno) \ ((segno) / (sbi)->segs_per_sec) +#define GET_SEGNO_FROM_SECNO(sbi, secno) \ + ((secno) * (sbi)->segs_per_sec) #define GET_ZONENO_FROM_SEGNO(sbi, segno) \ (((segno) / (sbi)->segs_per_sec) / (sbi)->secs_per_zone) @@ -720,8 +722,8 @@ static inline block_t sum_blk_addr(struct f2fs_sb_info *sbi, int base, int type) static inline bool no_fggc_candidate(struct f2fs_sb_info *sbi, unsigned int secno) { - if (get_valid_blocks(sbi, secno, sbi->segs_per_sec) >= - sbi->fggc_threshold) + if (get_valid_blocks(sbi, GET_SEGNO_FROM_SECNO(sbi, secno), + sbi->segs_per_sec) >= sbi->fggc_threshold) return true; return false; } From af381ca699eb25cb4ab153300b5c0b9d3c727bb2 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 7 Apr 2017 14:33:22 -0700 Subject: [PATCH 249/804] f2fs: clean up get_valid_blocks with consistent parameter This patch cleans up get_valid_blocks, which has no functional change. Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 2 +- fs/f2fs/gc.c | 8 ++++---- fs/f2fs/segment.c | 8 ++++---- fs/f2fs/segment.h | 8 ++++---- fs/f2fs/super.c | 4 ++-- 5 files changed, 15 insertions(+), 15 deletions(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 906f627e44fc..dc16a52db275 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -137,7 +137,7 @@ static void update_sit_info(struct f2fs_sb_info *sbi) blks_per_sec = sbi->segs_per_sec * sbi->blocks_per_seg; hblks_per_sec = blks_per_sec / 2; for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) { - vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec); + vblocks = get_valid_blocks(sbi, segno, true); dist = abs(vblocks - hblks_per_sec); bimodal += dist * dist; diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index d712b64ee6c2..ec07940ea722 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -229,7 +229,7 @@ static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno) for (i = 0; i < sbi->segs_per_sec; i++) mtime += get_seg_entry(sbi, start + i)->mtime; - vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec); + vblocks = get_valid_blocks(sbi, segno, true); mtime = div_u64(mtime, sbi->segs_per_sec); vblocks = div_u64(vblocks, sbi->segs_per_sec); @@ -252,7 +252,7 @@ static unsigned int get_greedy_cost(struct f2fs_sb_info *sbi, unsigned int segno) { unsigned int valid_blocks = - get_valid_blocks(sbi, segno, sbi->segs_per_sec); + get_valid_blocks(sbi, segno, true); return IS_DATASEG(get_seg_entry(sbi, segno)->type) ? valid_blocks * 2 : valid_blocks; @@ -897,7 +897,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, GET_SUM_BLOCK(sbi, segno)); f2fs_put_page(sum_page, 0); - if (get_valid_blocks(sbi, segno, 1) == 0 || + if (get_valid_blocks(sbi, segno, false) == 0 || !PageUptodate(sum_page) || unlikely(f2fs_cp_error(sbi))) goto next; @@ -931,7 +931,7 @@ next: blk_finish_plug(&plug); if (gc_type == FG_GC && - get_valid_blocks(sbi, start_segno, sbi->segs_per_sec) == 0) + get_valid_blocks(sbi, start_segno, true) == 0) sec_freed = 1; stat_inc_call_count(sbi->stat_info); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index dc2737cfaa67..8ab152f209af 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -636,7 +636,7 @@ static void __remove_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, if (test_and_clear_bit(segno, dirty_i->dirty_segmap[t])) dirty_i->nr_dirty[t]--; - if (get_valid_blocks(sbi, segno, sbi->segs_per_sec) == 0) + if (get_valid_blocks(sbi, segno, true) == 0) clear_bit(GET_SECNO(sbi, segno), dirty_i->victim_secmap); } @@ -657,7 +657,7 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) mutex_lock(&dirty_i->seglist_lock); - valid_blocks = get_valid_blocks(sbi, segno, 0); + valid_blocks = get_valid_blocks(sbi, segno, false); if (valid_blocks == 0) { __locate_dirty_segment(sbi, segno, PRE); @@ -1188,7 +1188,7 @@ next: secno = GET_SECNO(sbi, start); start_segno = secno * sbi->segs_per_sec; if (!IS_CURSEC(sbi, secno) && - !get_valid_blocks(sbi, start, sbi->segs_per_sec)) + !get_valid_blocks(sbi, start, true)) f2fs_issue_discard(sbi, START_BLOCK(sbi, start_segno), sbi->segs_per_sec << sbi->log_blocks_per_seg); @@ -2938,7 +2938,7 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi) if (segno >= MAIN_SEGS(sbi)) break; offset = segno + 1; - valid_blocks = get_valid_blocks(sbi, segno, 0); + valid_blocks = get_valid_blocks(sbi, segno, false); if (valid_blocks == sbi->blocks_per_seg || !valid_blocks) continue; if (valid_blocks > sbi->blocks_per_seg) { diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 39ef9cc0093b..053166038bfe 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -309,13 +309,13 @@ static inline struct sec_entry *get_sec_entry(struct f2fs_sb_info *sbi, } static inline unsigned int get_valid_blocks(struct f2fs_sb_info *sbi, - unsigned int segno, int section) + unsigned int segno, bool use_section) { /* * In order to get # of valid blocks in a section instantly from many * segments, f2fs manages two counting structures separately. */ - if (section > 1) + if (use_section && sbi->segs_per_sec > 1) return get_sec_entry(sbi, segno)->valid_blocks; else return get_seg_entry(sbi, segno)->valid_blocks; @@ -722,8 +722,8 @@ static inline block_t sum_blk_addr(struct f2fs_sb_info *sbi, int base, int type) static inline bool no_fggc_candidate(struct f2fs_sb_info *sbi, unsigned int secno) { - if (get_valid_blocks(sbi, GET_SEGNO_FROM_SECNO(sbi, secno), - sbi->segs_per_sec) >= sbi->fggc_threshold) + if (get_valid_blocks(sbi, GET_SEGNO_FROM_SECNO(sbi, secno), true) >= + sbi->fggc_threshold) return true; return false; } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index d50eb8d27e60..aa6ee31ef39b 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -997,7 +997,7 @@ static int segment_info_seq_show(struct seq_file *seq, void *offset) if ((i % 10) == 0) seq_printf(seq, "%-10d", i); seq_printf(seq, "%d|%-3u", se->type, - get_valid_blocks(sbi, i, 1)); + get_valid_blocks(sbi, i, false)); if ((i % 10) == 9 || i == (total_segs - 1)) seq_putc(seq, '\n'); else @@ -1023,7 +1023,7 @@ static int segment_bits_seq_show(struct seq_file *seq, void *offset) seq_printf(seq, "%-10d", i); seq_printf(seq, "%d|%-3u|", se->type, - get_valid_blocks(sbi, i, 1)); + get_valid_blocks(sbi, i, false)); for (j = 0; j < SIT_VBLOCK_MAP_SIZE; j++) seq_printf(seq, " %.2x", se->cur_valid_map[j]); seq_putc(seq, '\n'); From d1c1a744c455f1f76098792f9b1bdd682fc95dfe Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 7 Apr 2017 15:08:17 -0700 Subject: [PATCH 250/804] f2fs: clean up some macros in terms of GET_SEGNO This patch cleans several macros by introducing: - BLKS_PER_SEC - GET_SEC_FROM_SEG - GET_SEG_FROM_SEC - GET_ZONE_FROM_SEC - GET_ZONE_FROM_SEG Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 6 +++--- fs/f2fs/f2fs.h | 7 +++---- fs/f2fs/file.c | 3 +-- fs/f2fs/gc.c | 17 ++++++++--------- fs/f2fs/segment.c | 20 ++++++++++---------- fs/f2fs/segment.h | 34 ++++++++++++++++++++-------------- 6 files changed, 45 insertions(+), 42 deletions(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index dc16a52db275..692beff66bf8 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -109,8 +109,8 @@ static void update_general_status(struct f2fs_sb_info *sbi) for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_NODE; i++) { struct curseg_info *curseg = CURSEG_I(sbi, i); si->curseg[i] = curseg->segno; - si->cursec[i] = curseg->segno / sbi->segs_per_sec; - si->curzone[i] = si->cursec[i] / sbi->secs_per_zone; + si->cursec[i] = GET_SEC_FROM_SEG(sbi, curseg->segno); + si->curzone[i] = GET_ZONE_FROM_SEC(sbi, si->cursec[i]); } for (i = 0; i < 2; i++) { @@ -134,7 +134,7 @@ static void update_sit_info(struct f2fs_sb_info *sbi) bimodal = 0; total_vblocks = 0; - blks_per_sec = sbi->segs_per_sec * sbi->blocks_per_seg; + blks_per_sec = BLKS_PER_SEC(sbi); hblks_per_sec = blks_per_sec / 2; for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) { vblocks = get_valid_blocks(sbi, segno, true); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index cbc35660c466..e93bf44c34fa 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -202,12 +202,11 @@ enum { #define DEF_BATCHED_TRIM_SECTIONS 2048 #define BATCHED_TRIM_SEGMENTS(sbi) \ - (SM_I(sbi)->trim_sections * (sbi)->segs_per_sec) + (GET_SEG_FROM_SEC(sbi, SM_I(sbi)->trim_sections)) #define BATCHED_TRIM_BLOCKS(sbi) \ (BATCHED_TRIM_SEGMENTS(sbi) << (sbi)->log_blocks_per_seg) -#define MAX_DISCARD_BLOCKS(sbi) \ - ((1 << (sbi)->log_blocks_per_seg) * (sbi)->segs_per_sec) -#define DISCARD_ISSUE_RATE 8 +#define MAX_DISCARD_BLOCKS(sbi) BLKS_PER_SEC(sbi) +#define DISCARD_ISSUE_RATE 8 #define DEF_CP_INTERVAL 60 /* 60 secs */ #define DEF_IDLE_INTERVAL 5 /* 5 secs */ diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index b06a52e33a79..321bfca8a4f9 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1897,7 +1897,6 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, pgoff_t pg_start, pg_end; unsigned int blk_per_seg = sbi->blocks_per_seg; unsigned int total = 0, sec_num; - unsigned int pages_per_sec = sbi->segs_per_sec * blk_per_seg; block_t blk_end = 0; bool fragmented = false; int err; @@ -1961,7 +1960,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, map.m_lblk = pg_start; map.m_len = pg_end - pg_start; - sec_num = (map.m_len + pages_per_sec - 1) / pages_per_sec; + sec_num = (map.m_len + BLKS_PER_SEC(sbi) - 1) / BLKS_PER_SEC(sbi); /* * make sure there are enough free section for LFS allocation, this can diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index ec07940ea722..fe7716269b33 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -211,7 +211,7 @@ static unsigned int check_bg_victims(struct f2fs_sb_info *sbi) continue; clear_bit(secno, dirty_i->victim_secmap); - return secno * sbi->segs_per_sec; + return GET_SEG_FROM_SEC(sbi, secno); } return NULL_SEGNO; } @@ -219,8 +219,8 @@ static unsigned int check_bg_victims(struct f2fs_sb_info *sbi) static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno) { struct sit_info *sit_i = SIT_I(sbi); - unsigned int secno = GET_SECNO(sbi, segno); - unsigned int start = secno * sbi->segs_per_sec; + unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); + unsigned int start = GET_SEG_FROM_SEC(sbi, secno); unsigned long long mtime = 0; unsigned int vblocks; unsigned char age = 0; @@ -343,7 +343,7 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, nsearched++; } - secno = GET_SECNO(sbi, segno); + secno = GET_SEC_FROM_SEG(sbi, segno); if (sec_usage_check(sbi, secno)) goto next; @@ -372,7 +372,7 @@ next: if (p.min_segno != NULL_SEGNO) { got_it: if (p.alloc_mode == LFS) { - secno = GET_SECNO(sbi, p.min_segno); + secno = GET_SEC_FROM_SEG(sbi, p.min_segno); if (gc_type == FG_GC) sbi->cur_victim_sec = secno; else @@ -1006,7 +1006,7 @@ stop: void build_gc_manager(struct f2fs_sb_info *sbi) { - u64 main_count, resv_count, ovp_count, blocks_per_sec; + u64 main_count, resv_count, ovp_count; DIRTY_I(sbi)->v_ops = &default_v_ops; @@ -1014,8 +1014,7 @@ void build_gc_manager(struct f2fs_sb_info *sbi) main_count = SM_I(sbi)->main_segments << sbi->log_blocks_per_seg; resv_count = SM_I(sbi)->reserved_segments << sbi->log_blocks_per_seg; ovp_count = SM_I(sbi)->ovp_segments << sbi->log_blocks_per_seg; - blocks_per_sec = sbi->blocks_per_seg * sbi->segs_per_sec; - sbi->fggc_threshold = div64_u64((main_count - ovp_count) * blocks_per_sec, - (main_count - resv_count)); + sbi->fggc_threshold = div64_u64((main_count - ovp_count) * + BLKS_PER_SEC(sbi), (main_count - resv_count)); } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 8ab152f209af..40474e7c2033 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -637,7 +637,7 @@ static void __remove_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, dirty_i->nr_dirty[t]--; if (get_valid_blocks(sbi, segno, true) == 0) - clear_bit(GET_SECNO(sbi, segno), + clear_bit(GET_SEC_FROM_SEG(sbi, segno), dirty_i->victim_secmap); } } @@ -1185,8 +1185,8 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc) continue; } next: - secno = GET_SECNO(sbi, start); - start_segno = secno * sbi->segs_per_sec; + secno = GET_SEC_FROM_SEG(sbi, start); + start_segno = GET_SEG_FROM_SEC(sbi, secno); if (!IS_CURSEC(sbi, secno) && !get_valid_blocks(sbi, start, true)) f2fs_issue_discard(sbi, START_BLOCK(sbi, start_segno), @@ -1541,8 +1541,8 @@ static void get_new_segment(struct f2fs_sb_info *sbi, struct free_segmap_info *free_i = FREE_I(sbi); unsigned int segno, secno, zoneno; unsigned int total_zones = MAIN_SECS(sbi) / sbi->secs_per_zone; - unsigned int hint = *newseg / sbi->segs_per_sec; - unsigned int old_zoneno = GET_ZONENO_FROM_SEGNO(sbi, *newseg); + unsigned int hint = GET_SEC_FROM_SEG(sbi, *newseg); + unsigned int old_zoneno = GET_ZONE_FROM_SEG(sbi, *newseg); unsigned int left_start = hint; bool init = true; int go_left = 0; @@ -1552,8 +1552,8 @@ static void get_new_segment(struct f2fs_sb_info *sbi, if (!new_sec && ((*newseg + 1) % sbi->segs_per_sec)) { segno = find_next_zero_bit(free_i->free_segmap, - (hint + 1) * sbi->segs_per_sec, *newseg + 1); - if (segno < (hint + 1) * sbi->segs_per_sec) + GET_SEG_FROM_SEC(sbi, hint + 1), *newseg + 1); + if (segno < GET_SEG_FROM_SEC(sbi, hint + 1)) goto got_it; } find_other_zone: @@ -1584,8 +1584,8 @@ find_other_zone: secno = left_start; skip_left: hint = secno; - segno = secno * sbi->segs_per_sec; - zoneno = secno / sbi->secs_per_zone; + segno = GET_SEG_FROM_SEC(sbi, secno); + zoneno = GET_ZONE_FROM_SEC(sbi, secno); /* give up on finding another zone */ if (!init) @@ -1629,7 +1629,7 @@ static void reset_curseg(struct f2fs_sb_info *sbi, int type, int modified) struct summary_footer *sum_footer; curseg->segno = curseg->next_segno; - curseg->zone = GET_ZONENO_FROM_SEGNO(sbi, curseg->segno); + curseg->zone = GET_ZONE_FROM_SEG(sbi, curseg->segno); curseg->next_blkoff = 0; curseg->next_segno = NULL_SEGNO; diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 053166038bfe..5f6ef163aa8f 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -78,12 +78,16 @@ ((((blk_addr) == NULL_ADDR) || ((blk_addr) == NEW_ADDR)) ? \ NULL_SEGNO : GET_L2R_SEGNO(FREE_I(sbi), \ GET_SEGNO_FROM_SEG0(sbi, blk_addr))) -#define GET_SECNO(sbi, segno) \ +#define BLKS_PER_SEC(sbi) \ + ((sbi)->segs_per_sec * (sbi)->blocks_per_seg) +#define GET_SEC_FROM_SEG(sbi, segno) \ ((segno) / (sbi)->segs_per_sec) -#define GET_SEGNO_FROM_SECNO(sbi, secno) \ +#define GET_SEG_FROM_SEC(sbi, secno) \ ((secno) * (sbi)->segs_per_sec) -#define GET_ZONENO_FROM_SEGNO(sbi, segno) \ - (((segno) / (sbi)->segs_per_sec) / (sbi)->secs_per_zone) +#define GET_ZONE_FROM_SEC(sbi, secno) \ + ((secno) / (sbi)->secs_per_zone) +#define GET_ZONE_FROM_SEG(sbi, segno) \ + GET_ZONE_FROM_SEC(sbi, GET_SEC_FROM_SEG(sbi, segno)) #define GET_SUM_BLOCK(sbi, segno) \ ((sbi)->sm_info->ssa_blkaddr + (segno)) @@ -305,7 +309,7 @@ static inline struct sec_entry *get_sec_entry(struct f2fs_sb_info *sbi, unsigned int segno) { struct sit_info *sit_i = SIT_I(sbi); - return &sit_i->sec_entries[GET_SECNO(sbi, segno)]; + return &sit_i->sec_entries[GET_SEC_FROM_SEG(sbi, segno)]; } static inline unsigned int get_valid_blocks(struct f2fs_sb_info *sbi, @@ -360,8 +364,8 @@ static inline unsigned int find_next_inuse(struct free_segmap_info *free_i, static inline void __set_free(struct f2fs_sb_info *sbi, unsigned int segno) { struct free_segmap_info *free_i = FREE_I(sbi); - unsigned int secno = segno / sbi->segs_per_sec; - unsigned int start_segno = secno * sbi->segs_per_sec; + unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); + unsigned int start_segno = GET_SEG_FROM_SEC(sbi, secno); unsigned int next; spin_lock(&free_i->segmap_lock); @@ -381,7 +385,8 @@ static inline void __set_inuse(struct f2fs_sb_info *sbi, unsigned int segno) { struct free_segmap_info *free_i = FREE_I(sbi); - unsigned int secno = segno / sbi->segs_per_sec; + unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); + set_bit(segno, free_i->free_segmap); free_i->free_segments--; if (!test_and_set_bit(secno, free_i->free_secmap)) @@ -392,8 +397,8 @@ static inline void __set_test_and_free(struct f2fs_sb_info *sbi, unsigned int segno) { struct free_segmap_info *free_i = FREE_I(sbi); - unsigned int secno = segno / sbi->segs_per_sec; - unsigned int start_segno = secno * sbi->segs_per_sec; + unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); + unsigned int start_segno = GET_SEG_FROM_SEC(sbi, secno); unsigned int next; spin_lock(&free_i->segmap_lock); @@ -414,7 +419,8 @@ static inline void __set_test_and_inuse(struct f2fs_sb_info *sbi, unsigned int segno) { struct free_segmap_info *free_i = FREE_I(sbi); - unsigned int secno = segno / sbi->segs_per_sec; + unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); + spin_lock(&free_i->segmap_lock); if (!test_and_set_bit(segno, free_i->free_segmap)) { free_i->free_segments--; @@ -479,12 +485,12 @@ static inline int overprovision_segments(struct f2fs_sb_info *sbi) static inline int overprovision_sections(struct f2fs_sb_info *sbi) { - return ((unsigned int) overprovision_segments(sbi)) / sbi->segs_per_sec; + return GET_SEC_FROM_SEG(sbi, (unsigned int)overprovision_segments(sbi)); } static inline int reserved_sections(struct f2fs_sb_info *sbi) { - return ((unsigned int) reserved_segments(sbi)) / sbi->segs_per_sec; + return GET_SEC_FROM_SEG(sbi, (unsigned int)reserved_segments(sbi)); } static inline bool need_SSR(struct f2fs_sb_info *sbi) @@ -722,7 +728,7 @@ static inline block_t sum_blk_addr(struct f2fs_sb_info *sbi, int base, int type) static inline bool no_fggc_candidate(struct f2fs_sb_info *sbi, unsigned int secno) { - if (get_valid_blocks(sbi, GET_SEGNO_FROM_SECNO(sbi, secno), true) >= + if (get_valid_blocks(sbi, GET_SEG_FROM_SEC(sbi, secno), true) >= sbi->fggc_threshold) return true; return false; From 68033a5ab66f3973813d9b18fa9a28b1da48d13d Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 7 Apr 2017 17:25:54 -0700 Subject: [PATCH 251/804] f2fs: avoid frequent checkpoint during f2fs_gc Now we're doing SSR aggressively more than ever before, so once we reach to the reserved_segment, f2fs_balance_fs will call f2fs_gc, which triggers checkpoint everytime. We actually must avoid that. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index fe7716269b33..84ade968d149 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -966,9 +966,11 @@ gc_more: * threshold, we can make them free by checkpoint. Then, we * secure free segments which doesn't need fggc any more. */ - ret = write_checkpoint(sbi, &cpc); - if (ret) - goto stop; + if (prefree_segments(sbi)) { + ret = write_checkpoint(sbi, &cpc); + if (ret) + goto stop; + } if (has_not_enough_free_secs(sbi, 0, 0)) gc_type = FG_GC; } From 6e4fee6a144e7f275217cf4900d1cdd12f39d42d Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 11 Apr 2017 09:25:22 +0800 Subject: [PATCH 252/804] f2fs: extract rb-tree operation infrastructure rb-tree lookup/update functions are deeply coupled into extent cache codes, it's very hard to reuse these basic functions, this patch extracts common rb-tree operation infrastructure for latter reusing. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/extent_cache.c | 291 +++++++++++++++++++++++------------------ fs/f2fs/f2fs.h | 20 ++- 2 files changed, 179 insertions(+), 132 deletions(-) diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index c6934f014e0f..68e649a31c7d 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -18,6 +18,146 @@ #include "node.h" #include +static struct rb_entry *__lookup_rb_tree_fast(struct rb_entry *cached_re, + unsigned int ofs) +{ + if (cached_re) { + if (cached_re->ofs <= ofs && + cached_re->ofs + cached_re->len > ofs) { + return cached_re; + } + } + return NULL; +} + +static struct rb_entry *__lookup_rb_tree_slow(struct rb_root *root, + unsigned int ofs) +{ + struct rb_node *node = root->rb_node; + struct rb_entry *re; + + while (node) { + re = rb_entry(node, struct rb_entry, rb_node); + + if (ofs < re->ofs) + node = node->rb_left; + else if (ofs >= re->ofs + re->len) + node = node->rb_right; + else + return re; + } + return NULL; +} + +static struct rb_entry *__lookup_rb_tree(struct rb_root *root, + struct rb_entry *cached_re, unsigned int ofs) +{ + struct rb_entry *re; + + re = __lookup_rb_tree_fast(cached_re, ofs); + if (!re) + return __lookup_rb_tree_slow(root, ofs); + + return re; +} + +static struct rb_node **__lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi, + struct rb_root *root, struct rb_node **parent, + unsigned int ofs) +{ + struct rb_node **p = &root->rb_node; + struct rb_entry *re; + + while (*p) { + *parent = *p; + re = rb_entry(*parent, struct rb_entry, rb_node); + + if (ofs < re->ofs) + p = &(*p)->rb_left; + else if (ofs >= re->ofs + re->len) + p = &(*p)->rb_right; + else + f2fs_bug_on(sbi, 1); + } + + return p; +} + +/* + * lookup rb entry in position of @ofs in rb-tree, + * if hit, return the entry, otherwise, return NULL + * @prev_ex: extent before ofs + * @next_ex: extent after ofs + * @insert_p: insert point for new extent at ofs + * in order to simpfy the insertion after. + * tree must stay unchanged between lookup and insertion. + */ +static struct rb_entry *__lookup_rb_tree_ret(struct rb_root *root, + struct rb_entry *cached_re, + unsigned int ofs, + struct rb_entry **prev_entry, + struct rb_entry **next_entry, + struct rb_node ***insert_p, + struct rb_node **insert_parent) +{ + struct rb_node **pnode = &root->rb_node; + struct rb_node *parent = NULL, *tmp_node; + struct rb_entry *re = cached_re; + + *insert_p = NULL; + *insert_parent = NULL; + *prev_entry = NULL; + *next_entry = NULL; + + if (RB_EMPTY_ROOT(root)) + return NULL; + + if (re) { + if (re->ofs <= ofs && re->ofs + re->len > ofs) + goto lookup_neighbors; + } + + while (*pnode) { + parent = *pnode; + re = rb_entry(*pnode, struct rb_entry, rb_node); + + if (ofs < re->ofs) + pnode = &(*pnode)->rb_left; + else if (ofs >= re->ofs + re->len) + pnode = &(*pnode)->rb_right; + else + goto lookup_neighbors; + } + + *insert_p = pnode; + *insert_parent = parent; + + re = rb_entry(parent, struct rb_entry, rb_node); + tmp_node = parent; + if (parent && ofs > re->ofs) + tmp_node = rb_next(parent); + *next_entry = rb_entry_safe(tmp_node, struct rb_entry, rb_node); + + tmp_node = parent; + if (parent && ofs < re->ofs) + tmp_node = rb_prev(parent); + *prev_entry = rb_entry_safe(tmp_node, struct rb_entry, rb_node); + return NULL; + +lookup_neighbors: + if (ofs == re->ofs) { + /* lookup prev node for merging backward later */ + tmp_node = rb_prev(&re->rb_node); + *prev_entry = rb_entry_safe(tmp_node, struct rb_entry, rb_node); + } + if (ofs == re->ofs + re->len - 1) { + /* lookup next node for merging frontward later */ + tmp_node = rb_next(&re->rb_node); + *next_entry = rb_entry_safe(tmp_node, struct rb_entry, rb_node); + } + return re; +} + static struct kmem_cache *extent_tree_slab; static struct kmem_cache *extent_node_slab; @@ -102,36 +242,6 @@ static struct extent_tree *__grab_extent_tree(struct inode *inode) return et; } -static struct extent_node *__lookup_extent_tree(struct f2fs_sb_info *sbi, - struct extent_tree *et, unsigned int fofs) -{ - struct rb_node *node = et->root.rb_node; - struct extent_node *en = et->cached_en; - - if (en) { - struct extent_info *cei = &en->ei; - - if (cei->fofs <= fofs && cei->fofs + cei->len > fofs) { - stat_inc_cached_node_hit(sbi); - return en; - } - } - - while (node) { - en = rb_entry(node, struct extent_node, rb_node); - - if (fofs < en->ei.fofs) { - node = node->rb_left; - } else if (fofs >= en->ei.fofs + en->ei.len) { - node = node->rb_right; - } else { - stat_inc_rbtree_node_hit(sbi); - return en; - } - } - return NULL; -} - static struct extent_node *__init_extent_tree(struct f2fs_sb_info *sbi, struct extent_tree *et, struct extent_info *ei) { @@ -237,17 +347,24 @@ static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs, goto out; } - en = __lookup_extent_tree(sbi, et, pgofs); - if (en) { - *ei = en->ei; - spin_lock(&sbi->extent_lock); - if (!list_empty(&en->list)) { - list_move_tail(&en->list, &sbi->extent_list); - et->cached_en = en; - } - spin_unlock(&sbi->extent_lock); - ret = true; + en = (struct extent_node *)__lookup_rb_tree(&et->root, + (struct rb_entry *)et->cached_en, pgofs); + if (!en) + goto out; + + if (en == et->cached_en) + stat_inc_cached_node_hit(sbi); + else + stat_inc_rbtree_node_hit(sbi); + + *ei = en->ei; + spin_lock(&sbi->extent_lock); + if (!list_empty(&en->list)) { + list_move_tail(&en->list, &sbi->extent_list); + et->cached_en = en; } + spin_unlock(&sbi->extent_lock); + ret = true; out: stat_inc_total_hit(sbi); read_unlock(&et->lock); @@ -256,83 +373,6 @@ out: return ret; } - -/* - * lookup extent at @fofs, if hit, return the extent - * if not, return NULL and - * @prev_ex: extent before fofs - * @next_ex: extent after fofs - * @insert_p: insert point for new extent at fofs - * in order to simpfy the insertion after. - * tree must stay unchanged between lookup and insertion. - */ -static struct extent_node *__lookup_extent_tree_ret(struct extent_tree *et, - unsigned int fofs, - struct extent_node **prev_ex, - struct extent_node **next_ex, - struct rb_node ***insert_p, - struct rb_node **insert_parent) -{ - struct rb_node **pnode = &et->root.rb_node; - struct rb_node *parent = NULL, *tmp_node; - struct extent_node *en = et->cached_en; - - *insert_p = NULL; - *insert_parent = NULL; - *prev_ex = NULL; - *next_ex = NULL; - - if (RB_EMPTY_ROOT(&et->root)) - return NULL; - - if (en) { - struct extent_info *cei = &en->ei; - - if (cei->fofs <= fofs && cei->fofs + cei->len > fofs) - goto lookup_neighbors; - } - - while (*pnode) { - parent = *pnode; - en = rb_entry(*pnode, struct extent_node, rb_node); - - if (fofs < en->ei.fofs) - pnode = &(*pnode)->rb_left; - else if (fofs >= en->ei.fofs + en->ei.len) - pnode = &(*pnode)->rb_right; - else - goto lookup_neighbors; - } - - *insert_p = pnode; - *insert_parent = parent; - - en = rb_entry(parent, struct extent_node, rb_node); - tmp_node = parent; - if (parent && fofs > en->ei.fofs) - tmp_node = rb_next(parent); - *next_ex = rb_entry_safe(tmp_node, struct extent_node, rb_node); - - tmp_node = parent; - if (parent && fofs < en->ei.fofs) - tmp_node = rb_prev(parent); - *prev_ex = rb_entry_safe(tmp_node, struct extent_node, rb_node); - return NULL; - -lookup_neighbors: - if (fofs == en->ei.fofs) { - /* lookup prev node for merging backward later */ - tmp_node = rb_prev(&en->rb_node); - *prev_ex = rb_entry_safe(tmp_node, struct extent_node, rb_node); - } - if (fofs == en->ei.fofs + en->ei.len - 1) { - /* lookup next node for merging frontward later */ - tmp_node = rb_next(&en->rb_node); - *next_ex = rb_entry_safe(tmp_node, struct extent_node, rb_node); - } - return en; -} - static struct extent_node *__try_merge_extent_node(struct inode *inode, struct extent_tree *et, struct extent_info *ei, struct extent_node *prev_ex, @@ -387,17 +427,7 @@ static struct extent_node *__insert_extent_tree(struct inode *inode, goto do_insert; } - while (*p) { - parent = *p; - en = rb_entry(parent, struct extent_node, rb_node); - - if (ei->fofs < en->ei.fofs) - p = &(*p)->rb_left; - else if (ei->fofs >= en->ei.fofs + en->ei.len) - p = &(*p)->rb_right; - else - f2fs_bug_on(sbi, 1); - } + p = __lookup_rb_tree_for_insert(sbi, &et->root, &parent, ei->fofs); do_insert: en = __attach_extent_node(sbi, et, ei, parent, p); if (!en) @@ -447,7 +477,10 @@ static void f2fs_update_extent_tree_range(struct inode *inode, __drop_largest_extent(inode, fofs, len); /* 1. lookup first extent node in range [fofs, fofs + len - 1] */ - en = __lookup_extent_tree_ret(et, fofs, &prev_en, &next_en, + en = (struct extent_node *)__lookup_rb_tree_ret(&et->root, + (struct rb_entry *)et->cached_en, fofs, + (struct rb_entry **)&prev_en, + (struct rb_entry **)&next_en, &insert_p, &insert_parent); if (!en) en = next_en; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index e93bf44c34fa..1a5737831401 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -444,16 +444,30 @@ enum { /* number of extent info in extent cache we try to shrink */ #define EXTENT_CACHE_SHRINK_NUMBER 128 +struct rb_entry { + struct rb_node rb_node; /* rb node located in rb-tree */ + unsigned int ofs; /* start offset of the entry */ + unsigned int len; /* length of the entry */ +}; + struct extent_info { unsigned int fofs; /* start offset in a file */ - u32 blk; /* start block address of the extent */ unsigned int len; /* length of the extent */ + u32 blk; /* start block address of the extent */ }; struct extent_node { - struct rb_node rb_node; /* rb node located in rb-tree */ + struct rb_node rb_node; + union { + struct { + unsigned int fofs; + unsigned int len; + u32 blk; + }; + struct extent_info ei; /* extent info */ + + }; struct list_head list; /* node in global extent list of sbi */ - struct extent_info ei; /* extent info */ struct extent_tree *et; /* extent tree pointer */ }; From fa3a914e8bf8f0f06e9423fc857ca4f5d44d00d3 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 27 Mar 2017 18:14:05 +0800 Subject: [PATCH 253/804] f2fs: shrink blk plug region Don't use blk plug covering area where there won't be any IOs being issued. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 40474e7c2033..1e726e893eec 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -932,9 +932,8 @@ repeat: if (kthread_should_stop()) return 0; - blk_start_plug(&plug); - mutex_lock(&dcc->cmd_lock); + blk_start_plug(&plug); list_for_each_entry_safe(dc, tmp, pend_list, list) { f2fs_bug_on(sbi, dc->state != D_PREP); @@ -944,6 +943,7 @@ repeat: if (iter++ > DISCARD_ISSUE_RATE) break; } + blk_finish_plug(&plug); list_for_each_entry_safe(dc, tmp, wait_list, list) { if (dc->state == D_DONE) { @@ -953,8 +953,6 @@ repeat: } mutex_unlock(&dcc->cmd_lock); - blk_finish_plug(&plug); - iter = 0; congestion_wait(BLK_RW_SYNC, HZ/50); From be687c56d1f697a86abd1e3359e2b63fd3aa73bc Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 11 Apr 2017 19:01:26 -0700 Subject: [PATCH 254/804] f2fs: fix fs corruption due to zero inode page This patch fixes the following scenario. - f2fs_create/f2fs_mkdir - write_checkpoint - f2fs_mark_inode_dirty_sync - block_operations - f2fs_lock_all - f2fs_sync_inode_meta - f2fs_unlock_all - sync_inode_metadata - f2fs_lock_op - f2fs_write_inode - update_inode_page - get_node_page return -ENOENT - new_inode_page - fill_node_footer - f2fs_mark_inode_dirty_sync - ... - f2fs_unlock_op - f2fs_inode_synced - f2fs_lock_all - do_checkpoint In this checkpoint, we can get an inode page which contains zeros having valid node footer only. Cc: Signed-off-by: Jaegeuk Kim --- fs/f2fs/inode.c | 2 +- fs/f2fs/namei.c | 20 ++++++++++---------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 2520fa72b23f..0900814485c7 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -316,7 +316,6 @@ retry: } else if (err != -ENOENT) { f2fs_stop_checkpoint(sbi, false); } - f2fs_inode_synced(inode); return 0; } ret = update_inode(inode, node_page); @@ -450,6 +449,7 @@ void handle_failed_inode(struct inode *inode) * in a panic when flushing dirty inodes in gdirty_list. */ update_inode_page(inode); + f2fs_inode_synced(inode); /* don't make bad inode, since it becomes a regular file. */ unlock_new_inode(inode); diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 43eb2bd417a8..35fca4c39993 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -148,8 +148,6 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, inode->i_mapping->a_ops = &f2fs_dblock_aops; ino = inode->i_ino; - f2fs_balance_fs(sbi, true); - f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); if (err) @@ -163,6 +161,8 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, if (IS_DIRSYNC(dir)) f2fs_sync_fs(sbi->sb, 1); + + f2fs_balance_fs(sbi, true); return 0; out: handle_failed_inode(inode); @@ -420,8 +420,6 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, inode_nohighmem(inode); inode->i_mapping->a_ops = &f2fs_dblock_aops; - f2fs_balance_fs(sbi, true); - f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); if (err) @@ -484,6 +482,8 @@ err_out: } kfree(sd); + + f2fs_balance_fs(sbi, true); return err; out: handle_failed_inode(inode); @@ -505,8 +505,6 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) inode->i_mapping->a_ops = &f2fs_dblock_aops; mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_HIGH_ZERO); - f2fs_balance_fs(sbi, true); - set_inode_flag(inode, FI_INC_LINK); f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); @@ -521,6 +519,8 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) if (IS_DIRSYNC(dir)) f2fs_sync_fs(sbi->sb, 1); + + f2fs_balance_fs(sbi, true); return 0; out_fail: @@ -551,8 +551,6 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry, init_special_inode(inode, inode->i_mode, rdev); inode->i_op = &f2fs_special_inode_operations; - f2fs_balance_fs(sbi, true); - f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); if (err) @@ -566,6 +564,8 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry, if (IS_DIRSYNC(dir)) f2fs_sync_fs(sbi->sb, 1); + + f2fs_balance_fs(sbi, true); return 0; out: handle_failed_inode(inode); @@ -592,8 +592,6 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry, inode->i_mapping->a_ops = &f2fs_dblock_aops; } - f2fs_balance_fs(sbi, true); - f2fs_lock_op(sbi); err = acquire_orphan_inode(sbi); if (err) @@ -619,6 +617,8 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry, /* link_count was changed by d_tmpfile as well. */ f2fs_unlock_op(sbi); unlock_new_inode(inode); + + f2fs_balance_fs(sbi, true); return 0; release_out: From 24f3c7e19565a9438dc075c9cd61a6d8a962c315 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 11 Apr 2017 19:15:33 -0700 Subject: [PATCH 255/804] f2fs: give time to flush dirty pages for checkpoint If all the threads are waiting for checkpoint, we have no chance to flush required dirty pages. Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 8b106d10afe7..0983b7646444 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -981,6 +981,7 @@ retry_flush_dents: err = sync_dirty_inodes(sbi, DIR_INODE); if (err) goto out; + cond_resched(); goto retry_flush_dents; } @@ -996,6 +997,7 @@ retry_flush_dents: err = f2fs_sync_inode_meta(sbi); if (err) goto out; + cond_resched(); goto retry_flush_dents; } @@ -1010,6 +1012,7 @@ retry_flush_nodes: f2fs_unlock_all(sbi); goto out; } + cond_resched(); goto retry_flush_nodes; } From 933686cf727b9428a0933c8d94b4ef4c2cedc0e3 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 12 Apr 2017 10:01:33 -0700 Subject: [PATCH 256/804] f2fs: allocate hot_data for atomic writes We'd better allocate atomic writes to hot_data zone. Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 321bfca8a4f9..1da2ceaaac3e 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1536,6 +1536,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) goto out; set_inode_flag(inode, FI_ATOMIC_FILE); + set_inode_flag(inode, FI_HOT_DATA); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); if (!get_dirty_pages(inode)) From 8aa17546af24daf62a9cad74d09ab6797065db48 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 12 Apr 2017 12:02:00 -0700 Subject: [PATCH 257/804] f2fs: fix not to set fsync/dentry mark Otherwise, we can see stale fsync/dentry mark given by previous calls, resulting in giving up roll-forward recovery due to wrong dentry mark. Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 19ea77dc3192..dbf0efeb1cde 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1458,6 +1458,9 @@ continue_unlock: f2fs_wait_on_page_writeback(page, NODE, true); BUG_ON(PageWriteback(page)); + set_fsync_mark(page, 0); + set_dentry_mark(page, 0); + if (!atomic || page == last_page) { set_fsync_mark(page, 1); if (IS_INODE(page)) { From 9febed8ff933c555a0c637b49e09a4264f6c2907 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 14 Apr 2017 15:46:23 -0700 Subject: [PATCH 258/804] f2fs: avoid dirty node pages in check_only recovery In the check_only mode, we should not make any dirty node pages. Otherwise, we can get this panic: F2FS-fs (nvme0n1p1): Need to recover fsync data ------------[ cut here ]------------ kernel BUG at fs/f2fs/node.c:2204! CPU: 7 PID: 19923 Comm: mount Tainted: G OE 4.9.8 #2 RIP: 0010:[] [] flush_nat_entries+0x43b/0x7d0 [f2fs] Call Trace: [] ? __f2fs_submit_merged_bio+0x5a/0xd0 [f2fs] [] ? __f2fs_submit_merged_bio+0x5a/0xd0 [f2fs] [] ? __f2fs_submit_merged_bio+0x8b/0xd0 [f2fs] [] ? up_write+0x1f/0x40 [] ? __f2fs_submit_merged_bio+0x8b/0xd0 [f2fs] [] write_checkpoint+0x2f4/0xf20 [f2fs] [] ? trace_hardirqs_on+0xd/0x10 [] ? f2fs_sync_fs+0x79/0x190 [f2fs] [] ? f2fs_sync_fs+0x79/0x190 [f2fs] [] f2fs_sync_fs+0x85/0x190 [f2fs] [] f2fs_balance_fs_bg+0x7e/0x1c0 [f2fs] [] f2fs_write_node_pages+0x34/0x350 [f2fs] [] ? __lock_is_held+0x52/0x70 [] do_writepages+0x21/0x30 [] __writeback_single_inode+0x61/0x760 [] ? _raw_spin_unlock+0x27/0x40 [] writeback_single_inode+0xd5/0x190 [] write_inode_now+0x99/0xc0 [] iput+0x1f6/0x2c0 [] f2fs_fill_super+0xc32/0x10c0 [f2fs] [] mount_bdev+0x182/0x1b0 [] ? f2fs_commit_super+0x100/0x100 [f2fs] [] f2fs_mount+0x15/0x20 [f2fs] [] mount_fs+0x38/0x170 [] vfs_kern_mount+0x6b/0x160 [] do_mount+0x1be/0xd60 Signed-off-by: Jaegeuk Kim --- fs/f2fs/recovery.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index d025aa83fb5b..907d6b7dde6a 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -198,7 +198,8 @@ static void recover_inode(struct inode *inode, struct page *page) ino_of_node(page), name); } -static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) +static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head, + bool check_only) { struct curseg_info *curseg; struct page *page = NULL; @@ -225,7 +226,8 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) entry = get_fsync_inode(head, ino_of_node(page)); if (!entry) { - if (IS_INODE(page) && is_dent_dnode(page)) { + if (!check_only && + IS_INODE(page) && is_dent_dnode(page)) { err = recover_inode_page(sbi, page); if (err) break; @@ -569,7 +571,7 @@ int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) mutex_lock(&sbi->cp_mutex); /* step #1: find fsynced inode numbers */ - err = find_fsync_dnodes(sbi, &inode_list); + err = find_fsync_dnodes(sbi, &inode_list, check_only); if (err || list_empty(&inode_list)) goto out; From 062eb908b28711e4c9323a79dfe69a265efd64d5 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 14 Apr 2017 23:24:55 +0800 Subject: [PATCH 259/804] f2fs: use rb-tree to track pending discard commands Introduce rb-tree based discard cache infrastructure to speed up lookup and merge operation of discard entry. Signed-off-by: Chao Yu [Jaegeuk Kim: initialize dc to avoid build warning] Signed-off-by: Jaegeuk Kim --- fs/f2fs/extent_cache.c | 15 +-- fs/f2fs/f2fs.h | 48 ++++++++- fs/f2fs/segment.c | 227 +++++++++++++++++++++++++++++++++-------- 3 files changed, 238 insertions(+), 52 deletions(-) diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 68e649a31c7d..221ad086ee00 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -49,7 +49,7 @@ static struct rb_entry *__lookup_rb_tree_slow(struct rb_root *root, return NULL; } -static struct rb_entry *__lookup_rb_tree(struct rb_root *root, +struct rb_entry *__lookup_rb_tree(struct rb_root *root, struct rb_entry *cached_re, unsigned int ofs) { struct rb_entry *re; @@ -61,7 +61,7 @@ static struct rb_entry *__lookup_rb_tree(struct rb_root *root, return re; } -static struct rb_node **__lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi, +struct rb_node **__lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi, struct rb_root *root, struct rb_node **parent, unsigned int ofs) { @@ -92,13 +92,14 @@ static struct rb_node **__lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi, * in order to simpfy the insertion after. * tree must stay unchanged between lookup and insertion. */ -static struct rb_entry *__lookup_rb_tree_ret(struct rb_root *root, +struct rb_entry *__lookup_rb_tree_ret(struct rb_root *root, struct rb_entry *cached_re, unsigned int ofs, struct rb_entry **prev_entry, struct rb_entry **next_entry, struct rb_node ***insert_p, - struct rb_node **insert_parent) + struct rb_node **insert_parent, + bool force) { struct rb_node **pnode = &root->rb_node; struct rb_node *parent = NULL, *tmp_node; @@ -145,12 +146,12 @@ static struct rb_entry *__lookup_rb_tree_ret(struct rb_root *root, return NULL; lookup_neighbors: - if (ofs == re->ofs) { + if (ofs == re->ofs || force) { /* lookup prev node for merging backward later */ tmp_node = rb_prev(&re->rb_node); *prev_entry = rb_entry_safe(tmp_node, struct rb_entry, rb_node); } - if (ofs == re->ofs + re->len - 1) { + if (ofs == re->ofs + re->len - 1 || force) { /* lookup next node for merging frontward later */ tmp_node = rb_next(&re->rb_node); *next_entry = rb_entry_safe(tmp_node, struct rb_entry, rb_node); @@ -481,7 +482,7 @@ static void f2fs_update_extent_tree_range(struct inode *inode, (struct rb_entry *)et->cached_en, fofs, (struct rb_entry **)&prev_en, (struct rb_entry **)&next_en, - &insert_p, &insert_parent); + &insert_p, &insert_parent, false); if (!en) en = next_en; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 1a5737831401..dd25e6ff0785 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -261,13 +261,26 @@ enum { D_DONE, }; +struct discard_info { + block_t lstart; /* logical start address */ + block_t len; /* length */ + block_t start; /* actual start address in dev */ +}; + struct discard_cmd { + struct rb_node rb_node; /* rb node located in rb-tree */ + union { + struct { + block_t lstart; /* logical start address */ + block_t len; /* length */ + block_t start; /* actual start address in dev */ + }; + struct discard_info di; /* discard info */ + + }; struct list_head list; /* command list */ struct completion wait; /* compleation */ struct block_device *bdev; /* bdev */ - block_t lstart; /* logical start address */ - block_t start; /* actual start address in dev */ - block_t len; /* length */ int state; /* state */ int error; /* bio error */ }; @@ -284,6 +297,7 @@ struct discard_cmd_control { atomic_t issued_discard; /* # of issued discard */ atomic_t issing_discard; /* # of issing discard */ atomic_t discard_cmd_cnt; /* # of cached cmd count */ + struct rb_root root; /* root of discard rb-tree */ }; /* for the list of fsync inodes, used only during recovery */ @@ -584,6 +598,24 @@ static inline void set_extent_info(struct extent_info *ei, unsigned int fofs, ei->len = len; } +static inline bool __is_discard_mergeable(struct discard_info *back, + struct discard_info *front) +{ + return back->lstart + back->len == front->lstart; +} + +static inline bool __is_discard_back_mergeable(struct discard_info *cur, + struct discard_info *back) +{ + return __is_discard_mergeable(back, cur); +} + +static inline bool __is_discard_front_mergeable(struct discard_info *cur, + struct discard_info *front) +{ + return __is_discard_mergeable(cur, front); +} + static inline bool __is_extent_mergeable(struct extent_info *back, struct extent_info *front) { @@ -2640,6 +2672,16 @@ void f2fs_leave_shrinker(struct f2fs_sb_info *sbi); /* * extent_cache.c */ +struct rb_entry *__lookup_rb_tree(struct rb_root *root, + struct rb_entry *cached_re, unsigned int ofs); +struct rb_node **__lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi, + struct rb_root *root, struct rb_node **parent, + unsigned int ofs); +struct rb_entry *__lookup_rb_tree_ret(struct rb_root *root, + struct rb_entry *cached_re, unsigned int ofs, + struct rb_entry **prev_entry, struct rb_entry **next_entry, + struct rb_node ***insert_p, struct rb_node **insert_parent, + bool force); unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink); bool f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext); void f2fs_drop_extent_tree(struct inode *inode); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 1e726e893eec..9adc3bcfb4f4 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -672,7 +672,7 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) mutex_unlock(&dirty_i->seglist_lock); } -static void __add_discard_cmd(struct f2fs_sb_info *sbi, +static struct discard_cmd *__create_discard_cmd(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t lstart, block_t start, block_t len) { @@ -689,18 +689,46 @@ static void __add_discard_cmd(struct f2fs_sb_info *sbi, dc->state = D_PREP; dc->error = 0; init_completion(&dc->wait); - - mutex_lock(&dcc->cmd_lock); list_add_tail(&dc->list, pend_list); - mutex_unlock(&dcc->cmd_lock); - atomic_inc(&dcc->discard_cmd_cnt); + + return dc; } -static void __remove_discard_cmd(struct f2fs_sb_info *sbi, struct discard_cmd *dc) +static struct discard_cmd *__attach_discard_cmd(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t lstart, + block_t start, block_t len, + struct rb_node *parent, struct rb_node **p) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct discard_cmd *dc; + + dc = __create_discard_cmd(sbi, bdev, lstart, start, len); + + rb_link_node(&dc->rb_node, parent, p); + rb_insert_color(&dc->rb_node, &dcc->root); + + return dc; +} + +static void __detach_discard_cmd(struct discard_cmd_control *dcc, + struct discard_cmd *dc) { if (dc->state == D_DONE) - atomic_dec(&(SM_I(sbi)->dcc_info->issing_discard)); + atomic_dec(&dcc->issing_discard); + + list_del(&dc->list); + rb_erase(&dc->rb_node, &dcc->root); + + kmem_cache_free(discard_cmd_slab, dc); + + atomic_dec(&dcc->discard_cmd_cnt); +} + +static void __remove_discard_cmd(struct f2fs_sb_info *sbi, + struct discard_cmd *dc) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; if (dc->error == -EOPNOTSUPP) dc->error = 0; @@ -708,9 +736,7 @@ static void __remove_discard_cmd(struct f2fs_sb_info *sbi, struct discard_cmd *d if (dc->error) f2fs_msg(sbi->sb, KERN_INFO, "Issue discard failed, ret: %d", dc->error); - list_del(&dc->list); - kmem_cache_free(discard_cmd_slab, dc); - atomic_dec(&SM_I(sbi)->dcc_info->discard_cmd_cnt); + __detach_discard_cmd(dcc, dc); } static void f2fs_submit_discard_endio(struct bio *bio) @@ -833,6 +859,148 @@ static void __submit_discard_cmd(struct f2fs_sb_info *sbi, } } +static struct discard_cmd *__insert_discard_tree(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t lstart, + block_t start, block_t len, + struct rb_node **insert_p, + struct rb_node *insert_parent) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct rb_node **p = &dcc->root.rb_node; + struct rb_node *parent = NULL; + struct discard_cmd *dc = NULL; + + if (insert_p && insert_parent) { + parent = insert_parent; + p = insert_p; + goto do_insert; + } + + p = __lookup_rb_tree_for_insert(sbi, &dcc->root, &parent, lstart); +do_insert: + dc = __attach_discard_cmd(sbi, bdev, lstart, start, len, parent, p); + if (!dc) + return NULL; + + return dc; +} + +static void __punch_discard_cmd(struct f2fs_sb_info *sbi, + struct discard_cmd *dc, block_t blkaddr) +{ + struct discard_info di = dc->di; + bool modified = false; + + if (dc->state == D_DONE || dc->len == 1) { + __remove_discard_cmd(sbi, dc); + return; + } + + if (blkaddr > di.lstart) { + dc->len = blkaddr - dc->lstart; + modified = true; + } + + if (blkaddr < di.lstart + di.len - 1) { + if (modified) { + __insert_discard_tree(sbi, dc->bdev, blkaddr + 1, + di.start + blkaddr + 1 - di.lstart, + di.lstart + di.len - 1 - blkaddr, + NULL, NULL); + } else { + dc->lstart++; + dc->len--; + dc->start++; + } + } +} + +static void __update_discard_tree_range(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t lstart, + block_t start, block_t len) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct discard_cmd *prev_dc = NULL, *next_dc = NULL; + struct discard_cmd *dc; + struct discard_info di = {0}; + struct rb_node **insert_p = NULL, *insert_parent = NULL; + block_t end = lstart + len; + + mutex_lock(&dcc->cmd_lock); + + dc = (struct discard_cmd *)__lookup_rb_tree_ret(&dcc->root, + NULL, lstart, + (struct rb_entry **)&prev_dc, + (struct rb_entry **)&next_dc, + &insert_p, &insert_parent, true); + if (dc) + prev_dc = dc; + + if (!prev_dc) { + di.lstart = lstart; + di.len = next_dc ? next_dc->lstart - lstart : len; + di.len = min(di.len, len); + di.start = start; + } + + while (1) { + struct rb_node *node; + bool merged = false; + struct discard_cmd *tdc = NULL; + + if (prev_dc) { + di.lstart = prev_dc->lstart + prev_dc->len; + if (di.lstart < lstart) + di.lstart = lstart; + if (di.lstart >= end) + break; + + if (!next_dc || next_dc->lstart > end) + di.len = end - di.lstart; + else + di.len = next_dc->lstart - di.lstart; + di.start = start + di.lstart - lstart; + } + + if (!di.len) + goto next; + + if (prev_dc && prev_dc->state == D_PREP && + prev_dc->bdev == bdev && + __is_discard_back_mergeable(&di, &prev_dc->di)) { + prev_dc->di.len += di.len; + di = prev_dc->di; + tdc = prev_dc; + merged = true; + } + + if (next_dc && next_dc->state == D_PREP && + next_dc->bdev == bdev && + __is_discard_front_mergeable(&di, &next_dc->di)) { + next_dc->di.lstart = di.lstart; + next_dc->di.len += di.len; + next_dc->di.start = di.start; + if (tdc) + __remove_discard_cmd(sbi, tdc); + + merged = true; + } + + if (!merged) + __insert_discard_tree(sbi, bdev, di.lstart, di.start, + di.len, NULL, NULL); + next: + prev_dc = next_dc; + if (!prev_dc) + break; + + node = rb_next(&prev_dc->rb_node); + next_dc = rb_entry_safe(node, struct discard_cmd, rb_node); + } + + mutex_unlock(&dcc->cmd_lock); +} + static int __queue_discard_cmd(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t blkstart, block_t blklen) { @@ -845,50 +1013,24 @@ static int __queue_discard_cmd(struct f2fs_sb_info *sbi, blkstart -= FDEV(devi).start_blk; } - __add_discard_cmd(sbi, bdev, lblkstart, blkstart, blklen); + __update_discard_tree_range(sbi, bdev, lblkstart, blkstart, blklen); wake_up(&SM_I(sbi)->dcc_info->discard_wait_queue); return 0; } -static void __punch_discard_cmd(struct f2fs_sb_info *sbi, - struct discard_cmd *dc, block_t blkaddr) -{ - block_t end_block = START_BLOCK(sbi, GET_SEGNO(sbi, blkaddr) + 1); - - if (dc->state == D_DONE || dc->lstart + dc->len <= end_block) { - __remove_discard_cmd(sbi, dc); - return; - } - - if (blkaddr - dc->lstart < dc->lstart + dc->len - end_block) { - dc->start += (end_block - dc->lstart); - dc->len -= (end_block - dc->lstart); - dc->lstart = end_block; - } else { - dc->len = blkaddr - dc->lstart; - } -} - /* This should be covered by global mutex, &sit_i->sentry_lock */ void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; - struct list_head *pend_list = &(dcc->discard_pend_list); - struct list_head *wait_list = &(dcc->discard_wait_list); - struct discard_cmd *dc, *tmp; + struct discard_cmd *dc; mutex_lock(&dcc->cmd_lock); - list_for_each_entry_safe(dc, tmp, pend_list, list) { - if (dc->lstart <= blkaddr && blkaddr < dc->lstart + dc->len) - __punch_discard_cmd(sbi, dc, blkaddr); - } - - list_for_each_entry_safe(dc, tmp, wait_list, list) { - if (dc->lstart <= blkaddr && blkaddr < dc->lstart + dc->len) { + dc = (struct discard_cmd *)__lookup_rb_tree(&dcc->root, NULL, blkaddr); + if (dc) { + if (dc->state != D_PREP) wait_for_completion_io(&dc->wait); - __punch_discard_cmd(sbi, dc, blkaddr); - } + __punch_discard_cmd(sbi, dc, blkaddr); } mutex_unlock(&dcc->cmd_lock); @@ -1257,6 +1399,7 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi) atomic_set(&dcc->discard_cmd_cnt, 0); dcc->nr_discards = 0; dcc->max_discards = 0; + dcc->root = RB_ROOT; init_waitqueue_head(&dcc->discard_wait_queue); SM_I(sbi)->dcc_info = dcc; From 2814d83ec77245d9855b2b053fcab47ae23614d9 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 15 Apr 2017 14:09:36 +0800 Subject: [PATCH 260/804] f2fs: clean up discard_cmd_control structure Avoid long variable name in discard_cmd_control structure, no logic change. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim Conflicts: fs/f2fs/segment.c --- fs/f2fs/f2fs.h | 8 ++++---- fs/f2fs/segment.c | 24 ++++++++++++------------ 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index dd25e6ff0785..85821e6d71fd 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -287,12 +287,12 @@ struct discard_cmd { struct discard_cmd_control { struct task_struct *f2fs_issue_discard; /* discard thread */ - struct list_head discard_entry_list; /* 4KB discard entry list */ - int nr_discards; /* # of discards in the list */ - struct list_head discard_pend_list; /* store pending entries */ - struct list_head discard_wait_list; /* store on-flushing entries */ + struct list_head entry_list; /* 4KB discard entry list */ + struct list_head pend_list; /* store pending entries */ + struct list_head wait_list; /* store on-flushing entries */ wait_queue_head_t discard_wait_queue; /* waiting queue for wake-up */ struct mutex cmd_lock; + int nr_discards; /* # of discards in the list */ int max_discards; /* max. discards to be issued */ atomic_t issued_discard; /* # of issued discard */ atomic_t issing_discard; /* # of issing discard */ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 9adc3bcfb4f4..d237efa523ca 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -677,7 +677,7 @@ static struct discard_cmd *__create_discard_cmd(struct f2fs_sb_info *sbi, block_t start, block_t len) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; - struct list_head *pend_list = &(dcc->discard_pend_list); + struct list_head *pend_list = &(dcc->pend_list); struct discard_cmd *dc; dc = f2fs_kmem_cache_alloc(discard_cmd_slab, GFP_NOFS); @@ -852,7 +852,7 @@ static void __submit_discard_cmd(struct f2fs_sb_info *sbi, bio->bi_private = dc; bio->bi_end_io = f2fs_submit_discard_endio; submit_bio(REQ_SYNC, bio); - list_move_tail(&dc->list, &dcc->discard_wait_list); + list_move_tail(&dc->list, &dcc->wait_list); } } else { __remove_discard_cmd(sbi, dc); @@ -1040,8 +1040,8 @@ void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; - struct list_head *pend_list = &(dcc->discard_pend_list); - struct list_head *wait_list = &(dcc->discard_wait_list); + struct list_head *pend_list = &(dcc->pend_list); + struct list_head *wait_list = &(dcc->wait_list); struct discard_cmd *dc, *tmp; struct blk_plug plug; @@ -1065,8 +1065,8 @@ static int issue_discard_thread(void *data) struct f2fs_sb_info *sbi = data; struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; wait_queue_head_t *q = &dcc->discard_wait_queue; - struct list_head *pend_list = &dcc->discard_pend_list; - struct list_head *wait_list = &dcc->discard_wait_list; + struct list_head *pend_list = &dcc->pend_list; + struct list_head *wait_list = &dcc->wait_list; struct discard_cmd *dc, *tmp; struct blk_plug plug; int iter = 0; @@ -1214,7 +1214,7 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc, unsigned int start = 0, end = -1; bool force = (cpc->reason == CP_DISCARD); struct discard_entry *de = NULL; - struct list_head *head = &SM_I(sbi)->dcc_info->discard_entry_list; + struct list_head *head = &SM_I(sbi)->dcc_info->entry_list; int i; if (se->valid_blocks == max_blocks || !f2fs_discard_en(sbi)) @@ -1263,7 +1263,7 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc, void release_discard_addrs(struct f2fs_sb_info *sbi) { - struct list_head *head = &(SM_I(sbi)->dcc_info->discard_entry_list); + struct list_head *head = &(SM_I(sbi)->dcc_info->entry_list); struct discard_entry *entry, *this; /* drop caches */ @@ -1289,7 +1289,7 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi) void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc) { - struct list_head *head = &(SM_I(sbi)->dcc_info->discard_entry_list); + struct list_head *head = &(SM_I(sbi)->dcc_info->entry_list); struct discard_entry *entry, *this; struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); unsigned long *prefree_map = dirty_i->dirty_segmap[PRE]; @@ -1390,9 +1390,9 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi) if (!dcc) return -ENOMEM; - INIT_LIST_HEAD(&dcc->discard_entry_list); - INIT_LIST_HEAD(&dcc->discard_pend_list); - INIT_LIST_HEAD(&dcc->discard_wait_list); + INIT_LIST_HEAD(&dcc->entry_list); + INIT_LIST_HEAD(&dcc->pend_list); + INIT_LIST_HEAD(&dcc->wait_list); mutex_init(&dcc->cmd_lock); atomic_set(&dcc->issued_discard, 0); atomic_set(&dcc->issing_discard, 0); From 0e9f98f97b6871261a5fd7420402039cc6a77198 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 15 Apr 2017 14:09:37 +0800 Subject: [PATCH 261/804] f2fs: in prior to issue big discard Keep issuing big size discard in prior instead of the one with random size, so that we expect that it will help to: - be quick to recycle unused large space in flash storage device. - give a chance for a) wait to merge small piece discards into bigger one, or b) avoid issuing discards while they have being reallocated by SSR. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 7 +++++- fs/f2fs/segment.c | 54 ++++++++++++++++++++++++++++++++++------------- 2 files changed, 45 insertions(+), 16 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 85821e6d71fd..9ede6bd15084 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -255,6 +255,11 @@ struct discard_entry { unsigned char discard_map[SIT_VBLOCK_MAP_SIZE]; /* segment discard bitmap */ }; +/* max discard pend list number */ +#define MAX_PLIST_NUM 512 +#define plist_idx(blk_num) ((blk_num) >= MAX_PLIST_NUM ? \ + (MAX_PLIST_NUM - 1) : (blk_num - 1)) + enum { D_PREP, D_SUBMIT, @@ -288,7 +293,7 @@ struct discard_cmd { struct discard_cmd_control { struct task_struct *f2fs_issue_discard; /* discard thread */ struct list_head entry_list; /* 4KB discard entry list */ - struct list_head pend_list; /* store pending entries */ + struct list_head pend_list[MAX_PLIST_NUM];/* store pending entries */ struct list_head wait_list; /* store on-flushing entries */ wait_queue_head_t discard_wait_queue; /* waiting queue for wake-up */ struct mutex cmd_lock; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index d237efa523ca..c77037521dfe 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -677,9 +677,13 @@ static struct discard_cmd *__create_discard_cmd(struct f2fs_sb_info *sbi, block_t start, block_t len) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; - struct list_head *pend_list = &(dcc->pend_list); + struct list_head *pend_list; struct discard_cmd *dc; + f2fs_bug_on(sbi, !len); + + pend_list = &dcc->pend_list[plist_idx(len)]; + dc = f2fs_kmem_cache_alloc(discard_cmd_slab, GFP_NOFS); INIT_LIST_HEAD(&dc->list); dc->bdev = bdev; @@ -885,9 +889,16 @@ do_insert: return dc; } +static void __relocate_discard_cmd(struct discard_cmd_control *dcc, + struct discard_cmd *dc) +{ + list_move_tail(&dc->list, &dcc->pend_list[plist_idx(dc->len)]); +} + static void __punch_discard_cmd(struct f2fs_sb_info *sbi, struct discard_cmd *dc, block_t blkaddr) { + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct discard_info di = dc->di; bool modified = false; @@ -898,6 +909,7 @@ static void __punch_discard_cmd(struct f2fs_sb_info *sbi, if (blkaddr > di.lstart) { dc->len = blkaddr - dc->lstart; + __relocate_discard_cmd(dcc, dc); modified = true; } @@ -911,6 +923,7 @@ static void __punch_discard_cmd(struct f2fs_sb_info *sbi, dc->lstart++; dc->len--; dc->start++; + __relocate_discard_cmd(dcc, dc); } } } @@ -969,6 +982,7 @@ static void __update_discard_tree_range(struct f2fs_sb_info *sbi, prev_dc->bdev == bdev && __is_discard_back_mergeable(&di, &prev_dc->di)) { prev_dc->di.len += di.len; + __relocate_discard_cmd(dcc, prev_dc); di = prev_dc->di; tdc = prev_dc; merged = true; @@ -980,6 +994,7 @@ static void __update_discard_tree_range(struct f2fs_sb_info *sbi, next_dc->di.lstart = di.lstart; next_dc->di.len += di.len; next_dc->di.start = di.start; + __relocate_discard_cmd(dcc, next_dc); if (tdc) __remove_discard_cmd(sbi, tdc); @@ -1040,16 +1055,20 @@ void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; - struct list_head *pend_list = &(dcc->pend_list); + struct list_head *pend_list; struct list_head *wait_list = &(dcc->wait_list); struct discard_cmd *dc, *tmp; struct blk_plug plug; + int i; mutex_lock(&dcc->cmd_lock); blk_start_plug(&plug); - list_for_each_entry_safe(dc, tmp, pend_list, list) - __submit_discard_cmd(sbi, dc); + for (i = 0; i < MAX_PLIST_NUM; i++) { + pend_list = &dcc->pend_list[i]; + list_for_each_entry_safe(dc, tmp, pend_list, list) + __submit_discard_cmd(sbi, dc); + } blk_finish_plug(&plug); list_for_each_entry_safe(dc, tmp, wait_list, list) { @@ -1065,26 +1084,30 @@ static int issue_discard_thread(void *data) struct f2fs_sb_info *sbi = data; struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; wait_queue_head_t *q = &dcc->discard_wait_queue; - struct list_head *pend_list = &dcc->pend_list; + struct list_head *pend_list; struct list_head *wait_list = &dcc->wait_list; struct discard_cmd *dc, *tmp; struct blk_plug plug; - int iter = 0; + int iter = 0, i; repeat: if (kthread_should_stop()) return 0; mutex_lock(&dcc->cmd_lock); blk_start_plug(&plug); - list_for_each_entry_safe(dc, tmp, pend_list, list) { - f2fs_bug_on(sbi, dc->state != D_PREP); + for (i = MAX_PLIST_NUM - 1; i >= 0; i--) { + pend_list = &dcc->pend_list[i]; + list_for_each_entry_safe(dc, tmp, pend_list, list) { + f2fs_bug_on(sbi, dc->state != D_PREP); - if (is_idle(sbi)) - __submit_discard_cmd(sbi, dc); + if (is_idle(sbi)) + __submit_discard_cmd(sbi, dc); - if (iter++ > DISCARD_ISSUE_RATE) - break; + if (iter++ > DISCARD_ISSUE_RATE) + goto next_step; + } } +next_step: blk_finish_plug(&plug); list_for_each_entry_safe(dc, tmp, wait_list, list) { @@ -1099,7 +1122,7 @@ repeat: congestion_wait(BLK_RW_SYNC, HZ/50); wait_event_interruptible(*q, kthread_should_stop() || - !list_empty(pend_list) || !list_empty(wait_list)); + atomic_read(&dcc->discard_cmd_cnt)); goto repeat; } @@ -1379,7 +1402,7 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi) { dev_t dev = sbi->sb->s_bdev->bd_dev; struct discard_cmd_control *dcc; - int err = 0; + int err = 0, i; if (SM_I(sbi)->dcc_info) { dcc = SM_I(sbi)->dcc_info; @@ -1391,7 +1414,8 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi) return -ENOMEM; INIT_LIST_HEAD(&dcc->entry_list); - INIT_LIST_HEAD(&dcc->pend_list); + for (i = 0; i < MAX_PLIST_NUM; i++) + INIT_LIST_HEAD(&dcc->pend_list[i]); INIT_LIST_HEAD(&dcc->wait_list); mutex_init(&dcc->cmd_lock); atomic_set(&dcc->issued_discard, 0); From 048fe2a0a94798c9708f4ad3a0d3bcdd095f5da8 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 15 Apr 2017 14:09:38 +0800 Subject: [PATCH 262/804] f2fs: trace __submit_discard_cmd Add an even class f2fs_discard for introducing f2fs_queue_discard, then use f2fs_{queue,issue}_discard to trace __{queue,submit}_discard_cmd. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 4 +++- include/trace/events/f2fs.h | 16 +++++++++++++++- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index c77037521dfe..e44fea9e2205 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -843,6 +843,8 @@ static void __submit_discard_cmd(struct f2fs_sb_info *sbi, if (dc->state != D_PREP) return; + trace_f2fs_issue_discard(dc->bdev, dc->start, dc->len); + dc->error = __blkdev_issue_discard(dc->bdev, SECTOR_FROM_BLOCK(dc->start), SECTOR_FROM_BLOCK(dc->len), @@ -1021,7 +1023,7 @@ static int __queue_discard_cmd(struct f2fs_sb_info *sbi, { block_t lblkstart = blkstart; - trace_f2fs_issue_discard(bdev, blkstart, blklen); + trace_f2fs_queue_discard(bdev, blkstart, blklen); if (sbi->s_ndevs) { int devi = f2fs_target_device_index(sbi, blkstart); diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 8ca1ddf50dc1..0796b2bf6870 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -1118,7 +1118,7 @@ TRACE_EVENT(f2fs_write_checkpoint, __entry->msg) ); -TRACE_EVENT(f2fs_issue_discard, +DECLARE_EVENT_CLASS(f2fs_discard, TP_PROTO(struct block_device *dev, block_t blkstart, block_t blklen), @@ -1142,6 +1142,20 @@ TRACE_EVENT(f2fs_issue_discard, (unsigned long long)__entry->blklen) ); +DEFINE_EVENT(f2fs_discard, f2fs_queue_discard, + + TP_PROTO(struct block_device *dev, block_t blkstart, block_t blklen), + + TP_ARGS(dev, blkstart, blklen) +); + +DEFINE_EVENT(f2fs_discard, f2fs_issue_discard, + + TP_PROTO(struct block_device *dev, block_t blkstart, block_t blklen), + + TP_ARGS(dev, blkstart, blklen) +); + TRACE_EVENT(f2fs_issue_reset_zone, TP_PROTO(struct block_device *dev, block_t blkstart), From 73d23680deb6dada7423e3d7656ca8fd743cc58e Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 17 Apr 2017 18:21:43 +0800 Subject: [PATCH 263/804] f2fs: introduce __check_rb_tree_consistence Introduce __check_rb_tree_consistence to check consistence of rb-tree based discard cache in runtime. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/extent_cache.c | 32 ++++++++++++++++++++++++++++++++ fs/f2fs/f2fs.h | 2 ++ fs/f2fs/segment.c | 15 +++++++++++++-- 3 files changed, 47 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 221ad086ee00..2f98d7039701 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -159,6 +159,38 @@ lookup_neighbors: return re; } +bool __check_rb_tree_consistence(struct f2fs_sb_info *sbi, + struct rb_root *root) +{ +#ifdef CONFIG_F2FS_CHECK_FS + struct rb_node *cur = rb_first(root), *next; + struct rb_entry *cur_re, *next_re; + + if (!cur) + return true; + + while (cur) { + next = rb_next(cur); + if (!next) + return true; + + cur_re = rb_entry(cur, struct rb_entry, rb_node); + next_re = rb_entry(next, struct rb_entry, rb_node); + + if (cur_re->ofs + cur_re->len > next_re->ofs) { + f2fs_msg(sbi->sb, KERN_INFO, "inconsistent rbtree, " + "cur(%u, %u) next(%u, %u)", + cur_re->ofs, cur_re->len, + next_re->ofs, next_re->len); + return false; + } + + cur = next; + } +#endif + return true; +} + static struct kmem_cache *extent_tree_slab; static struct kmem_cache *extent_node_slab; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 9ede6bd15084..cab03e5532f0 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2687,6 +2687,8 @@ struct rb_entry *__lookup_rb_tree_ret(struct rb_root *root, struct rb_entry **prev_entry, struct rb_entry **next_entry, struct rb_node ***insert_p, struct rb_node **insert_parent, bool force); +bool __check_rb_tree_consistence(struct f2fs_sb_info *sbi, + struct rb_root *root); unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink); bool f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext); void f2fs_drop_extent_tree(struct inode *inode); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index e44fea9e2205..df0bb6c4bb90 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -912,6 +912,7 @@ static void __punch_discard_cmd(struct f2fs_sb_info *sbi, if (blkaddr > di.lstart) { dc->len = blkaddr - dc->lstart; __relocate_discard_cmd(dcc, dc); + f2fs_bug_on(sbi, !__check_rb_tree_consistence(sbi, &dcc->root)); modified = true; } @@ -921,11 +922,15 @@ static void __punch_discard_cmd(struct f2fs_sb_info *sbi, di.start + blkaddr + 1 - di.lstart, di.lstart + di.len - 1 - blkaddr, NULL, NULL); + f2fs_bug_on(sbi, + !__check_rb_tree_consistence(sbi, &dcc->root)); } else { dc->lstart++; dc->len--; dc->start++; __relocate_discard_cmd(dcc, dc); + f2fs_bug_on(sbi, + !__check_rb_tree_consistence(sbi, &dcc->root)); } } } @@ -985,6 +990,8 @@ static void __update_discard_tree_range(struct f2fs_sb_info *sbi, __is_discard_back_mergeable(&di, &prev_dc->di)) { prev_dc->di.len += di.len; __relocate_discard_cmd(dcc, prev_dc); + f2fs_bug_on(sbi, + !__check_rb_tree_consistence(sbi, &dcc->root)); di = prev_dc->di; tdc = prev_dc; merged = true; @@ -999,13 +1006,17 @@ static void __update_discard_tree_range(struct f2fs_sb_info *sbi, __relocate_discard_cmd(dcc, next_dc); if (tdc) __remove_discard_cmd(sbi, tdc); - + f2fs_bug_on(sbi, + !__check_rb_tree_consistence(sbi, &dcc->root)); merged = true; } - if (!merged) + if (!merged) { __insert_discard_tree(sbi, bdev, di.lstart, di.start, di.len, NULL, NULL); + f2fs_bug_on(sbi, + !__check_rb_tree_consistence(sbi, &dcc->root)); + } next: prev_dc = next_dc; if (!prev_dc) From e818486a9ada270ff28bbd21c02294122352b392 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 18 Apr 2017 19:23:39 +0800 Subject: [PATCH 264/804] f2fs: unlock cp_rwsem early for IPU writes For IPU writes, there won't be any udpates in dnode page since we will reuse old block address instead of allocating new one, so we don't need to lock cp_rwsem during IPU IO submitting. Signed-off-by: Chao Yu --- fs/f2fs/data.c | 6 +++++- fs/f2fs/f2fs.h | 1 + 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 85da089004ee..64aa38b21bf9 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1355,6 +1355,8 @@ retry_encrypt: !is_cold_data(page) && !IS_ATOMIC_WRITTEN_PAGE(page) && need_inplace_update(inode))) { + f2fs_unlock_op(F2FS_I_SB(inode)); + fio->cp_rwsem_locked = false; err = rewrite_data_page(fio); set_inode_flag(inode, FI_UPDATE_WRITE); trace_f2fs_do_write_data_page(page, IPU); @@ -1390,6 +1392,7 @@ static int __write_data_page(struct page *page, bool *submitted, .page = page, .encrypted_page = NULL, .submitted = false, + .cp_rwsem_locked = true, }; trace_f2fs_writepage(page, DATA); @@ -1447,7 +1450,8 @@ write: err = do_write_data_page(&fio); if (F2FS_I(inode)->last_disk_size < psize) F2FS_I(inode)->last_disk_size = psize; - f2fs_unlock_op(sbi); + if (fio.cp_rwsem_locked) + f2fs_unlock_op(sbi); done: if (err && err != -ENOENT) goto redirty_out; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index cab03e5532f0..6e73b4aa0de2 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -860,6 +860,7 @@ struct f2fs_io_info { struct page *page; /* page to be written */ struct page *encrypted_page; /* encrypted page */ bool submitted; /* indicate IO submission */ + bool cp_rwsem_locked; /* indicate cp_rwsem is held */ }; #define is_read_io(rw) ((rw) == READ) From b88a1ae0f2d2baa6aab582fc22c7af238547a1e3 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 18 Apr 2017 19:27:39 +0800 Subject: [PATCH 265/804] f2fs: add undiscard blocks stat This patch adds to account undiscard blocks. Signed-off-by: Chao Yu --- fs/f2fs/debug.c | 5 +++-- fs/f2fs/f2fs.h | 2 ++ fs/f2fs/segment.c | 9 +++++++++ 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 692beff66bf8..6102737473d4 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -69,6 +69,7 @@ static void update_general_status(struct f2fs_sb_info *sbi) atomic_read(&SM_I(sbi)->dcc_info->issing_discard); si->nr_discard_cmd = atomic_read(&SM_I(sbi)->dcc_info->discard_cmd_cnt); + si->undiscard_blks = SM_I(sbi)->dcc_info->undiscard_blks; } si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg; si->rsvd_segs = reserved_segments(sbi); @@ -348,11 +349,11 @@ static int stat_show(struct seq_file *s, void *v) si->ext_tree, si->zombie_tree, si->ext_node); seq_puts(s, "\nBalancing F2FS Async:\n"); seq_printf(s, " - IO (CP: %4d, Data: %4d, Flush: (%4d %4d), " - "Discard: (%4d %4d)) cmd: %4d\n", + "Discard: (%4d %4d)) cmd: %4d undiscard:%4u\n", si->nr_wb_cp_data, si->nr_wb_data, si->nr_flushing, si->nr_flushed, si->nr_discarding, si->nr_discarded, - si->nr_discard_cmd); + si->nr_discard_cmd, si->undiscard_blks); seq_printf(s, " - inmem: %4d, atomic IO: %4d (Max. %4d), " "volatile IO: %4d (Max. %4d)\n", si->inmem_pages, si->aw_cnt, si->max_aw_cnt, diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 6e73b4aa0de2..1ca8f8963e61 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -299,6 +299,7 @@ struct discard_cmd_control { struct mutex cmd_lock; int nr_discards; /* # of discards in the list */ int max_discards; /* max. discards to be issued */ + unsigned int undiscard_blks; /* # of undiscard blocks */ atomic_t issued_discard; /* # of issued discard */ atomic_t issing_discard; /* # of issing discard */ atomic_t discard_cmd_cnt; /* # of cached cmd count */ @@ -2457,6 +2458,7 @@ struct f2fs_stat_info { int bg_gc, nr_wb_cp_data, nr_wb_data; int nr_flushing, nr_flushed, nr_discarding, nr_discarded; int nr_discard_cmd; + unsigned int undiscard_blks; int inline_xattr, inline_inode, inline_dir, append, update, orphans; int aw_cnt, max_aw_cnt, vw_cnt, max_vw_cnt; unsigned int valid_count, valid_node_count, valid_inode_count, discard_blks; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index df0bb6c4bb90..ba46aa20db1a 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -695,6 +695,7 @@ static struct discard_cmd *__create_discard_cmd(struct f2fs_sb_info *sbi, init_completion(&dc->wait); list_add_tail(&dc->list, pend_list); atomic_inc(&dcc->discard_cmd_cnt); + dcc->undiscard_blks += len; return dc; } @@ -723,6 +724,7 @@ static void __detach_discard_cmd(struct discard_cmd_control *dcc, list_del(&dc->list); rb_erase(&dc->rb_node, &dcc->root); + dcc->undiscard_blks -= dc->len; kmem_cache_free(discard_cmd_slab, dc); @@ -909,8 +911,11 @@ static void __punch_discard_cmd(struct f2fs_sb_info *sbi, return; } + dcc->undiscard_blks -= di.len; + if (blkaddr > di.lstart) { dc->len = blkaddr - dc->lstart; + dcc->undiscard_blks += dc->len; __relocate_discard_cmd(dcc, dc); f2fs_bug_on(sbi, !__check_rb_tree_consistence(sbi, &dcc->root)); modified = true; @@ -928,6 +933,7 @@ static void __punch_discard_cmd(struct f2fs_sb_info *sbi, dc->lstart++; dc->len--; dc->start++; + dcc->undiscard_blks += dc->len; __relocate_discard_cmd(dcc, dc); f2fs_bug_on(sbi, !__check_rb_tree_consistence(sbi, &dcc->root)); @@ -989,6 +995,7 @@ static void __update_discard_tree_range(struct f2fs_sb_info *sbi, prev_dc->bdev == bdev && __is_discard_back_mergeable(&di, &prev_dc->di)) { prev_dc->di.len += di.len; + dcc->undiscard_blks += di.len; __relocate_discard_cmd(dcc, prev_dc); f2fs_bug_on(sbi, !__check_rb_tree_consistence(sbi, &dcc->root)); @@ -1003,6 +1010,7 @@ static void __update_discard_tree_range(struct f2fs_sb_info *sbi, next_dc->di.lstart = di.lstart; next_dc->di.len += di.len; next_dc->di.start = di.start; + dcc->undiscard_blks += di.len; __relocate_discard_cmd(dcc, next_dc); if (tdc) __remove_discard_cmd(sbi, tdc); @@ -1436,6 +1444,7 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi) atomic_set(&dcc->discard_cmd_cnt, 0); dcc->nr_discards = 0; dcc->max_discards = 0; + dcc->undiscard_blks = 0; dcc->root = RB_ROOT; init_waitqueue_head(&dcc->discard_wait_queue); From ac2de6c6cbd6e927584b805a10172eb5f1f1d330 Mon Sep 17 00:00:00 2001 From: Hou Pengyang Date: Tue, 18 Apr 2017 11:57:16 +0000 Subject: [PATCH 266/804] f2fs: introduce async IPU policy This patch introduces an ASYNC IPU policy. Under senario of large # of async updating(e.g. log writing in Android), disk would be seriously fragmented, and higher frequent gc would be triggered. This patch uses IPU to rewrite the async update writting, since async is NOT sensitive to io latency. Signed-off-by: Hou Pengyang --- fs/f2fs/data.c | 2 +- fs/f2fs/file.c | 2 +- fs/f2fs/segment.h | 12 +++++++++++- 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 64aa38b21bf9..c990c4735505 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1354,7 +1354,7 @@ retry_encrypt: if (unlikely(fio->old_blkaddr != NEW_ADDR && !is_cold_data(page) && !IS_ATOMIC_WRITTEN_PAGE(page) && - need_inplace_update(inode))) { + need_inplace_update(inode, fio))) { f2fs_unlock_op(F2FS_I_SB(inode)); fio->cp_rwsem_locked = false; err = rewrite_data_page(fio); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 1da2ceaaac3e..1ed58a631bac 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1903,7 +1903,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, int err; /* if in-place-update policy is enabled, don't waste time here */ - if (need_inplace_update(inode)) + if (need_inplace_update(inode, NULL)) return -EINVAL; pg_start = range->start >> PAGE_SHIFT; diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 5f6ef163aa8f..3cd780a42f51 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -556,9 +556,11 @@ enum { F2FS_IPU_UTIL, F2FS_IPU_SSR_UTIL, F2FS_IPU_FSYNC, + F2FS_IPU_ASYNC, }; -static inline bool need_inplace_update(struct inode *inode) +static inline bool need_inplace_update(struct inode *inode, + struct f2fs_io_info *fio) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); unsigned int policy = SM_I(sbi)->ipu_policy; @@ -581,6 +583,14 @@ static inline bool need_inplace_update(struct inode *inode) utilization(sbi) > SM_I(sbi)->min_ipu_util) return true; + /* + * IPU for rewrite async pages + */ + if (policy & (0x1 << F2FS_IPU_ASYNC) && + fio && fio->op == REQ_OP_WRITE && + !(fio->op_flags & REQ_SYNC)) + return true; + /* this is only set during fdatasync */ if (policy & (0x1 << F2FS_IPU_FSYNC) && is_inode_flag_set(inode, FI_NEED_IPU)) From 1b73445838adfda63d6bba040e3aec87b7d9818a Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 13 Apr 2017 15:17:00 -0700 Subject: [PATCH 267/804] f2fs: add ioctl to flush data from faster device to cold area This patch adds an ioctl to flush data in faster device to cold area. User can give device number and number of segments to move. It doesn't move it if there is only one device. The parameter looks like: struct f2fs_flush_device { u32 dev_num; /* device number to flush */ u32 segments; /* # of segments to flush */ }; Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 11 ++++++-- fs/f2fs/file.c | 69 +++++++++++++++++++++++++++++++++++++++++++++-- fs/f2fs/gc.c | 42 +++++++++++++++++++---------- fs/f2fs/segment.c | 14 +++++++--- fs/f2fs/segment.h | 7 ++++- 5 files changed, 120 insertions(+), 23 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 1ca8f8963e61..b7052f911ea5 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -367,6 +367,8 @@ static inline bool __has_cursum_space(struct f2fs_journal *journal, #define F2FS_IOC_DEFRAGMENT _IO(F2FS_IOCTL_MAGIC, 8) #define F2FS_IOC_MOVE_RANGE _IOWR(F2FS_IOCTL_MAGIC, 9, \ struct f2fs_move_range) +#define F2FS_IOC_FLUSH_DEVICE _IOW(F2FS_IOCTL_MAGIC, 10, \ + struct f2fs_flush_device) #define F2FS_IOC_SET_ENCRYPTION_POLICY FS_IOC_SET_ENCRYPTION_POLICY #define F2FS_IOC_GET_ENCRYPTION_POLICY FS_IOC_GET_ENCRYPTION_POLICY @@ -403,6 +405,11 @@ struct f2fs_move_range { u64 len; /* size to move */ }; +struct f2fs_flush_device { + u32 dev_num; /* device number to flush */ + u32 segments; /* # of segments to flush */ +}; + /* * For INODE and NODE manager */ @@ -1047,7 +1054,6 @@ struct f2fs_sb_info { int bg_gc; /* background gc calls */ unsigned int ndirty_inode[NR_INODE_TYPE]; /* # of dirty inodes */ #endif - unsigned int last_victim[2]; /* last victim segment # */ spinlock_t stat_lock; /* lock for stat operations */ /* For sysfs suppport */ @@ -2429,7 +2435,8 @@ int f2fs_migrate_page(struct address_space *mapping, struct page *newpage, int start_gc_thread(struct f2fs_sb_info *sbi); void stop_gc_thread(struct f2fs_sb_info *sbi); block_t start_bidx_of_node(unsigned int node_ofs, struct inode *inode); -int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background); +int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background, + unsigned int segno); void build_gc_manager(struct f2fs_sb_info *sbi); /* diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 1ed58a631bac..fc1e6d048fd2 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1860,7 +1860,7 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg) mutex_lock(&sbi->gc_mutex); } - ret = f2fs_gc(sbi, sync, true); + ret = f2fs_gc(sbi, sync, true, NULL_SEGNO); out: mnt_drop_write_file(filp); return ret; @@ -2216,6 +2216,69 @@ err_out: return err; } +static int f2fs_ioc_flush_device(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct sit_info *sm = SIT_I(sbi); + unsigned int start_segno = 0, end_segno = 0; + unsigned int dev_start_segno = 0, dev_end_segno = 0; + struct f2fs_flush_device range; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (f2fs_readonly(sbi->sb)) + return -EROFS; + + if (copy_from_user(&range, (struct f2fs_flush_device __user *)arg, + sizeof(range))) + return -EFAULT; + + if (sbi->s_ndevs <= 1 || sbi->s_ndevs - 1 <= range.dev_num || + sbi->segs_per_sec != 1) { + f2fs_msg(sbi->sb, KERN_WARNING, + "Can't flush %u in %d for segs_per_sec %u != 1\n", + range.dev_num, sbi->s_ndevs, + sbi->segs_per_sec); + return -EINVAL; + } + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + if (range.dev_num != 0) + dev_start_segno = GET_SEGNO(sbi, FDEV(range.dev_num).start_blk); + dev_end_segno = GET_SEGNO(sbi, FDEV(range.dev_num).end_blk); + + start_segno = sm->last_victim[FLUSH_DEVICE]; + if (start_segno < dev_start_segno || start_segno >= dev_end_segno) + start_segno = dev_start_segno; + end_segno = min(start_segno + range.segments, dev_end_segno); + + while (start_segno < end_segno) { + if (!mutex_trylock(&sbi->gc_mutex)) { + ret = -EBUSY; + goto out; + } + sm->last_victim[GC_CB] = end_segno + 1; + sm->last_victim[GC_GREEDY] = end_segno + 1; + sm->last_victim[ALLOC_NEXT] = end_segno + 1; + ret = f2fs_gc(sbi, true, true, start_segno); + if (ret == -EAGAIN) + ret = 0; + else if (ret < 0) + break; + start_segno++; + } +out: + mnt_drop_write_file(filp); + return ret; +} + + long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { switch (cmd) { @@ -2253,6 +2316,8 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return f2fs_ioc_defragment(filp, arg); case F2FS_IOC_MOVE_RANGE: return f2fs_ioc_move_range(filp, arg); + case F2FS_IOC_FLUSH_DEVICE: + return f2fs_ioc_flush_device(filp, arg); default: return -ENOTTY; } @@ -2325,8 +2390,8 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case F2FS_IOC_GARBAGE_COLLECT: case F2FS_IOC_WRITE_CHECKPOINT: case F2FS_IOC_DEFRAGMENT: - break; case F2FS_IOC_MOVE_RANGE: + case F2FS_IOC_FLUSH_DEVICE: break; default: return -ENOIOCTLCMD; diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 84ade968d149..39b738dc36c7 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -84,7 +84,7 @@ static int gc_thread_func(void *data) stat_inc_bggc_count(sbi); /* if return value is not zero, no victim was selected */ - if (f2fs_gc(sbi, test_opt(sbi, FORCE_FG_GC), true)) + if (f2fs_gc(sbi, test_opt(sbi, FORCE_FG_GC), true, NULL_SEGNO)) wait_ms = gc_th->no_gc_sleep_time; trace_f2fs_background_gc(sbi->sb, wait_ms, @@ -176,7 +176,7 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type, if (type == CURSEG_HOT_DATA || IS_NODESEG(type)) p->offset = 0; else - p->offset = sbi->last_victim[p->gc_mode]; + p->offset = SIT_I(sbi)->last_victim[p->gc_mode]; } static unsigned int get_max_cost(struct f2fs_sb_info *sbi, @@ -295,6 +295,7 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, unsigned int *result, int gc_type, int type, char alloc_mode) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + struct sit_info *sm = SIT_I(sbi); struct victim_sel_policy p; unsigned int secno, last_victim; unsigned int last_segment = MAIN_SEGS(sbi); @@ -308,10 +309,18 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, p.min_segno = NULL_SEGNO; p.min_cost = get_max_cost(sbi, &p); + if (*result != NULL_SEGNO) { + if (IS_DATASEG(get_seg_entry(sbi, *result)->type) && + get_valid_blocks(sbi, *result, false) && + !sec_usage_check(sbi, GET_SEC_FROM_SEG(sbi, *result))) + p.min_segno = *result; + goto out; + } + if (p.max_search == 0) goto out; - last_victim = sbi->last_victim[p.gc_mode]; + last_victim = sm->last_victim[p.gc_mode]; if (p.alloc_mode == LFS && gc_type == FG_GC) { p.min_segno = check_bg_victims(sbi); if (p.min_segno != NULL_SEGNO) @@ -324,9 +333,10 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, segno = find_next_bit(p.dirty_segmap, last_segment, p.offset); if (segno >= last_segment) { - if (sbi->last_victim[p.gc_mode]) { - last_segment = sbi->last_victim[p.gc_mode]; - sbi->last_victim[p.gc_mode] = 0; + if (sm->last_victim[p.gc_mode]) { + last_segment = + sm->last_victim[p.gc_mode]; + sm->last_victim[p.gc_mode] = 0; p.offset = 0; continue; } @@ -361,11 +371,11 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, } next: if (nsearched >= p.max_search) { - if (!sbi->last_victim[p.gc_mode] && segno <= last_victim) - sbi->last_victim[p.gc_mode] = last_victim + 1; + if (!sm->last_victim[p.gc_mode] && segno <= last_victim) + sm->last_victim[p.gc_mode] = last_victim + 1; else - sbi->last_victim[p.gc_mode] = segno + 1; - sbi->last_victim[p.gc_mode] %= MAIN_SEGS(sbi); + sm->last_victim[p.gc_mode] = segno + 1; + sm->last_victim[p.gc_mode] %= MAIN_SEGS(sbi); break; } } @@ -912,7 +922,6 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, * - mutex_lock(sentry_lock) - change_curseg() * - lock_page(sum_page) */ - if (type == SUM_TYPE_NODE) gc_node_segment(sbi, sum->entries, segno, gc_type); else @@ -939,13 +948,14 @@ next: return sec_freed; } -int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background) +int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, + bool background, unsigned int segno) { - unsigned int segno; int gc_type = sync ? FG_GC : BG_GC; int sec_freed = 0; int ret = -EINVAL; struct cp_control cpc; + unsigned int init_segno = segno; struct gc_inode_list gc_list = { .ilist = LIST_HEAD_INIT(gc_list.ilist), .iroot = RADIX_TREE_INIT(GFP_NOFS), @@ -990,13 +1000,17 @@ gc_more: sbi->cur_victim_sec = NULL_SEGNO; if (!sync) { - if (has_not_enough_free_secs(sbi, sec_freed, 0)) + if (has_not_enough_free_secs(sbi, sec_freed, 0)) { + segno = NULL_SEGNO; goto gc_more; + } if (gc_type == FG_GC) ret = write_checkpoint(sbi, &cpc); } stop: + SIT_I(sbi)->last_victim[ALLOC_NEXT] = 0; + SIT_I(sbi)->last_victim[FLUSH_DEVICE] = init_segno; mutex_unlock(&sbi->gc_mutex); put_gc_inode(&gc_list); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index ba46aa20db1a..888bde8cec34 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -401,7 +401,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) */ if (has_not_enough_free_secs(sbi, 0, 0)) { mutex_lock(&sbi->gc_mutex); - f2fs_gc(sbi, false, false); + f2fs_gc(sbi, false, false, NULL_SEGNO); } } @@ -1834,6 +1834,8 @@ static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type) if (type == CURSEG_HOT_DATA || IS_NODESEG(type)) return 0; + if (SIT_I(sbi)->last_victim[ALLOC_NEXT]) + return SIT_I(sbi)->last_victim[ALLOC_NEXT]; return CURSEG_I(sbi, type)->segno; } @@ -1931,12 +1933,15 @@ static int get_ssr_segment(struct f2fs_sb_info *sbi, int type) { struct curseg_info *curseg = CURSEG_I(sbi, type); const struct victim_selection *v_ops = DIRTY_I(sbi)->v_ops; + unsigned segno = NULL_SEGNO; int i, cnt; bool reversed = false; /* need_SSR() already forces to do this */ - if (v_ops->get_victim(sbi, &(curseg)->next_segno, BG_GC, type, SSR)) + if (v_ops->get_victim(sbi, &segno, BG_GC, type, SSR)) { + curseg->next_segno = segno; return 1; + } /* For node segments, let's do SSR more intensively */ if (IS_NODESEG(type)) { @@ -1960,9 +1965,10 @@ static int get_ssr_segment(struct f2fs_sb_info *sbi, int type) for (; cnt-- > 0; reversed ? i-- : i++) { if (i == type) continue; - if (v_ops->get_victim(sbi, &(curseg)->next_segno, - BG_GC, i, SSR)) + if (v_ops->get_victim(sbi, &segno, BG_GC, i, SSR)) { + curseg->next_segno = segno; return 1; + } } return 0; } diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 3cd780a42f51..93cc4e504aab 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -138,7 +138,10 @@ enum { */ enum { GC_CB = 0, - GC_GREEDY + GC_GREEDY, + ALLOC_NEXT, + FLUSH_DEVICE, + MAX_GC_POLICY, }; /* @@ -233,6 +236,8 @@ struct sit_info { unsigned long long mounted_time; /* mount time */ unsigned long long min_mtime; /* min. modification time */ unsigned long long max_mtime; /* max. modification time */ + + unsigned int last_victim[MAX_GC_POLICY]; /* last victim segment # */ }; struct free_segmap_info { From b16a719c967a5e27c036f0435ce0632983d1a857 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 18 Apr 2017 13:47:25 -0700 Subject: [PATCH 268/804] f2fs: fix _IOW usage This patch fixes wrong _IOW usage. Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index b7052f911ea5..4a517cde5fd8 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -362,9 +362,10 @@ static inline bool __has_cursum_space(struct f2fs_journal *journal, #define F2FS_IOC_START_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 3) #define F2FS_IOC_RELEASE_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 4) #define F2FS_IOC_ABORT_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 5) -#define F2FS_IOC_GARBAGE_COLLECT _IO(F2FS_IOCTL_MAGIC, 6) +#define F2FS_IOC_GARBAGE_COLLECT _IOW(F2FS_IOCTL_MAGIC, 6, __u32) #define F2FS_IOC_WRITE_CHECKPOINT _IO(F2FS_IOCTL_MAGIC, 7) -#define F2FS_IOC_DEFRAGMENT _IO(F2FS_IOCTL_MAGIC, 8) +#define F2FS_IOC_DEFRAGMENT _IOWR(F2FS_IOCTL_MAGIC, 8, \ + struct f2fs_defragment) #define F2FS_IOC_MOVE_RANGE _IOWR(F2FS_IOCTL_MAGIC, 9, \ struct f2fs_move_range) #define F2FS_IOC_FLUSH_DEVICE _IOW(F2FS_IOCTL_MAGIC, 10, \ From e9b7e2e3bbc3cb0dd0a4f3af2c1a679d307a0380 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 18 Apr 2017 15:03:15 -0700 Subject: [PATCH 269/804] f2fs: assign allocation hint for warm/cold data This patch gives slower device region to warm/cold data area more eagerly. Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 39b738dc36c7..f3102a895c48 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1033,4 +1033,9 @@ void build_gc_manager(struct f2fs_sb_info *sbi) sbi->fggc_threshold = div64_u64((main_count - ovp_count) * BLKS_PER_SEC(sbi), (main_count - resv_count)); + + /* give warm/cold data area from slower device */ + if (sbi->s_ndevs && sbi->segs_per_sec == 1) + SIT_I(sbi)->last_victim[ALLOC_NEXT] = + GET_SEGNO(sbi, FDEV(0).end_blk) + 1; } From 11538a935f968ab8aefaeedec772cae27e3f57eb Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 19 Apr 2017 19:38:33 +0200 Subject: [PATCH 270/804] f2fs: improve definition of statistic macros With a recent addition of f2fs_lookup_extent_tree(), we get a warning about the use of empty macros: fs/f2fs/extent_cache.c: In function 'f2fs_lookup_extent_tree': fs/f2fs/extent_cache.c:358:32: error: suggest braces around empty body in an 'else' statement [-Werror=empty-body] stat_inc_rbtree_node_hit(sbi); A good way to avoid the warning and make the code more robust is to define all no-op macros as 'do { } while (0)'. Fixes: 54c2258cd63a ("f2fs: extract rb-tree operation infrastructure") Signed-off-by: Arnd Bergmann Reivewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 58 +++++++++++++++++++++++++------------------------- 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 4a517cde5fd8..c98c07cee464 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2599,35 +2599,35 @@ void f2fs_destroy_stats(struct f2fs_sb_info *sbi); int __init f2fs_create_root_stats(void); void f2fs_destroy_root_stats(void); #else -#define stat_inc_cp_count(si) -#define stat_inc_bg_cp_count(si) -#define stat_inc_call_count(si) -#define stat_inc_bggc_count(si) -#define stat_inc_dirty_inode(sbi, type) -#define stat_dec_dirty_inode(sbi, type) -#define stat_inc_total_hit(sb) -#define stat_inc_rbtree_node_hit(sb) -#define stat_inc_largest_node_hit(sbi) -#define stat_inc_cached_node_hit(sbi) -#define stat_inc_inline_xattr(inode) -#define stat_dec_inline_xattr(inode) -#define stat_inc_inline_inode(inode) -#define stat_dec_inline_inode(inode) -#define stat_inc_inline_dir(inode) -#define stat_dec_inline_dir(inode) -#define stat_inc_atomic_write(inode) -#define stat_dec_atomic_write(inode) -#define stat_update_max_atomic_write(inode) -#define stat_inc_volatile_write(inode) -#define stat_dec_volatile_write(inode) -#define stat_update_max_volatile_write(inode) -#define stat_inc_seg_type(sbi, curseg) -#define stat_inc_block_count(sbi, curseg) -#define stat_inc_inplace_blocks(sbi) -#define stat_inc_seg_count(sbi, type, gc_type) -#define stat_inc_tot_blk_count(si, blks) -#define stat_inc_data_blk_count(sbi, blks, gc_type) -#define stat_inc_node_blk_count(sbi, blks, gc_type) +#define stat_inc_cp_count(si) do { } while (0) +#define stat_inc_bg_cp_count(si) do { } while (0) +#define stat_inc_call_count(si) do { } while (0) +#define stat_inc_bggc_count(si) do { } while (0) +#define stat_inc_dirty_inode(sbi, type) do { } while (0) +#define stat_dec_dirty_inode(sbi, type) do { } while (0) +#define stat_inc_total_hit(sb) do { } while (0) +#define stat_inc_rbtree_node_hit(sb) do { } while (0) +#define stat_inc_largest_node_hit(sbi) do { } while (0) +#define stat_inc_cached_node_hit(sbi) do { } while (0) +#define stat_inc_inline_xattr(inode) do { } while (0) +#define stat_dec_inline_xattr(inode) do { } while (0) +#define stat_inc_inline_inode(inode) do { } while (0) +#define stat_dec_inline_inode(inode) do { } while (0) +#define stat_inc_inline_dir(inode) do { } while (0) +#define stat_dec_inline_dir(inode) do { } while (0) +#define stat_inc_atomic_write(inode) do { } while (0) +#define stat_dec_atomic_write(inode) do { } while (0) +#define stat_update_max_atomic_write(inode) do { } while (0) +#define stat_inc_volatile_write(inode) do { } while (0) +#define stat_dec_volatile_write(inode) do { } while (0) +#define stat_update_max_volatile_write(inode) do { } while (0) +#define stat_inc_seg_type(sbi, curseg) do { } while (0) +#define stat_inc_block_count(sbi, curseg) do { } while (0) +#define stat_inc_inplace_blocks(sbi) do { } while (0) +#define stat_inc_seg_count(sbi, type, gc_type) do { } while (0) +#define stat_inc_tot_blk_count(si, blks) do { } while (0) +#define stat_inc_data_blk_count(sbi, blks, gc_type) do { } while (0) +#define stat_inc_node_blk_count(sbi, blks, gc_type) do { } while (0) static inline int f2fs_build_stats(struct f2fs_sb_info *sbi) { return 0; } static inline void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { } From 0cfd113b84607c3827f01049bc9cd75559a906d1 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 20 Apr 2017 13:51:57 -0700 Subject: [PATCH 271/804] f2fs: fix out-of free segments This patch also reverts d0db7703ac1 ("f2fs: do SSR in higher priority"). This patch fixes out of free segments caused by many small file creation by 1) mkfs -s 1 2G 2) mount 3) untar - preoduce 60000 small files burstly 4) sync - flush node pages - flush imeta Here, when we do f2fs_balance_fs, we missed # of imeta blocks, resulting in skipping to check has_not_enough_free_secs. Another test is done by 1) mkfs -s 12 2G 2) mount 3) untar - preoduce 60000 small files burstly 4) sync - flush node pages - flush imeta In this case, this patch also fixes wrong block allocation under large section size. Reported-by: William Brana Cc: Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 3 ++- fs/f2fs/inode.c | 3 ++- fs/f2fs/segment.c | 26 +++++++++++++++++++++----- 3 files changed, 25 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index c990c4735505..67d05b001722 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1470,7 +1470,8 @@ out: } unlock_page(page); - f2fs_balance_fs(sbi, need_balance_fs); + if (!S_ISDIR(inode->i_mode)) + f2fs_balance_fs(sbi, need_balance_fs); if (unlikely(f2fs_cp_error(sbi))) { f2fs_submit_merged_bio(sbi, DATA, WRITE); diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 0900814485c7..518f49643092 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -338,7 +338,8 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) * We need to balance fs here to prevent from producing dirty node pages * during the urgent cleaning time when runing out of free sections. */ - if (update_inode_page(inode) && wbc && wbc->nr_to_write) + update_inode_page(inode); + if (wbc && wbc->nr_to_write) f2fs_balance_fs(sbi, true); return 0; } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 888bde8cec34..f5dbb6ef8390 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -388,11 +388,8 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) } #endif - if (!need) - return; - /* balance_fs_bg is able to be pending */ - if (excess_cached_nats(sbi)) + if (need && excess_cached_nats(sbi)) f2fs_balance_fs_bg(sbi); /* @@ -1718,6 +1715,17 @@ static void write_current_sum_page(struct f2fs_sb_info *sbi, f2fs_put_page(page, 1); } +static int is_next_segment_free(struct f2fs_sb_info *sbi, int type) +{ + struct curseg_info *curseg = CURSEG_I(sbi, type); + unsigned int segno = curseg->segno + 1; + struct free_segmap_info *free_i = FREE_I(sbi); + + if (segno < MAIN_SEGS(sbi) && segno % sbi->segs_per_sec) + return !test_bit(segno, free_i->free_segmap); + return 0; +} + /* * Find a new segment from the free segments bitmap to right order * This function should be returned with success, otherwise BUG @@ -1831,6 +1839,10 @@ static void reset_curseg(struct f2fs_sb_info *sbi, int type, int modified) static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type) { + /* if segs_per_sec is large than 1, we need to keep original policy. */ + if (sbi->segs_per_sec != 1) + return CURSEG_I(sbi, type)->segno; + if (type == CURSEG_HOT_DATA || IS_NODESEG(type)) return 0; @@ -1980,17 +1992,21 @@ static int get_ssr_segment(struct f2fs_sb_info *sbi, int type) static void allocate_segment_by_default(struct f2fs_sb_info *sbi, int type, bool force) { + struct curseg_info *curseg = CURSEG_I(sbi, type); + if (force) new_curseg(sbi, type, true); else if (!is_set_ckpt_flags(sbi, CP_CRC_RECOVERY_FLAG) && type == CURSEG_WARM_NODE) new_curseg(sbi, type, false); + else if (curseg->alloc_type == LFS && is_next_segment_free(sbi, type)) + new_curseg(sbi, type, false); else if (need_SSR(sbi) && get_ssr_segment(sbi, type)) change_curseg(sbi, type, true); else new_curseg(sbi, type, false); - stat_inc_seg_type(sbi, CURSEG_I(sbi, type)); + stat_inc_seg_type(sbi, curseg); } void allocate_new_segments(struct f2fs_sb_info *sbi) From 352c91d0d482987cd5ed21f46671a80e2616c9a5 Mon Sep 17 00:00:00 2001 From: Hou Pengyang Date: Fri, 21 Apr 2017 12:41:48 +0000 Subject: [PATCH 272/804] f2fs: skip encrypted inode in ASYNC IPU policy Async request may be throttled in block layer, so page for async may keep WRITE_BACK for a long time. For encrytped inode, we need wait on page writeback no matter if the device supports BDI_CAP_STABLE_WRITES. This may result in a higher waiting page writeback time for async encrypted inode page. This patch skips IPU for encrypted inode's updating write. Signed-off-by: Hou Pengyang Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 93cc4e504aab..8ad22b8cbba7 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -593,7 +593,8 @@ static inline bool need_inplace_update(struct inode *inode, */ if (policy & (0x1 << F2FS_IPU_ASYNC) && fio && fio->op == REQ_OP_WRITE && - !(fio->op_flags & REQ_SYNC)) + !(fio->op_flags & REQ_SYNC) && + !f2fs_encrypted_inode(inode)) return true; /* this is only set during fdatasync */ From 38f30f047da2994e9b71b5411e2e60757b21f74d Mon Sep 17 00:00:00 2001 From: Sheng Yong Date: Sat, 22 Apr 2017 10:39:20 +0800 Subject: [PATCH 273/804] f2fs: fix multiple f2fs_add_link() having same name for inline dentry Commit 88c5c13a5027 (f2fs: fix multiple f2fs_add_link() calls having same name) does not cover the scenario where inline dentry is enabled. In that case, F2FS_I(dir)->task will be NULL, and __f2fs_add_link will lookup dentries one more time. This patch fixes it by moving the assigment of current task to a upper level to cover both normal and inline dentry. Cc: Fixes: 88c5c13a5027 (f2fs: fix multiple f2fs_add_link() calls having same name) Signed-off-by: Sheng Yong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index db077960e376..f44e1370890f 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -207,13 +207,9 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, f2fs_put_page(dentry_page, 0); } - /* This is to increase the speed of f2fs_create */ - if (!de && room) { - F2FS_I(dir)->task = current; - if (F2FS_I(dir)->chash != namehash) { - F2FS_I(dir)->chash = namehash; - F2FS_I(dir)->clevel = level; - } + if (!de && room && F2FS_I(dir)->chash != namehash) { + F2FS_I(dir)->chash = namehash; + F2FS_I(dir)->clevel = level; } return de; @@ -254,6 +250,9 @@ struct f2fs_dir_entry *__f2fs_find_entry(struct inode *dir, break; } out: + /* This is to increase the speed of f2fs_create */ + if (!de) + F2FS_I(dir)->task = current; return de; } From 9933f6e186a32d6f2da5581d97523b7fd99a4eba Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Sat, 22 Apr 2017 18:06:26 +0800 Subject: [PATCH 274/804] f2fs: seperate read nat page from nat_tree_lock This patch seperate nat page read io from nat_tree_lock. -lock_page -get_node_info() -current_nat_addr ...... -> write_checkpoint -get_meta_page Because we lock node page, we can make sure no other threads modify this nid concurrently. So we just obtain current_nat_addr under nat_tree_lock, node info is always same in both nat pack. Signed-off-by: Yunlei He Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index dbf0efeb1cde..a63399338ff4 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -376,6 +376,7 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni) struct page *page = NULL; struct f2fs_nat_entry ne; struct nat_entry *e; + pgoff_t index; int i; ni->nid = nid; @@ -401,17 +402,21 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni) node_info_from_raw_nat(ni, &ne); } up_read(&curseg->journal_rwsem); - if (i >= 0) + if (i >= 0) { + up_read(&nm_i->nat_tree_lock); goto cache; + } /* Fill node_info from nat page */ - page = get_current_nat_page(sbi, start_nid); + index = current_nat_addr(sbi, nid); + up_read(&nm_i->nat_tree_lock); + + page = get_meta_page(sbi, index); nat_blk = (struct f2fs_nat_block *)page_address(page); ne = nat_blk->entries[nid - start_nid]; node_info_from_raw_nat(ni, &ne); f2fs_put_page(page, 1); cache: - up_read(&nm_i->nat_tree_lock); /* cache nat entry */ down_write(&nm_i->nat_tree_lock); cache_nat_entry(sbi, nid, &ne); From e7a9ce2e7cc68d29ed27926a40eafb33c9bb62e0 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 25 Apr 2017 00:21:34 +0800 Subject: [PATCH 275/804] f2fs: delay awaking discard thread It's better to delay awaking discard thread while queuing discard commands in checkpoint, it will help to give more chances for merging big and small discard. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index f5dbb6ef8390..93c6d8a00722 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1047,7 +1047,6 @@ static int __queue_discard_cmd(struct f2fs_sb_info *sbi, blkstart -= FDEV(devi).start_blk; } __update_discard_tree_range(sbi, bdev, lblkstart, blkstart, blklen); - wake_up(&SM_I(sbi)->dcc_info->discard_wait_queue); return 0; } @@ -1414,6 +1413,8 @@ skip: SM_I(sbi)->dcc_info->nr_discards -= total_len; kmem_cache_free(discard_entry_slab, entry); } + + wake_up(&SM_I(sbi)->dcc_info->discard_wait_queue); } static int create_discard_cmd_control(struct f2fs_sb_info *sbi) From 9170805a6362eb449000f3e47a4a9c39e0f0dd8a Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 25 Apr 2017 00:21:35 +0800 Subject: [PATCH 276/804] f2fs: enable small discard by default This patch start to enable 4K granularity small discard by default when realtime discard is on, so, in seriously fragmented space, small size discard can be issued in time to avoid useless storage space occupying of invalid filesystem's data, then performance of flash storage can be recovered. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 4 ++-- fs/f2fs/segment.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index c98c07cee464..635bca168078 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -297,8 +297,8 @@ struct discard_cmd_control { struct list_head wait_list; /* store on-flushing entries */ wait_queue_head_t discard_wait_queue; /* waiting queue for wake-up */ struct mutex cmd_lock; - int nr_discards; /* # of discards in the list */ - int max_discards; /* max. discards to be issued */ + unsigned int nr_discards; /* # of discards in the list */ + unsigned int max_discards; /* max. discards to be issued */ unsigned int undiscard_blks; /* # of undiscard blocks */ atomic_t issued_discard; /* # of issued discard */ atomic_t issing_discard; /* # of issing discard */ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 93c6d8a00722..cc617da64d38 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1441,7 +1441,7 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi) atomic_set(&dcc->issing_discard, 0); atomic_set(&dcc->discard_cmd_cnt, 0); dcc->nr_discards = 0; - dcc->max_discards = 0; + dcc->max_discards = MAIN_SEGS(sbi) << sbi->log_blocks_per_seg; dcc->undiscard_blks = 0; dcc->root = RB_ROOT; From 72b8a76169d7b2743479a693f4b15d36ccf70e0c Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 25 Apr 2017 20:21:37 +0800 Subject: [PATCH 277/804] f2fs: introduce __issue_discard_cmd Just cleanup, no logic change. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 63 ++++++++++++++++++++++------------------------- 1 file changed, 30 insertions(+), 33 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index cc617da64d38..b49818dd02c4 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1050,6 +1050,32 @@ static int __queue_discard_cmd(struct f2fs_sb_info *sbi, return 0; } +static void __issue_discard_cmd(struct f2fs_sb_info *sbi, bool issue_cond) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct list_head *pend_list; + struct discard_cmd *dc, *tmp; + struct blk_plug plug; + int i, iter = 0; + + mutex_lock(&dcc->cmd_lock); + blk_start_plug(&plug); + for (i = MAX_PLIST_NUM - 1; i >= 0; i--) { + pend_list = &dcc->pend_list[i]; + list_for_each_entry_safe(dc, tmp, pend_list, list) { + f2fs_bug_on(sbi, dc->state != D_PREP); + + if (!issue_cond || is_idle(sbi)) + __submit_discard_cmd(sbi, dc); + if (issue_cond && iter++ > DISCARD_ISSUE_RATE) + goto out; + } + } +out: + blk_finish_plug(&plug); + mutex_unlock(&dcc->cmd_lock); +} + /* This should be covered by global mutex, &sit_i->sentry_lock */ void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) { @@ -1072,27 +1098,16 @@ void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; - struct list_head *pend_list; struct list_head *wait_list = &(dcc->wait_list); struct discard_cmd *dc, *tmp; - struct blk_plug plug; - int i; + + __issue_discard_cmd(sbi, false); mutex_lock(&dcc->cmd_lock); - - blk_start_plug(&plug); - for (i = 0; i < MAX_PLIST_NUM; i++) { - pend_list = &dcc->pend_list[i]; - list_for_each_entry_safe(dc, tmp, pend_list, list) - __submit_discard_cmd(sbi, dc); - } - blk_finish_plug(&plug); - list_for_each_entry_safe(dc, tmp, wait_list, list) { wait_for_completion_io(&dc->wait); __remove_discard_cmd(sbi, dc); } - mutex_unlock(&dcc->cmd_lock); } @@ -1101,32 +1116,15 @@ static int issue_discard_thread(void *data) struct f2fs_sb_info *sbi = data; struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; wait_queue_head_t *q = &dcc->discard_wait_queue; - struct list_head *pend_list; struct list_head *wait_list = &dcc->wait_list; struct discard_cmd *dc, *tmp; - struct blk_plug plug; - int iter = 0, i; repeat: if (kthread_should_stop()) return 0; + __issue_discard_cmd(sbi, true); + mutex_lock(&dcc->cmd_lock); - blk_start_plug(&plug); - for (i = MAX_PLIST_NUM - 1; i >= 0; i--) { - pend_list = &dcc->pend_list[i]; - list_for_each_entry_safe(dc, tmp, pend_list, list) { - f2fs_bug_on(sbi, dc->state != D_PREP); - - if (is_idle(sbi)) - __submit_discard_cmd(sbi, dc); - - if (iter++ > DISCARD_ISSUE_RATE) - goto next_step; - } - } -next_step: - blk_finish_plug(&plug); - list_for_each_entry_safe(dc, tmp, wait_list, list) { if (dc->state == D_DONE) { wait_for_completion_io(&dc->wait); @@ -1135,7 +1133,6 @@ next_step: } mutex_unlock(&dcc->cmd_lock); - iter = 0; congestion_wait(BLK_RW_SYNC, HZ/50); wait_event_interruptible(*q, kthread_should_stop() || From 6cd09438a3311c769442e877eb5fc1ae32ccc3e7 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 25 Apr 2017 20:21:38 +0800 Subject: [PATCH 278/804] f2fs: introduce __wait_discard_cmd Just cleanup, no logic change. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 40 ++++++++++++++++++---------------------- 1 file changed, 18 insertions(+), 22 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index b49818dd02c4..a0a0592e1681 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1076,6 +1076,22 @@ out: mutex_unlock(&dcc->cmd_lock); } +static void __wait_discard_cmd(struct f2fs_sb_info *sbi, bool wait_cond) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct list_head *wait_list = &(dcc->wait_list); + struct discard_cmd *dc, *tmp; + + mutex_lock(&dcc->cmd_lock); + list_for_each_entry_safe(dc, tmp, wait_list, list) { + if (!wait_cond || dc->state == D_DONE) { + wait_for_completion_io(&dc->wait); + __remove_discard_cmd(sbi, dc); + } + } + mutex_unlock(&dcc->cmd_lock); +} + /* This should be covered by global mutex, &sit_i->sentry_lock */ void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) { @@ -1097,18 +1113,8 @@ void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) /* This comes from f2fs_put_super */ void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi) { - struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; - struct list_head *wait_list = &(dcc->wait_list); - struct discard_cmd *dc, *tmp; - __issue_discard_cmd(sbi, false); - - mutex_lock(&dcc->cmd_lock); - list_for_each_entry_safe(dc, tmp, wait_list, list) { - wait_for_completion_io(&dc->wait); - __remove_discard_cmd(sbi, dc); - } - mutex_unlock(&dcc->cmd_lock); + __wait_discard_cmd(sbi, false); } static int issue_discard_thread(void *data) @@ -1116,22 +1122,12 @@ static int issue_discard_thread(void *data) struct f2fs_sb_info *sbi = data; struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; wait_queue_head_t *q = &dcc->discard_wait_queue; - struct list_head *wait_list = &dcc->wait_list; - struct discard_cmd *dc, *tmp; repeat: if (kthread_should_stop()) return 0; __issue_discard_cmd(sbi, true); - - mutex_lock(&dcc->cmd_lock); - list_for_each_entry_safe(dc, tmp, wait_list, list) { - if (dc->state == D_DONE) { - wait_for_completion_io(&dc->wait); - __remove_discard_cmd(sbi, dc); - } - } - mutex_unlock(&dcc->cmd_lock); + __wait_discard_cmd(sbi, true); congestion_wait(BLK_RW_SYNC, HZ/50); From b465728ac362b0b069a3935fb65f1f019ed65ab2 Mon Sep 17 00:00:00 2001 From: Hou Pengyang Date: Tue, 25 Apr 2017 12:45:12 +0000 Subject: [PATCH 279/804] f2fs: reconstruct code to write a data page This patch introduces encrypt_one_page which encrypts one data page before submit_bio, and change the use of need_inplace_update. Signed-off-by: Hou Pengyang Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 81 +++++++++++++++++++++++++++++------------------ fs/f2fs/file.c | 4 +-- fs/f2fs/segment.h | 6 +--- 3 files changed, 54 insertions(+), 37 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 67d05b001722..db9ed78156e8 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1300,6 +1300,49 @@ static int f2fs_read_data_pages(struct file *file, return f2fs_mpage_readpages(mapping, pages, NULL, nr_pages); } +static int encrypt_one_page(struct f2fs_io_info *fio) +{ + struct inode *inode = fio->page->mapping->host; + gfp_t gfp_flags = GFP_NOFS; + + if (!f2fs_encrypted_inode(inode) || !S_ISREG(inode->i_mode)) + return 0; + + /* wait for GCed encrypted page writeback */ + f2fs_wait_on_encrypted_page_writeback(fio->sbi, fio->old_blkaddr); + +retry_encrypt: + fio->encrypted_page = fscrypt_encrypt_page(inode, fio->page, + PAGE_SIZE, 0, fio->page->index, gfp_flags); + if (!IS_ERR(fio->encrypted_page)) + return 0; + + /* flush pending IOs and wait for a while in the ENOMEM case */ + if (PTR_ERR(fio->encrypted_page) == -ENOMEM) { + f2fs_flush_merged_bios(fio->sbi); + congestion_wait(BLK_RW_ASYNC, HZ/50); + gfp_flags |= __GFP_NOFAIL; + goto retry_encrypt; + } + return PTR_ERR(fio->encrypted_page); +} + +static inline bool need_inplace_update(struct f2fs_io_info *fio) +{ + struct inode *inode = fio->page->mapping->host; + + if (fio->old_blkaddr == NEW_ADDR) + return false; + if (S_ISDIR(inode->i_mode) || f2fs_is_atomic_file(inode)) + return false; + if (is_cold_data(fio->page)) + return false; + if (IS_ATOMIC_WRITTEN_PAGE(fio->page)) + return false; + + return need_inplace_update_policy(inode, fio); +} + int do_write_data_page(struct f2fs_io_info *fio) { struct page *page = fio->page; @@ -1320,30 +1363,9 @@ int do_write_data_page(struct f2fs_io_info *fio) goto out_writepage; } - if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) { - gfp_t gfp_flags = GFP_NOFS; - - /* wait for GCed encrypted page writeback */ - f2fs_wait_on_encrypted_page_writeback(F2FS_I_SB(inode), - fio->old_blkaddr); -retry_encrypt: - fio->encrypted_page = fscrypt_encrypt_page(inode, fio->page, - PAGE_SIZE, 0, - fio->page->index, - gfp_flags); - if (IS_ERR(fio->encrypted_page)) { - err = PTR_ERR(fio->encrypted_page); - if (err == -ENOMEM) { - /* flush pending ios and wait for a while */ - f2fs_flush_merged_bios(F2FS_I_SB(inode)); - congestion_wait(BLK_RW_ASYNC, HZ/50); - gfp_flags |= __GFP_NOFAIL; - err = 0; - goto retry_encrypt; - } - goto out_writepage; - } - } + err = encrypt_one_page(fio); + if (err) + goto out_writepage; set_page_writeback(page); @@ -1351,15 +1373,14 @@ retry_encrypt: * If current allocation needs SSR, * it had better in-place writes for updated data. */ - if (unlikely(fio->old_blkaddr != NEW_ADDR && - !is_cold_data(page) && - !IS_ATOMIC_WRITTEN_PAGE(page) && - need_inplace_update(inode, fio))) { - f2fs_unlock_op(F2FS_I_SB(inode)); + if (need_inplace_update(fio)) { + f2fs_bug_on(fio->sbi, !fio->cp_rwsem_locked); + f2fs_unlock_op(fio->sbi); fio->cp_rwsem_locked = false; + err = rewrite_data_page(fio); + trace_f2fs_do_write_data_page(fio->page, IPU); set_inode_flag(inode, FI_UPDATE_WRITE); - trace_f2fs_do_write_data_page(page, IPU); } else { write_data_page(&dn, fio); trace_f2fs_do_write_data_page(page, OPU); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index fc1e6d048fd2..005129e03a67 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1903,7 +1903,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, int err; /* if in-place-update policy is enabled, don't waste time here */ - if (need_inplace_update(inode, NULL)) + if (need_inplace_update_policy(inode, NULL)) return -EINVAL; pg_start = range->start >> PAGE_SHIFT; @@ -2038,7 +2038,7 @@ static int f2fs_ioc_defragment(struct file *filp, unsigned long arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!S_ISREG(inode->i_mode)) + if (!S_ISREG(inode->i_mode) || f2fs_is_atomic_file(inode)) return -EINVAL; if (f2fs_readonly(sbi->sb)) diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 8ad22b8cbba7..10bf05d4cff4 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -564,16 +564,12 @@ enum { F2FS_IPU_ASYNC, }; -static inline bool need_inplace_update(struct inode *inode, +static inline bool need_inplace_update_policy(struct inode *inode, struct f2fs_io_info *fio) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); unsigned int policy = SM_I(sbi)->ipu_policy; - /* IPU can be done only for the user data */ - if (S_ISDIR(inode->i_mode) || f2fs_is_atomic_file(inode)) - return false; - if (test_opt(sbi, LFS)) return false; From cf1770e0fa436b62f454732cd0e7842bed61430a Mon Sep 17 00:00:00 2001 From: Hou Pengyang Date: Tue, 25 Apr 2017 12:45:13 +0000 Subject: [PATCH 280/804] f2fs: lookup extent cache first under IPU scenario If a page is cold, NOT atomit written and need_ipu now, there is a high probability that IPU should be adapted. For IPU, we try to check extent tree to get the block index first, instead of reading the dnode page, where may lead to an useless dnode IO, since no need to update the dnode index for IPU. Signed-off-by: Hou Pengyang Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim Conflicts: fs/f2fs/gc.c --- fs/f2fs/data.c | 16 ++++++++++++++-- fs/f2fs/gc.c | 3 ++- fs/f2fs/segment.c | 1 + 3 files changed, 17 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index db9ed78156e8..b89b97be5ee4 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1348,9 +1348,20 @@ int do_write_data_page(struct f2fs_io_info *fio) struct page *page = fio->page; struct inode *inode = page->mapping->host; struct dnode_of_data dn; + struct extent_info ei = {0,0,0}; + bool ipu_force = false; int err = 0; set_new_dnode(&dn, inode, NULL, NULL, 0); + if (need_inplace_update(fio) && + f2fs_lookup_extent_cache(inode, page->index, &ei)) { + fio->old_blkaddr = ei.blk + page->index - ei.fofs; + if (fio->old_blkaddr != NULL_ADDR && + fio->old_blkaddr != NEW_ADDR) { + ipu_force = true; + goto got_it; + } + } err = get_dnode_of_data(&dn, page->index, LOOKUP_NODE); if (err) return err; @@ -1362,7 +1373,7 @@ int do_write_data_page(struct f2fs_io_info *fio) ClearPageUptodate(page); goto out_writepage; } - +got_it: err = encrypt_one_page(fio); if (err) goto out_writepage; @@ -1373,7 +1384,7 @@ int do_write_data_page(struct f2fs_io_info *fio) * If current allocation needs SSR, * it had better in-place writes for updated data. */ - if (need_inplace_update(fio)) { + if (ipu_force || need_inplace_update(fio)) { f2fs_bug_on(fio->sbi, !fio->cp_rwsem_locked); f2fs_unlock_op(fio->sbi); fio->cp_rwsem_locked = false; @@ -1410,6 +1421,7 @@ static int __write_data_page(struct page *page, bool *submitted, .type = DATA, .op = REQ_OP_WRITE, .op_flags = wbc_to_write_flags(wbc), + .old_blkaddr = NULL_ADDR, .page = page, .encrypted_page = NULL, .submitted = false, diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index f3102a895c48..32b3ae415260 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -713,7 +713,8 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type, .sbi = F2FS_I_SB(inode), .type = DATA, .op = REQ_OP_WRITE, - .op_flags = REQ_SYNC | REQ_NOIDLE, + .op_flags = REQ_SYNC, + .old_blkaddr = NULL_ADDR, .page = page, .encrypted_page = NULL, }; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index a0a0592e1681..69ead09ba06f 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -311,6 +311,7 @@ static int __commit_inmem_pages(struct inode *inode, } fio.page = page; + fio.old_blkaddr = NULL_ADDR; err = do_write_data_page(&fio); if (err) { unlock_page(page); From 0905adc8c720177c5c85c95d4aeb99d9a8cd92d5 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 24 Apr 2017 15:20:16 -0700 Subject: [PATCH 281/804] f2fs: introduce valid_ipu_blkaddr to clean up This patch introduces valid_ipu_blkaddr to clean up checking block address for inplace-update. Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index b89b97be5ee4..fe27b2851336 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1331,8 +1331,6 @@ static inline bool need_inplace_update(struct f2fs_io_info *fio) { struct inode *inode = fio->page->mapping->host; - if (fio->old_blkaddr == NEW_ADDR) - return false; if (S_ISDIR(inode->i_mode) || f2fs_is_atomic_file(inode)) return false; if (is_cold_data(fio->page)) @@ -1343,6 +1341,15 @@ static inline bool need_inplace_update(struct f2fs_io_info *fio) return need_inplace_update_policy(inode, fio); } +static inline bool valid_ipu_blkaddr(struct f2fs_io_info *fio) +{ + if (fio->old_blkaddr == NEW_ADDR) + return false; + if (fio->old_blkaddr == NULL_ADDR) + return false; + return true; +} + int do_write_data_page(struct f2fs_io_info *fio) { struct page *page = fio->page; @@ -1356,8 +1363,8 @@ int do_write_data_page(struct f2fs_io_info *fio) if (need_inplace_update(fio) && f2fs_lookup_extent_cache(inode, page->index, &ei)) { fio->old_blkaddr = ei.blk + page->index - ei.fofs; - if (fio->old_blkaddr != NULL_ADDR && - fio->old_blkaddr != NEW_ADDR) { + + if (valid_ipu_blkaddr(fio)) { ipu_force = true; goto got_it; } @@ -1384,7 +1391,7 @@ got_it: * If current allocation needs SSR, * it had better in-place writes for updated data. */ - if (ipu_force || need_inplace_update(fio)) { + if (ipu_force || (valid_ipu_blkaddr(fio) && need_inplace_update(fio))) { f2fs_bug_on(fio->sbi, !fio->cp_rwsem_locked); f2fs_unlock_op(fio->sbi); fio->cp_rwsem_locked = false; From 95d6aa32c3c2a250b30562cc19db9dac602b93f4 Mon Sep 17 00:00:00 2001 From: Jin Qian Date: Tue, 25 Apr 2017 16:28:48 -0700 Subject: [PATCH 282/804] f2fs: sanity check segment count F2FS uses 4 bytes to represent block address. As a result, supported size of disk is 16 TB and it equals to 16 * 1024 * 1024 / 2 segments. Signed-off-by: Jin Qian Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 7 +++++++ include/linux/f2fs_fs.h | 6 ++++++ 2 files changed, 13 insertions(+) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index aa6ee31ef39b..9c310d8a6da1 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1496,6 +1496,13 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi, return 1; } + if (le32_to_cpu(raw_super->segment_count) > F2FS_MAX_SEGMENT) { + f2fs_msg(sb, KERN_INFO, + "Invalid segment count (%u)", + le32_to_cpu(raw_super->segment_count)); + return 1; + } + /* check CP/SIT/NAT/SSA/MAIN_AREA area boundary */ if (sanity_check_area_boundary(sbi, bh)) return 1; diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index e2d239ed4c60..661200e6d281 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -301,6 +301,12 @@ struct f2fs_nat_block { #define SIT_VBLOCK_MAP_SIZE 64 #define SIT_ENTRY_PER_BLOCK (PAGE_SIZE / sizeof(struct f2fs_sit_entry)) +/* + * F2FS uses 4 bytes to represent block address. As a result, supported size of + * disk is 16 TB and it equals to 16 * 1024 * 1024 / 2 segments. + */ +#define F2FS_MAX_SEGMENT ((16 * 1024 * 1024) / 2) + /* * Note that f2fs_sit_entry->vblocks has the following bit-field information. * [15:10] : allocation type such as CURSEG_XXXX_TYPE From 5abcd71d0fd8a642d848a13de041e2112df21a23 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 26 Apr 2017 11:11:12 -0700 Subject: [PATCH 283/804] f2fs: nullify fio->encrypted_page for each writes This makes sure each write request has nullified encrypted_page pointer. Signed-off-by: Jaegeuk Kim Conflicts: fs/f2fs/segment.c --- fs/f2fs/segment.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 69ead09ba06f..9fcc2f9aa732 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -290,8 +290,7 @@ static int __commit_inmem_pages(struct inode *inode, .sbi = sbi, .type = DATA, .op = REQ_OP_WRITE, - .op_flags = REQ_SYNC | REQ_NOIDLE | REQ_PRIO, - .encrypted_page = NULL, + .op_flags = REQ_SYNC | REQ_PRIO, }; pgoff_t last_idx = ULONG_MAX; int err = 0; @@ -312,6 +311,7 @@ static int __commit_inmem_pages(struct inode *inode, fio.page = page; fio.old_blkaddr = NULL_ADDR; + fio.encrypted_page = NULL; err = do_write_data_page(&fio); if (err) { unlock_page(page); From 7ec84ed608e4fe4b00189ed36363da553c092eaf Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 26 Apr 2017 17:39:54 +0800 Subject: [PATCH 284/804] f2fs: don't hold cmd_lock during waiting discard command Previously, with protection of cmd_lock, we will wait for end io of discard command which potentially may lead long latency, making worse concurrency. So, in this patch, we try to add reference into discard entry to prevent the entry being released by other thread, then we can avoid holding global cmd_lock during waiting discard to finish. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 1 + fs/f2fs/segment.c | 25 ++++++++++++++++++++----- 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 635bca168078..b20b3b29bc27 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -286,6 +286,7 @@ struct discard_cmd { struct list_head list; /* command list */ struct completion wait; /* compleation */ struct block_device *bdev; /* bdev */ + unsigned short ref; /* reference count */ int state; /* state */ int error; /* bio error */ }; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 9fcc2f9aa732..6a79d0b3b423 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -688,6 +688,7 @@ static struct discard_cmd *__create_discard_cmd(struct f2fs_sb_info *sbi, dc->lstart = lstart; dc->start = start; dc->len = len; + dc->ref = 0; dc->state = D_PREP; dc->error = 0; init_completion(&dc->wait); @@ -1086,6 +1087,8 @@ static void __wait_discard_cmd(struct f2fs_sb_info *sbi, bool wait_cond) mutex_lock(&dcc->cmd_lock); list_for_each_entry_safe(dc, tmp, wait_list, list) { if (!wait_cond || dc->state == D_DONE) { + if (dc->ref) + continue; wait_for_completion_io(&dc->wait); __remove_discard_cmd(sbi, dc); } @@ -1098,17 +1101,29 @@ void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct discard_cmd *dc; + bool need_wait = false; mutex_lock(&dcc->cmd_lock); - dc = (struct discard_cmd *)__lookup_rb_tree(&dcc->root, NULL, blkaddr); if (dc) { - if (dc->state != D_PREP) - wait_for_completion_io(&dc->wait); - __punch_discard_cmd(sbi, dc, blkaddr); + if (dc->state == D_PREP) { + __punch_discard_cmd(sbi, dc, blkaddr); + } else { + dc->ref++; + need_wait = true; + } } - mutex_unlock(&dcc->cmd_lock); + + if (need_wait) { + wait_for_completion_io(&dc->wait); + mutex_lock(&dcc->cmd_lock); + f2fs_bug_on(sbi, dc->state != D_DONE); + dc->ref--; + if (!dc->ref) + __remove_discard_cmd(sbi, dc); + mutex_unlock(&dcc->cmd_lock); + } } /* This comes from f2fs_put_super */ From 0756d8f7982ebaf4de78364a7cada24a6e31098c Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 26 Apr 2017 17:39:55 +0800 Subject: [PATCH 285/804] f2fs: shrink size of struct discard_cmd In order to shrink size of struct discard_cmd, change variable type of @state in struct discard_cmd from int to unsigned char. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index b20b3b29bc27..125d006ed5ff 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -287,7 +287,7 @@ struct discard_cmd { struct completion wait; /* compleation */ struct block_device *bdev; /* bdev */ unsigned short ref; /* reference count */ - int state; /* state */ + unsigned char state; /* state */ int error; /* bio error */ }; From 87c98567046f8f5890bbb7e5dab874ced61ccbf8 Mon Sep 17 00:00:00 2001 From: Hou Pengyang Date: Thu, 27 Apr 2017 00:17:21 +0800 Subject: [PATCH 286/804] f2fs: release cp and dnode lock before IPU We don't need to rewrite the page under cp_rwsem and dnode locks. Signed-off-by: Hou Pengyang Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 39 ++++++++++++++++++++++++--------------- fs/f2fs/f2fs.h | 2 +- fs/f2fs/gc.c | 1 + fs/f2fs/segment.c | 1 + 4 files changed, 27 insertions(+), 16 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index fe27b2851336..c1e881242d53 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1366,12 +1366,17 @@ int do_write_data_page(struct f2fs_io_info *fio) if (valid_ipu_blkaddr(fio)) { ipu_force = true; + fio->need_lock = false; goto got_it; } } + + if (fio->need_lock) + f2fs_lock_op(fio->sbi); + err = get_dnode_of_data(&dn, page->index, LOOKUP_NODE); if (err) - return err; + goto out; fio->old_blkaddr = dn.data_blkaddr; @@ -1392,22 +1397,26 @@ got_it: * it had better in-place writes for updated data. */ if (ipu_force || (valid_ipu_blkaddr(fio) && need_inplace_update(fio))) { - f2fs_bug_on(fio->sbi, !fio->cp_rwsem_locked); - f2fs_unlock_op(fio->sbi); - fio->cp_rwsem_locked = false; - + f2fs_put_dnode(&dn); + if (fio->need_lock) + f2fs_unlock_op(fio->sbi); err = rewrite_data_page(fio); trace_f2fs_do_write_data_page(fio->page, IPU); set_inode_flag(inode, FI_UPDATE_WRITE); - } else { - write_data_page(&dn, fio); - trace_f2fs_do_write_data_page(page, OPU); - set_inode_flag(inode, FI_APPEND_WRITE); - if (page->index == 0) - set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN); + return err; } + + /* LFS mode write path */ + write_data_page(&dn, fio); + trace_f2fs_do_write_data_page(page, OPU); + set_inode_flag(inode, FI_APPEND_WRITE); + if (page->index == 0) + set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN); out_writepage: f2fs_put_dnode(&dn); +out: + if (fio->need_lock) + f2fs_unlock_op(fio->sbi); return err; } @@ -1432,7 +1441,7 @@ static int __write_data_page(struct page *page, bool *submitted, .page = page, .encrypted_page = NULL, .submitted = false, - .cp_rwsem_locked = true, + .need_lock = true, }; trace_f2fs_writepage(page, DATA); @@ -1468,6 +1477,7 @@ write: /* Dentry blocks are controlled by checkpoint */ if (S_ISDIR(inode->i_mode)) { + fio.need_lock = false; err = do_write_data_page(&fio); goto done; } @@ -1485,13 +1495,12 @@ write: if (!err) goto out; } - f2fs_lock_op(sbi); + if (err == -EAGAIN) err = do_write_data_page(&fio); if (F2FS_I(inode)->last_disk_size < psize) F2FS_I(inode)->last_disk_size = psize; - if (fio.cp_rwsem_locked) - f2fs_unlock_op(sbi); + done: if (err && err != -ENOENT) goto redirty_out; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 125d006ed5ff..bb1b6ce66c1a 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -870,7 +870,7 @@ struct f2fs_io_info { struct page *page; /* page to be written */ struct page *encrypted_page; /* encrypted page */ bool submitted; /* indicate IO submission */ - bool cp_rwsem_locked; /* indicate cp_rwsem is held */ + bool need_lock; /* indicate we need to lock cp_rwsem */ }; #define is_read_io(rw) ((rw) == READ) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 32b3ae415260..b527ab0eec1d 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -717,6 +717,7 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type, .old_blkaddr = NULL_ADDR, .page = page, .encrypted_page = NULL, + .need_lock = true, }; bool is_dirty = PageDirty(page); int err; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 6a79d0b3b423..9e15496036ff 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -312,6 +312,7 @@ static int __commit_inmem_pages(struct inode *inode, fio.page = page; fio.old_blkaddr = NULL_ADDR; fio.encrypted_page = NULL; + fio.need_lock = false, err = do_write_data_page(&fio); if (err) { unlock_page(page); From 30d60edd7becac313a6a3adb9733ea123695b2f6 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 27 Apr 2017 20:40:39 +0800 Subject: [PATCH 287/804] f2fs: allow cpc->reason to indicate more than one reason Change to use different bits of cpc->reason to indicate different status, so cpc->reason can indicate more than one reason. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 14 +++++++------- fs/f2fs/f2fs.h | 16 +++++++--------- fs/f2fs/segment.c | 8 ++++---- 3 files changed, 18 insertions(+), 20 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 0983b7646444..b1a86997b115 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1055,17 +1055,17 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc) spin_lock(&sbi->cp_lock); - if (cpc->reason == CP_UMOUNT && + if ((cpc->reason & CP_UMOUNT) && le32_to_cpu(ckpt->cp_pack_total_block_count) > sbi->blocks_per_seg - NM_I(sbi)->nat_bits_blocks) disable_nat_bits(sbi, false); - if (cpc->reason == CP_UMOUNT) + if (cpc->reason & CP_UMOUNT) __set_ckpt_flags(ckpt, CP_UMOUNT_FLAG); else __clear_ckpt_flags(ckpt, CP_UMOUNT_FLAG); - if (cpc->reason == CP_FASTBOOT) + if (cpc->reason & CP_FASTBOOT) __set_ckpt_flags(ckpt, CP_FASTBOOT_FLAG); else __clear_ckpt_flags(ckpt, CP_FASTBOOT_FLAG); @@ -1273,8 +1273,8 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) mutex_lock(&sbi->cp_mutex); if (!is_sbi_flag_set(sbi, SBI_IS_DIRTY) && - (cpc->reason == CP_FASTBOOT || cpc->reason == CP_SYNC || - (cpc->reason == CP_DISCARD && !sbi->discard_blks))) + ((cpc->reason & CP_FASTBOOT) || (cpc->reason & CP_SYNC) || + ((cpc->reason & CP_DISCARD) && !sbi->discard_blks))) goto out; if (unlikely(f2fs_cp_error(sbi))) { err = -EIO; @@ -1296,7 +1296,7 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) f2fs_flush_merged_bios(sbi); /* this is the case of multiple fstrims without any changes */ - if (cpc->reason == CP_DISCARD) { + if (cpc->reason & CP_DISCARD) { if (!exist_trim_candidates(sbi, cpc)) { unblock_operations(sbi); goto out; @@ -1334,7 +1334,7 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) unblock_operations(sbi); stat_inc_cp_count(sbi->stat_info); - if (cpc->reason == CP_RECOVERY) + if (cpc->reason & CP_RECOVERY) f2fs_msg(sbi->sb, KERN_NOTICE, "checkpoint: version = %llx", ckpt_ver); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index bb1b6ce66c1a..4c7eb0b6b4ca 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -192,13 +192,11 @@ enum { SIT_BITMAP }; -enum { - CP_UMOUNT, - CP_FASTBOOT, - CP_SYNC, - CP_RECOVERY, - CP_DISCARD, -}; +#define CP_UMOUNT 0x00000001 +#define CP_FASTBOOT 0x00000002 +#define CP_SYNC 0x00000004 +#define CP_RECOVERY 0x00000008 +#define CP_DISCARD 0x00000010 #define DEF_BATCHED_TRIM_SECTIONS 2048 #define BATCHED_TRIM_SEGMENTS(sbi) \ @@ -1332,7 +1330,7 @@ static inline bool enabled_nat_bits(struct f2fs_sb_info *sbi, { bool set = is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG); - return (cpc) ? (cpc->reason == CP_UMOUNT) && set : set; + return (cpc) ? (cpc->reason & CP_UMOUNT) && set : set; } static inline void f2fs_lock_op(struct f2fs_sb_info *sbi) @@ -1368,7 +1366,7 @@ static inline int __get_cp_reason(struct f2fs_sb_info *sbi) static inline bool __remain_node_summaries(int reason) { - return (reason == CP_UMOUNT || reason == CP_FASTBOOT); + return (reason & (CP_UMOUNT | CP_FASTBOOT)); } static inline bool __exist_node_summaries(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 9e15496036ff..444ea2c4f671 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1262,7 +1262,7 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc, unsigned long *discard_map = (unsigned long *)se->discard_map; unsigned long *dmap = SIT_I(sbi)->tmp_map; unsigned int start = 0, end = -1; - bool force = (cpc->reason == CP_DISCARD); + bool force = (cpc->reason & CP_DISCARD); struct discard_entry *de = NULL; struct list_head *head = &SM_I(sbi)->dcc_info->entry_list; int i; @@ -1345,7 +1345,7 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc) unsigned long *prefree_map = dirty_i->dirty_segmap[PRE]; unsigned int start = 0, end = -1; unsigned int secno, start_segno; - bool force = (cpc->reason == CP_DISCARD); + bool force = (cpc->reason & CP_DISCARD); mutex_lock(&dirty_i->seglist_lock); @@ -2849,7 +2849,7 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) se = get_seg_entry(sbi, segno); /* add discard candidates */ - if (cpc->reason != CP_DISCARD) { + if (!(cpc->reason & CP_DISCARD)) { cpc->trim_start = segno; add_discard_addrs(sbi, cpc, false); } @@ -2885,7 +2885,7 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) f2fs_bug_on(sbi, !list_empty(head)); f2fs_bug_on(sbi, sit_i->dirty_sentries); out: - if (cpc->reason == CP_DISCARD) { + if (cpc->reason & CP_DISCARD) { __u64 trim_start = cpc->trim_start; for (; cpc->trim_start <= cpc->trim_end; cpc->trim_start++) From a725708ca434461dbaaefa7ec5004373ec3ba054 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 28 Apr 2017 13:56:08 +0800 Subject: [PATCH 288/804] f2fs: introduce CP_TRIMMED_FLAG to avoid unneeded discard Introduce CP_TRIMMED_FLAG to indicate all invalid block were trimmed before umount, so once we do mount with image which contain the flag, we don't record invalid blocks as undiscard one, when fstrim is being triggered, we can avoid issuing redundant discard commands. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 3 +++ fs/f2fs/f2fs.h | 1 + fs/f2fs/segment.c | 28 ++++++++++++++++++++-------- fs/f2fs/super.c | 7 +++++++ include/linux/f2fs_fs.h | 1 + include/trace/events/f2fs.h | 4 +++- 6 files changed, 35 insertions(+), 9 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index b1a86997b115..d639fd9062d4 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1060,6 +1060,9 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc) sbi->blocks_per_seg - NM_I(sbi)->nat_bits_blocks) disable_nat_bits(sbi, false); + if (cpc->reason & CP_TRIMMED) + __set_ckpt_flags(ckpt, CP_TRIMMED_FLAG); + if (cpc->reason & CP_UMOUNT) __set_ckpt_flags(ckpt, CP_UMOUNT_FLAG); else diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 4c7eb0b6b4ca..b34d527ba809 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -197,6 +197,7 @@ enum { #define CP_SYNC 0x00000004 #define CP_RECOVERY 0x00000008 #define CP_DISCARD 0x00000010 +#define CP_TRIMMED 0x00000020 #define DEF_BATCHED_TRIM_SECTIONS 2048 #define BATCHED_TRIM_SEGMENTS(sbi) \ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 444ea2c4f671..23e809f64ded 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -3084,10 +3084,17 @@ static void build_sit_entries(struct f2fs_sb_info *sbi) /* build discard map only one time */ if (f2fs_discard_en(sbi)) { - memcpy(se->discard_map, se->cur_valid_map, - SIT_VBLOCK_MAP_SIZE); - sbi->discard_blks += sbi->blocks_per_seg - - se->valid_blocks; + if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) { + memset(se->discard_map, 0xff, + SIT_VBLOCK_MAP_SIZE); + } else { + memcpy(se->discard_map, + se->cur_valid_map, + SIT_VBLOCK_MAP_SIZE); + sbi->discard_blks += + sbi->blocks_per_seg - + se->valid_blocks; + } } if (sbi->segs_per_sec > 1) @@ -3111,10 +3118,15 @@ static void build_sit_entries(struct f2fs_sb_info *sbi) seg_info_from_raw_sit(se, &sit); if (f2fs_discard_en(sbi)) { - memcpy(se->discard_map, se->cur_valid_map, - SIT_VBLOCK_MAP_SIZE); - sbi->discard_blks += old_valid_blocks - - se->valid_blocks; + if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) { + memset(se->discard_map, 0xff, + SIT_VBLOCK_MAP_SIZE); + } else { + memcpy(se->discard_map, se->cur_valid_map, + SIT_VBLOCK_MAP_SIZE); + sbi->discard_blks += old_valid_blocks - + se->valid_blocks; + } } if (sbi->segs_per_sec > 1) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 9c310d8a6da1..eeda97b54556 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -797,6 +797,13 @@ static void f2fs_put_super(struct super_block *sb) /* be sure to wait for any on-going discard commands */ f2fs_wait_discard_bios(sbi); + if (!sbi->discard_blks) { + struct cp_control cpc = { + .reason = CP_UMOUNT | CP_TRIMMED, + }; + write_checkpoint(sbi, &cpc); + } + /* write_checkpoint can update stat informaion */ f2fs_destroy_stats(sbi); diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index 661200e6d281..2b7183c5c9a3 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -114,6 +114,7 @@ struct f2fs_super_block { /* * For checkpoint */ +#define CP_TRIMMED_FLAG 0x00000100 #define CP_NAT_BITS_FLAG 0x00000080 #define CP_CRC_RECOVERY_FLAG 0x00000040 #define CP_FASTBOOT_FLAG 0x00000020 diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 0796b2bf6870..0d02af995547 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -44,6 +44,7 @@ TRACE_DEFINE_ENUM(CP_FASTBOOT); TRACE_DEFINE_ENUM(CP_SYNC); TRACE_DEFINE_ENUM(CP_RECOVERY); TRACE_DEFINE_ENUM(CP_DISCARD); +TRACE_DEFINE_ENUM(CP_TRIMMED); #define show_block_type(type) \ __print_symbolic(type, \ @@ -118,7 +119,8 @@ TRACE_DEFINE_ENUM(CP_DISCARD); { CP_FASTBOOT, "Fastboot" }, \ { CP_SYNC, "Sync" }, \ { CP_RECOVERY, "Recovery" }, \ - { CP_DISCARD, "Discard" }) + { CP_DISCARD, "Discard" }, \ + { CP_UMOUNT | CP_TRIMMED, "Umount,Trimmed" }) struct victim_sel_policy; struct f2fs_map_blocks; From 19023fdfb16384ce695bd7ce07902518d39f5435 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 1 May 2017 18:09:44 -0700 Subject: [PATCH 289/804] f2fs: flush dirty nats periodically This patch flushes dirty nats in order to acquire available nids by writing checkpoint. Otherwise, we can have no chance to get freed nids. Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 23e809f64ded..5fdc995b1f1e 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -419,7 +419,7 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) else build_free_nids(sbi, false, false); - if (!is_idle(sbi)) + if (!is_idle(sbi) && !excess_dirty_nats(sbi)) return; /* checkpoint is the only way to shrink partial cached entries */ From d15370b84bd18bbe31358e9ba110ac3b7c8ff18a Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 1 May 2017 18:13:03 -0700 Subject: [PATCH 290/804] f2fs: show available_nids in f2fs/status This patch adds an entry in f2fs/status to show # of available nids. Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 5 +++-- fs/f2fs/f2fs.h | 3 ++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 6102737473d4..87f449845f5f 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -97,6 +97,7 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->sits = MAIN_SEGS(sbi); si->dirty_sits = SIT_I(sbi)->dirty_sentries; si->free_nids = NM_I(sbi)->nid_cnt[FREE_NID_LIST]; + si->avail_nids = NM_I(sbi)->available_nids; si->alloc_nids = NM_I(sbi)->nid_cnt[ALLOC_NID_LIST]; si->bg_gc = sbi->bg_gc; si->util_free = (int)(free_user_blocks(sbi) >> sbi->log_blocks_per_seg) @@ -370,8 +371,8 @@ static int stat_show(struct seq_file *s, void *v) si->ndirty_imeta); seq_printf(s, " - NATs: %9d/%9d\n - SITs: %9d/%9d\n", si->dirty_nats, si->nats, si->dirty_sits, si->sits); - seq_printf(s, " - free_nids: %9d, alloc_nids: %9d\n", - si->free_nids, si->alloc_nids); + seq_printf(s, " - free_nids: %9d/%9d\n - alloc_nids: %9d\n", + si->free_nids, si->avail_nids, si->alloc_nids); seq_puts(s, "\nDistribution of User Blocks:"); seq_puts(s, " [ valid | invalid | free ]\n"); seq_puts(s, " ["); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index b34d527ba809..4d086c7c2138 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2461,7 +2461,8 @@ struct f2fs_stat_info { int ndirty_node, ndirty_dent, ndirty_meta, ndirty_data, ndirty_imeta; int inmem_pages; unsigned int ndirty_dirs, ndirty_files, ndirty_all; - int nats, dirty_nats, sits, dirty_sits, free_nids, alloc_nids; + int nats, dirty_nats, sits, dirty_sits; + int free_nids, avail_nids, alloc_nids; int total_count, utilization; int bg_gc, nr_wb_cp_data, nr_wb_data; int nr_flushing, nr_flushed, nr_discarding, nr_discarded; From 02ac4707889068a4b739a1ed52637cff8390a41f Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 3 May 2017 23:59:13 +0800 Subject: [PATCH 291/804] f2fs: relocate inode_{,un}lock in F2FS_IOC_SETFLAGS This patch expands cover region of inode->i_rwsem to keep setting flag atomically. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 005129e03a67..70be377c2236 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1478,10 +1478,10 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg) if (ret) return ret; - flags = f2fs_mask_flags(inode->i_mode, flags); - inode_lock(inode); + flags = f2fs_mask_flags(inode->i_mode, flags); + oldflags = fi->i_flags; if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) { @@ -1495,10 +1495,11 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg) flags = flags & FS_FL_USER_MODIFIABLE; flags |= oldflags & ~FS_FL_USER_MODIFIABLE; fi->i_flags = flags; - inode_unlock(inode); inode->i_ctime = current_time(inode); f2fs_set_inode_flags(inode); + + inode_unlock(inode); out: mnt_drop_write_file(filp); return ret; From 60a9766f27c7b0f5cae5db408edc6e6bb86538a2 Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Wed, 26 Apr 2017 15:56:52 +0800 Subject: [PATCH 292/804] f2fs: fix a mount fail for wrong next_scan_nid -write_checkpoint -do_checkpoint -next_free_nid <--- something wrong with next free nid -f2fs_fill_super -build_node_manager -build_free_nids -get_current_nat_page -__get_meta_page <--- attempt to access beyond end of device Signed-off-by: Yunlei He Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index a63399338ff4..833f5fb9858c 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1958,6 +1958,9 @@ static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount) int i = 0; nid_t nid = nm_i->next_scan_nid; + if (unlikely(nid >= nm_i->max_nid)) + nid = 0; + /* Enough entries */ if (nm_i->nid_cnt[FREE_NID_LIST] >= NAT_ENTRY_PER_BLOCK) return; From b53229d776afa1626da3f362ad9f82884c8555ad Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 7 Apr 2017 10:58:39 -0700 Subject: [PATCH 293/804] f2fs: sync f2fs_lookup() with ext4_lookup() As for ext4, now that fscrypt_has_permitted_context() correctly handles the case where we have the key for the parent directory but not the child, f2fs_lookup() no longer has to work around it. Also add the same warning message that ext4 uses. Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/f2fs/namei.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 35fca4c39993..77349d51f952 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -324,9 +324,10 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, if (f2fs_encrypted_inode(dir) && (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) && !fscrypt_has_permitted_context(dir, inode)) { - bool nokey = f2fs_encrypted_inode(inode) && - !fscrypt_has_encryption_key(inode); - err = nokey ? -ENOKEY : -EPERM; + f2fs_msg(inode->i_sb, KERN_WARNING, + "Inconsistent encryption contexts: %lu/%lu", + dir->i_ino, inode->i_ino); + err = -EPERM; goto err_out; } return d_splice_alias(inode, dentry); From 64f3b27a09c34664fb3023fcfdd95834948a4a1f Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 24 Apr 2017 10:00:08 -0700 Subject: [PATCH 294/804] f2fs: check entire encrypted bigname when finding a dentry If user has no key under an encrypted dir, fscrypt gives digested dentries. Previously, when looking up a dentry, f2fs only checks its hash value with first 4 bytes of the digested dentry, which didn't handle hash collisions fully. This patch enhances to check entire dentry bytes likewise ext4. Eric reported how to reproduce this issue by: # seq -f "edir/abcdefghijklmnopqrstuvwxyz012345%.0f" 100000 | xargs touch # find edir -type f | xargs stat -c %i | sort | uniq | wc -l 100000 # sync # echo 3 > /proc/sys/vm/drop_caches # keyctl new_session # find edir -type f | xargs stat -c %i | sort | uniq | wc -l 99999 Cc: Reported-by: Eric Biggers Signed-off-by: Jaegeuk Kim (fixed f2fs_dentry_hash() to work even when the hash is 0) Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o Conflicts: fs/f2fs/inline.c --- fs/f2fs/dir.c | 37 +++++++++++++++++++++---------------- fs/f2fs/f2fs.h | 3 ++- fs/f2fs/hash.c | 7 ++++++- fs/f2fs/inline.c | 4 ++-- 4 files changed, 31 insertions(+), 20 deletions(-) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index f44e1370890f..9dbf44a28520 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -130,19 +130,29 @@ struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *fname, continue; } - /* encrypted case */ + if (de->hash_code != namehash) + goto not_match; + de_name.name = d->filename[bit_pos]; de_name.len = le16_to_cpu(de->name_len); - /* show encrypted name */ - if (fname->hash) { - if (de->hash_code == cpu_to_le32(fname->hash)) - goto found; - } else if (de_name.len == name->len && - de->hash_code == namehash && - !memcmp(de_name.name, name->name, name->len)) +#ifdef CONFIG_F2FS_FS_ENCRYPTION + if (unlikely(!name->name)) { + if (fname->usr_fname->name[0] == '_') { + if (de_name.len >= 16 && + !memcmp(de_name.name + de_name.len - 16, + fname->crypto_buf.name + 8, 16)) + goto found; + goto not_match; + } + name->name = fname->crypto_buf.name; + name->len = fname->crypto_buf.len; + } +#endif + if (de_name.len == name->len && + !memcmp(de_name.name, name->name, name->len)) goto found; - +not_match: if (max_slots && max_len > *max_slots) *max_slots = max_len; max_len = 0; @@ -170,12 +180,7 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, struct f2fs_dir_entry *de = NULL; bool room = false; int max_slots; - f2fs_hash_t namehash; - - if(fname->hash) - namehash = cpu_to_le32(fname->hash); - else - namehash = f2fs_dentry_hash(&name); + f2fs_hash_t namehash = f2fs_dentry_hash(&name, fname); nbucket = dir_buckets(level, F2FS_I(dir)->i_dir_level); nblock = bucket_blocks(level); @@ -526,7 +531,7 @@ int f2fs_add_regular_entry(struct inode *dir, const struct qstr *new_name, level = 0; slots = GET_DENTRY_SLOTS(new_name->len); - dentry_hash = f2fs_dentry_hash(new_name); + dentry_hash = f2fs_dentry_hash(new_name, NULL); current_depth = F2FS_I(dir)->i_current_depth; if (F2FS_I(dir)->chash == dentry_hash) { diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 4d086c7c2138..b0a093e38104 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2258,7 +2258,8 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi); /* * hash.c */ -f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info); +f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info, + struct fscrypt_name *fname); /* * node.c diff --git a/fs/f2fs/hash.c b/fs/f2fs/hash.c index 71b7206c431e..eb2e031ea887 100644 --- a/fs/f2fs/hash.c +++ b/fs/f2fs/hash.c @@ -70,7 +70,8 @@ static void str2hashbuf(const unsigned char *msg, size_t len, *buf++ = pad; } -f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info) +f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info, + struct fscrypt_name *fname) { __u32 hash; f2fs_hash_t f2fs_hash; @@ -79,6 +80,10 @@ f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info) const unsigned char *name = name_info->name; size_t len = name_info->len; + /* encrypted bigname case */ + if (fname && !fname->disk_name.name) + return cpu_to_le32(fname->hash); + if (is_dot_dotdot(name_info)) return 0; diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index b3bd1012a4fc..fc8b49696b9d 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -298,7 +298,7 @@ struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir, return NULL; } - namehash = f2fs_dentry_hash(&name); + namehash = f2fs_dentry_hash(&name, fname); inline_dentry = inline_data_addr(ipage); @@ -533,7 +533,7 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, f2fs_wait_on_page_writeback(ipage, NODE, true); - name_hash = f2fs_dentry_hash(new_name); + name_hash = f2fs_dentry_hash(new_name, NULL); make_dentry_ptr_inline(NULL, &d, dentry_blk); f2fs_update_dentry(ino, mode, &d, new_name, name_hash, bit_pos); From 73c0288f1c9c90c39170b1246de431adb5a85fc8 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 24 Apr 2017 10:00:09 -0700 Subject: [PATCH 295/804] fscrypt: avoid collisions when presenting long encrypted filenames When accessing an encrypted directory without the key, userspace must operate on filenames derived from the ciphertext names, which contain arbitrary bytes. Since we must support filenames as long as NAME_MAX, we can't always just base64-encode the ciphertext, since that may make it too long. Currently, this is solved by presenting long names in an abbreviated form containing any needed filesystem-specific hashes (e.g. to identify a directory block), then the last 16 bytes of ciphertext. This needs to be sufficient to identify the actual name on lookup. However, there is a bug. It seems to have been assumed that due to the use of a CBC (ciphertext block chaining)-based encryption mode, the last 16 bytes (i.e. the AES block size) of ciphertext would depend on the full plaintext, preventing collisions. However, we actually use CBC with ciphertext stealing (CTS), which handles the last two blocks specially, causing them to appear "flipped". Thus, it's actually the second-to-last block which depends on the full plaintext. This caused long filenames that differ only near the end of their plaintexts to, when observed without the key, point to the wrong inode and be undeletable. For example, with ext4: # echo pass | e4crypt add_key -p 16 edir/ # seq -f "edir/abcdefghijklmnopqrstuvwxyz012345%.0f" 100000 | xargs touch # find edir/ -type f | xargs stat -c %i | sort | uniq | wc -l 100000 # sync # echo 3 > /proc/sys/vm/drop_caches # keyctl new_session # find edir/ -type f | xargs stat -c %i | sort | uniq | wc -l 2004 # rm -rf edir/ rm: cannot remove 'edir/_A7nNFi3rhkEQlJ6P,hdzluhODKOeWx5V': Structure needs cleaning ... To fix this, when presenting long encrypted filenames, encode the second-to-last block of ciphertext rather than the last 16 bytes. Although it would be nice to solve this without depending on a specific encryption mode, that would mean doing a cryptographic hash like SHA-256 which would be much less efficient. This way is sufficient for now, and it's still compatible with encryption modes like HEH which are strong pseudorandom permutations. Also, changing the presented names is still allowed at any time because they are only provided to allow applications to do things like delete encrypted directories. They're not designed to be used to persistently identify files --- which would be hard to do anyway, given that they're encrypted after all. For ease of backports, this patch only makes the minimal fix to both ext4 and f2fs. It leaves ubifs as-is, since ubifs doesn't compare the ciphertext block yet. Follow-on patches will clean things up properly and make the filesystems use a shared helper function. Fixes: 5de0b4d0cd15 ("ext4 crypto: simplify and speed up filename encryption") Reported-by: Gwendal Grignou Cc: stable@vger.kernel.org Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/crypto/fname.c | 2 +- fs/ext4/namei.c | 4 ++-- fs/f2fs/dir.c | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index 13052b85c393..932881f27f2f 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -300,7 +300,7 @@ int fscrypt_fname_disk_to_usr(struct inode *inode, } else { memset(buf, 0, 8); } - memcpy(buf + 8, iname->name + iname->len - 16, 16); + memcpy(buf + 8, iname->name + ((iname->len - 17) & ~15), 16); oname->name[0] = '_'; oname->len = 1 + digest_encode(buf, 24, oname->name + 1); return 0; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 38eb0c8e43b9..dac159a226ad 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1243,9 +1243,9 @@ static inline int ext4_match(struct ext4_filename *fname, if (unlikely(!name)) { if (fname->usr_fname->name[0] == '_') { int ret; - if (de->name_len < 16) + if (de->name_len <= 32) return 0; - ret = memcmp(de->name + de->name_len - 16, + ret = memcmp(de->name + ((de->name_len - 17) & ~15), fname->crypto_buf.name + 8, 16); return (ret == 0) ? 1 : 0; } diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 9dbf44a28520..c716ab0baf1d 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -139,8 +139,8 @@ struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *fname, #ifdef CONFIG_F2FS_FS_ENCRYPTION if (unlikely(!name->name)) { if (fname->usr_fname->name[0] == '_') { - if (de_name.len >= 16 && - !memcmp(de_name.name + de_name.len - 16, + if (de_name.len > 32 && + !memcmp(de_name.name + ((de_name.len - 17) & ~15), fname->crypto_buf.name + 8, 16)) goto found; goto not_match; From e9dbf926ed236a065dfd8a8f930564fe0f1f2b73 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 21 Feb 2017 15:07:11 -0800 Subject: [PATCH 296/804] fscrypt: remove broken support for detecting keyring key revocation Filesystem encryption ostensibly supported revoking a keyring key that had been used to "unlock" encrypted files, causing those files to become "locked" again. This was, however, buggy for several reasons, the most severe of which was that when key revocation happened to be detected for an inode, its fscrypt_info was immediately freed, even while other threads could be using it for encryption or decryption concurrently. This could be exploited to crash the kernel or worse. This patch fixes the use-after-free by removing the code which detects the keyring key having been revoked, invalidated, or expired. Instead, an encrypted inode that is "unlocked" now simply remains unlocked until it is evicted from memory. Note that this is no worse than the case for block device-level encryption, e.g. dm-crypt, and it still remains possible for a privileged user to evict unused pages, inodes, and dentries by running 'sync; echo 3 > /proc/sys/vm/drop_caches', or by simply unmounting the filesystem. In fact, one of those actions was already needed anyway for key revocation to work even somewhat sanely. This change is not expected to break any applications. In the future I'd like to implement a real API for fscrypt key revocation that interacts sanely with ongoing filesystem operations --- waiting for existing operations to complete and blocking new operations, and invalidating and sanitizing key material and plaintext from the VFS caches. But this is a hard problem, and for now this bug must be fixed. This bug affected almost all versions of ext4, f2fs, and ubifs encryption, and it was potentially reachable in any kernel configured with encryption support (CONFIG_EXT4_ENCRYPTION=y, CONFIG_EXT4_FS_ENCRYPTION=y, CONFIG_F2FS_FS_ENCRYPTION=y, or CONFIG_UBIFS_FS_ENCRYPTION=y). Note that older kernels did not use the shared fs/crypto/ code, but due to the potential security implications of this bug, it may still be worthwhile to backport this fix to them. Fixes: b7236e21d55f ("ext4 crypto: reorganize how we store keys in the inode") Cc: stable@vger.kernel.org # v4.2+ Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o Acked-by: Michael Halcrow --- fs/crypto/crypto.c | 10 +------ fs/crypto/fname.c | 2 +- fs/crypto/fscrypt_private.h | 4 --- fs/crypto/keyinfo.c | 52 +++++++------------------------------ 4 files changed, 11 insertions(+), 57 deletions(-) diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index 02a7a9286449..6d6eca394d4d 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -327,7 +327,6 @@ EXPORT_SYMBOL(fscrypt_decrypt_page); static int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags) { struct dentry *dir; - struct fscrypt_info *ci; int dir_has_key, cached_with_key; if (flags & LOOKUP_RCU) @@ -339,18 +338,11 @@ static int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags) return 0; } - ci = d_inode(dir)->i_crypt_info; - if (ci && ci->ci_keyring_key && - (ci->ci_keyring_key->flags & ((1 << KEY_FLAG_INVALIDATED) | - (1 << KEY_FLAG_REVOKED) | - (1 << KEY_FLAG_DEAD)))) - ci = NULL; - /* this should eventually be an flag in d_flags */ spin_lock(&dentry->d_lock); cached_with_key = dentry->d_flags & DCACHE_ENCRYPTED_WITH_KEY; spin_unlock(&dentry->d_lock); - dir_has_key = (ci != NULL); + dir_has_key = (d_inode(dir)->i_crypt_info != NULL); dput(dir); /* diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index 932881f27f2f..15bf9c31a34d 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -350,7 +350,7 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, fname->disk_name.len = iname->len; return 0; } - ret = fscrypt_get_crypt_info(dir); + ret = fscrypt_get_encryption_info(dir); if (ret && ret != -EOPNOTSUPP) return ret; diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index ea01e5279675..ab0440274630 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -67,7 +67,6 @@ struct fscrypt_info { u8 ci_filename_mode; u8 ci_flags; struct crypto_skcipher *ci_ctfm; - struct key *ci_keyring_key; u8 ci_master_key[FS_KEY_DESCRIPTOR_SIZE]; }; @@ -111,7 +110,4 @@ extern int fscrypt_do_page_crypto(const struct inode *inode, extern struct page *fscrypt_alloc_bounce_page(struct fscrypt_ctx *ctx, gfp_t gfp_flags); -/* keyinfo.c */ -extern int fscrypt_get_crypt_info(struct inode *); - #endif /* _FSCRYPT_PRIVATE_H */ diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c index 02eb6b9e4438..cb3e82abf034 100644 --- a/fs/crypto/keyinfo.c +++ b/fs/crypto/keyinfo.c @@ -95,6 +95,7 @@ static int validate_user_key(struct fscrypt_info *crypt_info, kfree(description); if (IS_ERR(keyring_key)) return PTR_ERR(keyring_key); + down_read(&keyring_key->sem); if (keyring_key->type != &key_type_logon) { printk_once(KERN_WARNING @@ -102,11 +103,9 @@ static int validate_user_key(struct fscrypt_info *crypt_info, res = -ENOKEY; goto out; } - down_read(&keyring_key->sem); ukp = user_key_payload(keyring_key); if (ukp->datalen != sizeof(struct fscrypt_key)) { res = -EINVAL; - up_read(&keyring_key->sem); goto out; } master_key = (struct fscrypt_key *)ukp->data; @@ -117,17 +116,11 @@ static int validate_user_key(struct fscrypt_info *crypt_info, "%s: key size incorrect: %d\n", __func__, master_key->size); res = -ENOKEY; - up_read(&keyring_key->sem); goto out; } res = derive_key_aes(ctx->nonce, master_key->raw, raw_key); - up_read(&keyring_key->sem); - if (res) - goto out; - - crypt_info->ci_keyring_key = keyring_key; - return 0; out: + up_read(&keyring_key->sem); key_put(keyring_key); return res; } @@ -169,12 +162,11 @@ static void put_crypt_info(struct fscrypt_info *ci) if (!ci) return; - key_put(ci->ci_keyring_key); crypto_free_skcipher(ci->ci_ctfm); kmem_cache_free(fscrypt_info_cachep, ci); } -int fscrypt_get_crypt_info(struct inode *inode) +int fscrypt_get_encryption_info(struct inode *inode) { struct fscrypt_info *crypt_info; struct fscrypt_context ctx; @@ -184,21 +176,15 @@ int fscrypt_get_crypt_info(struct inode *inode) u8 *raw_key = NULL; int res; + if (inode->i_crypt_info) + return 0; + res = fscrypt_initialize(inode->i_sb->s_cop->flags); if (res) return res; if (!inode->i_sb->s_cop->get_context) return -EOPNOTSUPP; -retry: - crypt_info = ACCESS_ONCE(inode->i_crypt_info); - if (crypt_info) { - if (!crypt_info->ci_keyring_key || - key_validate(crypt_info->ci_keyring_key) == 0) - return 0; - fscrypt_put_encryption_info(inode, crypt_info); - goto retry; - } res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); if (res < 0) { @@ -229,7 +215,6 @@ retry: crypt_info->ci_data_mode = ctx.contents_encryption_mode; crypt_info->ci_filename_mode = ctx.filenames_encryption_mode; crypt_info->ci_ctfm = NULL; - crypt_info->ci_keyring_key = NULL; memcpy(crypt_info->ci_master_key, ctx.master_key_descriptor, sizeof(crypt_info->ci_master_key)); @@ -273,14 +258,8 @@ retry: if (res) goto out; - kzfree(raw_key); - raw_key = NULL; - if (cmpxchg(&inode->i_crypt_info, NULL, crypt_info) != NULL) { - put_crypt_info(crypt_info); - goto retry; - } - return 0; - + if (cmpxchg(&inode->i_crypt_info, NULL, crypt_info) == NULL) + crypt_info = NULL; out: if (res == -ENOKEY) res = 0; @@ -288,6 +267,7 @@ out: kzfree(raw_key); return res; } +EXPORT_SYMBOL(fscrypt_get_encryption_info); void fscrypt_put_encryption_info(struct inode *inode, struct fscrypt_info *ci) { @@ -305,17 +285,3 @@ void fscrypt_put_encryption_info(struct inode *inode, struct fscrypt_info *ci) put_crypt_info(ci); } EXPORT_SYMBOL(fscrypt_put_encryption_info); - -int fscrypt_get_encryption_info(struct inode *inode) -{ - struct fscrypt_info *ci = inode->i_crypt_info; - - if (!ci || - (ci->ci_keyring_key && - (ci->ci_keyring_key->flags & ((1 << KEY_FLAG_INVALIDATED) | - (1 << KEY_FLAG_REVOKED) | - (1 << KEY_FLAG_DEAD))))) - return fscrypt_get_crypt_info(inode); - return 0; -} -EXPORT_SYMBOL(fscrypt_get_encryption_info); From 0addb61dc70fff224c344a30bf38345d96ce7fdb Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 22 Feb 2017 13:25:14 -0800 Subject: [PATCH 297/804] fscrypt: eliminate ->prepare_context() operation The only use of the ->prepare_context() fscrypt operation was to allow ext4 to evict inline data from the inode before ->set_context(). However, there is no reason why this cannot be done as simply the first step in ->set_context(), and in fact it makes more sense to do it that way because then the policy modes and flags get validated before any real work is done. Therefore, merge ext4_prepare_context() into ext4_set_context(), and remove ->prepare_context(). Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o Conflicts: fs/ext4/super.c --- fs/crypto/policy.c | 7 ------- include/linux/fscrypt_common.h | 1 - 2 files changed, 8 deletions(-) diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index 14b76da71269..4908906d54d5 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -33,17 +33,10 @@ static int create_encryption_context_from_policy(struct inode *inode, const struct fscrypt_policy *policy) { struct fscrypt_context ctx; - int res; if (!inode->i_sb->s_cop->set_context) return -EOPNOTSUPP; - if (inode->i_sb->s_cop->prepare_context) { - res = inode->i_sb->s_cop->prepare_context(inode); - if (res) - return res; - } - ctx.format = FS_ENCRYPTION_CONTEXT_FORMAT_V1; memcpy(ctx.master_key_descriptor, policy->master_key_descriptor, FS_KEY_DESCRIPTOR_SIZE); diff --git a/include/linux/fscrypt_common.h b/include/linux/fscrypt_common.h index 547f81592ba1..10c1abfbac6c 100644 --- a/include/linux/fscrypt_common.h +++ b/include/linux/fscrypt_common.h @@ -87,7 +87,6 @@ struct fscrypt_operations { unsigned int flags; const char *key_prefix; int (*get_context)(struct inode *, void *, size_t); - int (*prepare_context)(struct inode *); int (*set_context)(struct inode *, const void *, size_t, void *); int (*dummy_context)(struct inode *); bool (*is_encrypted)(struct inode *); From 31469fc2488f66e7b43f80088690dd386e0d12b6 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 4 Apr 2017 14:39:41 -0700 Subject: [PATCH 298/804] fscrypt: remove unnecessary checks for NULL operations The functions in fs/crypto/*.c are only called by filesystems configured with encryption support. Since the ->get_context(), ->set_context(), and ->empty_dir() operations are always provided in that case (and must be, otherwise there would be no way to get/set encryption policies, or in the case of ->get_context() even access encrypted files at all), there is no need to check for these operations being NULL and we can remove these unneeded checks. Signed-off-by: Eric Biggers Reviewed-by: Richard Weinberger Signed-off-by: Theodore Ts'o --- fs/crypto/keyinfo.c | 3 --- fs/crypto/policy.c | 11 +---------- 2 files changed, 1 insertion(+), 13 deletions(-) diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c index cb3e82abf034..4636c18c2fb9 100644 --- a/fs/crypto/keyinfo.c +++ b/fs/crypto/keyinfo.c @@ -183,9 +183,6 @@ int fscrypt_get_encryption_info(struct inode *inode) if (res) return res; - if (!inode->i_sb->s_cop->get_context) - return -EOPNOTSUPP; - res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); if (res < 0) { if (!fscrypt_dummy_context_enabled(inode) || diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index 4908906d54d5..d71ec3780d0c 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -34,9 +34,6 @@ static int create_encryption_context_from_policy(struct inode *inode, { struct fscrypt_context ctx; - if (!inode->i_sb->s_cop->set_context) - return -EOPNOTSUPP; - ctx.format = FS_ENCRYPTION_CONTEXT_FORMAT_V1; memcpy(ctx.master_key_descriptor, policy->master_key_descriptor, FS_KEY_DESCRIPTOR_SIZE); @@ -87,8 +84,6 @@ int fscrypt_ioctl_set_policy(struct file *filp, const void __user *arg) if (ret == -ENODATA) { if (!S_ISDIR(inode->i_mode)) ret = -ENOTDIR; - else if (!inode->i_sb->s_cop->empty_dir) - ret = -EOPNOTSUPP; else if (!inode->i_sb->s_cop->empty_dir(inode)) ret = -ENOTEMPTY; else @@ -118,8 +113,7 @@ int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg) struct fscrypt_policy policy; int res; - if (!inode->i_sb->s_cop->get_context || - !inode->i_sb->s_cop->is_encrypted(inode)) + if (!inode->i_sb->s_cop->is_encrypted(inode)) return -ENODATA; res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); @@ -202,9 +196,6 @@ int fscrypt_inherit_context(struct inode *parent, struct inode *child, struct fscrypt_info *ci; int res; - if (!parent->i_sb->s_cop->set_context) - return -EOPNOTSUPP; - res = fscrypt_get_encryption_info(parent); if (res < 0) return res; From c24873a651517a8a247a2f90f4ca8631747e793e Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 4 Apr 2017 14:43:34 -0700 Subject: [PATCH 299/804] fscrypt: remove fscrypt_symlink_data_len() fscrypt_symlink_data_len() is never called and can be removed. Signed-off-by: Eric Biggers Reviewed-by: Richard Weinberger Signed-off-by: Theodore Ts'o --- include/linux/fscrypt_common.h | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/include/linux/fscrypt_common.h b/include/linux/fscrypt_common.h index 10c1abfbac6c..0a30c106c1e5 100644 --- a/include/linux/fscrypt_common.h +++ b/include/linux/fscrypt_common.h @@ -46,17 +46,6 @@ struct fscrypt_symlink_data { char encrypted_path[1]; } __packed; -/** - * This function is used to calculate the disk space required to - * store a filename of length l in encrypted symlink format. - */ -static inline u32 fscrypt_symlink_data_len(u32 l) -{ - if (l < FS_CRYPTO_BLOCK_SIZE) - l = FS_CRYPTO_BLOCK_SIZE; - return (l + sizeof(struct fscrypt_symlink_data) - 1); -} - struct fscrypt_str { unsigned char *name; u32 len; From 171695f2abca44894f48e90401b0ebb46035893f Mon Sep 17 00:00:00 2001 From: Joe Richey Date: Thu, 6 Apr 2017 16:14:05 -0700 Subject: [PATCH 300/804] fscrypt: Move key structure and constants to uapi This commit exposes the necessary constants and structures for a userspace program to pass filesystem encryption keys into the keyring. The fscrypt_key structure was already part of the kernel ABI, this change just makes it so programs no longer have to redeclare these structures (like e4crypt in e2fsprogs currently does). Note that we do not expose the other FS_*_KEY_SIZE constants as they are not necessary. Only XTS is supported for contents_encryption_mode, so currently FS_MAX_KEY_SIZE bytes of key material must always be passed to the kernel. This commit also removes __packed from fscrypt_key as it does not contain any implicit padding and does not refer to an on-disk structure. Signed-off-by: Joe Richey Signed-off-by: Theodore Ts'o --- fs/crypto/fscrypt_private.h | 11 ----------- include/uapi/linux/fs.h | 13 +++++++++++++ 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index ab0440274630..6ed4ad422fc3 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -22,10 +22,6 @@ #define FS_AES_256_CBC_KEY_SIZE 32 #define FS_AES_256_CTS_KEY_SIZE 32 #define FS_AES_256_XTS_KEY_SIZE 64 -#define FS_MAX_KEY_SIZE 64 - -#define FS_KEY_DESC_PREFIX "fscrypt:" -#define FS_KEY_DESC_PREFIX_SIZE 8 #define FS_KEY_DERIVATION_NONCE_SIZE 16 @@ -51,13 +47,6 @@ struct fscrypt_context { #define FS_ENCRYPTION_CONTEXT_FORMAT_V1 1 -/* This is passed in from userspace into the kernel keyring */ -struct fscrypt_key { - u32 mode; - u8 raw[FS_MAX_KEY_SIZE]; - u32 size; -} __packed; - /* * A pointer to this structure is stored in the file system's in-core * representation of an inode. diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index ea33e08d9d75..a1533084395c 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -203,6 +203,19 @@ struct fscrypt_policy { #define FS_IOC_GET_ENCRYPTION_PWSALT _IOW('f', 20, __u8[16]) #define FS_IOC_GET_ENCRYPTION_POLICY _IOW('f', 21, struct fscrypt_policy) +/* Parameters for passing an encryption key into the kernel keyring */ +#define FS_KEY_DESC_PREFIX "fscrypt:" +#define FS_KEY_DESC_PREFIX_SIZE 8 + +/* Structure that userspace passes to the kernel keyring */ +#define FS_MAX_KEY_SIZE 64 + +struct fscrypt_key { + __u32 mode; + __u8 raw[FS_MAX_KEY_SIZE]; + __u32 size; +}; + /* * Inode flags (FS_IOC_GETFLAGS / FS_IOC_SETFLAGS) */ From 48c7f9c819ac97658839140d25e7505b397c6ffe Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 7 Apr 2017 10:58:37 -0700 Subject: [PATCH 301/804] fscrypt: fix context consistency check when key(s) unavailable To mitigate some types of offline attacks, filesystem encryption is designed to enforce that all files in an encrypted directory tree use the same encryption policy (i.e. the same encryption context excluding the nonce). However, the fscrypt_has_permitted_context() function which enforces this relies on comparing struct fscrypt_info's, which are only available when we have the encryption keys. This can cause two incorrect behaviors: 1. If we have the parent directory's key but not the child's key, or vice versa, then fscrypt_has_permitted_context() returned false, causing applications to see EPERM or ENOKEY. This is incorrect if the encryption contexts are in fact consistent. Although we'd normally have either both keys or neither key in that case since the master_key_descriptors would be the same, this is not guaranteed because keys can be added or removed from keyrings at any time. 2. If we have neither the parent's key nor the child's key, then fscrypt_has_permitted_context() returned true, causing applications to see no error (or else an error for some other reason). This is incorrect if the encryption contexts are in fact inconsistent, since in that case we should deny access. To fix this, retrieve and compare the fscrypt_contexts if we are unable to set up both fscrypt_infos. While this slightly hurts performance when accessing an encrypted directory tree without the key, this isn't a case we really need to be optimizing for; access *with* the key is much more important. Furthermore, the performance hit is barely noticeable given that we are already retrieving the fscrypt_context and doing two keyring searches in fscrypt_get_encryption_info(). If we ever actually wanted to optimize this case we might start by caching the fscrypt_contexts. Cc: stable@vger.kernel.org # 4.0+ Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/crypto/policy.c | 87 ++++++++++++++++++++++++++++++++++++---------- 1 file changed, 68 insertions(+), 19 deletions(-) diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index d71ec3780d0c..210976e7a269 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -137,27 +137,61 @@ int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg) } EXPORT_SYMBOL(fscrypt_ioctl_get_policy); +/** + * fscrypt_has_permitted_context() - is a file's encryption policy permitted + * within its directory? + * + * @parent: inode for parent directory + * @child: inode for file being looked up, opened, or linked into @parent + * + * Filesystems must call this before permitting access to an inode in a + * situation where the parent directory is encrypted (either before allowing + * ->lookup() to succeed, or for a regular file before allowing it to be opened) + * and before any operation that involves linking an inode into an encrypted + * directory, including link, rename, and cross rename. It enforces the + * constraint that within a given encrypted directory tree, all files use the + * same encryption policy. The pre-access check is needed to detect potentially + * malicious offline violations of this constraint, while the link and rename + * checks are needed to prevent online violations of this constraint. + * + * Return: 1 if permitted, 0 if forbidden. If forbidden, the caller must fail + * the filesystem operation with EPERM. + */ int fscrypt_has_permitted_context(struct inode *parent, struct inode *child) { - struct fscrypt_info *parent_ci, *child_ci; + const struct fscrypt_operations *cops = parent->i_sb->s_cop; + const struct fscrypt_info *parent_ci, *child_ci; + struct fscrypt_context parent_ctx, child_ctx; int res; - if ((parent == NULL) || (child == NULL)) { - printk(KERN_ERR "parent %p child %p\n", parent, child); - BUG_ON(1); - } - /* No restrictions on file types which are never encrypted */ if (!S_ISREG(child->i_mode) && !S_ISDIR(child->i_mode) && !S_ISLNK(child->i_mode)) return 1; - /* no restrictions if the parent directory is not encrypted */ - if (!parent->i_sb->s_cop->is_encrypted(parent)) + /* No restrictions if the parent directory is unencrypted */ + if (!cops->is_encrypted(parent)) return 1; - /* if the child directory is not encrypted, this is always a problem */ - if (!parent->i_sb->s_cop->is_encrypted(child)) + + /* Encrypted directories must not contain unencrypted files */ + if (!cops->is_encrypted(child)) return 0; + + /* + * Both parent and child are encrypted, so verify they use the same + * encryption policy. Compare the fscrypt_info structs if the keys are + * available, otherwise retrieve and compare the fscrypt_contexts. + * + * Note that the fscrypt_context retrieval will be required frequently + * when accessing an encrypted directory tree without the key. + * Performance-wise this is not a big deal because we already don't + * really optimize for file access without the key (to the extent that + * such access is even possible), given that any attempted access + * already causes a fscrypt_context retrieval and keyring search. + * + * In any case, if an unexpected error occurs, fall back to "forbidden". + */ + res = fscrypt_get_encryption_info(parent); if (res) return 0; @@ -166,17 +200,32 @@ int fscrypt_has_permitted_context(struct inode *parent, struct inode *child) return 0; parent_ci = parent->i_crypt_info; child_ci = child->i_crypt_info; - if (!parent_ci && !child_ci) - return 1; - if (!parent_ci || !child_ci) + + if (parent_ci && child_ci) { + return memcmp(parent_ci->ci_master_key, child_ci->ci_master_key, + FS_KEY_DESCRIPTOR_SIZE) == 0 && + (parent_ci->ci_data_mode == child_ci->ci_data_mode) && + (parent_ci->ci_filename_mode == + child_ci->ci_filename_mode) && + (parent_ci->ci_flags == child_ci->ci_flags); + } + + res = cops->get_context(parent, &parent_ctx, sizeof(parent_ctx)); + if (res != sizeof(parent_ctx)) return 0; - return (memcmp(parent_ci->ci_master_key, - child_ci->ci_master_key, - FS_KEY_DESCRIPTOR_SIZE) == 0 && - (parent_ci->ci_data_mode == child_ci->ci_data_mode) && - (parent_ci->ci_filename_mode == child_ci->ci_filename_mode) && - (parent_ci->ci_flags == child_ci->ci_flags)); + res = cops->get_context(child, &child_ctx, sizeof(child_ctx)); + if (res != sizeof(child_ctx)) + return 0; + + return memcmp(parent_ctx.master_key_descriptor, + child_ctx.master_key_descriptor, + FS_KEY_DESCRIPTOR_SIZE) == 0 && + (parent_ctx.contents_encryption_mode == + child_ctx.contents_encryption_mode) && + (parent_ctx.filenames_encryption_mode == + child_ctx.filenames_encryption_mode) && + (parent_ctx.flags == child_ctx.flags); } EXPORT_SYMBOL(fscrypt_has_permitted_context); From 8c66df6c7a8b22e94c37a6d8374a5ee2ad1dd27f Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 24 Apr 2017 10:00:10 -0700 Subject: [PATCH 302/804] fscrypt: introduce helper function for filename matching Introduce a helper function fscrypt_match_name() which tests whether a fscrypt_name matches a directory entry. Also clean up the magic numbers and document things properly. Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/crypto/fname.c | 90 +++++++++++++++++++++++++-------- fs/crypto/fscrypt_private.h | 2 - include/linux/fscrypt_notsupp.h | 9 ++++ include/linux/fscrypt_supp.h | 78 ++++++++++++++++++++++++++++ 4 files changed, 157 insertions(+), 22 deletions(-) diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index 15bf9c31a34d..d1bb02b1ee58 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -159,6 +159,8 @@ static int fname_decrypt(struct inode *inode, static const char *lookup_table = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,"; +#define BASE64_CHARS(nbytes) DIV_ROUND_UP((nbytes) * 4, 3) + /** * digest_encode() - * @@ -230,11 +232,14 @@ EXPORT_SYMBOL(fscrypt_fname_encrypted_size); int fscrypt_fname_alloc_buffer(const struct inode *inode, u32 ilen, struct fscrypt_str *crypto_str) { - unsigned int olen = fscrypt_fname_encrypted_size(inode, ilen); + u32 olen = fscrypt_fname_encrypted_size(inode, ilen); + const u32 max_encoded_len = + max_t(u32, BASE64_CHARS(FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE), + 1 + BASE64_CHARS(sizeof(struct fscrypt_digested_name))); crypto_str->len = olen; - if (olen < FS_FNAME_CRYPTO_DIGEST_SIZE * 2) - olen = FS_FNAME_CRYPTO_DIGEST_SIZE * 2; + olen = max(olen, max_encoded_len); + /* * Allocated buffer can hold one more character to null-terminate the * string @@ -266,6 +271,10 @@ EXPORT_SYMBOL(fscrypt_fname_free_buffer); * * The caller must have allocated sufficient memory for the @oname string. * + * If the key is available, we'll decrypt the disk name; otherwise, we'll encode + * it for presentation. Short names are directly base64-encoded, while long + * names are encoded in fscrypt_digested_name format. + * * Return: 0 on success, -errno on failure */ int fscrypt_fname_disk_to_usr(struct inode *inode, @@ -274,7 +283,7 @@ int fscrypt_fname_disk_to_usr(struct inode *inode, struct fscrypt_str *oname) { const struct qstr qname = FSTR_TO_QSTR(iname); - char buf[24]; + struct fscrypt_digested_name digested_name; if (fscrypt_is_dot_dotdot(&qname)) { oname->name[0] = '.'; @@ -289,20 +298,24 @@ int fscrypt_fname_disk_to_usr(struct inode *inode, if (inode->i_crypt_info) return fname_decrypt(inode, iname, oname); - if (iname->len <= FS_FNAME_CRYPTO_DIGEST_SIZE) { + if (iname->len <= FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE) { oname->len = digest_encode(iname->name, iname->len, oname->name); return 0; } if (hash) { - memcpy(buf, &hash, 4); - memcpy(buf + 4, &minor_hash, 4); + digested_name.hash = hash; + digested_name.minor_hash = minor_hash; } else { - memset(buf, 0, 8); + digested_name.hash = 0; + digested_name.minor_hash = 0; } - memcpy(buf + 8, iname->name + ((iname->len - 17) & ~15), 16); + memcpy(digested_name.digest, + FSCRYPT_FNAME_DIGEST(iname->name, iname->len), + FSCRYPT_FNAME_DIGEST_SIZE); oname->name[0] = '_'; - oname->len = 1 + digest_encode(buf, 24, oname->name + 1); + oname->len = 1 + digest_encode((const char *)&digested_name, + sizeof(digested_name), oname->name + 1); return 0; } EXPORT_SYMBOL(fscrypt_fname_disk_to_usr); @@ -336,10 +349,35 @@ int fscrypt_fname_usr_to_disk(struct inode *inode, } EXPORT_SYMBOL(fscrypt_fname_usr_to_disk); +/** + * fscrypt_setup_filename() - prepare to search a possibly encrypted directory + * @dir: the directory that will be searched + * @iname: the user-provided filename being searched for + * @lookup: 1 if we're allowed to proceed without the key because it's + * ->lookup() or we're finding the dir_entry for deletion; 0 if we cannot + * proceed without the key because we're going to create the dir_entry. + * @fname: the filename information to be filled in + * + * Given a user-provided filename @iname, this function sets @fname->disk_name + * to the name that would be stored in the on-disk directory entry, if possible. + * If the directory is unencrypted this is simply @iname. Else, if we have the + * directory's encryption key, then @iname is the plaintext, so we encrypt it to + * get the disk_name. + * + * Else, for keyless @lookup operations, @iname is the presented ciphertext, so + * we decode it to get either the ciphertext disk_name (for short names) or the + * fscrypt_digested_name (for long names). Non-@lookup operations will be + * impossible in this case, so we fail them with ENOKEY. + * + * If successful, fscrypt_free_filename() must be called later to clean up. + * + * Return: 0 on success, -errno on failure + */ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, int lookup, struct fscrypt_name *fname) { - int ret = 0, bigname = 0; + int ret; + int digested; memset(fname, 0, sizeof(struct fscrypt_name)); fname->usr_fname = iname; @@ -373,25 +411,37 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, * We don't have the key and we are doing a lookup; decode the * user-supplied name */ - if (iname->name[0] == '_') - bigname = 1; - if ((bigname && (iname->len != 33)) || (!bigname && (iname->len > 43))) - return -ENOENT; + if (iname->name[0] == '_') { + if (iname->len != + 1 + BASE64_CHARS(sizeof(struct fscrypt_digested_name))) + return -ENOENT; + digested = 1; + } else { + if (iname->len > + BASE64_CHARS(FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE)) + return -ENOENT; + digested = 0; + } - fname->crypto_buf.name = kmalloc(32, GFP_KERNEL); + fname->crypto_buf.name = + kmalloc(max_t(size_t, FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE, + sizeof(struct fscrypt_digested_name)), + GFP_KERNEL); if (fname->crypto_buf.name == NULL) return -ENOMEM; - ret = digest_decode(iname->name + bigname, iname->len - bigname, + ret = digest_decode(iname->name + digested, iname->len - digested, fname->crypto_buf.name); if (ret < 0) { ret = -ENOENT; goto errout; } fname->crypto_buf.len = ret; - if (bigname) { - memcpy(&fname->hash, fname->crypto_buf.name, 4); - memcpy(&fname->minor_hash, fname->crypto_buf.name + 4, 4); + if (digested) { + const struct fscrypt_digested_name *n = + (const void *)fname->crypto_buf.name; + fname->hash = n->hash; + fname->minor_hash = n->minor_hash; } else { fname->disk_name.name = fname->crypto_buf.name; fname->disk_name.len = fname->crypto_buf.len; diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index 6ed4ad422fc3..0b65491de28a 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -13,8 +13,6 @@ #include -#define FS_FNAME_CRYPTO_DIGEST_SIZE 32 - /* Encryption parameters */ #define FS_XTS_TWEAK_SIZE 16 #define FS_AES_128_ECB_KEY_SIZE 16 diff --git a/include/linux/fscrypt_notsupp.h b/include/linux/fscrypt_notsupp.h index 3511ca798804..ec406aed2f2f 100644 --- a/include/linux/fscrypt_notsupp.h +++ b/include/linux/fscrypt_notsupp.h @@ -147,6 +147,15 @@ static inline int fscrypt_fname_usr_to_disk(struct inode *inode, return -EOPNOTSUPP; } +static inline bool fscrypt_match_name(const struct fscrypt_name *fname, + const u8 *de_name, u32 de_name_len) +{ + /* Encryption support disabled; use standard comparison */ + if (de_name_len != fname->disk_name.len) + return false; + return !memcmp(de_name, fname->disk_name.name, fname->disk_name.len); +} + /* bio.c */ static inline void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *ctx, struct bio *bio) diff --git a/include/linux/fscrypt_supp.h b/include/linux/fscrypt_supp.h index a140f47e9b27..e12c224a0d1e 100644 --- a/include/linux/fscrypt_supp.h +++ b/include/linux/fscrypt_supp.h @@ -57,6 +57,84 @@ extern int fscrypt_fname_disk_to_usr(struct inode *, u32, u32, extern int fscrypt_fname_usr_to_disk(struct inode *, const struct qstr *, struct fscrypt_str *); +#define FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE 32 + +/* Extracts the second-to-last ciphertext block; see explanation below */ +#define FSCRYPT_FNAME_DIGEST(name, len) \ + ((name) + round_down((len) - FS_CRYPTO_BLOCK_SIZE - 1, \ + FS_CRYPTO_BLOCK_SIZE)) + +#define FSCRYPT_FNAME_DIGEST_SIZE FS_CRYPTO_BLOCK_SIZE + +/** + * fscrypt_digested_name - alternate identifier for an on-disk filename + * + * When userspace lists an encrypted directory without access to the key, + * filenames whose ciphertext is longer than FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE + * bytes are shown in this abbreviated form (base64-encoded) rather than as the + * full ciphertext (base64-encoded). This is necessary to allow supporting + * filenames up to NAME_MAX bytes, since base64 encoding expands the length. + * + * To make it possible for filesystems to still find the correct directory entry + * despite not knowing the full on-disk name, we encode any filesystem-specific + * 'hash' and/or 'minor_hash' which the filesystem may need for its lookups, + * followed by the second-to-last ciphertext block of the filename. Due to the + * use of the CBC-CTS encryption mode, the second-to-last ciphertext block + * depends on the full plaintext. (Note that ciphertext stealing causes the + * last two blocks to appear "flipped".) This makes collisions very unlikely: + * just a 1 in 2^128 chance for two filenames to collide even if they share the + * same filesystem-specific hashes. + * + * This scheme isn't strictly immune to intentional collisions because it's + * basically like a CBC-MAC, which isn't secure on variable-length inputs. + * However, generating a CBC-MAC collision requires the ability to choose + * arbitrary ciphertext, which won't normally be possible with filename + * encryption since it would require write access to the raw disk. + * + * Taking a real cryptographic hash like SHA-256 over the full ciphertext would + * be better in theory but would be less efficient and more complicated to + * implement, especially since the filesystem would need to calculate it for + * each directory entry examined during a search. + */ +struct fscrypt_digested_name { + u32 hash; + u32 minor_hash; + u8 digest[FSCRYPT_FNAME_DIGEST_SIZE]; +}; + +/** + * fscrypt_match_name() - test whether the given name matches a directory entry + * @fname: the name being searched for + * @de_name: the name from the directory entry + * @de_name_len: the length of @de_name in bytes + * + * Normally @fname->disk_name will be set, and in that case we simply compare + * that to the name stored in the directory entry. The only exception is that + * if we don't have the key for an encrypted directory and a filename in it is + * very long, then we won't have the full disk_name and we'll instead need to + * match against the fscrypt_digested_name. + * + * Return: %true if the name matches, otherwise %false. + */ +static inline bool fscrypt_match_name(const struct fscrypt_name *fname, + const u8 *de_name, u32 de_name_len) +{ + if (unlikely(!fname->disk_name.name)) { + const struct fscrypt_digested_name *n = + (const void *)fname->crypto_buf.name; + if (WARN_ON_ONCE(fname->usr_fname->name[0] != '_')) + return false; + if (de_name_len <= FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE) + return false; + return !memcmp(FSCRYPT_FNAME_DIGEST(de_name, de_name_len), + n->digest, FSCRYPT_FNAME_DIGEST_SIZE); + } + + if (de_name_len != fname->disk_name.len) + return false; + return !memcmp(de_name, fname->disk_name.name, fname->disk_name.len); +} + /* bio.c */ extern void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *, struct bio *); extern void fscrypt_pullback_bio_page(struct page **, bool); From a1425ed23988ee78ab0bde734117b382da3bba39 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 24 Apr 2017 10:00:12 -0700 Subject: [PATCH 303/804] f2fs: switch to using fscrypt_match_name() Switch f2fs directory searches to use the fscrypt_match_name() helper function. There should be no functional change. Signed-off-by: Eric Biggers Acked-by: Jaegeuk Kim Signed-off-by: Theodore Ts'o --- fs/f2fs/dir.c | 28 ++++------------------------ 1 file changed, 4 insertions(+), 24 deletions(-) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index c716ab0baf1d..a87a5ecca74d 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -111,8 +111,6 @@ struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *fname, struct f2fs_dir_entry *de; unsigned long bit_pos = 0; int max_len = 0; - struct fscrypt_str de_name = FSTR_INIT(NULL, 0); - struct fscrypt_str *name = &fname->disk_name; if (max_slots) *max_slots = 0; @@ -130,29 +128,11 @@ struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *fname, continue; } - if (de->hash_code != namehash) - goto not_match; - - de_name.name = d->filename[bit_pos]; - de_name.len = le16_to_cpu(de->name_len); - -#ifdef CONFIG_F2FS_FS_ENCRYPTION - if (unlikely(!name->name)) { - if (fname->usr_fname->name[0] == '_') { - if (de_name.len > 32 && - !memcmp(de_name.name + ((de_name.len - 17) & ~15), - fname->crypto_buf.name + 8, 16)) - goto found; - goto not_match; - } - name->name = fname->crypto_buf.name; - name->len = fname->crypto_buf.len; - } -#endif - if (de_name.len == name->len && - !memcmp(de_name.name, name->name, name->len)) + if (de->hash_code == namehash && + fscrypt_match_name(fname, d->filename[bit_pos], + le16_to_cpu(de->name_len))) goto found; -not_match: + if (max_slots && max_len > *max_slots) *max_slots = max_len; max_len = 0; From a156aa8444353737f3e23aa7b1646852b9f0dea2 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 1 May 2017 11:43:32 -0700 Subject: [PATCH 304/804] fscrypt: correct collision claim for digested names As I noted on the mailing list, it's easier than I originally thought to create intentional collisions in the digested names. Unfortunately it's not too easy to solve this, so for now just fix the comment to not lie. Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- include/linux/fscrypt_supp.h | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/include/linux/fscrypt_supp.h b/include/linux/fscrypt_supp.h index e12c224a0d1e..cd4e82c17304 100644 --- a/include/linux/fscrypt_supp.h +++ b/include/linux/fscrypt_supp.h @@ -81,20 +81,16 @@ extern int fscrypt_fname_usr_to_disk(struct inode *, const struct qstr *, * followed by the second-to-last ciphertext block of the filename. Due to the * use of the CBC-CTS encryption mode, the second-to-last ciphertext block * depends on the full plaintext. (Note that ciphertext stealing causes the - * last two blocks to appear "flipped".) This makes collisions very unlikely: - * just a 1 in 2^128 chance for two filenames to collide even if they share the - * same filesystem-specific hashes. + * last two blocks to appear "flipped".) This makes accidental collisions very + * unlikely: just a 1 in 2^128 chance for two filenames to collide even if they + * share the same filesystem-specific hashes. * - * This scheme isn't strictly immune to intentional collisions because it's - * basically like a CBC-MAC, which isn't secure on variable-length inputs. - * However, generating a CBC-MAC collision requires the ability to choose - * arbitrary ciphertext, which won't normally be possible with filename - * encryption since it would require write access to the raw disk. - * - * Taking a real cryptographic hash like SHA-256 over the full ciphertext would - * be better in theory but would be less efficient and more complicated to - * implement, especially since the filesystem would need to calculate it for - * each directory entry examined during a search. + * However, this scheme isn't immune to intentional collisions, which can be + * created by anyone able to create arbitrary plaintext filenames and view them + * without the key. Making the "digest" be a real cryptographic hash like + * SHA-256 over the full ciphertext would prevent this, although it would be + * less efficient and harder to implement, especially since the filesystem would + * need to calculate it for each directory entry examined during a search. */ struct fscrypt_digested_name { u32 hash; From 6190400da0498c63e01f1984f3386e95c487d2f7 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 6 Jul 2017 12:24:49 -0700 Subject: [PATCH 305/804] f2fs, block_dump: give WRITE direction to submit_bio The block_dump in submit_bio uses rw, instead of bio->bi_rw. Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 +- fs/f2fs/segment.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index c1e881242d53..ae6c1353529f 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -224,7 +224,7 @@ submit_io: trace_f2fs_submit_read_bio(sbi->sb, type, bio); else trace_f2fs_submit_write_bio(sbi->sb, type, bio); - submit_bio(0, bio); + submit_bio(bio_op(bio), bio); } static void __submit_merged_bio(struct f2fs_bio_info *io) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 5fdc995b1f1e..c35e70e72e8b 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -809,7 +809,7 @@ static int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, } if (bio) { - int ret = submit_bio_wait(0, bio); + int ret = submit_bio_wait(op, bio); bio_put(bio); if (ret) return ret; From cc4611491956055e8a414351e6a180677c46ff08 Mon Sep 17 00:00:00 2001 From: Jin Qian Date: Mon, 15 May 2017 10:45:08 -0700 Subject: [PATCH 306/804] f2fs: sanity check checkpoint segno and blkoff Make sure segno and blkoff read from raw image are valid. Cc: stable@vger.kernel.org Signed-off-by: Jin Qian [Jaegeuk Kim: adjust minor coding style] Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index eeda97b54556..fb68af6b04c5 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1523,6 +1523,8 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi) struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); unsigned int ovp_segments, reserved_segments; + unsigned int main_segs, blocks_per_seg; + int i; total = le32_to_cpu(raw_super->segment_count); fsmeta = le32_to_cpu(raw_super->segment_count_ckpt); @@ -1544,6 +1546,20 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi) return 1; } + main_segs = le32_to_cpu(raw_super->segment_count_main); + blocks_per_seg = sbi->blocks_per_seg; + + for (i = 0; i < NR_CURSEG_NODE_TYPE; i++) { + if (le32_to_cpu(ckpt->cur_node_segno[i]) >= main_segs || + le16_to_cpu(ckpt->cur_node_blkoff[i]) >= blocks_per_seg) + return 1; + } + for (i = 0; i < NR_CURSEG_DATA_TYPE; i++) { + if (le32_to_cpu(ckpt->cur_data_segno[i]) >= main_segs || + le16_to_cpu(ckpt->cur_data_blkoff[i]) >= blocks_per_seg) + return 1; + } + if (unlikely(f2fs_cp_error(sbi))) { f2fs_msg(sbi->sb, KERN_ERR, "A bug case: need to run fsck"); return 1; From 34c2b318e3ce93a2d7a96f7545fc7c279e492b2d Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 16 May 2017 13:20:16 -0700 Subject: [PATCH 307/804] f2fs: load inode's flag from disk This patch fixes missing inode flag loaded from disk, reported by Tom. [tom@localhost ~]$ sudo mount /dev/loop0 /mnt/ [tom@localhost ~]$ sudo chown tom:tom /mnt/ [tom@localhost ~]$ touch /mnt/testfile [tom@localhost ~]$ sudo chattr +i /mnt/testfile [tom@localhost ~]$ echo test > /mnt/testfile bash: /mnt/testfile: Operation not permitted [tom@localhost ~]$ rm /mnt/testfile rm: cannot remove '/mnt/testfile': Operation not permitted [tom@localhost ~]$ sudo umount /mnt/ [tom@localhost ~]$ sudo mount /dev/loop0 /mnt/ [tom@localhost ~]$ lsattr /mnt/testfile ----i-------------- /mnt/testfile [tom@localhost ~]$ echo test > /mnt/testfile [tom@localhost ~]$ rm /mnt/testfile [tom@localhost ~]$ sudo umount /mnt/ Cc: stable@vger.kernel.org Reported-by: Tom Yan Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 1 + fs/f2fs/inode.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 70be377c2236..aee781394c87 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1498,6 +1498,7 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg) inode->i_ctime = current_time(inode); f2fs_set_inode_flags(inode); + f2fs_mark_inode_dirty_sync(inode, false); inode_unlock(inode); out: diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 518f49643092..e53c784ab11e 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -44,7 +44,6 @@ void f2fs_set_inode_flags(struct inode *inode) new_fl |= S_DIRSYNC; inode_set_flags(inode, new_fl, S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); - f2fs_mark_inode_dirty_sync(inode, false); } static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri) @@ -226,6 +225,7 @@ make_now: ret = -EIO; goto bad_inode; } + f2fs_set_inode_flags(inode); unlock_new_inode(inode); trace_f2fs_iget(inode); return inode; From 71a2058ae62a60a00de63e450d5f3d251cff604d Mon Sep 17 00:00:00 2001 From: Weichao Guo Date: Thu, 11 May 2017 04:28:00 +0800 Subject: [PATCH 308/804] f2fs: make sure f2fs_gc returns consistent errno By default, f2fs_gc returns -EINVAL in general error cases, e.g., no victim was selected. However, the default errno may be overwritten in two cases: gc_more and BG_GC -> FG_GC. We should return consistent errno in such cases. Signed-off-by: Weichao Guo Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index b527ab0eec1d..afa2b2cf9f7e 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -955,7 +955,7 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, { int gc_type = sync ? FG_GC : BG_GC; int sec_freed = 0; - int ret = -EINVAL; + int ret; struct cp_control cpc; unsigned int init_segno = segno; struct gc_inode_list gc_list = { @@ -965,8 +965,10 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, cpc.reason = __get_cp_reason(sbi); gc_more: - if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE))) + if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE))) { + ret = -EINVAL; goto stop; + } if (unlikely(f2fs_cp_error(sbi))) { ret = -EIO; goto stop; @@ -987,6 +989,7 @@ gc_more: gc_type = FG_GC; } + ret = -EINVAL; /* f2fs_balance_fs doesn't need to do BG_GC in critical path. */ if (gc_type == BG_GC && !background) goto stop; From 74683b0ea0208d4c37594920e0d1b6499dc5ddc3 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 10 May 2017 11:23:36 -0700 Subject: [PATCH 309/804] f2fs: use f2fs_submit_page_bio for ra_meta_pages This patch avoids to use f2fs_submit_merged_bio for read, which was the only read case. Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index d639fd9062d4..81d6de1336d0 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -208,12 +208,10 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, } fio.page = page; - fio.old_blkaddr = fio.new_blkaddr; - f2fs_submit_page_mbio(&fio); + f2fs_submit_page_bio(&fio); f2fs_put_page(page, 0); } out: - f2fs_submit_merged_bio(sbi, META, READ); blk_finish_plug(&plug); return blkno - start; } From 4a6ac1475b49371eda81c62150e9a626882f2029 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 10 May 2017 11:28:38 -0700 Subject: [PATCH 310/804] f2fs: remove unnecessary read cases in merged IO flow Merged IO flow doesn't need to care about read IOs. f2fs_submit_merged_bio -> f2fs_submit_merged_write f2fs_submit_merged_bios -> f2fs_submit_merged_writes f2fs_submit_merged_bio_cond -> f2fs_submit_merged_write_cond Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 14 +++++----- fs/f2fs/data.c | 55 ++++++++++++++++--------------------- fs/f2fs/f2fs.h | 12 ++++---- fs/f2fs/gc.c | 6 ++-- fs/f2fs/node.c | 11 ++++---- fs/f2fs/segment.c | 11 ++++---- fs/f2fs/super.c | 5 +--- include/trace/events/f2fs.h | 2 +- 8 files changed, 51 insertions(+), 65 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 81d6de1336d0..b7580cf84f94 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -31,7 +31,7 @@ void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io) set_ckpt_flags(sbi, CP_ERROR_FLAG); sbi->sb->s_flags |= MS_RDONLY; if (!end_io) - f2fs_flush_merged_bios(sbi); + f2fs_flush_merged_writes(sbi); } /* @@ -248,13 +248,13 @@ static int f2fs_write_meta_page(struct page *page, dec_page_count(sbi, F2FS_DIRTY_META); if (wbc->for_reclaim) - f2fs_submit_merged_bio_cond(sbi, page->mapping->host, - 0, page->index, META, WRITE); + f2fs_submit_merged_write_cond(sbi, page->mapping->host, + 0, page->index, META); unlock_page(page); if (unlikely(f2fs_cp_error(sbi))) - f2fs_submit_merged_bio(sbi, META, WRITE); + f2fs_submit_merged_write(sbi, META); return 0; @@ -357,7 +357,7 @@ continue_unlock: } stop: if (nwritten) - f2fs_submit_merged_bio(sbi, type, WRITE); + f2fs_submit_merged_write(sbi, type); blk_finish_plug(&plug); @@ -905,7 +905,7 @@ retry: * We should submit bio, since it exists several * wribacking dentry pages in the freeing inode. */ - f2fs_submit_merged_bio(sbi, DATA, WRITE); + f2fs_submit_merged_write(sbi, DATA); cond_resched(); } goto retry; @@ -1294,7 +1294,7 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish block_ops"); - f2fs_flush_merged_bios(sbi); + f2fs_flush_merged_writes(sbi); /* this is the case of multiple fstrims without any changes */ if (cpc->reason & CP_DISCARD) { diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index ae6c1353529f..5afdd9455d43 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -290,14 +290,12 @@ static bool has_merged_page(struct f2fs_sb_info *sbi, struct inode *inode, return ret; } -static void __f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, +static void __f2fs_submit_merged_write(struct f2fs_sb_info *sbi, struct inode *inode, nid_t ino, pgoff_t idx, - enum page_type type, int rw) + enum page_type type) { enum page_type btype = PAGE_TYPE_OF_BIO(type); - struct f2fs_bio_info *io; - - io = is_read_io(rw) ? &sbi->read_io : &sbi->write_io[btype]; + struct f2fs_bio_info *io = &sbi->write_io[btype]; down_write(&io->io_rwsem); @@ -317,25 +315,24 @@ out: up_write(&io->io_rwsem); } -void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, enum page_type type, - int rw) +void f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type) { - __f2fs_submit_merged_bio(sbi, NULL, 0, 0, type, rw); + __f2fs_submit_merged_write(sbi, NULL, 0, 0, type); } -void f2fs_submit_merged_bio_cond(struct f2fs_sb_info *sbi, +void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi, struct inode *inode, nid_t ino, pgoff_t idx, - enum page_type type, int rw) + enum page_type type) { if (has_merged_page(sbi, inode, ino, idx, type)) - __f2fs_submit_merged_bio(sbi, inode, ino, idx, type, rw); + __f2fs_submit_merged_write(sbi, inode, ino, idx, type); } -void f2fs_flush_merged_bios(struct f2fs_sb_info *sbi) +void f2fs_flush_merged_writes(struct f2fs_sb_info *sbi) { - f2fs_submit_merged_bio(sbi, DATA, WRITE); - f2fs_submit_merged_bio(sbi, NODE, WRITE); - f2fs_submit_merged_bio(sbi, META, WRITE); + f2fs_submit_merged_write(sbi, DATA); + f2fs_submit_merged_write(sbi, NODE); + f2fs_submit_merged_write(sbi, META); } /* @@ -367,16 +364,15 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) return 0; } -int f2fs_submit_page_mbio(struct f2fs_io_info *fio) +int f2fs_submit_page_write(struct f2fs_io_info *fio) { struct f2fs_sb_info *sbi = fio->sbi; enum page_type btype = PAGE_TYPE_OF_BIO(fio->type); - struct f2fs_bio_info *io; - bool is_read = is_read_io(fio->op); + struct f2fs_bio_info *io = &sbi->write_io[btype]; struct page *bio_page; int err = 0; - io = is_read ? &sbi->read_io : &sbi->write_io[btype]; + f2fs_bug_on(sbi, is_read_io(fio->op)); if (fio->old_blkaddr != NEW_ADDR) verify_block_addr(sbi, fio->old_blkaddr); @@ -387,8 +383,7 @@ int f2fs_submit_page_mbio(struct f2fs_io_info *fio) /* set submitted = 1 as a return value */ fio->submitted = 1; - if (!is_read) - inc_page_count(sbi, WB_DATA_TYPE(bio_page)); + inc_page_count(sbi, WB_DATA_TYPE(bio_page)); down_write(&io->io_rwsem); @@ -401,12 +396,11 @@ alloc_new: if ((fio->type == DATA || fio->type == NODE) && fio->new_blkaddr & F2FS_IO_SIZE_MASK(sbi)) { err = -EAGAIN; - if (!is_read) - dec_page_count(sbi, WB_DATA_TYPE(bio_page)); + dec_page_count(sbi, WB_DATA_TYPE(bio_page)); goto out_fail; } io->bio = __bio_alloc(sbi, fio->new_blkaddr, - BIO_MAX_PAGES, is_read); + BIO_MAX_PAGES, false); io->fio = *fio; } @@ -420,7 +414,7 @@ alloc_new: f2fs_trace_ios(fio, 0); out_fail: up_write(&io->io_rwsem); - trace_f2fs_submit_page_mbio(fio->page, fio); + trace_f2fs_submit_page_write(fio->page, fio); return err; } @@ -1319,7 +1313,7 @@ retry_encrypt: /* flush pending IOs and wait for a while in the ENOMEM case */ if (PTR_ERR(fio->encrypted_page) == -ENOMEM) { - f2fs_flush_merged_bios(fio->sbi); + f2fs_flush_merged_writes(fio->sbi); congestion_wait(BLK_RW_ASYNC, HZ/50); gfp_flags |= __GFP_NOFAIL; goto retry_encrypt; @@ -1511,8 +1505,7 @@ out: ClearPageUptodate(page); if (wbc->for_reclaim) { - f2fs_submit_merged_bio_cond(sbi, inode, 0, page->index, - DATA, WRITE); + f2fs_submit_merged_write_cond(sbi, inode, 0, page->index, DATA); clear_inode_flag(inode, FI_HOT_DATA); remove_dirty_inode(inode); submitted = NULL; @@ -1523,7 +1516,7 @@ out: f2fs_balance_fs(sbi, need_balance_fs); if (unlikely(f2fs_cp_error(sbi))) { - f2fs_submit_merged_bio(sbi, DATA, WRITE); + f2fs_submit_merged_write(sbi, DATA); submitted = NULL; } @@ -1682,8 +1675,8 @@ continue_unlock: mapping->writeback_index = done_index; if (last_idx != ULONG_MAX) - f2fs_submit_merged_bio_cond(F2FS_M_SB(mapping), mapping->host, - 0, last_idx, DATA, WRITE); + f2fs_submit_merged_write_cond(F2FS_M_SB(mapping), mapping->host, + 0, last_idx, DATA); return ret; } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index b0a093e38104..3fec9d6d3962 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -946,7 +946,6 @@ struct f2fs_sb_info { struct f2fs_sm_info *sm_info; /* segment manager */ /* for bio operations */ - struct f2fs_bio_info read_io; /* for read bios */ struct f2fs_bio_info write_io[NR_PAGE_TYPE]; /* for write bios */ struct mutex wio_mutex[NODE + 1]; /* bio ordering for NODE/DATA */ int write_io_size_bits; /* Write IO size bits */ @@ -2392,14 +2391,13 @@ void destroy_checkpoint_caches(void); /* * data.c */ -void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, enum page_type type, - int rw); -void f2fs_submit_merged_bio_cond(struct f2fs_sb_info *sbi, +void f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type); +void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi, struct inode *inode, nid_t ino, pgoff_t idx, - enum page_type type, int rw); -void f2fs_flush_merged_bios(struct f2fs_sb_info *sbi); + enum page_type type); +void f2fs_flush_merged_writes(struct f2fs_sb_info *sbi); int f2fs_submit_page_bio(struct f2fs_io_info *fio); -int f2fs_submit_page_mbio(struct f2fs_io_info *fio); +int f2fs_submit_page_write(struct f2fs_io_info *fio); struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi, block_t blk_addr, struct bio *bio); int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr); diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index afa2b2cf9f7e..deb20100d0be 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -670,7 +670,7 @@ static void move_encrypted_block(struct inode *inode, block_t bidx, fio.op = REQ_OP_WRITE; fio.op_flags = REQ_SYNC | REQ_NOIDLE; fio.new_blkaddr = newaddr; - f2fs_submit_page_mbio(&fio); + f2fs_submit_page_write(&fio); f2fs_update_data_blkaddr(&dn, newaddr); set_inode_flag(inode, FI_APPEND_WRITE); @@ -936,8 +936,8 @@ next: } if (gc_type == FG_GC) - f2fs_submit_merged_bio(sbi, - (type == SUM_TYPE_NODE) ? NODE : DATA, WRITE); + f2fs_submit_merged_write(sbi, + (type == SUM_TYPE_NODE) ? NODE : DATA); blk_finish_plug(&plug); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 833f5fb9858c..90715dade918 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1373,15 +1373,15 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, up_read(&sbi->node_write); if (wbc->for_reclaim) { - f2fs_submit_merged_bio_cond(sbi, page->mapping->host, 0, - page->index, NODE, WRITE); + f2fs_submit_merged_write_cond(sbi, page->mapping->host, 0, + page->index, NODE); submitted = NULL; } unlock_page(page); if (unlikely(f2fs_cp_error(sbi))) { - f2fs_submit_merged_bio(sbi, NODE, WRITE); + f2fs_submit_merged_write(sbi, NODE); submitted = NULL; } if (submitted) @@ -1518,8 +1518,7 @@ continue_unlock: } out: if (last_idx != ULONG_MAX) - f2fs_submit_merged_bio_cond(sbi, NULL, ino, last_idx, - NODE, WRITE); + f2fs_submit_merged_write_cond(sbi, NULL, ino, last_idx, NODE); return ret ? -EIO: 0; } @@ -1625,7 +1624,7 @@ continue_unlock: } out: if (nwritten) - f2fs_submit_merged_bio(sbi, NODE, WRITE); + f2fs_submit_merged_write(sbi, NODE); return ret; } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index c35e70e72e8b..5331cbefd681 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -328,8 +328,7 @@ static int __commit_inmem_pages(struct inode *inode, } if (last_idx != ULONG_MAX) - f2fs_submit_merged_bio_cond(sbi, inode, 0, last_idx, - DATA, WRITE); + f2fs_submit_merged_write_cond(sbi, inode, 0, last_idx, DATA); if (!err) __revoke_inmem_pages(inode, revoke_list, false, false); @@ -2229,7 +2228,7 @@ reallocate: &fio->new_blkaddr, sum, type); /* writeout dirty page into bdev */ - err = f2fs_submit_page_mbio(fio); + err = f2fs_submit_page_write(fio); if (err == -EAGAIN) { fio->old_blkaddr = fio->new_blkaddr; goto reallocate; @@ -2256,7 +2255,7 @@ void write_meta_page(struct f2fs_sb_info *sbi, struct page *page) fio.op_flags &= ~REQ_META; set_page_writeback(page); - f2fs_submit_page_mbio(&fio); + f2fs_submit_page_write(&fio); } void write_node_page(unsigned int nid, struct f2fs_io_info *fio) @@ -2375,8 +2374,8 @@ void f2fs_wait_on_page_writeback(struct page *page, if (PageWriteback(page)) { struct f2fs_sb_info *sbi = F2FS_P_SB(page); - f2fs_submit_merged_bio_cond(sbi, page->mapping->host, - 0, page->index, type, WRITE); + f2fs_submit_merged_write_cond(sbi, page->mapping->host, + 0, page->index, type); if (ordered) wait_on_page_writeback(page); else diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index fb68af6b04c5..528b5198a5e2 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -817,7 +817,7 @@ static void f2fs_put_super(struct super_block *sb) mutex_unlock(&sbi->umount_mutex); /* our cp_error case, we can wait for any writeback page */ - f2fs_flush_merged_bios(sbi); + f2fs_flush_merged_writes(sbi); iput(sbi->node_inode); iput(sbi->meta_inode); @@ -1972,9 +1972,6 @@ try_onemore: set_sbi_flag(sbi, SBI_POR_DOING); spin_lock_init(&sbi->stat_lock); - init_rwsem(&sbi->read_io.io_rwsem); - sbi->read_io.sbi = sbi; - sbi->read_io.bio = NULL; for (i = 0; i < NR_PAGE_TYPE; i++) { init_rwsem(&sbi->write_io[i].io_rwsem); sbi->write_io[i].sbi = sbi; diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 0d02af995547..fa0d8b07a1bf 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -781,7 +781,7 @@ DEFINE_EVENT_CONDITION(f2fs__submit_page_bio, f2fs_submit_page_bio, TP_CONDITION(page->mapping) ); -DEFINE_EVENT_CONDITION(f2fs__submit_page_bio, f2fs_submit_page_mbio, +DEFINE_EVENT_CONDITION(f2fs__submit_page_bio, f2fs_submit_page_write, TP_PROTO(struct page *page, struct f2fs_io_info *fio), From e61d6504368df2834b9c2b76debe8b1b557d08e0 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 10 May 2017 14:19:54 -0700 Subject: [PATCH 311/804] f2fs: use fio instead of multiple parameters This patch just changes using fio instead of parameters. Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 41 +++++++++++++++++++++-------------------- 1 file changed, 21 insertions(+), 20 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 5331cbefd681..ca5f815c6eab 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2118,61 +2118,62 @@ static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type) return false; } -static int __get_segment_type_2(struct page *page, enum page_type p_type) +static int __get_segment_type_2(struct f2fs_io_info *fio) { - if (p_type == DATA) + if (fio->type == DATA) return CURSEG_HOT_DATA; else return CURSEG_HOT_NODE; } -static int __get_segment_type_4(struct page *page, enum page_type p_type) +static int __get_segment_type_4(struct f2fs_io_info *fio) { - if (p_type == DATA) { - struct inode *inode = page->mapping->host; + if (fio->type == DATA) { + struct inode *inode = fio->page->mapping->host; if (S_ISDIR(inode->i_mode)) return CURSEG_HOT_DATA; else return CURSEG_COLD_DATA; } else { - if (IS_DNODE(page) && is_cold_node(page)) + if (IS_DNODE(fio->page) && is_cold_node(fio->page)) return CURSEG_WARM_NODE; else return CURSEG_COLD_NODE; } } -static int __get_segment_type_6(struct page *page, enum page_type p_type) +static int __get_segment_type_6(struct f2fs_io_info *fio) { - if (p_type == DATA) { - struct inode *inode = page->mapping->host; + if (fio->type == DATA) { + struct inode *inode = fio->page->mapping->host; - if (is_cold_data(page) || file_is_cold(inode)) + if (is_cold_data(fio->page) || file_is_cold(inode)) return CURSEG_COLD_DATA; if (is_inode_flag_set(inode, FI_HOT_DATA)) return CURSEG_HOT_DATA; return CURSEG_WARM_DATA; } else { - if (IS_DNODE(page)) - return is_cold_node(page) ? CURSEG_WARM_NODE : + if (IS_DNODE(fio->page)) + return is_cold_node(fio->page) ? CURSEG_WARM_NODE : CURSEG_HOT_NODE; return CURSEG_COLD_NODE; } } -static int __get_segment_type(struct page *page, enum page_type p_type) +static int __get_segment_type(struct f2fs_io_info *fio) { - switch (F2FS_P_SB(page)->active_logs) { + switch (fio->sbi->active_logs) { case 2: - return __get_segment_type_2(page, p_type); + return __get_segment_type_2(fio); case 4: - return __get_segment_type_4(page, p_type); + return __get_segment_type_4(fio); } + /* NR_CURSEG_TYPE(6) logs by default */ - f2fs_bug_on(F2FS_P_SB(page), - F2FS_P_SB(page)->active_logs != NR_CURSEG_TYPE); - return __get_segment_type_6(page, p_type); + f2fs_bug_on(fio->sbi, fio->sbi->active_logs != NR_CURSEG_TYPE); + + return __get_segment_type_6(fio); } void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, @@ -2218,7 +2219,7 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) { - int type = __get_segment_type(fio->page, fio->type); + int type = __get_segment_type(fio); int err; if (fio->type == NODE || fio->type == DATA) From a3b6a409692bee072eec659b9d18766d53f96c36 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 10 May 2017 11:18:25 -0700 Subject: [PATCH 312/804] f2fs: split bio cache Split DATA/NODE type bio cache according to different temperature, so write IOs with the same temperature can be merged in corresponding bio cache as much as possible, otherwise, different temperature write IOs submitting into one bio cache will always cause split of bio. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim Conflicts: include/trace/events/f2fs.h --- fs/f2fs/data.c | 57 +++++++++++++++++++++++++------------ fs/f2fs/f2fs.h | 10 ++++++- fs/f2fs/gc.c | 2 ++ fs/f2fs/segment.c | 24 ++++++++++++---- fs/f2fs/segment.h | 4 +++ fs/f2fs/super.c | 21 ++++++++++++-- include/trace/events/f2fs.h | 11 ++++++- 7 files changed, 100 insertions(+), 29 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 5afdd9455d43..f7e597a1d984 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -281,27 +281,32 @@ static bool has_merged_page(struct f2fs_sb_info *sbi, struct inode *inode, nid_t ino, pgoff_t idx, enum page_type type) { enum page_type btype = PAGE_TYPE_OF_BIO(type); - struct f2fs_bio_info *io = &sbi->write_io[btype]; - bool ret; + enum temp_type temp; + struct f2fs_bio_info *io; + bool ret = false; - down_read(&io->io_rwsem); - ret = __has_merged_page(io, inode, ino, idx); - up_read(&io->io_rwsem); + for (temp = HOT; temp < NR_TEMP_TYPE; temp++) { + io = sbi->write_io[btype] + temp; + + down_read(&io->io_rwsem); + ret = __has_merged_page(io, inode, ino, idx); + up_read(&io->io_rwsem); + + /* TODO: use HOT temp only for meta pages now. */ + if (ret || btype == META) + break; + } return ret; } static void __f2fs_submit_merged_write(struct f2fs_sb_info *sbi, - struct inode *inode, nid_t ino, pgoff_t idx, - enum page_type type) + enum page_type type, enum temp_type temp) { enum page_type btype = PAGE_TYPE_OF_BIO(type); - struct f2fs_bio_info *io = &sbi->write_io[btype]; + struct f2fs_bio_info *io = sbi->write_io[btype] + temp; down_write(&io->io_rwsem); - if (!__has_merged_page(io, inode, ino, idx)) - goto out; - /* change META to META_FLUSH in the checkpoint procedure */ if (type >= META_FLUSH) { io->fio.type = META_FLUSH; @@ -311,21 +316,38 @@ static void __f2fs_submit_merged_write(struct f2fs_sb_info *sbi, io->fio.op_flags |= WRITE_FLUSH | REQ_FUA; } __submit_merged_bio(io); -out: up_write(&io->io_rwsem); } +static void __submit_merged_write_cond(struct f2fs_sb_info *sbi, + struct inode *inode, nid_t ino, pgoff_t idx, + enum page_type type, bool force) +{ + enum temp_type temp; + + if (!force && !has_merged_page(sbi, inode, ino, idx, type)) + return; + + for (temp = HOT; temp < NR_TEMP_TYPE; temp++) { + + __f2fs_submit_merged_write(sbi, type, temp); + + /* TODO: use HOT temp only for meta pages now. */ + if (type >= META) + break; + } +} + void f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type) { - __f2fs_submit_merged_write(sbi, NULL, 0, 0, type); + __submit_merged_write_cond(sbi, NULL, 0, 0, type, true); } void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi, struct inode *inode, nid_t ino, pgoff_t idx, enum page_type type) { - if (has_merged_page(sbi, inode, ino, idx, type)) - __f2fs_submit_merged_write(sbi, inode, ino, idx, type); + __submit_merged_write_cond(sbi, inode, ino, idx, type, false); } void f2fs_flush_merged_writes(struct f2fs_sb_info *sbi) @@ -368,7 +390,7 @@ int f2fs_submit_page_write(struct f2fs_io_info *fio) { struct f2fs_sb_info *sbi = fio->sbi; enum page_type btype = PAGE_TYPE_OF_BIO(fio->type); - struct f2fs_bio_info *io = &sbi->write_io[btype]; + struct f2fs_bio_info *io = sbi->write_io[btype] + fio->temp; struct page *bio_page; int err = 0; @@ -404,8 +426,7 @@ alloc_new: io->fio = *fio; } - if (bio_add_page(io->bio, bio_page, PAGE_SIZE, 0) < - PAGE_SIZE) { + if (bio_add_page(io->bio, bio_page, PAGE_SIZE, 0) < PAGE_SIZE) { __submit_merged_bio(io); goto alloc_new; } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 3fec9d6d3962..b0a34ae19a3f 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -859,9 +859,17 @@ enum page_type { OPU, }; +enum temp_type { + HOT = 0, /* must be zero for meta bio */ + WARM, + COLD, + NR_TEMP_TYPE, +}; + struct f2fs_io_info { struct f2fs_sb_info *sbi; /* f2fs_sb_info pointer */ enum page_type type; /* contains DATA/NODE/META/META_FLUSH */ + enum temp_type temp; /* contains HOT/WARM/COLD */ int op; /* contains REQ_OP_ */ int op_flags; /* req_flag_bits */ block_t new_blkaddr; /* new block address to be written */ @@ -946,7 +954,7 @@ struct f2fs_sb_info { struct f2fs_sm_info *sm_info; /* segment manager */ /* for bio operations */ - struct f2fs_bio_info write_io[NR_PAGE_TYPE]; /* for write bios */ + struct f2fs_bio_info *write_io[NR_PAGE_TYPE]; /* for write bios */ struct mutex wio_mutex[NODE + 1]; /* bio ordering for NODE/DATA */ int write_io_size_bits; /* Write IO size bits */ mempool_t *write_io_dummy; /* Dummy pages */ diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index deb20100d0be..50c7864eb0d9 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -586,6 +586,7 @@ static void move_encrypted_block(struct inode *inode, block_t bidx, struct f2fs_io_info fio = { .sbi = F2FS_I_SB(inode), .type = DATA, + .temp = COLD, .op = REQ_OP_READ, .op_flags = REQ_SYNC, .encrypted_page = NULL, @@ -712,6 +713,7 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type, struct f2fs_io_info fio = { .sbi = F2FS_I_SB(inode), .type = DATA, + .temp = COLD, .op = REQ_OP_WRITE, .op_flags = REQ_SYNC, .old_blkaddr = NULL_ADDR, diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index ca5f815c6eab..f6cbacf66ddc 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2163,17 +2163,29 @@ static int __get_segment_type_6(struct f2fs_io_info *fio) static int __get_segment_type(struct f2fs_io_info *fio) { + int type = 0; + switch (fio->sbi->active_logs) { case 2: - return __get_segment_type_2(fio); + type = __get_segment_type_2(fio); + break; case 4: - return __get_segment_type_4(fio); + type = __get_segment_type_4(fio); + break; + case 6: + type = __get_segment_type_6(fio); + break; + default: + f2fs_bug_on(fio->sbi, true); } - /* NR_CURSEG_TYPE(6) logs by default */ - f2fs_bug_on(fio->sbi, fio->sbi->active_logs != NR_CURSEG_TYPE); - - return __get_segment_type_6(fio); + if (IS_HOT(type)) + fio->temp = HOT; + else if (IS_WARM(type)) + fio->temp = WARM; + else + fio->temp = COLD; + return type; } void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 10bf05d4cff4..e9ba1f1d9723 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -27,6 +27,10 @@ #define IS_DATASEG(t) ((t) <= CURSEG_COLD_DATA) #define IS_NODESEG(t) ((t) >= CURSEG_HOT_NODE) +#define IS_HOT(t) ((t) == CURSEG_HOT_NODE || (t) == CURSEG_HOT_DATA) +#define IS_WARM(t) ((t) == CURSEG_WARM_NODE || (t) == CURSEG_WARM_DATA) +#define IS_COLD(t) ((t) == CURSEG_COLD_NODE || (t) == CURSEG_COLD_DATA) + #define IS_CURSEG(sbi, seg) \ (((seg) == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno) || \ ((seg) == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno) || \ diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 528b5198a5e2..1cb9ca9cab33 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -768,6 +768,7 @@ static void destroy_device_list(struct f2fs_sb_info *sbi) static void f2fs_put_super(struct super_block *sb) { struct f2fs_sb_info *sbi = F2FS_SB(sb); + int i; if (sbi->s_proc) { remove_proc_entry("segment_info", sbi->s_proc); @@ -839,6 +840,8 @@ static void f2fs_put_super(struct super_block *sb) if (sbi->write_io_dummy) mempool_destroy(sbi->write_io_dummy); destroy_percpu_info(sbi); + for (i = 0; i < NR_PAGE_TYPE; i++) + kfree(sbi->write_io[i]); kfree(sbi); } @@ -1973,9 +1976,19 @@ try_onemore: spin_lock_init(&sbi->stat_lock); for (i = 0; i < NR_PAGE_TYPE; i++) { - init_rwsem(&sbi->write_io[i].io_rwsem); - sbi->write_io[i].sbi = sbi; - sbi->write_io[i].bio = NULL; + int n = (i == META) ? 1: NR_TEMP_TYPE; + int j; + + sbi->write_io[i] = kmalloc(n * sizeof(struct f2fs_bio_info), + GFP_KERNEL); + if (!sbi->write_io[i]) + goto free_options; + + for (j = HOT; j < n; j++) { + init_rwsem(&sbi->write_io[i][j].io_rwsem); + sbi->write_io[i][j].sbi = sbi; + sbi->write_io[i][j].bio = NULL; + } } init_rwsem(&sbi->cp_rwsem); @@ -2221,6 +2234,8 @@ free_meta_inode: free_io_dummy: mempool_destroy(sbi->write_io_dummy); free_options: + for (i = 0; i < NR_PAGE_TYPE; i++) + kfree(sbi->write_io[i]); destroy_percpu_info(sbi); kfree(options); free_sb_buf: diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index fa0d8b07a1bf..20c4556ab56d 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -83,6 +83,12 @@ TRACE_DEFINE_ENUM(CP_TRIMMED); { REQ_META | REQ_PRIO, "(MP)" }, \ { 0, " \b" }) +#define show_block_temp(temp) \ + __print_symbolic(temp, \ + { HOT, "HOT" }, \ + { WARM, "WARM" }, \ + { COLD, "COLD" }) + #define show_data_type(type) \ __print_symbolic(type, \ { CURSEG_HOT_DATA, "Hot DATA" }, \ @@ -748,6 +754,7 @@ DECLARE_EVENT_CLASS(f2fs__submit_page_bio, __field(block_t, new_blkaddr) __field(int, op) __field(int, op_flags) + __field(int, temp) __field(int, type) ), @@ -759,16 +766,18 @@ DECLARE_EVENT_CLASS(f2fs__submit_page_bio, __entry->new_blkaddr = fio->new_blkaddr; __entry->op = fio->op; __entry->op_flags = fio->op_flags; + __entry->temp = fio->temp; __entry->type = fio->type; ), TP_printk("dev = (%d,%d), ino = %lu, page_index = 0x%lx, " - "oldaddr = 0x%llx, newaddr = 0x%llx rw = %s%s, type = %s", + "oldaddr = 0x%llx, newaddr = 0x%llx, rw = %s(%s), type = %s_%s", show_dev_ino(__entry), (unsigned long)__entry->index, (unsigned long long)__entry->old_blkaddr, (unsigned long long)__entry->new_blkaddr, show_bio_type(__entry->op, __entry->op_flags), + show_block_temp(__entry->temp), show_block_type(__entry->type)) ); From 5d6951b8e115161e940f46690a2c971833769584 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 12 May 2017 13:51:34 -0700 Subject: [PATCH 313/804] f2fs: avoid f2fs_lock_op for IPU writes Currently, if we do get_node_of_data before f2fs_lock_op, there may be dead lock as follows, where process A would be in infinite loop, and B will NOT be awaked. Process A(cp): Process B: f2fs_lock_all(sbi) get_dnode_of_data <---- lock dn.node_page flush_nodes f2fs_lock_op So, this patch adds f2fs_trylock_op to avoid f2fs_lock_op done by IPU. Signed-off-by: Hou Pengyang Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 44 +++++++++++++++++++++++++++++++------------- fs/f2fs/f2fs.h | 13 ++++++++++++- fs/f2fs/gc.c | 2 +- fs/f2fs/segment.c | 2 +- 4 files changed, 45 insertions(+), 16 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index f7e597a1d984..8211bab93e06 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1381,12 +1381,12 @@ int do_write_data_page(struct f2fs_io_info *fio) if (valid_ipu_blkaddr(fio)) { ipu_force = true; - fio->need_lock = false; + fio->need_lock = LOCK_DONE; goto got_it; } } - if (fio->need_lock) + if (fio->need_lock == LOCK_REQ) f2fs_lock_op(fio->sbi); err = get_dnode_of_data(&dn, page->index, LOOKUP_NODE); @@ -1401,19 +1401,18 @@ int do_write_data_page(struct f2fs_io_info *fio) goto out_writepage; } got_it: - err = encrypt_one_page(fio); - if (err) - goto out_writepage; - - set_page_writeback(page); - /* * If current allocation needs SSR, * it had better in-place writes for updated data. */ if (ipu_force || (valid_ipu_blkaddr(fio) && need_inplace_update(fio))) { + err = encrypt_one_page(fio); + if (err) + goto out_writepage; + + set_page_writeback(page); f2fs_put_dnode(&dn); - if (fio->need_lock) + if (fio->need_lock == LOCK_REQ) f2fs_unlock_op(fio->sbi); err = rewrite_data_page(fio); trace_f2fs_do_write_data_page(fio->page, IPU); @@ -1421,6 +1420,20 @@ got_it: return err; } + if (fio->need_lock == LOCK_RETRY) { + if (!f2fs_trylock_op(fio->sbi)) { + err = -EAGAIN; + goto out_writepage; + } + fio->need_lock = LOCK_REQ; + } + + err = encrypt_one_page(fio); + if (err) + goto out_writepage; + + set_page_writeback(page); + /* LFS mode write path */ write_data_page(&dn, fio); trace_f2fs_do_write_data_page(page, OPU); @@ -1430,7 +1443,7 @@ got_it: out_writepage: f2fs_put_dnode(&dn); out: - if (fio->need_lock) + if (fio->need_lock == LOCK_REQ) f2fs_unlock_op(fio->sbi); return err; } @@ -1456,7 +1469,7 @@ static int __write_data_page(struct page *page, bool *submitted, .page = page, .encrypted_page = NULL, .submitted = false, - .need_lock = true, + .need_lock = LOCK_RETRY, }; trace_f2fs_writepage(page, DATA); @@ -1492,7 +1505,7 @@ write: /* Dentry blocks are controlled by checkpoint */ if (S_ISDIR(inode->i_mode)) { - fio.need_lock = false; + fio.need_lock = LOCK_DONE; err = do_write_data_page(&fio); goto done; } @@ -1511,8 +1524,13 @@ write: goto out; } - if (err == -EAGAIN) + if (err == -EAGAIN) { err = do_write_data_page(&fio); + if (err == -EAGAIN) { + fio.need_lock = LOCK_REQ; + err = do_write_data_page(&fio); + } + } if (F2FS_I(inode)->last_disk_size < psize) F2FS_I(inode)->last_disk_size = psize; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index b0a34ae19a3f..4a33399e277c 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -866,6 +866,12 @@ enum temp_type { NR_TEMP_TYPE, }; +enum need_lock_type { + LOCK_REQ = 0, + LOCK_DONE, + LOCK_RETRY, +}; + struct f2fs_io_info { struct f2fs_sb_info *sbi; /* f2fs_sb_info pointer */ enum page_type type; /* contains DATA/NODE/META/META_FLUSH */ @@ -877,7 +883,7 @@ struct f2fs_io_info { struct page *page; /* page to be written */ struct page *encrypted_page; /* encrypted page */ bool submitted; /* indicate IO submission */ - bool need_lock; /* indicate we need to lock cp_rwsem */ + int need_lock; /* indicate we need to lock cp_rwsem */ }; #define is_read_io(rw) ((rw) == READ) @@ -1346,6 +1352,11 @@ static inline void f2fs_lock_op(struct f2fs_sb_info *sbi) down_read(&sbi->cp_rwsem); } +static inline int f2fs_trylock_op(struct f2fs_sb_info *sbi) +{ + return down_read_trylock(&sbi->cp_rwsem); +} + static inline void f2fs_unlock_op(struct f2fs_sb_info *sbi) { up_read(&sbi->cp_rwsem); diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 50c7864eb0d9..93ba82c968c6 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -719,7 +719,7 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type, .old_blkaddr = NULL_ADDR, .page = page, .encrypted_page = NULL, - .need_lock = true, + .need_lock = LOCK_REQ, }; bool is_dirty = PageDirty(page); int err; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index f6cbacf66ddc..1a280099da8f 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -312,7 +312,7 @@ static int __commit_inmem_pages(struct inode *inode, fio.page = page; fio.old_blkaddr = NULL_ADDR; fio.encrypted_page = NULL; - fio.need_lock = false, + fio.need_lock = LOCK_DONE; err = do_write_data_page(&fio); if (err) { unlock_page(page); From a34df1bce787535738fa89ab1968482c6ff28f26 Mon Sep 17 00:00:00 2001 From: Hou Pengyang Date: Wed, 17 May 2017 02:48:48 +0000 Subject: [PATCH 314/804] f2fs: declare load_free_nid_bitmap static Signed-off-by: Hou Pengyang Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 90715dade918..8e27e853ed11 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2555,7 +2555,7 @@ static int __get_nat_bitmaps(struct f2fs_sb_info *sbi) return 0; } -inline void load_free_nid_bitmap(struct f2fs_sb_info *sbi) +static inline void load_free_nid_bitmap(struct f2fs_sb_info *sbi) { struct f2fs_nm_info *nm_i = NM_I(sbi); unsigned int i = 0; From 800a48334fcd12b868392f031b6ae7547e469eec Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Wed, 17 May 2017 17:22:51 +0800 Subject: [PATCH 315/804] f2fs: add a new function get_ssr_cost This patch add a new method get_ssr_cost to select SSR segment more accurately. Signed-off-by: Yunlei He Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 93ba82c968c6..3c901bc2f917 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -258,11 +258,20 @@ static unsigned int get_greedy_cost(struct f2fs_sb_info *sbi, valid_blocks * 2 : valid_blocks; } +static unsigned int get_ssr_cost(struct f2fs_sb_info *sbi, + unsigned int segno) +{ + struct seg_entry *se = get_seg_entry(sbi, segno); + + return se->ckpt_valid_blocks > se->valid_blocks ? + se->ckpt_valid_blocks : se->valid_blocks; +} + static inline unsigned int get_gc_cost(struct f2fs_sb_info *sbi, unsigned int segno, struct victim_sel_policy *p) { if (p->alloc_mode == SSR) - return get_seg_entry(sbi, segno)->ckpt_valid_blocks; + return get_ssr_cost(sbi, segno); /* alloc_mode == LFS */ if (p->gc_mode == GC_GREEDY) From 842ce444fd86167f3e9dc858f22e4c90639764e0 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 17 May 2017 10:36:58 -0700 Subject: [PATCH 316/804] f2fs: try to freeze in gc and discard threads This allows to freeze gc and discard threads. Cc: stable@vger.kernel.org Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 9 +++++---- fs/f2fs/segment.c | 25 ++++++++++++++++--------- 2 files changed, 21 insertions(+), 13 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 3c901bc2f917..1e6716ee64c1 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -32,13 +32,14 @@ static int gc_thread_func(void *data) wait_ms = gc_th->min_sleep_time; + set_freezable(); do { + wait_event_interruptible_timeout(*wq, + kthread_should_stop() || freezing(current), + msecs_to_jiffies(wait_ms)); + if (try_to_freeze()) continue; - else - wait_event_interruptible_timeout(*wq, - kthread_should_stop(), - msecs_to_jiffies(wait_ms)); if (kthread_should_stop()) break; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 1a280099da8f..46ee1139046c 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -16,6 +16,7 @@ #include #include #include +#include #include "f2fs.h" #include "segment.h" @@ -1138,18 +1139,24 @@ static int issue_discard_thread(void *data) struct f2fs_sb_info *sbi = data; struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; wait_queue_head_t *q = &dcc->discard_wait_queue; -repeat: - if (kthread_should_stop()) - return 0; - __issue_discard_cmd(sbi, true); - __wait_discard_cmd(sbi, true); + set_freezable(); - congestion_wait(BLK_RW_SYNC, HZ/50); + do { + wait_event_interruptible(*q, kthread_should_stop() || + freezing(current) || + atomic_read(&dcc->discard_cmd_cnt)); + if (try_to_freeze()) + continue; + if (kthread_should_stop()) + return 0; - wait_event_interruptible(*q, kthread_should_stop() || - atomic_read(&dcc->discard_cmd_cnt)); - goto repeat; + __issue_discard_cmd(sbi, true); + __wait_discard_cmd(sbi, true); + + congestion_wait(BLK_RW_SYNC, HZ/50); + } while (!kthread_should_stop()); + return 0; } #ifdef CONFIG_BLK_DEV_ZONED From e2b2bed0961e34d6fd1c293af66bd39b10a317f4 Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Fri, 19 May 2017 15:06:12 +0800 Subject: [PATCH 317/804] f2fs: fix a bug caused by NULL extent tree Thread A: Thread B: -f2fs_remount -sbi->mount_opt.opt = 0; <--- -f2fs_iget -do_read_inode -f2fs_init_extent_tree -F2FS_I(inode)->extent_tree is NULL -default_options && parse_options -remount return <--- -f2fs_map_blocks -f2fs_lookup_extent_tree -f2fs_bug_on(sbi, !et); The same problem with f2fs_new_inode. Signed-off-by: Yunlei He Signed-off-by: Jaegeuk Kim --- fs/f2fs/extent_cache.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 2f98d7039701..ff2352a0ed15 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -320,7 +320,7 @@ static void __drop_largest_extent(struct inode *inode, } /* return true, if inode page is changed */ -bool f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext) +static bool __f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct extent_tree *et; @@ -358,6 +358,16 @@ out: return false; } +bool f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext) +{ + bool ret = __f2fs_init_extent_tree(inode, i_ext); + + if (!F2FS_I(inode)->extent_tree) + set_inode_flag(inode, FI_NO_EXTENT); + + return ret; +} + static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs, struct extent_info *ei) { From b89cdaf6b93047d2f916d8c34100b1239665fd20 Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Fri, 19 May 2017 14:42:12 +0800 Subject: [PATCH 318/804] f2fs: combine huge num of discard rb tree consistence checks Came across a hungtask caused by huge number of rb tree traversing during adding discard addrs in cp. This patch combine these consistence checks and move it to discard thread. Signed-off-by: Yunlei He Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 46ee1139046c..3c24a8ca0283 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -917,7 +917,6 @@ static void __punch_discard_cmd(struct f2fs_sb_info *sbi, dc->len = blkaddr - dc->lstart; dcc->undiscard_blks += dc->len; __relocate_discard_cmd(dcc, dc); - f2fs_bug_on(sbi, !__check_rb_tree_consistence(sbi, &dcc->root)); modified = true; } @@ -927,16 +926,12 @@ static void __punch_discard_cmd(struct f2fs_sb_info *sbi, di.start + blkaddr + 1 - di.lstart, di.lstart + di.len - 1 - blkaddr, NULL, NULL); - f2fs_bug_on(sbi, - !__check_rb_tree_consistence(sbi, &dcc->root)); } else { dc->lstart++; dc->len--; dc->start++; dcc->undiscard_blks += dc->len; __relocate_discard_cmd(dcc, dc); - f2fs_bug_on(sbi, - !__check_rb_tree_consistence(sbi, &dcc->root)); } } } @@ -997,8 +992,6 @@ static void __update_discard_tree_range(struct f2fs_sb_info *sbi, prev_dc->di.len += di.len; dcc->undiscard_blks += di.len; __relocate_discard_cmd(dcc, prev_dc); - f2fs_bug_on(sbi, - !__check_rb_tree_consistence(sbi, &dcc->root)); di = prev_dc->di; tdc = prev_dc; merged = true; @@ -1014,16 +1007,12 @@ static void __update_discard_tree_range(struct f2fs_sb_info *sbi, __relocate_discard_cmd(dcc, next_dc); if (tdc) __remove_discard_cmd(sbi, tdc); - f2fs_bug_on(sbi, - !__check_rb_tree_consistence(sbi, &dcc->root)); merged = true; } if (!merged) { __insert_discard_tree(sbi, bdev, di.lstart, di.start, di.len, NULL, NULL); - f2fs_bug_on(sbi, - !__check_rb_tree_consistence(sbi, &dcc->root)); } next: prev_dc = next_dc; @@ -1062,6 +1051,8 @@ static void __issue_discard_cmd(struct f2fs_sb_info *sbi, bool issue_cond) int i, iter = 0; mutex_lock(&dcc->cmd_lock); + f2fs_bug_on(sbi, + !__check_rb_tree_consistence(sbi, &dcc->root)); blk_start_plug(&plug); for (i = MAX_PLIST_NUM - 1; i >= 0; i--) { pend_list = &dcc->pend_list[i]; From f152939829d14d6fd6e0f8a461df1996acc8269d Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 19 May 2017 23:37:00 +0800 Subject: [PATCH 319/804] f2fs: split wio_mutex Split wio_mutex to adjust different temperature bio cache. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 3 ++- fs/f2fs/segment.c | 4 ++-- fs/f2fs/super.c | 7 ++++--- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 4a33399e277c..dc9de0418621 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -961,7 +961,8 @@ struct f2fs_sb_info { /* for bio operations */ struct f2fs_bio_info *write_io[NR_PAGE_TYPE]; /* for write bios */ - struct mutex wio_mutex[NODE + 1]; /* bio ordering for NODE/DATA */ + struct mutex wio_mutex[NR_PAGE_TYPE - 1][NR_TEMP_TYPE]; + /* bio ordering for NODE/DATA */ int write_io_size_bits; /* Write IO size bits */ mempool_t *write_io_dummy; /* Dummy pages */ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 3c24a8ca0283..00503627c1d1 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2233,7 +2233,7 @@ static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) int err; if (fio->type == NODE || fio->type == DATA) - mutex_lock(&fio->sbi->wio_mutex[fio->type]); + mutex_lock(&fio->sbi->wio_mutex[fio->type][fio->temp]); reallocate: allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr, &fio->new_blkaddr, sum, type); @@ -2246,7 +2246,7 @@ reallocate: } if (fio->type == NODE || fio->type == DATA) - mutex_unlock(&fio->sbi->wio_mutex[fio->type]); + mutex_unlock(&fio->sbi->wio_mutex[fio->type][fio->temp]); } void write_meta_page(struct f2fs_sb_info *sbi, struct page *page) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 1cb9ca9cab33..68d4285f635c 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1573,7 +1573,7 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi) static void init_sb_info(struct f2fs_sb_info *sbi) { struct f2fs_super_block *raw_super = sbi->raw_super; - int i; + int i, j; sbi->log_sectors_per_block = le32_to_cpu(raw_super->log_sectors_per_block); @@ -1605,8 +1605,9 @@ static void init_sb_info(struct f2fs_sb_info *sbi) INIT_LIST_HEAD(&sbi->s_list); mutex_init(&sbi->umount_mutex); - mutex_init(&sbi->wio_mutex[NODE]); - mutex_init(&sbi->wio_mutex[DATA]); + for (i = 0; i < NR_PAGE_TYPE - 1; i++) + for (j = HOT; j < NR_TEMP_TYPE; j++) + mutex_init(&sbi->wio_mutex[i][j]); spin_lock_init(&sbi->cp_lock); } From 843d3364d7996211e38545bcd484d2eebeb1e5a5 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 19 May 2017 23:37:01 +0800 Subject: [PATCH 320/804] f2fs: introduce io_list for serialize data/node IOs Serialize data/node IOs by using fifo list instead of mutex lock, it will help to enhance concurrency of f2fs, meanwhile keeping LFS IO semantics. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 1 + fs/f2fs/data.c | 24 ++++++++++++++++++++---- fs/f2fs/f2fs.h | 7 ++++++- fs/f2fs/gc.c | 3 ++- fs/f2fs/segment.c | 22 +++++++++++++++------- fs/f2fs/super.c | 2 ++ 6 files changed, 46 insertions(+), 13 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index b7580cf84f94..69641cf7fd6f 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -163,6 +163,7 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, .op_flags = sync ? (REQ_SYNC | REQ_META | REQ_PRIO) : REQ_RAHEAD, .encrypted_page = NULL, + .in_list = false, }; struct blk_plug plug; diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 8211bab93e06..f61ab7539229 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -396,6 +396,20 @@ int f2fs_submit_page_write(struct f2fs_io_info *fio) f2fs_bug_on(sbi, is_read_io(fio->op)); + down_write(&io->io_rwsem); +next: + if (fio->in_list) { + spin_lock(&io->io_lock); + if (list_empty(&io->io_list)) { + spin_unlock(&io->io_lock); + goto out_fail; + } + fio = list_first_entry(&io->io_list, + struct f2fs_io_info, list); + list_del(&fio->list); + spin_unlock(&io->io_lock); + } + if (fio->old_blkaddr != NEW_ADDR) verify_block_addr(sbi, fio->old_blkaddr); verify_block_addr(sbi, fio->new_blkaddr); @@ -407,8 +421,6 @@ int f2fs_submit_page_write(struct f2fs_io_info *fio) inc_page_count(sbi, WB_DATA_TYPE(bio_page)); - down_write(&io->io_rwsem); - if (io->bio && (io->last_block_in_bio != fio->new_blkaddr - 1 || (io->fio.op != fio->op || io->fio.op_flags != fio->op_flags) || !__same_bdev(sbi, fio->new_blkaddr, io->bio))) @@ -433,9 +445,13 @@ alloc_new: io->last_block_in_bio = fio->new_blkaddr; f2fs_trace_ios(fio, 0); + + trace_f2fs_submit_page_write(fio->page, fio); + + if (fio->in_list) + goto next; out_fail: up_write(&io->io_rwsem); - trace_f2fs_submit_page_write(fio->page, fio); return err; } @@ -748,7 +764,7 @@ alloc: set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); allocate_data_block(sbi, NULL, dn->data_blkaddr, &dn->data_blkaddr, - &sum, CURSEG_WARM_DATA); + &sum, CURSEG_WARM_DATA, NULL, false); set_data_blkaddr(dn); /* update i_size */ diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index dc9de0418621..f35473293e46 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -882,8 +882,10 @@ struct f2fs_io_info { block_t old_blkaddr; /* old block address before Cow */ struct page *page; /* page to be written */ struct page *encrypted_page; /* encrypted page */ + struct list_head list; /* serialize IOs */ bool submitted; /* indicate IO submission */ int need_lock; /* indicate we need to lock cp_rwsem */ + bool in_list; /* indicate fio is in io_list */ }; #define is_read_io(rw) ((rw) == READ) @@ -893,6 +895,8 @@ struct f2fs_bio_info { sector_t last_block_in_bio; /* last block number */ struct f2fs_io_info fio; /* store buffered io info. */ struct rw_semaphore io_rwsem; /* blocking op for bio */ + spinlock_t io_lock; /* serialize DATA/NODE IOs */ + struct list_head io_list; /* track fios */ }; #define FDEV(i) (sbi->devs[i]) @@ -2361,7 +2365,8 @@ void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn, bool recover_newaddr); void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, block_t old_blkaddr, block_t *new_blkaddr, - struct f2fs_summary *sum, int type); + struct f2fs_summary *sum, int type, + struct f2fs_io_info *fio, bool add_list); void f2fs_wait_on_page_writeback(struct page *page, enum page_type type, bool ordered); void f2fs_wait_on_encrypted_page_writeback(struct f2fs_sb_info *sbi, diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 1e6716ee64c1..c72da8733ba6 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -600,6 +600,7 @@ static void move_encrypted_block(struct inode *inode, block_t bidx, .op = REQ_OP_READ, .op_flags = REQ_SYNC, .encrypted_page = NULL, + .in_list = false, }; struct dnode_of_data dn; struct f2fs_summary sum; @@ -643,7 +644,7 @@ static void move_encrypted_block(struct inode *inode, block_t bidx, fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr; allocate_data_block(fio.sbi, NULL, fio.old_blkaddr, &newaddr, - &sum, CURSEG_COLD_DATA); + &sum, CURSEG_COLD_DATA, NULL, false); fio.encrypted_page = pagecache_get_page(META_MAPPING(fio.sbi), newaddr, FGP_LOCK | FGP_CREAT, GFP_NOFS); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 00503627c1d1..1be5947ae1fe 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2188,7 +2188,8 @@ static int __get_segment_type(struct f2fs_io_info *fio) void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, block_t old_blkaddr, block_t *new_blkaddr, - struct f2fs_summary *sum, int type) + struct f2fs_summary *sum, int type, + struct f2fs_io_info *fio, bool add_list) { struct sit_info *sit_i = SIT_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, type); @@ -2224,6 +2225,17 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, if (page && IS_NODESEG(type)) fill_node_footer_blkaddr(page, NEXT_FREE_BLKADDR(sbi, curseg)); + if (add_list) { + struct f2fs_bio_info *io; + + INIT_LIST_HEAD(&fio->list); + fio->in_list = true; + io = sbi->write_io[fio->type] + fio->temp; + spin_lock(&io->io_lock); + list_add_tail(&fio->list, &io->io_list); + spin_unlock(&io->io_lock); + } + mutex_unlock(&curseg->curseg_mutex); } @@ -2232,11 +2244,9 @@ static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) int type = __get_segment_type(fio); int err; - if (fio->type == NODE || fio->type == DATA) - mutex_lock(&fio->sbi->wio_mutex[fio->type][fio->temp]); reallocate: allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr, - &fio->new_blkaddr, sum, type); + &fio->new_blkaddr, sum, type, fio, true); /* writeout dirty page into bdev */ err = f2fs_submit_page_write(fio); @@ -2244,9 +2254,6 @@ reallocate: fio->old_blkaddr = fio->new_blkaddr; goto reallocate; } - - if (fio->type == NODE || fio->type == DATA) - mutex_unlock(&fio->sbi->wio_mutex[fio->type][fio->temp]); } void write_meta_page(struct f2fs_sb_info *sbi, struct page *page) @@ -2260,6 +2267,7 @@ void write_meta_page(struct f2fs_sb_info *sbi, struct page *page) .new_blkaddr = page->index, .page = page, .encrypted_page = NULL, + .in_list = false, }; if (unlikely(page->index >= MAIN_BLKADDR(sbi))) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 68d4285f635c..9c9a01f776dc 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1989,6 +1989,8 @@ try_onemore: init_rwsem(&sbi->write_io[i][j].io_rwsem); sbi->write_io[i][j].sbi = sbi; sbi->write_io[i][j].bio = NULL; + spin_lock_init(&sbi->write_io[i][j].io_lock); + INIT_LIST_HEAD(&sbi->write_io[i][j].io_list); } } From 732de6bf9e328f15a85606744ddab0559ab7bf65 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 19 May 2017 23:46:43 +0800 Subject: [PATCH 321/804] f2fs: show more info if fail to issue discard Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 1be5947ae1fe..fcdc45c8ba1b 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -741,7 +741,8 @@ static void __remove_discard_cmd(struct f2fs_sb_info *sbi, if (dc->error) f2fs_msg(sbi->sb, KERN_INFO, - "Issue discard failed, ret: %d", dc->error); + "Issue discard(%u, %u, %u) failed, ret: %d", + dc->lstart, dc->start, dc->len, dc->error); __detach_discard_cmd(dcc, dc); } From aa9d75d2f522fccac641f5cb535acd53b0238f2e Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 19 May 2017 23:46:44 +0800 Subject: [PATCH 322/804] f2fs: wake up all waiters in f2fs_submit_discard_endio There could be more than one waiter waiting discard IO completion, so we need use complete_all() instead of complete() in f2fs_submit_discard_endio to avoid hungtask. Fixes: ec9895add2c5 ("f2fs: don't hold cmd_lock during waiting discard command") Cc: Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index fcdc45c8ba1b..66cbd3da0404 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -752,7 +752,7 @@ static void f2fs_submit_discard_endio(struct bio *bio) dc->error = bio->bi_error; dc->state = D_DONE; - complete(&dc->wait); + complete_all(&dc->wait); bio_put(bio); } From b3df3669b73c7e48e6b808e1b17522f585bd69ea Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 19 May 2017 23:46:45 +0800 Subject: [PATCH 323/804] f2fs: wait discard IO completion without cmd_lock held Wait discard IO completion outside cmd_lock to avoid long latency of holding cmd_lock in IO busy scenario. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 66cbd3da0404..c8f5d8feac44 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1076,17 +1076,34 @@ static void __wait_discard_cmd(struct f2fs_sb_info *sbi, bool wait_cond) struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct list_head *wait_list = &(dcc->wait_list); struct discard_cmd *dc, *tmp; + bool need_wait; + +next: + need_wait = false; mutex_lock(&dcc->cmd_lock); list_for_each_entry_safe(dc, tmp, wait_list, list) { - if (!wait_cond || dc->state == D_DONE) { - if (dc->ref) - continue; + if (!wait_cond || (dc->state == D_DONE && !dc->ref)) { wait_for_completion_io(&dc->wait); __remove_discard_cmd(sbi, dc); + } else { + dc->ref++; + need_wait = true; + break; } } mutex_unlock(&dcc->cmd_lock); + + if (need_wait) { + wait_for_completion_io(&dc->wait); + mutex_lock(&dcc->cmd_lock); + f2fs_bug_on(sbi, dc->state != D_DONE); + dc->ref--; + if (!dc->ref) + __remove_discard_cmd(sbi, dc); + mutex_unlock(&dcc->cmd_lock); + goto next; + } } /* This should be covered by global mutex, &sit_i->sentry_lock */ From f4afd85fc4212df5b75ab8e261214b41f0eb8007 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 22 May 2017 17:39:43 -0700 Subject: [PATCH 324/804] f2fs: don't bother checking for encryption key in ->mmap() Since only an open file can be mmap'ed, and we only allow open()ing an encrypted file when its key is available, there is no need to check for the key again before permitting each mmap(). This f2fs copy of this code was also broken in that it wouldn't actually have failed if the key was in fact unavailable. Signed-off-by: Eric Biggers Reviewed-by: David Gstir Acked-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index aee781394c87..8f8bd3e49f1f 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -417,14 +417,6 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma) struct inode *inode = file_inode(file); int err; - if (f2fs_encrypted_inode(inode)) { - err = fscrypt_get_encryption_info(inode); - if (err) - return 0; - if (!f2fs_encrypted_inode(inode)) - return -ENOKEY; - } - /* we don't need to use inline_data strictly */ err = f2fs_convert_inline_inode(inode); if (err) From 6dbf9cbf28ab6e5134ad1a3f2fc1ba4c9e195be0 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 22 May 2017 17:39:45 -0700 Subject: [PATCH 325/804] f2fs: don't bother checking for encryption key in ->write_iter() Since only an open file can be written to, and we only allow open()ing an encrypted file when its key is available, there is no need to check for the key again before permitting each ->write_iter(). This code was also broken in that it wouldn't actually have failed if the key was in fact unavailable. Signed-off-by: Eric Biggers Reviewed-by: David Gstir Acked-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 8f8bd3e49f1f..f87eeb04ea8d 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2324,11 +2324,6 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) struct blk_plug plug; ssize_t ret; - if (f2fs_encrypted_inode(inode) && - !fscrypt_has_encryption_key(inode) && - fscrypt_get_encryption_info(inode)) - return -EACCES; - inode_lock(inode); ret = generic_write_checks(iocb, from); if (ret > 0) { From 46281c4ef52648cf84e16013bd196a78f9d84d3d Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 26 May 2017 17:04:40 +0900 Subject: [PATCH 326/804] f2fs: Do not issue small discards in LFS mode clear_prefree_segments() issues small discards after discarding full segments. These small discards may not be section aligned, so not zone aligned on a zoned block device, causing __f2fs_iissue_discard_zone() to fail. Fix this by not issuing small discards for a volume mounted with the BLKZONED feature enabled. Cc: stable@vger.kernel.org Signed-off-by: Damien Le Moal Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index c8f5d8feac44..33b1628245e7 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1416,7 +1416,8 @@ find_next: sbi->blocks_per_seg, cur_pos); len = next_pos - cur_pos; - if (force && len < cpc->trim_minlen) + if (f2fs_sb_mounted_blkzoned(sbi->sb) || + (force && len < cpc->trim_minlen)) goto skip; f2fs_issue_discard(sbi, entry->start_blkaddr + cur_pos, From 97c0717bd043d751969e0d7cc40e12b719bc271f Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 1 Jun 2017 15:39:27 -0700 Subject: [PATCH 327/804] f2fs: remove false-positive bug_on For example, f2fs_create - new_node_page is failed - handle_failed_inode - skip to add it into orphan list, since ni.blk_addr == NULL_ADDR : set_inode_flag(inode, FI_FREE_NID) f2fs_evict_inode - EIO due to fault injection - f2fs_bug_on() is triggered So, we don't need to call f2fs_bug_on in this case. Signed-off-by: Jaegeuk Kim --- fs/f2fs/inode.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index e53c784ab11e..868d71436ebc 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -425,9 +425,10 @@ no_delete: if (is_inode_flag_set(inode, FI_FREE_NID)) { alloc_nid_failed(sbi, inode->i_ino); clear_inode_flag(inode, FI_FREE_NID); + } else { + f2fs_bug_on(sbi, err && + !exist_written_data(sbi, inode->i_ino, ORPHAN_INO)); } - f2fs_bug_on(sbi, err && - !exist_written_data(sbi, inode->i_ino, ORPHAN_INO)); out_clear: fscrypt_put_encryption_info(inode, NULL); clear_inode(inode); From 0a3354135a308bb16420547ea91c8fbaabbb8dcd Mon Sep 17 00:00:00 2001 From: Zhang Shengju Date: Thu, 1 Jun 2017 16:50:10 +0800 Subject: [PATCH 328/804] f2fs: remove the unnecessary cast for PTR_ERR It's not necessary to specify 'int' casting for PTR_ERR. Signed-off-by: Zhang Shengju Signed-off-by: Jaegeuk Kim --- fs/f2fs/acl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index a45d1f4b7b0f..ad26f59ba464 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -236,7 +236,7 @@ static int __f2fs_set_acl(struct inode *inode, int type, value = f2fs_acl_to_disk(F2FS_I_SB(inode), acl, &size); if (IS_ERR(value)) { clear_inode_flag(inode, FI_ACL_MODE); - return (int)PTR_ERR(value); + return PTR_ERR(value); } } From 83f42def8ae0598dfa66c3419c9df4078df60a48 Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Thu, 1 Jun 2017 16:43:51 +0800 Subject: [PATCH 329/804] f2fs: fix a panic caused by NULL flush_cmd_control Mount fs with option noflush_merge, boot failed for illegal address fcc in function f2fs_issue_flush: if (!test_opt(sbi, FLUSH_MERGE)) { ret = submit_flush_wait(sbi); atomic_inc(&fcc->issued_flush); -> Here, fcc illegal return ret; } Signed-off-by: Yunlei He Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 33b1628245e7..3e2121dcaded 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -566,6 +566,9 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi) init_waitqueue_head(&fcc->flush_wait_queue); init_llist_head(&fcc->issue_list); SM_I(sbi)->fcc_info = fcc; + if (!test_opt(sbi, FLUSH_MERGE)) + return err; + init_thread: fcc->f2fs_issue_flush = kthread_run(issue_flush_thread, sbi, "f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev)); @@ -3319,7 +3322,7 @@ int build_segment_manager(struct f2fs_sb_info *sbi) INIT_LIST_HEAD(&sm_info->sit_entry_set); - if (test_opt(sbi, FLUSH_MERGE) && !f2fs_readonly(sbi->sb)) { + if (!f2fs_readonly(sbi->sb)) { err = create_flush_cmd_control(sbi); if (err) return err; From 1479b6ffeb2eba6575600fb941f73f47d74a91a0 Mon Sep 17 00:00:00 2001 From: Jin Qian Date: Thu, 1 Jun 2017 11:18:30 -0700 Subject: [PATCH 330/804] f2fs: sanity check size of nat and sit cache Make sure number of entires doesn't exceed max journal size. Cc: stable@vger.kernel.org Signed-off-by: Jin Qian Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 3e2121dcaded..f86fd003f932 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2574,6 +2574,8 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type) static int restore_curseg_summaries(struct f2fs_sb_info *sbi) { + struct f2fs_journal *sit_j = CURSEG_I(sbi, CURSEG_COLD_DATA)->journal; + struct f2fs_journal *nat_j = CURSEG_I(sbi, CURSEG_HOT_DATA)->journal; int type = CURSEG_HOT_DATA; int err; @@ -2600,6 +2602,11 @@ static int restore_curseg_summaries(struct f2fs_sb_info *sbi) return err; } + /* sanity check for summary blocks */ + if (nats_in_cursum(nat_j) > NAT_JOURNAL_ENTRIES || + sits_in_cursum(sit_j) > SIT_JOURNAL_ENTRIES) + return -EINVAL; + return 0; } From 9ddde7dfd1c9446a26224353393d145c60426093 Mon Sep 17 00:00:00 2001 From: Fan Li Date: Fri, 2 Jun 2017 15:45:42 +0800 Subject: [PATCH 331/804] f2fs: simplify the way of calulating next nat address The index of segment which the next nat block is in has only one different bit than the current one, so to get the next nat address, we can simply alter that one bit. Signed-off-by: Fan Li Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 558048e33cf9..bb53e9955ff2 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -224,11 +224,7 @@ static inline pgoff_t next_nat_addr(struct f2fs_sb_info *sbi, struct f2fs_nm_info *nm_i = NM_I(sbi); block_addr -= nm_i->nat_blkaddr; - if ((block_addr >> sbi->log_blocks_per_seg) % 2) - block_addr -= sbi->blocks_per_seg; - else - block_addr += sbi->blocks_per_seg; - + block_addr ^= 1 << sbi->log_blocks_per_seg; return block_addr + nm_i->nat_blkaddr; } From 11de19dfa757695700c28278ff1f0a289c036a9e Mon Sep 17 00:00:00 2001 From: Qiuyang Sun Date: Thu, 18 May 2017 11:06:45 +0800 Subject: [PATCH 332/804] f2fs: dax: fix races between page faults and truncating pages Currently in F2FS, page faults and operations that truncate the pagecahe or data blocks, are completely unsynchronized. This can result in page fault faulting in a page into a range that we are changing after truncating, and thus we can end up with a page mapped to disk blocks that will be shortly freed. Filesystem corruption will shortly follow. This patch fixes the problem by creating new rw semaphore i_mmap_sem in f2fs_inode_info and grab it for functions removing blocks from extent tree and for read over page faults. The mechanism is similar to that in ext4. Signed-off-by: Qiuyang Sun Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim Conflicts: fs/f2fs/file.c --- fs/f2fs/data.c | 2 ++ fs/f2fs/f2fs.h | 1 + fs/f2fs/file.c | 49 ++++++++++++++++++++++++++++++++++++++++--------- fs/f2fs/super.c | 1 + 4 files changed, 44 insertions(+), 9 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index f61ab7539229..50048986bed9 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1799,8 +1799,10 @@ static void f2fs_write_failed(struct address_space *mapping, loff_t to) loff_t i_size = i_size_read(inode); if (to > i_size) { + down_write(&F2FS_I(inode)->i_mmap_sem); truncate_pagecache(inode, i_size); truncate_blocks(inode, i_size, true); + up_write(&F2FS_I(inode)->i_mmap_sem); } } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index f35473293e46..da7bb61a678a 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -586,6 +586,7 @@ struct f2fs_inode_info { struct mutex inmem_lock; /* lock for inmemory pages */ struct extent_tree *extent_tree; /* cached extent_tree entry */ struct rw_semaphore dio_rwsem[2];/* avoid racing between dio and gc */ + struct rw_semaphore i_mmap_sem; }; static inline void get_extent_info(struct extent_info *ext, diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index f87eeb04ea8d..447dd1221167 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -34,6 +34,19 @@ #include "trace.h" #include +static int f2fs_filemap_fault(struct vm_area_struct *vma, + struct vm_fault *vmf) +{ + struct inode *inode = file_inode(vma->vm_file); + int err; + + down_read(&F2FS_I(inode)->i_mmap_sem); + err = filemap_fault(vma, vmf); + up_read(&F2FS_I(inode)->i_mmap_sem); + + return err; +} + static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { @@ -61,13 +74,14 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, f2fs_balance_fs(sbi, dn.node_changed); file_update_time(vma->vm_file); + down_read(&F2FS_I(inode)->i_mmap_sem); lock_page(page); if (unlikely(page->mapping != inode->i_mapping || page_offset(page) > i_size_read(inode) || !PageUptodate(page))) { unlock_page(page); err = -EFAULT; - goto out; + goto out_sem; } /* @@ -96,6 +110,8 @@ mapped: if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) f2fs_wait_on_encrypted_page_writeback(sbi, dn.data_blkaddr); +out_sem: + up_read(&F2FS_I(inode)->i_mmap_sem); out: sb_end_pagefault(inode->i_sb); f2fs_update_time(sbi, REQ_TIME); @@ -103,7 +119,7 @@ out: } static const struct vm_operations_struct f2fs_file_vm_ops = { - .fault = filemap_fault, + .fault = f2fs_filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = f2fs_vm_page_mkwrite, }; @@ -681,8 +697,10 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) return -EACCES; if (attr->ia_size <= i_size_read(inode)) { + down_write(&F2FS_I(inode)->i_mmap_sem); truncate_setsize(inode, attr->ia_size); err = f2fs_truncate(inode); + up_write(&F2FS_I(inode)->i_mmap_sem); if (err) return err; } else { @@ -690,7 +708,9 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) * do not trim all blocks after i_size if target size is * larger than i_size. */ + down_write(&F2FS_I(inode)->i_mmap_sem); truncate_setsize(inode, attr->ia_size); + up_write(&F2FS_I(inode)->i_mmap_sem); /* should convert inline inode here */ if (!f2fs_may_inline_data(inode)) { @@ -836,12 +856,14 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) blk_start = (loff_t)pg_start << PAGE_SHIFT; blk_end = (loff_t)pg_end << PAGE_SHIFT; + down_write(&F2FS_I(inode)->i_mmap_sem); truncate_inode_pages_range(mapping, blk_start, blk_end - 1); f2fs_lock_op(sbi); ret = truncate_hole(inode, pg_start, pg_end); f2fs_unlock_op(sbi); + up_write(&F2FS_I(inode)->i_mmap_sem); } } @@ -1080,16 +1102,17 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) pg_start = offset >> PAGE_SHIFT; pg_end = (offset + len) >> PAGE_SHIFT; + down_write(&F2FS_I(inode)->i_mmap_sem); /* write out all dirty pages from offset */ ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); if (ret) - return ret; + goto out; truncate_pagecache(inode, offset); ret = f2fs_do_collapse(inode, pg_start, pg_end); if (ret) - return ret; + goto out; /* write out all moved pages, if possible */ filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); @@ -1102,6 +1125,8 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) if (!ret) f2fs_i_size_write(inode, new_size); +out: + up_write(&F2FS_I(inode)->i_mmap_sem); return ret; } @@ -1166,9 +1191,10 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, if (ret) return ret; + down_write(&F2FS_I(inode)->i_mmap_sem); ret = filemap_write_and_wait_range(mapping, offset, offset + len - 1); if (ret) - return ret; + goto out_sem; truncate_pagecache_range(inode, offset, offset + len - 1); @@ -1182,7 +1208,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, ret = fill_zero(inode, pg_start, off_start, off_end - off_start); if (ret) - return ret; + goto out_sem; new_size = max_t(loff_t, new_size, offset + len); } else { @@ -1190,7 +1216,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, ret = fill_zero(inode, pg_start++, off_start, PAGE_SIZE - off_start); if (ret) - return ret; + goto out_sem; new_size = max_t(loff_t, new_size, (loff_t)pg_start << PAGE_SHIFT); @@ -1239,6 +1265,8 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, out: if (!(mode & FALLOC_FL_KEEP_SIZE) && i_size_read(inode) < new_size) f2fs_i_size_write(inode, new_size); +out_sem: + up_write(&F2FS_I(inode)->i_mmap_sem); return ret; } @@ -1268,14 +1296,15 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) f2fs_balance_fs(sbi, true); + down_write(&F2FS_I(inode)->i_mmap_sem); ret = truncate_blocks(inode, i_size_read(inode), true); if (ret) - return ret; + goto out; /* write out all dirty pages from offset */ ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); if (ret) - return ret; + goto out; truncate_pagecache(inode, offset); @@ -1304,6 +1333,8 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) if (!ret) f2fs_i_size_write(inode, new_size); +out: + up_write(&F2FS_I(inode)->i_mmap_sem); return ret; } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 9c9a01f776dc..dc69af8ed028 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -624,6 +624,7 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) mutex_init(&fi->inmem_lock); init_rwsem(&fi->dio_rwsem[READ]); init_rwsem(&fi->dio_rwsem[WRITE]); + init_rwsem(&fi->i_mmap_sem); /* Will be used by directory only */ fi->i_dir_level = F2FS_SB(sb)->dir_level; From 4798bcc8085173a5f747ba9a597bc8fc27cbd8d6 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 5 Jun 2017 18:29:06 +0800 Subject: [PATCH 333/804] f2fs: introduce __wait_one_discard_bio In order to avoid copied codes. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index f86fd003f932..e10d4e5f2193 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1074,6 +1074,20 @@ out: mutex_unlock(&dcc->cmd_lock); } +static void __wait_one_discard_bio(struct f2fs_sb_info *sbi, + struct discard_cmd *dc) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + + wait_for_completion_io(&dc->wait); + mutex_lock(&dcc->cmd_lock); + f2fs_bug_on(sbi, dc->state != D_DONE); + dc->ref--; + if (!dc->ref) + __remove_discard_cmd(sbi, dc); + mutex_unlock(&dcc->cmd_lock); +} + static void __wait_discard_cmd(struct f2fs_sb_info *sbi, bool wait_cond) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; @@ -1098,13 +1112,7 @@ next: mutex_unlock(&dcc->cmd_lock); if (need_wait) { - wait_for_completion_io(&dc->wait); - mutex_lock(&dcc->cmd_lock); - f2fs_bug_on(sbi, dc->state != D_DONE); - dc->ref--; - if (!dc->ref) - __remove_discard_cmd(sbi, dc); - mutex_unlock(&dcc->cmd_lock); + __wait_one_discard_bio(sbi, dc); goto next; } } @@ -1128,15 +1136,8 @@ void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) } mutex_unlock(&dcc->cmd_lock); - if (need_wait) { - wait_for_completion_io(&dc->wait); - mutex_lock(&dcc->cmd_lock); - f2fs_bug_on(sbi, dc->state != D_DONE); - dc->ref--; - if (!dc->ref) - __remove_discard_cmd(sbi, dc); - mutex_unlock(&dcc->cmd_lock); - } + if (need_wait) + __wait_one_discard_bio(sbi, dc); } /* This comes from f2fs_put_super */ From f3fb4448d8f0843b6aec068fdcb0c8ea8a5321da Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 5 Jun 2017 18:29:07 +0800 Subject: [PATCH 334/804] f2fs: add f2fs_bug_on in __remove_discard_cmd Recently, discard related codes have changed a lot, so add f2fs_bug_on to detect potential bug. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index e10d4e5f2193..9b08a6660d13 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -739,6 +739,8 @@ static void __remove_discard_cmd(struct f2fs_sb_info *sbi, { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + f2fs_bug_on(sbi, dc->ref); + if (dc->error == -EOPNOTSUPP) dc->error = 0; From 958022601a8cd3bd95aa2673f9de42ce98c9c4fa Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 5 Jun 2017 18:29:08 +0800 Subject: [PATCH 335/804] f2fs: don't track newly allocated nat entry in list We will never persist newly allocated nat entries during checkpoint(), so we don't need to track such nat entries in nat dirty list in order to avoid: - more latency during traversing dirty list; - sorting nat sets incorrectly due to recording wrong entry_cnt in nat entry set. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 8e27e853ed11..c1b940ea01db 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -158,9 +158,6 @@ static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i, nid_t set = NAT_BLOCK_OFFSET(ne->ni.nid); struct nat_entry_set *head; - if (get_nat_flag(ne, IS_DIRTY)) - return; - head = radix_tree_lookup(&nm_i->nat_set_root, set); if (!head) { head = f2fs_kmem_cache_alloc(nat_entry_set_slab, GFP_NOFS); @@ -171,10 +168,18 @@ static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i, head->entry_cnt = 0; f2fs_radix_tree_insert(&nm_i->nat_set_root, set, head); } - list_move_tail(&ne->list, &head->entry_list); + + if (get_nat_flag(ne, IS_DIRTY)) + goto refresh_list; + nm_i->dirty_nat_cnt++; head->entry_cnt++; set_nat_flag(ne, IS_DIRTY, true); +refresh_list: + if (nat_get_blkaddr(ne) == NEW_ADDR) + list_del_init(&ne->list); + else + list_move_tail(&ne->list, &head->entry_list); } static void __clear_nat_cache_dirty(struct f2fs_nm_info *nm_i, @@ -2426,8 +2431,7 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, nid_t nid = nat_get_nid(ne); int offset; - if (nat_get_blkaddr(ne) == NEW_ADDR) - continue; + f2fs_bug_on(sbi, nat_get_blkaddr(ne) == NEW_ADDR); if (to_journal) { offset = lookup_journal_in_cursum(journal, From 82aed6f9d75f503b3dfabc7be02a25b168348511 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 7 Jun 2017 11:17:35 +0800 Subject: [PATCH 336/804] f2fs: fix to avoid panic when encountering corrupt node With fault_injection option, generic/361 of fstests will complain us with below message: Call Trace: get_node_page+0x12/0x20 [f2fs] f2fs_iget+0x92/0x7d0 [f2fs] f2fs_fill_super+0x10fb/0x15e0 [f2fs] mount_bdev+0x184/0x1c0 f2fs_mount+0x15/0x20 [f2fs] mount_fs+0x39/0x150 vfs_kern_mount+0x67/0x110 do_mount+0x1bb/0xc70 SyS_mount+0x83/0xd0 do_syscall_64+0x6e/0x160 entry_SYSCALL64_slow_path+0x25/0x25 Since mkfs loop device in f2fs partition can be failed silently due to checkpoint error injection, so root inode page can be corrupted, in order to avoid needless panic, in get_node_page, it's better to leave message and return error to caller, and let fsck repaire it later. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index c1b940ea01db..70f3c01a806f 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1157,6 +1157,7 @@ repeat: f2fs_put_page(page, 1); return ERR_PTR(err); } else if (err == LOCKED_PAGE) { + err = 0; goto page_hit; } @@ -1170,15 +1171,22 @@ repeat: goto repeat; } - if (unlikely(!PageUptodate(page))) + if (unlikely(!PageUptodate(page))) { + err = -EIO; goto out_err; + } page_hit: if(unlikely(nid != nid_of_node(page))) { - f2fs_bug_on(sbi, 1); + f2fs_msg(sbi->sb, KERN_WARNING, "inconsistent node block, " + "nid:%lu, node_footer[nid:%u,ino:%u,ofs:%u,cpver:%llu,blkaddr:%u]", + nid, nid_of_node(page), ino_of_node(page), + ofs_of_node(page), cpver_of_node(page), + next_blkaddr_of_node(page)); ClearPageUptodate(page); + err = -EINVAL; out_err: f2fs_put_page(page, 1); - return ERR_PTR(-EIO); + return ERR_PTR(err); } return page; } From e1640316b9d82b2f12ca0165a082e3f357b26d9f Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Fri, 9 Jun 2017 06:32:54 +0800 Subject: [PATCH 337/804] f2fs: use proper variable name It is better to use variable name "inline_dentry" instead of "dentry_blk" when data type is "struct f2fs_inline_dentry". This patch has no functional changes, just to make code more readable especially when call the function make_dentry_ptr_inline() and f2fs_convert_inline_dir(). Signed-off-by: Tiezhu Yang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/inline.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index fc8b49696b9d..03c86e55e4a7 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -316,12 +316,12 @@ struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir, int make_empty_inline_dir(struct inode *inode, struct inode *parent, struct page *ipage) { - struct f2fs_inline_dentry *dentry_blk; + struct f2fs_inline_dentry *inline_dentry; struct f2fs_dentry_ptr d; - dentry_blk = inline_data_addr(ipage); + inline_dentry = inline_data_addr(ipage); - make_dentry_ptr_inline(NULL, &d, dentry_blk); + make_dentry_ptr_inline(NULL, &d, inline_dentry); do_make_empty_dir(inode, parent, &d); set_page_dirty(ipage); @@ -500,7 +500,7 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, struct page *ipage; unsigned int bit_pos; f2fs_hash_t name_hash; - struct f2fs_inline_dentry *dentry_blk = NULL; + struct f2fs_inline_dentry *inline_dentry = NULL; struct f2fs_dentry_ptr d; int slots = GET_DENTRY_SLOTS(new_name->len); struct page *page = NULL; @@ -510,11 +510,11 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, if (IS_ERR(ipage)) return PTR_ERR(ipage); - dentry_blk = inline_data_addr(ipage); - bit_pos = room_for_filename(&dentry_blk->dentry_bitmap, + inline_dentry = inline_data_addr(ipage); + bit_pos = room_for_filename(&inline_dentry->dentry_bitmap, slots, NR_INLINE_DENTRY); if (bit_pos >= NR_INLINE_DENTRY) { - err = f2fs_convert_inline_dir(dir, ipage, dentry_blk); + err = f2fs_convert_inline_dir(dir, ipage, inline_dentry); if (err) return err; err = -EAGAIN; @@ -534,7 +534,7 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, f2fs_wait_on_page_writeback(ipage, NODE, true); name_hash = f2fs_dentry_hash(new_name, NULL); - make_dentry_ptr_inline(NULL, &d, dentry_blk); + make_dentry_ptr_inline(NULL, &d, inline_dentry); f2fs_update_dentry(ino, mode, &d, new_name, name_hash, bit_pos); set_page_dirty(ipage); @@ -586,14 +586,14 @@ bool f2fs_empty_inline_dir(struct inode *dir) struct f2fs_sb_info *sbi = F2FS_I_SB(dir); struct page *ipage; unsigned int bit_pos = 2; - struct f2fs_inline_dentry *dentry_blk; + struct f2fs_inline_dentry *inline_dentry; ipage = get_node_page(sbi, dir->i_ino); if (IS_ERR(ipage)) return false; - dentry_blk = inline_data_addr(ipage); - bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap, + inline_dentry = inline_data_addr(ipage); + bit_pos = find_next_bit_le(&inline_dentry->dentry_bitmap, NR_INLINE_DENTRY, bit_pos); From fb359654223525f89041c14de65acf1ef081607d Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 11 Jun 2017 09:21:11 +0200 Subject: [PATCH 338/804] f2fs: Fix a return value in case of error in 'f2fs_fill_super' err must be set to -ENOMEM, otherwise we return 0. Fixes: a912b54d3aaa0 ("f2fs: split bio cache") Signed-off-by: Christophe JAILLET Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index dc69af8ed028..fb51fd248d5b 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1983,8 +1983,10 @@ try_onemore: sbi->write_io[i] = kmalloc(n * sizeof(struct f2fs_bio_info), GFP_KERNEL); - if (!sbi->write_io[i]) + if (!sbi->write_io[i]) { + err = -ENOMEM; goto free_options; + } for (j = HOT; j < n; j++) { init_rwsem(&sbi->write_io[i][j].io_rwsem); From 47871a830a29b6327cda1e5e2866c39c9f3650f4 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 12 Jun 2017 09:44:24 +0800 Subject: [PATCH 339/804] f2fs: fix to show injection rate in ->show_options If fault injection functionality is enabled, show additional injection rate in ->show_options. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index fb51fd248d5b..26addfca7baa 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -985,7 +985,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_printf(seq, ",io_size=%uKB", F2FS_IO_SIZE_KB(sbi)); #ifdef CONFIG_F2FS_FAULT_INJECTION if (test_opt(sbi, FAULT_INJECTION)) - seq_puts(seq, ",fault_injection"); + seq_printf(seq, ",fault_injection=%u", + sbi->fault_info.inject_rate); #endif return 0; From b9ee759fbcae5bccc24f0520b6e1d5db0b5cc916 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 12 Jun 2017 09:44:27 +0800 Subject: [PATCH 340/804] f2fs: fix wrong error number of fill_super This patch fixes incorrect error number in error path of fill_super. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 26addfca7baa..d98fdb211cdf 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1936,6 +1936,7 @@ try_onemore: if (f2fs_sb_mounted_blkzoned(sb)) { f2fs_msg(sb, KERN_ERR, "Zoned block device support is not enabled\n"); + err = -EOPNOTSUPP; goto free_sb_buf; } #endif @@ -2009,8 +2010,10 @@ try_onemore: if (F2FS_IO_SIZE(sbi) > 1) { sbi->write_io_dummy = mempool_create_page_pool(2 * (F2FS_IO_SIZE(sbi) - 1), 0); - if (!sbi->write_io_dummy) + if (!sbi->write_io_dummy) { + err = -ENOMEM; goto free_options; + } } /* get an inode for meta space */ From 1fb6bf71cd86f0f8433ff513f795f8248d18f59d Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 14 Jun 2017 17:39:46 +0800 Subject: [PATCH 341/804] f2fs: clean up sysfs codes Just cleanup. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 121 +++++++++++++++++++++++++++++------------------- 1 file changed, 74 insertions(+), 47 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index d98fdb211cdf..70e82c4c210d 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -349,6 +349,22 @@ static struct kobj_type f2fs_ktype = { .release = f2fs_sb_release, }; +int __init f2fs_register_sysfs(void) +{ + f2fs_proc_root = proc_mkdir("fs/f2fs", NULL); + + f2fs_kset = kset_create_and_add("f2fs", NULL, fs_kobj); + if (!f2fs_kset) + return -ENOMEM; + return 0; +} + +void f2fs_unregister_sysfs(void) +{ + kset_unregister(f2fs_kset); + remove_proc_entry("fs/f2fs", NULL); +} + void f2fs_msg(struct super_block *sb, const char *level, const char *fmt, ...) { struct va_format vaf; @@ -766,17 +782,23 @@ static void destroy_device_list(struct f2fs_sb_info *sbi) kfree(sbi->devs); } -static void f2fs_put_super(struct super_block *sb) +void f2fs_exit_sysfs(struct f2fs_sb_info *sbi) { - struct f2fs_sb_info *sbi = F2FS_SB(sb); - int i; + kobject_del(&sbi->s_kobj); + kobject_put(&sbi->s_kobj); + wait_for_completion(&sbi->s_kobj_unregister); if (sbi->s_proc) { remove_proc_entry("segment_info", sbi->s_proc); remove_proc_entry("segment_bits", sbi->s_proc); - remove_proc_entry(sb->s_id, f2fs_proc_root); + remove_proc_entry(sbi->sb->s_id, f2fs_proc_root); } - kobject_del(&sbi->s_kobj); +} + +static void f2fs_put_super(struct super_block *sb) +{ + struct f2fs_sb_info *sbi = F2FS_SB(sb); + int i; stop_gc_thread(sbi); @@ -829,8 +851,8 @@ static void f2fs_put_super(struct super_block *sb) destroy_segment_manager(sbi); kfree(sbi->ckpt); - kobject_put(&sbi->s_kobj); - wait_for_completion(&sbi->s_kobj_unregister); + + f2fs_exit_sysfs(sbi); sb->s_fs_info = NULL; if (sbi->s_chksum_driver) @@ -1060,6 +1082,37 @@ static const struct file_operations f2fs_seq_##_name##_fops = { \ F2FS_PROC_FILE_DEF(segment_info); F2FS_PROC_FILE_DEF(segment_bits); +int f2fs_init_sysfs(struct f2fs_sb_info *sbi) +{ + struct super_block *sb = sbi->sb; + int err; + + if (f2fs_proc_root) + sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root); + + if (sbi->s_proc) { + proc_create_data("segment_info", S_IRUGO, sbi->s_proc, + &f2fs_seq_segment_info_fops, sb); + proc_create_data("segment_bits", S_IRUGO, sbi->s_proc, + &f2fs_seq_segment_bits_fops, sb); + } + + sbi->s_kobj.kset = f2fs_kset; + init_completion(&sbi->s_kobj_unregister); + err = kobject_init_and_add(&sbi->s_kobj, &f2fs_ktype, NULL, + "%s", sb->s_id); + if (err) + goto err_out; + return 0; +err_out: + if (sbi->s_proc) { + remove_proc_entry("segment_info", sbi->s_proc); + remove_proc_entry("segment_bits", sbi->s_proc); + remove_proc_entry(sb->s_id, f2fs_proc_root); + } + return err; +} + static void default_options(struct f2fs_sb_info *sbi) { /* init some FS parameters */ @@ -2120,22 +2173,9 @@ try_onemore: goto free_root_inode; } - if (f2fs_proc_root) - sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root); - - if (sbi->s_proc) { - proc_create_data("segment_info", S_IRUGO, sbi->s_proc, - &f2fs_seq_segment_info_fops, sb); - proc_create_data("segment_bits", S_IRUGO, sbi->s_proc, - &f2fs_seq_segment_bits_fops, sb); - } - - sbi->s_kobj.kset = f2fs_kset; - init_completion(&sbi->s_kobj_unregister); - err = kobject_init_and_add(&sbi->s_kobj, &f2fs_ktype, NULL, - "%s", sb->s_id); + err = f2fs_init_sysfs(sbi); if (err) - goto free_proc; + goto free_root_inode; /* recover fsynced data */ if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) { @@ -2146,7 +2186,7 @@ try_onemore: if (bdev_read_only(sb->s_bdev) && !is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) { err = -EROFS; - goto free_kobj; + goto free_sysfs; } if (need_fsck) @@ -2160,7 +2200,7 @@ try_onemore: need_fsck = true; f2fs_msg(sb, KERN_ERR, "Cannot recover all fsync data errno=%d", err); - goto free_kobj; + goto free_sysfs; } } else { err = recover_fsync_data(sbi, true); @@ -2169,7 +2209,7 @@ try_onemore: err = -EINVAL; f2fs_msg(sb, KERN_ERR, "Need to recover fsync data"); - goto free_kobj; + goto free_sysfs; } } skip_recovery: @@ -2184,7 +2224,7 @@ skip_recovery: /* After POR, we can run background GC thread.*/ err = start_gc_thread(sbi); if (err) - goto free_kobj; + goto free_sysfs; } kfree(options); @@ -2202,17 +2242,9 @@ skip_recovery: f2fs_update_time(sbi, REQ_TIME); return 0; -free_kobj: +free_sysfs: f2fs_sync_inode_meta(sbi); - kobject_del(&sbi->s_kobj); - kobject_put(&sbi->s_kobj); - wait_for_completion(&sbi->s_kobj_unregister); -free_proc: - if (sbi->s_proc) { - remove_proc_entry("segment_info", sbi->s_proc); - remove_proc_entry("segment_bits", sbi->s_proc); - remove_proc_entry(sb->s_id, f2fs_proc_root); - } + f2fs_exit_sysfs(sbi); free_root_inode: dput(sb->s_root); sb->s_root = NULL; @@ -2327,30 +2359,26 @@ static int __init init_f2fs_fs(void) err = create_extent_cache(); if (err) goto free_checkpoint_caches; - f2fs_kset = kset_create_and_add("f2fs", NULL, fs_kobj); - if (!f2fs_kset) { - err = -ENOMEM; + err = f2fs_register_sysfs(); + if (err) goto free_extent_cache; - } err = register_shrinker(&f2fs_shrinker_info); if (err) - goto free_kset; - + goto free_sysfs; err = register_filesystem(&f2fs_fs_type); if (err) goto free_shrinker; err = f2fs_create_root_stats(); if (err) goto free_filesystem; - f2fs_proc_root = proc_mkdir("fs/f2fs", NULL); return 0; free_filesystem: unregister_filesystem(&f2fs_fs_type); free_shrinker: unregister_shrinker(&f2fs_shrinker_info); -free_kset: - kset_unregister(f2fs_kset); +free_sysfs: + f2fs_unregister_sysfs(); free_extent_cache: destroy_extent_cache(); free_checkpoint_caches: @@ -2367,11 +2395,10 @@ fail: static void __exit exit_f2fs_fs(void) { - remove_proc_entry("fs/f2fs", NULL); f2fs_destroy_root_stats(); unregister_filesystem(&f2fs_fs_type); unregister_shrinker(&f2fs_shrinker_info); - kset_unregister(f2fs_kset); + f2fs_unregister_sysfs(); destroy_extent_cache(); destroy_checkpoint_caches(); destroy_segment_manager_caches(); From 06fc88a60d56abbde2f0cd034358fef37f7d307c Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 14 Jun 2017 17:39:47 +0800 Subject: [PATCH 342/804] f2fs: move sysfs code from super.c to fs/f2fs/sysfs.c Codes related to sysfs and procfs are dispersive and mixed with sb related codes, but actually these codes are independent from others, so split them from super.c, and reorgnize and manger them in sysfs.c. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim Conflicts: fs/f2fs/super.c --- fs/f2fs/Makefile | 2 +- fs/f2fs/f2fs.h | 8 ++ fs/f2fs/super.c | 333 -------------------------------------------- fs/f2fs/sysfs.c | 350 +++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 359 insertions(+), 334 deletions(-) create mode 100644 fs/f2fs/sysfs.c diff --git a/fs/f2fs/Makefile b/fs/f2fs/Makefile index ca949ea7c02f..a0dc559b1b47 100644 --- a/fs/f2fs/Makefile +++ b/fs/f2fs/Makefile @@ -2,7 +2,7 @@ obj-$(CONFIG_F2FS_FS) += f2fs.o f2fs-y := dir.o file.o inode.o namei.o hash.o super.o inline.o f2fs-y += checkpoint.o gc.o data.o node.o segment.o recovery.o -f2fs-y += shrinker.o extent_cache.o +f2fs-y += shrinker.o extent_cache.o sysfs.o f2fs-$(CONFIG_F2FS_STAT_FS) += debug.o f2fs-$(CONFIG_F2FS_FS_XATTR) += xattr.o f2fs-$(CONFIG_F2FS_FS_POSIX_ACL) += acl.o diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index da7bb61a678a..0d9d25891833 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2740,6 +2740,14 @@ void init_extent_cache_info(struct f2fs_sb_info *sbi); int __init create_extent_cache(void); void destroy_extent_cache(void); +/* + * sysfs.c + */ +int __init f2fs_register_sysfs(void); +void f2fs_unregister_sysfs(void); +int f2fs_init_sysfs(struct f2fs_sb_info *sbi); +void f2fs_exit_sysfs(struct f2fs_sb_info *sbi); + /* * crypto support */ diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 70e82c4c210d..42093c7c9ae0 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -35,9 +35,7 @@ #define CREATE_TRACE_POINTS #include -static struct proc_dir_entry *f2fs_proc_root; static struct kmem_cache *f2fs_inode_cachep; -static struct kset *f2fs_kset; #ifdef CONFIG_F2FS_FAULT_INJECTION @@ -146,225 +144,6 @@ static match_table_t f2fs_tokens = { {Opt_err, NULL}, }; -/* Sysfs support for f2fs */ -enum { - GC_THREAD, /* struct f2fs_gc_thread */ - SM_INFO, /* struct f2fs_sm_info */ - DCC_INFO, /* struct discard_cmd_control */ - NM_INFO, /* struct f2fs_nm_info */ - F2FS_SBI, /* struct f2fs_sb_info */ -#ifdef CONFIG_F2FS_FAULT_INJECTION - FAULT_INFO_RATE, /* struct f2fs_fault_info */ - FAULT_INFO_TYPE, /* struct f2fs_fault_info */ -#endif -}; - -struct f2fs_attr { - struct attribute attr; - ssize_t (*show)(struct f2fs_attr *, struct f2fs_sb_info *, char *); - ssize_t (*store)(struct f2fs_attr *, struct f2fs_sb_info *, - const char *, size_t); - int struct_type; - int offset; -}; - -static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type) -{ - if (struct_type == GC_THREAD) - return (unsigned char *)sbi->gc_thread; - else if (struct_type == SM_INFO) - return (unsigned char *)SM_I(sbi); - else if (struct_type == DCC_INFO) - return (unsigned char *)SM_I(sbi)->dcc_info; - else if (struct_type == NM_INFO) - return (unsigned char *)NM_I(sbi); - else if (struct_type == F2FS_SBI) - return (unsigned char *)sbi; -#ifdef CONFIG_F2FS_FAULT_INJECTION - else if (struct_type == FAULT_INFO_RATE || - struct_type == FAULT_INFO_TYPE) - return (unsigned char *)&sbi->fault_info; -#endif - return NULL; -} - -static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a, - struct f2fs_sb_info *sbi, char *buf) -{ - struct super_block *sb = sbi->sb; - - if (!sb->s_bdev->bd_part) - return snprintf(buf, PAGE_SIZE, "0\n"); - - return snprintf(buf, PAGE_SIZE, "%llu\n", - (unsigned long long)(sbi->kbytes_written + - BD_PART_WRITTEN(sbi))); -} - -static ssize_t f2fs_sbi_show(struct f2fs_attr *a, - struct f2fs_sb_info *sbi, char *buf) -{ - unsigned char *ptr = NULL; - unsigned int *ui; - - ptr = __struct_ptr(sbi, a->struct_type); - if (!ptr) - return -EINVAL; - - ui = (unsigned int *)(ptr + a->offset); - - return snprintf(buf, PAGE_SIZE, "%u\n", *ui); -} - -static ssize_t f2fs_sbi_store(struct f2fs_attr *a, - struct f2fs_sb_info *sbi, - const char *buf, size_t count) -{ - unsigned char *ptr; - unsigned long t; - unsigned int *ui; - ssize_t ret; - - ptr = __struct_ptr(sbi, a->struct_type); - if (!ptr) - return -EINVAL; - - ui = (unsigned int *)(ptr + a->offset); - - ret = kstrtoul(skip_spaces(buf), 0, &t); - if (ret < 0) - return ret; -#ifdef CONFIG_F2FS_FAULT_INJECTION - if (a->struct_type == FAULT_INFO_TYPE && t >= (1 << FAULT_MAX)) - return -EINVAL; -#endif - *ui = t; - return count; -} - -static ssize_t f2fs_attr_show(struct kobject *kobj, - struct attribute *attr, char *buf) -{ - struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, - s_kobj); - struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr); - - return a->show ? a->show(a, sbi, buf) : 0; -} - -static ssize_t f2fs_attr_store(struct kobject *kobj, struct attribute *attr, - const char *buf, size_t len) -{ - struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, - s_kobj); - struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr); - - return a->store ? a->store(a, sbi, buf, len) : 0; -} - -static void f2fs_sb_release(struct kobject *kobj) -{ - struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, - s_kobj); - complete(&sbi->s_kobj_unregister); -} - -#define F2FS_ATTR_OFFSET(_struct_type, _name, _mode, _show, _store, _offset) \ -static struct f2fs_attr f2fs_attr_##_name = { \ - .attr = {.name = __stringify(_name), .mode = _mode }, \ - .show = _show, \ - .store = _store, \ - .struct_type = _struct_type, \ - .offset = _offset \ -} - -#define F2FS_RW_ATTR(struct_type, struct_name, name, elname) \ - F2FS_ATTR_OFFSET(struct_type, name, 0644, \ - f2fs_sbi_show, f2fs_sbi_store, \ - offsetof(struct struct_name, elname)) - -#define F2FS_GENERAL_RO_ATTR(name) \ -static struct f2fs_attr f2fs_attr_##name = __ATTR(name, 0444, name##_show, NULL) - -F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_min_sleep_time, min_sleep_time); -F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_max_sleep_time, max_sleep_time); -F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_no_gc_sleep_time, no_gc_sleep_time); -F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_idle, gc_idle); -F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments); -F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_small_discards, max_discards); -F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, batched_trim_sections, trim_sections); -F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy); -F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util); -F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks); -F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_hot_blocks, min_hot_blocks); -F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh); -F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ra_nid_pages, ra_nid_pages); -F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, dirty_nats_ratio, dirty_nats_ratio); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, cp_interval, interval_time[CP_TIME]); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, idle_interval, interval_time[REQ_TIME]); -#ifdef CONFIG_F2FS_FAULT_INJECTION -F2FS_RW_ATTR(FAULT_INFO_RATE, f2fs_fault_info, inject_rate, inject_rate); -F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type); -#endif -F2FS_GENERAL_RO_ATTR(lifetime_write_kbytes); - -#define ATTR_LIST(name) (&f2fs_attr_##name.attr) -static struct attribute *f2fs_attrs[] = { - ATTR_LIST(gc_min_sleep_time), - ATTR_LIST(gc_max_sleep_time), - ATTR_LIST(gc_no_gc_sleep_time), - ATTR_LIST(gc_idle), - ATTR_LIST(reclaim_segments), - ATTR_LIST(max_small_discards), - ATTR_LIST(batched_trim_sections), - ATTR_LIST(ipu_policy), - ATTR_LIST(min_ipu_util), - ATTR_LIST(min_fsync_blocks), - ATTR_LIST(min_hot_blocks), - ATTR_LIST(max_victim_search), - ATTR_LIST(dir_level), - ATTR_LIST(ram_thresh), - ATTR_LIST(ra_nid_pages), - ATTR_LIST(dirty_nats_ratio), - ATTR_LIST(cp_interval), - ATTR_LIST(idle_interval), -#ifdef CONFIG_F2FS_FAULT_INJECTION - ATTR_LIST(inject_rate), - ATTR_LIST(inject_type), -#endif - ATTR_LIST(lifetime_write_kbytes), - NULL, -}; - -static const struct sysfs_ops f2fs_attr_ops = { - .show = f2fs_attr_show, - .store = f2fs_attr_store, -}; - -static struct kobj_type f2fs_ktype = { - .default_attrs = f2fs_attrs, - .sysfs_ops = &f2fs_attr_ops, - .release = f2fs_sb_release, -}; - -int __init f2fs_register_sysfs(void) -{ - f2fs_proc_root = proc_mkdir("fs/f2fs", NULL); - - f2fs_kset = kset_create_and_add("f2fs", NULL, fs_kobj); - if (!f2fs_kset) - return -ENOMEM; - return 0; -} - -void f2fs_unregister_sysfs(void) -{ - kset_unregister(f2fs_kset); - remove_proc_entry("fs/f2fs", NULL); -} - void f2fs_msg(struct super_block *sb, const char *level, const char *fmt, ...) { struct va_format vaf; @@ -782,19 +561,6 @@ static void destroy_device_list(struct f2fs_sb_info *sbi) kfree(sbi->devs); } -void f2fs_exit_sysfs(struct f2fs_sb_info *sbi) -{ - kobject_del(&sbi->s_kobj); - kobject_put(&sbi->s_kobj); - wait_for_completion(&sbi->s_kobj_unregister); - - if (sbi->s_proc) { - remove_proc_entry("segment_info", sbi->s_proc); - remove_proc_entry("segment_bits", sbi->s_proc); - remove_proc_entry(sbi->sb->s_id, f2fs_proc_root); - } -} - static void f2fs_put_super(struct super_block *sb) { struct f2fs_sb_info *sbi = F2FS_SB(sb); @@ -1014,105 +780,6 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) return 0; } -static int segment_info_seq_show(struct seq_file *seq, void *offset) -{ - struct super_block *sb = seq->private; - struct f2fs_sb_info *sbi = F2FS_SB(sb); - unsigned int total_segs = - le32_to_cpu(sbi->raw_super->segment_count_main); - int i; - - seq_puts(seq, "format: segment_type|valid_blocks\n" - "segment_type(0:HD, 1:WD, 2:CD, 3:HN, 4:WN, 5:CN)\n"); - - for (i = 0; i < total_segs; i++) { - struct seg_entry *se = get_seg_entry(sbi, i); - - if ((i % 10) == 0) - seq_printf(seq, "%-10d", i); - seq_printf(seq, "%d|%-3u", se->type, - get_valid_blocks(sbi, i, false)); - if ((i % 10) == 9 || i == (total_segs - 1)) - seq_putc(seq, '\n'); - else - seq_putc(seq, ' '); - } - - return 0; -} - -static int segment_bits_seq_show(struct seq_file *seq, void *offset) -{ - struct super_block *sb = seq->private; - struct f2fs_sb_info *sbi = F2FS_SB(sb); - unsigned int total_segs = - le32_to_cpu(sbi->raw_super->segment_count_main); - int i, j; - - seq_puts(seq, "format: segment_type|valid_blocks|bitmaps\n" - "segment_type(0:HD, 1:WD, 2:CD, 3:HN, 4:WN, 5:CN)\n"); - - for (i = 0; i < total_segs; i++) { - struct seg_entry *se = get_seg_entry(sbi, i); - - seq_printf(seq, "%-10d", i); - seq_printf(seq, "%d|%-3u|", se->type, - get_valid_blocks(sbi, i, false)); - for (j = 0; j < SIT_VBLOCK_MAP_SIZE; j++) - seq_printf(seq, " %.2x", se->cur_valid_map[j]); - seq_putc(seq, '\n'); - } - return 0; -} - -#define F2FS_PROC_FILE_DEF(_name) \ -static int _name##_open_fs(struct inode *inode, struct file *file) \ -{ \ - return single_open(file, _name##_seq_show, PDE_DATA(inode)); \ -} \ - \ -static const struct file_operations f2fs_seq_##_name##_fops = { \ - .owner = THIS_MODULE, \ - .open = _name##_open_fs, \ - .read = seq_read, \ - .llseek = seq_lseek, \ - .release = single_release, \ -}; - -F2FS_PROC_FILE_DEF(segment_info); -F2FS_PROC_FILE_DEF(segment_bits); - -int f2fs_init_sysfs(struct f2fs_sb_info *sbi) -{ - struct super_block *sb = sbi->sb; - int err; - - if (f2fs_proc_root) - sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root); - - if (sbi->s_proc) { - proc_create_data("segment_info", S_IRUGO, sbi->s_proc, - &f2fs_seq_segment_info_fops, sb); - proc_create_data("segment_bits", S_IRUGO, sbi->s_proc, - &f2fs_seq_segment_bits_fops, sb); - } - - sbi->s_kobj.kset = f2fs_kset; - init_completion(&sbi->s_kobj_unregister); - err = kobject_init_and_add(&sbi->s_kobj, &f2fs_ktype, NULL, - "%s", sb->s_id); - if (err) - goto err_out; - return 0; -err_out: - if (sbi->s_proc) { - remove_proc_entry("segment_info", sbi->s_proc); - remove_proc_entry("segment_bits", sbi->s_proc); - remove_proc_entry(sb->s_id, f2fs_proc_root); - } - return err; -} - static void default_options(struct f2fs_sb_info *sbi) { /* init some FS parameters */ diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c new file mode 100644 index 000000000000..714a3e47bbe8 --- /dev/null +++ b/fs/f2fs/sysfs.c @@ -0,0 +1,350 @@ +/* + * f2fs sysfs interface + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + * Copyright (c) 2017 Chao Yu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include +#include + +#include "f2fs.h" +#include "segment.h" +#include "gc.h" + +static struct proc_dir_entry *f2fs_proc_root; +static struct kset *f2fs_kset; + +/* Sysfs support for f2fs */ +enum { + GC_THREAD, /* struct f2fs_gc_thread */ + SM_INFO, /* struct f2fs_sm_info */ + DCC_INFO, /* struct discard_cmd_control */ + NM_INFO, /* struct f2fs_nm_info */ + F2FS_SBI, /* struct f2fs_sb_info */ +#ifdef CONFIG_F2FS_FAULT_INJECTION + FAULT_INFO_RATE, /* struct f2fs_fault_info */ + FAULT_INFO_TYPE, /* struct f2fs_fault_info */ +#endif +}; + +struct f2fs_attr { + struct attribute attr; + ssize_t (*show)(struct f2fs_attr *, struct f2fs_sb_info *, char *); + ssize_t (*store)(struct f2fs_attr *, struct f2fs_sb_info *, + const char *, size_t); + int struct_type; + int offset; +}; + +static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type) +{ + if (struct_type == GC_THREAD) + return (unsigned char *)sbi->gc_thread; + else if (struct_type == SM_INFO) + return (unsigned char *)SM_I(sbi); + else if (struct_type == DCC_INFO) + return (unsigned char *)SM_I(sbi)->dcc_info; + else if (struct_type == NM_INFO) + return (unsigned char *)NM_I(sbi); + else if (struct_type == F2FS_SBI) + return (unsigned char *)sbi; +#ifdef CONFIG_F2FS_FAULT_INJECTION + else if (struct_type == FAULT_INFO_RATE || + struct_type == FAULT_INFO_TYPE) + return (unsigned char *)&sbi->fault_info; +#endif + return NULL; +} + +static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + struct super_block *sb = sbi->sb; + + if (!sb->s_bdev->bd_part) + return snprintf(buf, PAGE_SIZE, "0\n"); + + return snprintf(buf, PAGE_SIZE, "%llu\n", + (unsigned long long)(sbi->kbytes_written + + BD_PART_WRITTEN(sbi))); +} + +static ssize_t f2fs_sbi_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + unsigned char *ptr = NULL; + unsigned int *ui; + + ptr = __struct_ptr(sbi, a->struct_type); + if (!ptr) + return -EINVAL; + + ui = (unsigned int *)(ptr + a->offset); + + return snprintf(buf, PAGE_SIZE, "%u\n", *ui); +} + +static ssize_t f2fs_sbi_store(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, + const char *buf, size_t count) +{ + unsigned char *ptr; + unsigned long t; + unsigned int *ui; + ssize_t ret; + + ptr = __struct_ptr(sbi, a->struct_type); + if (!ptr) + return -EINVAL; + + ui = (unsigned int *)(ptr + a->offset); + + ret = kstrtoul(skip_spaces(buf), 0, &t); + if (ret < 0) + return ret; +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (a->struct_type == FAULT_INFO_TYPE && t >= (1 << FAULT_MAX)) + return -EINVAL; +#endif + *ui = t; + return count; +} + +static ssize_t f2fs_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, + s_kobj); + struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr); + + return a->show ? a->show(a, sbi, buf) : 0; +} + +static ssize_t f2fs_attr_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t len) +{ + struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, + s_kobj); + struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr); + + return a->store ? a->store(a, sbi, buf, len) : 0; +} + +static void f2fs_sb_release(struct kobject *kobj) +{ + struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, + s_kobj); + complete(&sbi->s_kobj_unregister); +} + +#define F2FS_ATTR_OFFSET(_struct_type, _name, _mode, _show, _store, _offset) \ +static struct f2fs_attr f2fs_attr_##_name = { \ + .attr = {.name = __stringify(_name), .mode = _mode }, \ + .show = _show, \ + .store = _store, \ + .struct_type = _struct_type, \ + .offset = _offset \ +} + +#define F2FS_RW_ATTR(struct_type, struct_name, name, elname) \ + F2FS_ATTR_OFFSET(struct_type, name, 0644, \ + f2fs_sbi_show, f2fs_sbi_store, \ + offsetof(struct struct_name, elname)) + +#define F2FS_GENERAL_RO_ATTR(name) \ +static struct f2fs_attr f2fs_attr_##name = __ATTR(name, 0444, name##_show, NULL) + +F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_min_sleep_time, min_sleep_time); +F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_max_sleep_time, max_sleep_time); +F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_no_gc_sleep_time, no_gc_sleep_time); +F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_idle, gc_idle); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments); +F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_small_discards, max_discards); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, batched_trim_sections, trim_sections); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_hot_blocks, min_hot_blocks); +F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh); +F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ra_nid_pages, ra_nid_pages); +F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, dirty_nats_ratio, dirty_nats_ratio); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, cp_interval, interval_time[CP_TIME]); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, idle_interval, interval_time[REQ_TIME]); +#ifdef CONFIG_F2FS_FAULT_INJECTION +F2FS_RW_ATTR(FAULT_INFO_RATE, f2fs_fault_info, inject_rate, inject_rate); +F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type); +#endif +F2FS_GENERAL_RO_ATTR(lifetime_write_kbytes); + +#define ATTR_LIST(name) (&f2fs_attr_##name.attr) +static struct attribute *f2fs_attrs[] = { + ATTR_LIST(gc_min_sleep_time), + ATTR_LIST(gc_max_sleep_time), + ATTR_LIST(gc_no_gc_sleep_time), + ATTR_LIST(gc_idle), + ATTR_LIST(reclaim_segments), + ATTR_LIST(max_small_discards), + ATTR_LIST(batched_trim_sections), + ATTR_LIST(ipu_policy), + ATTR_LIST(min_ipu_util), + ATTR_LIST(min_fsync_blocks), + ATTR_LIST(min_hot_blocks), + ATTR_LIST(max_victim_search), + ATTR_LIST(dir_level), + ATTR_LIST(ram_thresh), + ATTR_LIST(ra_nid_pages), + ATTR_LIST(dirty_nats_ratio), + ATTR_LIST(cp_interval), + ATTR_LIST(idle_interval), +#ifdef CONFIG_F2FS_FAULT_INJECTION + ATTR_LIST(inject_rate), + ATTR_LIST(inject_type), +#endif + ATTR_LIST(lifetime_write_kbytes), + NULL, +}; + +static const struct sysfs_ops f2fs_attr_ops = { + .show = f2fs_attr_show, + .store = f2fs_attr_store, +}; + +static struct kobj_type f2fs_ktype = { + .default_attrs = f2fs_attrs, + .sysfs_ops = &f2fs_attr_ops, + .release = f2fs_sb_release, +}; + +static int segment_info_seq_show(struct seq_file *seq, void *offset) +{ + struct super_block *sb = seq->private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + unsigned int total_segs = + le32_to_cpu(sbi->raw_super->segment_count_main); + int i; + + seq_puts(seq, "format: segment_type|valid_blocks\n" + "segment_type(0:HD, 1:WD, 2:CD, 3:HN, 4:WN, 5:CN)\n"); + + for (i = 0; i < total_segs; i++) { + struct seg_entry *se = get_seg_entry(sbi, i); + + if ((i % 10) == 0) + seq_printf(seq, "%-10d", i); + seq_printf(seq, "%d|%-3u", se->type, + get_valid_blocks(sbi, i, false)); + if ((i % 10) == 9 || i == (total_segs - 1)) + seq_putc(seq, '\n'); + else + seq_putc(seq, ' '); + } + + return 0; +} + +static int segment_bits_seq_show(struct seq_file *seq, void *offset) +{ + struct super_block *sb = seq->private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + unsigned int total_segs = + le32_to_cpu(sbi->raw_super->segment_count_main); + int i, j; + + seq_puts(seq, "format: segment_type|valid_blocks|bitmaps\n" + "segment_type(0:HD, 1:WD, 2:CD, 3:HN, 4:WN, 5:CN)\n"); + + for (i = 0; i < total_segs; i++) { + struct seg_entry *se = get_seg_entry(sbi, i); + + seq_printf(seq, "%-10d", i); + seq_printf(seq, "%d|%-3u|", se->type, + get_valid_blocks(sbi, i, false)); + for (j = 0; j < SIT_VBLOCK_MAP_SIZE; j++) + seq_printf(seq, " %.2x", se->cur_valid_map[j]); + seq_putc(seq, '\n'); + } + return 0; +} + +#define F2FS_PROC_FILE_DEF(_name) \ +static int _name##_open_fs(struct inode *inode, struct file *file) \ +{ \ + return single_open(file, _name##_seq_show, PDE_DATA(inode)); \ +} \ + \ +static const struct file_operations f2fs_seq_##_name##_fops = { \ + .open = _name##_open_fs, \ + .read = seq_read, \ + .llseek = seq_lseek, \ + .release = single_release, \ +}; + +F2FS_PROC_FILE_DEF(segment_info); +F2FS_PROC_FILE_DEF(segment_bits); + +int __init f2fs_register_sysfs(void) +{ + f2fs_proc_root = proc_mkdir("fs/f2fs", NULL); + + f2fs_kset = kset_create_and_add("f2fs", NULL, fs_kobj); + if (!f2fs_kset) + return -ENOMEM; + return 0; +} + +void f2fs_unregister_sysfs(void) +{ + kset_unregister(f2fs_kset); + remove_proc_entry("fs/f2fs", NULL); +} + +int f2fs_init_sysfs(struct f2fs_sb_info *sbi) +{ + struct super_block *sb = sbi->sb; + int err; + + if (f2fs_proc_root) + sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root); + + if (sbi->s_proc) { + proc_create_data("segment_info", S_IRUGO, sbi->s_proc, + &f2fs_seq_segment_info_fops, sb); + proc_create_data("segment_bits", S_IRUGO, sbi->s_proc, + &f2fs_seq_segment_bits_fops, sb); + } + + sbi->s_kobj.kset = f2fs_kset; + init_completion(&sbi->s_kobj_unregister); + err = kobject_init_and_add(&sbi->s_kobj, &f2fs_ktype, NULL, + "%s", sb->s_id); + if (err) + goto err_out; + return 0; +err_out: + if (sbi->s_proc) { + remove_proc_entry("segment_info", sbi->s_proc); + remove_proc_entry("segment_bits", sbi->s_proc); + remove_proc_entry(sb->s_id, f2fs_proc_root); + } + return err; +} + +void f2fs_exit_sysfs(struct f2fs_sb_info *sbi) +{ + kobject_del(&sbi->s_kobj); + kobject_put(&sbi->s_kobj); + wait_for_completion(&sbi->s_kobj_unregister); + + if (sbi->s_proc) { + remove_proc_entry("segment_info", sbi->s_proc); + remove_proc_entry("segment_bits", sbi->s_proc); + remove_proc_entry(sbi->sb->s_id, f2fs_proc_root); + } +} From 4e968ec7cb9d76cc71c46370696784818a0dd364 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 13 Jun 2017 16:47:54 -0700 Subject: [PATCH 343/804] f2fs: require key for truncate(2) of encrypted file Currently, filesystems allow truncate(2) on an encrypted file without the encryption key. However, it's impossible to correctly handle the case where the size being truncated to is not a multiple of the filesystem block size, because that would require decrypting the final block, zeroing the part beyond i_size, then encrypting the block. As other modifications to encrypted file contents are prohibited without the key, just prohibit truncate(2) as well, making it fail with ENOKEY. Signed-off-by: Eric Biggers Acked-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 447dd1221167..e93dcb9da1c0 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -692,9 +692,13 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) return err; if (attr->ia_valid & ATTR_SIZE) { - if (f2fs_encrypted_inode(inode) && - fscrypt_get_encryption_info(inode)) - return -EACCES; + if (f2fs_encrypted_inode(inode)) { + err = fscrypt_get_encryption_info(inode); + if (err) + return err; + if (!fscrypt_has_encryption_key(inode)) + return -ENOKEY; + } if (attr->ia_size <= i_size_read(inode)) { down_write(&F2FS_I(inode)->i_mmap_sem); From 8a4c67330110ee0623b7215961ced82dc5e6b5cc Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 14 Jun 2017 23:00:55 +0800 Subject: [PATCH 344/804] f2fs: set CP_TRIMMED_FLAG correctly Don't set CP_TRIMMED_FLAG for non-zoned block device or discard unsupported device, it can avoid to trigger unneeded checkpoint for that kind of device. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 42093c7c9ae0..2b72e1e9a330 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -587,7 +587,7 @@ static void f2fs_put_super(struct super_block *sb) /* be sure to wait for any on-going discard commands */ f2fs_wait_discard_bios(sbi); - if (!sbi->discard_blks) { + if (f2fs_discard_en(sbi) && !sbi->discard_blks) { struct cp_control cpc = { .reason = CP_UMOUNT | CP_TRIMMED, }; From b3cba4ddf8140b28c9bcc4c22c1ccbe1342cc55a Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 14 Jun 2017 23:00:56 +0800 Subject: [PATCH 345/804] f2fs: measure inode.i_blocks as generic filesystem Both in memory or on disk, generic filesystems record i_blocks with 512bytes sized sector count, also VFS sub module such as disk quota follows this rule, but f2fs records it with 4096bytes sized block count, this difference leads to that once we use dquota's function which inc/dec iblocks, it will make i_blocks of f2fs being inconsistent between in memory and on disk. In order to resolve this issue, this patch changes to make in-memory i_blocks of f2fs recording sector count instead of block count, meanwhile leaving on-disk i_blocks recording block count. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 23 +++++++++++++---------- fs/f2fs/file.c | 1 - fs/f2fs/inode.c | 5 +++-- fs/f2fs/node.c | 2 +- 4 files changed, 17 insertions(+), 14 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 0d9d25891833..8e0f9693db04 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1419,10 +1419,10 @@ static inline int check_nid_range(struct f2fs_sb_info *sbi, nid_t nid) */ static inline int F2FS_HAS_BLOCKS(struct inode *inode) { - if (F2FS_I(inode)->i_xattr_nid) - return inode->i_blocks > F2FS_DEFAULT_ALLOCATED_BLOCKS + 1; - else - return inode->i_blocks > F2FS_DEFAULT_ALLOCATED_BLOCKS; + block_t xattr_block = F2FS_I(inode)->i_xattr_nid ? 1 : 0; + + return (inode->i_blocks >> F2FS_LOG_SECTORS_PER_BLOCK) > + (F2FS_DEFAULT_ALLOCATED_BLOCKS + xattr_block); } static inline bool f2fs_has_xattr_block(unsigned int ofs) @@ -1430,7 +1430,7 @@ static inline bool f2fs_has_xattr_block(unsigned int ofs) return ofs == XATTR_NODE_OFFSET; } -static inline void f2fs_i_blocks_write(struct inode *, blkcnt_t, bool); +static inline void f2fs_i_blocks_write(struct inode *, block_t, bool); static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi, struct inode *inode, blkcnt_t *count) { @@ -1468,11 +1468,13 @@ static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi, static inline void dec_valid_block_count(struct f2fs_sb_info *sbi, struct inode *inode, - blkcnt_t count) + block_t count) { + blkcnt_t sectors = count << F2FS_LOG_SECTORS_PER_BLOCK; + spin_lock(&sbi->stat_lock); f2fs_bug_on(sbi, sbi->total_valid_block_count < (block_t) count); - f2fs_bug_on(sbi, inode->i_blocks < count); + f2fs_bug_on(sbi, inode->i_blocks < sectors); sbi->total_valid_block_count -= (block_t)count; spin_unlock(&sbi->stat_lock); f2fs_i_blocks_write(inode, count, false); @@ -1923,13 +1925,14 @@ static inline void f2fs_i_links_write(struct inode *inode, bool inc) } static inline void f2fs_i_blocks_write(struct inode *inode, - blkcnt_t diff, bool add) + block_t diff, bool add) { bool clean = !is_inode_flag_set(inode, FI_DIRTY_INODE); bool recover = is_inode_flag_set(inode, FI_AUTO_RECOVER); + blkcnt_t sectors = diff << F2FS_LOG_SECTORS_PER_BLOCK; - inode->i_blocks = add ? inode->i_blocks + diff : - inode->i_blocks - diff; + inode->i_blocks = add ? inode->i_blocks + sectors : + inode->i_blocks - sectors; f2fs_mark_inode_dirty_sync(inode, true); if (clean || recover) set_inode_flag(inode, FI_AUTO_RECOVER); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index e93dcb9da1c0..b9a33c910b8a 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -647,7 +647,6 @@ int f2fs_getattr(struct vfsmount *mnt, { struct inode *inode = d_inode(dentry); generic_fillattr(inode, stat); - stat->blocks <<= 3; return 0; } diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 868d71436ebc..1ff5bd418d87 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -16,6 +16,7 @@ #include "f2fs.h" #include "node.h" +#include "segment.h" #include @@ -129,7 +130,7 @@ static int do_read_inode(struct inode *inode) i_gid_write(inode, le32_to_cpu(ri->i_gid)); set_nlink(inode, le32_to_cpu(ri->i_links)); inode->i_size = le64_to_cpu(ri->i_size); - inode->i_blocks = le64_to_cpu(ri->i_blocks); + inode->i_blocks = SECTOR_FROM_BLOCK(le64_to_cpu(ri->i_blocks)); inode->i_atime.tv_sec = le64_to_cpu(ri->i_atime); inode->i_ctime.tv_sec = le64_to_cpu(ri->i_ctime); @@ -267,7 +268,7 @@ int update_inode(struct inode *inode, struct page *node_page) ri->i_gid = cpu_to_le32(i_gid_read(inode)); ri->i_links = cpu_to_le32(inode->i_nlink); ri->i_size = cpu_to_le64(i_size_read(inode)); - ri->i_blocks = cpu_to_le64(inode->i_blocks); + ri->i_blocks = cpu_to_le64(SECTOR_TO_BLOCK(inode->i_blocks)); if (et) { read_lock(&et->lock); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 70f3c01a806f..b36b34f45bae 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1011,7 +1011,7 @@ int remove_inode_page(struct inode *inode) /* 0 is possible, after f2fs_new_inode() has failed */ f2fs_bug_on(F2FS_I_SB(inode), - inode->i_blocks != 0 && inode->i_blocks != 1); + inode->i_blocks != 0 && inode->i_blocks != 8); /* will put inode & node pages */ truncate_node(&dn); From 17ca8933b3827480fe7e94c228db05d3a471d180 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 14 Jun 2017 08:05:32 -0700 Subject: [PATCH 346/804] f2fs: don't need to check encrypted inode for partial truncation The cache_only is always false, if inode is encrypted. Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index b9a33c910b8a..cb99a7701080 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -537,8 +537,10 @@ static int truncate_partial_data_page(struct inode *inode, u64 from, truncate_out: f2fs_wait_on_page_writeback(page, DATA, true); zero_user(page, offset, PAGE_SIZE - offset); - if (!cache_only || !f2fs_encrypted_inode(inode) || - !S_ISREG(inode->i_mode)) + + /* An encrypted inode should have a key and truncate the last page. */ + f2fs_bug_on(F2FS_I_SB(inode), cache_only && f2fs_encrypted_inode(inode)); + if (!cache_only) set_page_dirty(page); f2fs_put_page(page, 1); return 0; From 40edf0c8ad01ca3c495bf2173ae5e263e9dbd318 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 15 Jun 2017 16:44:42 -0700 Subject: [PATCH 347/804] f2fs: add ioctl to do gc with target block address This patch adds f2fs_ioc_gc_range() to move blocks located in the given range. Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 8 ++++++++ fs/f2fs/file.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 8e0f9693db04..c0985a462af9 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -370,6 +370,8 @@ static inline bool __has_cursum_space(struct f2fs_journal *journal, struct f2fs_move_range) #define F2FS_IOC_FLUSH_DEVICE _IOW(F2FS_IOCTL_MAGIC, 10, \ struct f2fs_flush_device) +#define F2FS_IOC_GARBAGE_COLLECT_RANGE _IOW(F2FS_IOCTL_MAGIC, 11, \ + struct f2fs_gc_range) #define F2FS_IOC_SET_ENCRYPTION_POLICY FS_IOC_SET_ENCRYPTION_POLICY #define F2FS_IOC_GET_ENCRYPTION_POLICY FS_IOC_GET_ENCRYPTION_POLICY @@ -394,6 +396,12 @@ static inline bool __has_cursum_space(struct f2fs_journal *journal, #define F2FS_IOC32_GETVERSION FS_IOC32_GETVERSION #endif +struct f2fs_gc_range { + u32 sync; + u64 start; + u64 len; +}; + struct f2fs_defragment { u64 start; u64 len; diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index cb99a7701080..9978e86c89c5 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1896,6 +1896,50 @@ out: return ret; } +static int f2fs_ioc_gc_range(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_gc_range range; + u64 end; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(&range, (struct f2fs_gc_range __user *)arg, + sizeof(range))) + return -EFAULT; + + if (f2fs_readonly(sbi->sb)) + return -EROFS; + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + end = range.start + range.len; + if (range.start < MAIN_BLKADDR(sbi) || end >= MAX_BLKADDR(sbi)) + return -EINVAL; +do_more: + if (!range.sync) { + if (!mutex_trylock(&sbi->gc_mutex)) { + ret = -EBUSY; + goto out; + } + } else { + mutex_lock(&sbi->gc_mutex); + } + + ret = f2fs_gc(sbi, range.sync, true, GET_SEGNO(sbi, range.start)); + range.start += sbi->blocks_per_seg; + if (range.start <= end) + goto do_more; +out: + mnt_drop_write_file(filp); + return ret; +} + static int f2fs_ioc_write_checkpoint(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); @@ -2340,6 +2384,8 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return f2fs_ioc_get_encryption_pwsalt(filp, arg); case F2FS_IOC_GARBAGE_COLLECT: return f2fs_ioc_gc(filp, arg); + case F2FS_IOC_GARBAGE_COLLECT_RANGE: + return f2fs_ioc_gc_range(filp, arg); case F2FS_IOC_WRITE_CHECKPOINT: return f2fs_ioc_write_checkpoint(filp, arg); case F2FS_IOC_DEFRAGMENT: @@ -2413,6 +2459,7 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case F2FS_IOC_GET_ENCRYPTION_PWSALT: case F2FS_IOC_GET_ENCRYPTION_POLICY: case F2FS_IOC_GARBAGE_COLLECT: + case F2FS_IOC_GARBAGE_COLLECT_RANGE: case F2FS_IOC_WRITE_CHECKPOINT: case F2FS_IOC_DEFRAGMENT: case F2FS_IOC_MOVE_RANGE: From 38435dd20c4cd9e08dcdb9a8f79529cf13bf4936 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 21 Jun 2017 20:55:55 -0700 Subject: [PATCH 348/804] f2fs: report # of free inodes more precisely If the partition is small, we don't need to report total # of inodes including hidden free nodes. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 2b72e1e9a330..8948d7b2cb28 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -681,6 +681,7 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) struct f2fs_sb_info *sbi = F2FS_SB(sb); u64 id = huge_encode_dev(sb->s_bdev->bd_dev); block_t total_count, user_block_count, start_count, ovp_count; + u64 avail_node_count; total_count = le64_to_cpu(sbi->raw_super->block_count); user_block_count = sbi->user_block_count; @@ -693,9 +694,16 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_bfree = user_block_count - valid_user_blocks(sbi) + ovp_count; buf->f_bavail = user_block_count - valid_user_blocks(sbi); - buf->f_files = sbi->total_node_count - F2FS_RESERVED_NODE_NUM; - buf->f_ffree = min(buf->f_files - valid_node_count(sbi), - buf->f_bavail); + avail_node_count = sbi->total_node_count - F2FS_RESERVED_NODE_NUM; + + if (avail_node_count > user_block_count) { + buf->f_files = user_block_count; + buf->f_ffree = buf->f_bavail; + } else { + buf->f_files = avail_node_count; + buf->f_ffree = min(avail_node_count - valid_node_count(sbi), + buf->f_bavail); + } buf->f_namelen = F2FS_NAME_LEN; buf->f_fsid.val[0] = (u32)id; From 99ad6f555ab7f7ca7c337521dfe0c66a703dbdff Mon Sep 17 00:00:00 2001 From: Yunlong Song Date: Sat, 24 Jun 2017 15:57:19 +0800 Subject: [PATCH 349/804] f2fs: avoid redundant f2fs_flush after remount create_flush_cmd_control will create redundant issue_flush_thread after each remount with flush_merge option. Signed-off-by: Yunlong Song Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 9b08a6660d13..d73b4b29055c 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -555,6 +555,8 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi) if (SM_I(sbi)->fcc_info) { fcc = SM_I(sbi)->fcc_info; + if (fcc->f2fs_issue_flush) + return err; goto init_thread; } From ea9b9467fabf37b393eca00668cf9e2f8fce1ee8 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 26 Jun 2017 16:24:41 +0800 Subject: [PATCH 350/804] f2fs: introduce reserved_blocks in sysfs In this patch, we add a new sysfs interface, with it, we can control number of reserved blocks in system which could not be used by user, it enable f2fs to let user to configure for adjusting over-provision ratio dynamically instead of changing it by mkfs. So we can expect it will help to reserve more free space for relieving GC in both filesystem and flash device. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim Conflicts: Documentation/ABI/testing/sysfs-fs-f2fs --- Documentation/ABI/testing/sysfs-fs-f2fs | 32 +++++++++++++++++++++++++ fs/f2fs/f2fs.h | 13 ++++++---- fs/f2fs/super.c | 4 +++- fs/f2fs/sysfs.c | 16 ++++++++++++- 4 files changed, 59 insertions(+), 6 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index 0345f2d1c727..2805ce062fdb 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -92,3 +92,35 @@ Date: October 2015 Contact: "Chao Yu" Description: Controls the count of nid pages to be readaheaded. + +What: /sys/fs/f2fs//dirty_nats_ratio +Date: January 2016 +Contact: "Chao Yu" +Description: + Controls dirty nat entries ratio threshold, if current + ratio exceeds configured threshold, checkpoint will + be triggered for flushing dirty nat entries. + +What: /sys/fs/f2fs//lifetime_write_kbytes +Date: January 2016 +Contact: "Shuoran Liu" +Description: + Shows total written kbytes issued to disk. + +What: /sys/fs/f2fs//inject_rate +Date: May 2016 +Contact: "Sheng Yong" +Description: + Controls the injection rate. + +What: /sys/fs/f2fs//inject_type +Date: May 2016 +Contact: "Sheng Yong" +Description: + Controls the injection type. + +What: /sys/fs/f2fs//reserved_blocks +Date: June 2017 +Contact: "Chao Yu" +Description: + Controls current reserved blocks in system. diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index c0985a462af9..e908bf456230 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1033,6 +1033,8 @@ struct f2fs_sb_info { block_t total_valid_block_count; /* # of valid blocks */ block_t discard_blks; /* discard command candidats */ block_t last_valid_block_count; /* for recovery */ + block_t reserved_blocks; /* configurable reserved blocks */ + u32 s_next_generation; /* for NFS support */ /* # of pages, see count_type */ @@ -1443,6 +1445,7 @@ static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi, struct inode *inode, blkcnt_t *count) { blkcnt_t diff; + block_t avail_user_block_count; #ifdef CONFIG_F2FS_FAULT_INJECTION if (time_to_inject(sbi, FAULT_BLOCK)) { @@ -1458,10 +1461,11 @@ static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi, spin_lock(&sbi->stat_lock); sbi->total_valid_block_count += (block_t)(*count); - if (unlikely(sbi->total_valid_block_count > sbi->user_block_count)) { - diff = sbi->total_valid_block_count - sbi->user_block_count; + avail_user_block_count = sbi->user_block_count - sbi->reserved_blocks; + if (unlikely(sbi->total_valid_block_count > avail_user_block_count)) { + diff = sbi->total_valid_block_count - avail_user_block_count; *count -= diff; - sbi->total_valid_block_count = sbi->user_block_count; + sbi->total_valid_block_count = avail_user_block_count; if (!*count) { spin_unlock(&sbi->stat_lock); percpu_counter_sub(&sbi->alloc_valid_block_count, diff); @@ -1623,7 +1627,8 @@ static inline bool inc_valid_node_count(struct f2fs_sb_info *sbi, spin_lock(&sbi->stat_lock); valid_block_count = sbi->total_valid_block_count + 1; - if (unlikely(valid_block_count > sbi->user_block_count)) { + if (unlikely(valid_block_count + sbi->reserved_blocks > + sbi->user_block_count)) { spin_unlock(&sbi->stat_lock); return false; } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 8948d7b2cb28..c83a5ad7c2ba 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -692,7 +692,8 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_blocks = total_count - start_count; buf->f_bfree = user_block_count - valid_user_blocks(sbi) + ovp_count; - buf->f_bavail = user_block_count - valid_user_blocks(sbi); + buf->f_bavail = user_block_count - valid_user_blocks(sbi) - + sbi->reserved_blocks; avail_node_count = sbi->total_node_count - F2FS_RESERVED_NODE_NUM; @@ -1773,6 +1774,7 @@ try_onemore: sbi->total_valid_block_count = le64_to_cpu(sbi->ckpt->valid_block_count); sbi->last_valid_block_count = sbi->total_valid_block_count; + sbi->reserved_blocks = 0; for (i = 0; i < NR_INODE_TYPE; i++) { INIT_LIST_HEAD(&sbi->inode_list[i]); diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 714a3e47bbe8..9adc202fcd6f 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -30,6 +30,7 @@ enum { FAULT_INFO_RATE, /* struct f2fs_fault_info */ FAULT_INFO_TYPE, /* struct f2fs_fault_info */ #endif + RESERVED_BLOCKS, }; struct f2fs_attr { @@ -51,7 +52,7 @@ static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type) return (unsigned char *)SM_I(sbi)->dcc_info; else if (struct_type == NM_INFO) return (unsigned char *)NM_I(sbi); - else if (struct_type == F2FS_SBI) + else if (struct_type == F2FS_SBI || struct_type == RESERVED_BLOCKS) return (unsigned char *)sbi; #ifdef CONFIG_F2FS_FAULT_INJECTION else if (struct_type == FAULT_INFO_RATE || @@ -111,6 +112,17 @@ static ssize_t f2fs_sbi_store(struct f2fs_attr *a, if (a->struct_type == FAULT_INFO_TYPE && t >= (1 << FAULT_MAX)) return -EINVAL; #endif + if (a->struct_type == RESERVED_BLOCKS) { + spin_lock(&sbi->stat_lock); + if ((unsigned long)sbi->total_valid_block_count + t > + (unsigned long)sbi->user_block_count) { + spin_unlock(&sbi->stat_lock); + return -EINVAL; + } + *ui = t; + spin_unlock(&sbi->stat_lock); + return count; + } *ui = t; return count; } @@ -165,6 +177,7 @@ F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_no_gc_sleep_time, no_gc_sleep_time); F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_idle, gc_idle); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments); F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_small_discards, max_discards); +F2FS_RW_ATTR(RESERVED_BLOCKS, f2fs_sb_info, reserved_blocks, reserved_blocks); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, batched_trim_sections, trim_sections); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util); @@ -208,6 +221,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(inject_type), #endif ATTR_LIST(lifetime_write_kbytes), + ATTR_LIST(reserved_blocks), NULL, }; From 243d3acf5b181bbea3a4ad2ec8a3a84c26a22701 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 29 Jun 2017 23:17:45 +0800 Subject: [PATCH 351/804] f2fs: stop gc/discard thread in prior during umount This patch resolves kernel panic for xfstests/081, caused by recent f2fs_bug_on f2fs: add f2fs_bug_on in __remove_discard_cmd For fixing, we will stop gc/discard thread in prior in ->kill_sb in order to avoid referring and releasing race among them. Signed-off-by: Jaegeuk Kim Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 1 + fs/f2fs/segment.c | 19 +++++++++++++------ fs/f2fs/super.c | 7 ++++--- 3 files changed, 18 insertions(+), 9 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index e908bf456230..33681a2f160d 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2360,6 +2360,7 @@ void destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free); void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr); bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr); void refresh_sit_entry(struct f2fs_sb_info *sbi, block_t old, block_t new); +void stop_discard_thread(struct f2fs_sb_info *sbi); void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi); void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc); void release_discard_addrs(struct f2fs_sb_info *sbi); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index d73b4b29055c..09532f823cbc 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1144,6 +1144,18 @@ void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) __wait_one_discard_bio(sbi, dc); } +void stop_discard_thread(struct f2fs_sb_info *sbi) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + + if (dcc && dcc->f2fs_issue_discard) { + struct task_struct *discard_thread = dcc->f2fs_issue_discard; + + dcc->f2fs_issue_discard = NULL; + kthread_stop(discard_thread); + } +} + /* This comes from f2fs_put_super */ void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi) { @@ -1501,12 +1513,7 @@ static void destroy_discard_cmd_control(struct f2fs_sb_info *sbi) if (!dcc) return; - if (dcc->f2fs_issue_discard) { - struct task_struct *discard_thread = dcc->f2fs_issue_discard; - - dcc->f2fs_issue_discard = NULL; - kthread_stop(discard_thread); - } + stop_discard_thread(sbi); kfree(dcc); SM_I(sbi)->dcc_info = NULL; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index c83a5ad7c2ba..e4202585f92e 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -566,8 +566,6 @@ static void f2fs_put_super(struct super_block *sb) struct f2fs_sb_info *sbi = F2FS_SB(sb); int i; - stop_gc_thread(sbi); - /* prevent remaining shrinker jobs */ mutex_lock(&sbi->umount_mutex); @@ -1981,8 +1979,11 @@ static struct dentry *f2fs_mount(struct file_system_type *fs_type, int flags, static void kill_f2fs_super(struct super_block *sb) { - if (sb->s_root) + if (sb->s_root) { set_sbi_flag(F2FS_SB(sb), SBI_IS_CLOSE); + stop_gc_thread(F2FS_SB(sb)); + stop_discard_thread(F2FS_SB(sb)); + } kill_block_super(sb); } From 3099c953ccfdd643e83329f55afa088e94904831 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 30 Jun 2017 17:19:02 +0800 Subject: [PATCH 352/804] f2fs: introduce __check_sit_bitmap After we introduce discard thread, discard command can be issued concurrently with data allocating, this patch adds new function to heck sit bitmap to ensure that userdata was invalid in which on-going discard command covered. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim Conflicts: fs/f2fs/segment.c --- fs/f2fs/segment.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 09532f823cbc..0f6cded83c7b 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -844,6 +844,31 @@ static int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, return 0; } +void __check_sit_bitmap(struct f2fs_sb_info *sbi, + block_t start, block_t end) +{ +#ifdef CONFIG_F2FS_CHECK_FS + struct seg_entry *sentry; + unsigned int segno; + block_t blk = start; + unsigned long offset, size, max_blocks = sbi->blocks_per_seg; + unsigned long *map; + + while (blk < end) { + segno = GET_SEGNO(sbi, blk); + sentry = get_seg_entry(sbi, segno); + offset = GET_BLKOFF_FROM_SEG0(sbi, blk); + + size = min((unsigned long)(end - blk), max_blocks); + map = (unsigned long *)(sentry->cur_valid_map); + offset = __find_rev_next_bit(map, size, offset); + f2fs_bug_on(sbi, offset != size); + blk += size; + } +#endif +} + +/* this function is copied from blkdev_issue_discard from block/blk-lib.c */ static void __submit_discard_cmd(struct f2fs_sb_info *sbi, struct discard_cmd *dc) { @@ -869,6 +894,7 @@ static void __submit_discard_cmd(struct f2fs_sb_info *sbi, bio->bi_end_io = f2fs_submit_discard_endio; submit_bio(REQ_SYNC, bio); list_move_tail(&dc->list, &dcc->wait_list); + __check_sit_bitmap(sbi, dc->start, dc->start + dc->len); } } else { __remove_discard_cmd(sbi, dc); From 0b1e7ba9ee12277e92fc9043fd8a9f718e8bd9c1 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 29 Jun 2017 23:20:45 +0800 Subject: [PATCH 353/804] f2fs: skip ->writepages for {mete,node}_inode during recovery Skip ->writepages in prior to ->writepage for {meta,node}_inode during recovery, hence unneeded loop in ->writepages can be avoided. Moreover, check SBI_POR_DOING earlier while writebacking pages. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 3 +++ fs/f2fs/data.c | 13 +++++++------ fs/f2fs/node.c | 3 +++ 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 69641cf7fd6f..52ed60e72a25 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -270,6 +270,9 @@ static int f2fs_write_meta_pages(struct address_space *mapping, struct f2fs_sb_info *sbi = F2FS_M_SB(mapping); long diff, written; + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) + goto skip_write; + /* collect a number of dirty meta pages and write together */ if (wbc->for_kupdate || get_pages(sbi, F2FS_DIRTY_META) < nr_pages_to_skip(sbi, META)) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 50048986bed9..4a1730cfe86c 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1490,6 +1490,9 @@ static int __write_data_page(struct page *page, bool *submitted, trace_f2fs_writepage(page, DATA); + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) + goto redirty_out; + if (page->index < end_index) goto write; @@ -1503,8 +1506,6 @@ static int __write_data_page(struct page *page, bool *submitted, zero_user_segment(page, offset, PAGE_SIZE); write: - if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) - goto redirty_out; if (f2fs_is_drop_cache(inode)) goto out; /* we should not write 0'th page having journal header */ @@ -1752,6 +1753,10 @@ static int f2fs_write_data_pages(struct address_space *mapping, if (!get_dirty_pages(inode) && wbc->sync_mode == WB_SYNC_NONE) return 0; + /* during POR, we don't need to trigger writepage at all. */ + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) + goto skip_write; + if (S_ISDIR(inode->i_mode) && wbc->sync_mode == WB_SYNC_NONE && get_dirty_pages(inode) < nr_pages_to_skip(sbi, DATA) && available_free_memory(sbi, DIRTY_DENTS)) @@ -1761,10 +1766,6 @@ static int f2fs_write_data_pages(struct address_space *mapping, if (is_inode_flag_set(inode, FI_DO_DEFRAG)) goto skip_write; - /* during POR, we don't need to trigger writepage at all. */ - if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) - goto skip_write; - trace_f2fs_writepages(mapping->host, wbc, DATA); /* to avoid spliting IOs due to mixed WB_SYNC_ALL and WB_SYNC_NONE */ diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index b36b34f45bae..f5eebb99f57a 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1690,6 +1690,9 @@ static int f2fs_write_node_pages(struct address_space *mapping, struct blk_plug plug; long diff; + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) + goto skip_write; + /* balancing f2fs's metadata in background */ f2fs_balance_fs_bg(sbi); From 76b23d9006666425dce0ff700370c4887081129d Mon Sep 17 00:00:00 2001 From: Sheng Yong Date: Mon, 26 Jun 2017 10:41:35 +0800 Subject: [PATCH 354/804] f2fs: do not set LOST_PINO for newly created dir Since directories will be written back with checkpoint and fsync a directory will always write CP, there is no need to set LOST_PINO after creating a directory. Signed-off-by: Sheng Yong Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index a87a5ecca74d..1380c442648b 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -415,7 +415,8 @@ struct page *init_inode_metadata(struct inode *inode, struct inode *dir, * We lost i_pino from now on. */ if (is_inode_flag_set(inode, FI_INC_LINK)) { - file_lost_pino(inode); + if (!S_ISDIR(inode->i_mode)) + file_lost_pino(inode); /* * If link the tmpfile to alias through linkat path, * we should remove this inode from orphan list. From deb4cef3cba9854c07641dd29b63f16dc535889d Mon Sep 17 00:00:00 2001 From: Sheng Yong Date: Mon, 26 Jun 2017 10:41:36 +0800 Subject: [PATCH 355/804] f2fs: do not set LOST_PINO for renamed dir After renaming a directory, fsck could detect unmatched pino. The scenario can be reproduced as the following: $ mkdir /bar/subbar /foo $ rename /bar/subbar /foo Then fsck will report: [ASSERT] (__chk_dots_dentries:1182) --> Bad inode number[0x3] for '..', parent parent ino is [0x4] Rename sets LOST_PINO for old_inode. However, the flag cannot be cleared, since dir is written back with CP. So, let's get rid of LOST_PINO for a renamed dir and fix the pino directly at the end of rename. Signed-off-by: Sheng Yong Signed-off-by: Jaegeuk Kim --- fs/f2fs/namei.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 77349d51f952..82714cdde5f2 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -769,7 +769,10 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, } down_write(&F2FS_I(old_inode)->i_sem); - file_lost_pino(old_inode); + if (!old_dir_entry || whiteout) + file_lost_pino(old_inode); + else + F2FS_I(old_inode)->i_pino = new_dir->i_ino; up_write(&F2FS_I(old_inode)->i_sem); old_inode->i_ctime = current_time(old_inode); From 0b359879b966344cb2aa51e3906b422dfd201974 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 5 Jul 2017 12:17:24 +0800 Subject: [PATCH 356/804] Revert "f2fs: fix to clean previous mount option when remount_fs" Don't clear old mount option before parse new option during ->remount_fs like other generic filesystems. This reverts commit 26666c8a4366debae30ae37d0688b2bec92d196a. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index e4202585f92e..7f3c99d43579 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -847,7 +847,6 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) clear_sbi_flag(sbi, SBI_NEED_SB_WRITE); } - sbi->mount_opt.opt = 0; default_options(sbi); /* parse mount options */ From 4f7f22b7c85a4b25247df14955c64761b401ebc5 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 6 Jul 2017 01:11:31 +0800 Subject: [PATCH 357/804] f2fs: don't count inode block in in-memory inode.i_blocks Previously, we count all inode consumed blocks including inode block, xattr block, index block, data block into i_blocks, for other generic filesystems, they won't count inode block into i_blocks, so for userspace applications or quota system, they may detect incorrect block count according to i_blocks value in inode. This patch changes to count all blocks into inode.i_blocks excluding inode block, for on-disk i_blocks, we keep counting inode block for backward compatibility. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 22 ++++++++++++---------- fs/f2fs/inode.c | 4 ++-- fs/f2fs/node.c | 16 ++++++---------- 3 files changed, 20 insertions(+), 22 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 33681a2f160d..3e0832973a2c 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1422,8 +1422,6 @@ static inline int check_nid_range(struct f2fs_sb_info *sbi, nid_t nid) return 0; } -#define F2FS_DEFAULT_ALLOCATED_BLOCKS 1 - /* * Check whether the inode has blocks or not */ @@ -1431,8 +1429,7 @@ static inline int F2FS_HAS_BLOCKS(struct inode *inode) { block_t xattr_block = F2FS_I(inode)->i_xattr_nid ? 1 : 0; - return (inode->i_blocks >> F2FS_LOG_SECTORS_PER_BLOCK) > - (F2FS_DEFAULT_ALLOCATED_BLOCKS + xattr_block); + return (inode->i_blocks >> F2FS_LOG_SECTORS_PER_BLOCK) > xattr_block; } static inline bool f2fs_has_xattr_block(unsigned int ofs) @@ -1619,7 +1616,7 @@ static inline block_t __start_sum_addr(struct f2fs_sb_info *sbi) } static inline bool inc_valid_node_count(struct f2fs_sb_info *sbi, - struct inode *inode) + struct inode *inode, bool is_inode) { block_t valid_block_count; unsigned int valid_node_count; @@ -1639,8 +1636,12 @@ static inline bool inc_valid_node_count(struct f2fs_sb_info *sbi, return false; } - if (inode) - f2fs_i_blocks_write(inode, 1, true); + if (inode) { + if (is_inode) + f2fs_mark_inode_dirty_sync(inode, true); + else + f2fs_i_blocks_write(inode, 1, true); + } sbi->total_valid_node_count++; sbi->total_valid_block_count++; @@ -1651,15 +1652,16 @@ static inline bool inc_valid_node_count(struct f2fs_sb_info *sbi, } static inline void dec_valid_node_count(struct f2fs_sb_info *sbi, - struct inode *inode) + struct inode *inode, bool is_inode) { spin_lock(&sbi->stat_lock); f2fs_bug_on(sbi, !sbi->total_valid_block_count); f2fs_bug_on(sbi, !sbi->total_valid_node_count); - f2fs_bug_on(sbi, !inode->i_blocks); + f2fs_bug_on(sbi, !is_inode && !inode->i_blocks); - f2fs_i_blocks_write(inode, 1, false); + if (!is_inode) + f2fs_i_blocks_write(inode, 1, false); sbi->total_valid_node_count--; sbi->total_valid_block_count--; diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 1ff5bd418d87..e42a7a8805dc 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -130,7 +130,7 @@ static int do_read_inode(struct inode *inode) i_gid_write(inode, le32_to_cpu(ri->i_gid)); set_nlink(inode, le32_to_cpu(ri->i_links)); inode->i_size = le64_to_cpu(ri->i_size); - inode->i_blocks = SECTOR_FROM_BLOCK(le64_to_cpu(ri->i_blocks)); + inode->i_blocks = SECTOR_FROM_BLOCK(le64_to_cpu(ri->i_blocks) - 1); inode->i_atime.tv_sec = le64_to_cpu(ri->i_atime); inode->i_ctime.tv_sec = le64_to_cpu(ri->i_ctime); @@ -268,7 +268,7 @@ int update_inode(struct inode *inode, struct page *node_page) ri->i_gid = cpu_to_le32(i_gid_read(inode)); ri->i_links = cpu_to_le32(inode->i_nlink); ri->i_size = cpu_to_le64(i_size_read(inode)); - ri->i_blocks = cpu_to_le64(SECTOR_TO_BLOCK(inode->i_blocks)); + ri->i_blocks = cpu_to_le64(SECTOR_TO_BLOCK(inode->i_blocks) + 1); if (et) { read_lock(&et->lock); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index f5eebb99f57a..81c8d4eca6b9 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -678,15 +678,11 @@ static void truncate_node(struct dnode_of_data *dn) struct node_info ni; get_node_info(sbi, dn->nid, &ni); - if (dn->inode->i_blocks == 0) { - f2fs_bug_on(sbi, ni.blk_addr != NULL_ADDR); - goto invalidate; - } f2fs_bug_on(sbi, ni.blk_addr == NULL_ADDR); /* Deallocate node address */ invalidate_blocks(sbi, ni.blk_addr); - dec_valid_node_count(sbi, dn->inode); + dec_valid_node_count(sbi, dn->inode, dn->nid == dn->inode->i_ino); set_node_addr(sbi, &ni, NULL_ADDR, false); if (dn->nid == dn->inode->i_ino) { @@ -694,7 +690,7 @@ static void truncate_node(struct dnode_of_data *dn) dec_valid_inode_count(sbi); f2fs_inode_synced(dn->inode); } -invalidate: + clear_node_page_dirty(dn->node_page); set_sbi_flag(sbi, SBI_IS_DIRTY); @@ -1044,7 +1040,7 @@ struct page *new_node_page(struct dnode_of_data *dn, if (!page) return ERR_PTR(-ENOMEM); - if (unlikely(!inc_valid_node_count(sbi, dn->inode))) { + if (unlikely(!inc_valid_node_count(sbi, dn->inode, !ofs))) { err = -ENOSPC; goto fail; } @@ -2210,14 +2206,14 @@ int recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr) get_node_info(sbi, prev_xnid, &ni); f2fs_bug_on(sbi, ni.blk_addr == NULL_ADDR); invalidate_blocks(sbi, ni.blk_addr); - dec_valid_node_count(sbi, inode); + dec_valid_node_count(sbi, inode, false); set_node_addr(sbi, &ni, NULL_ADDR, false); recover_xnid: /* 2: update xattr nid in inode */ remove_free_nid(sbi, new_xnid); f2fs_i_xnid_write(inode, new_xnid); - if (unlikely(!inc_valid_node_count(sbi, inode))) + if (unlikely(!inc_valid_node_count(sbi, inode, false))) f2fs_bug_on(sbi, 1); update_inode_page(inode); @@ -2275,7 +2271,7 @@ retry: new_ni = old_ni; new_ni.ino = ino; - if (unlikely(!inc_valid_node_count(sbi, NULL))) + if (unlikely(!inc_valid_node_count(sbi, NULL, true))) WARN_ON(1); set_node_addr(sbi, &new_ni, NEW_ADDR, false); inc_valid_inode_count(sbi); From e4a2b45da36b6e2c31bdee76550f0eccd1f9a21f Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 6 Jul 2017 14:46:01 -0700 Subject: [PATCH 358/804] f2fs: relax migratepage for atomic written page In order to avoid lock contention for atomic written pages, we'd better give EBUSY in f2fs_migrate_page when mode is asynchronous. We expect it will be released soon as transaction commits. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 4a1730cfe86c..dc204f178b13 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2202,8 +2202,12 @@ int f2fs_migrate_page(struct address_space *mapping, BUG_ON(PageWriteback(page)); /* migrating an atomic written page is safe with the inmem_lock hold */ - if (atomic_written && !mutex_trylock(&fi->inmem_lock)) - return -EAGAIN; + if (atomic_written) { + if (mode != MIGRATE_SYNC) + return -EBUSY; + if (!mutex_trylock(&fi->inmem_lock)) + return -EAGAIN; + } /* * A reference is expected if PagePrivate set when move mapping, From a2bd44aca5f999b5fb69428448f88e4e8823c9ee Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 7 Jul 2017 14:10:15 +0800 Subject: [PATCH 359/804] f2fs: use spin_{,un}lock_irq{save,restore} generic/361 reports below warning, this is because: once, there is someone entering into critical region of sbi.cp_lock, if write_end_io. f2fs_stop_checkpoint is invoked from an triggered IRQ, we will encounter deadlock. So this patch changes to use spin_{,un}lock_irq{save,restore} to create critical region without IRQ enabled to avoid potential deadlock. irq event stamp: 83391573 loop: Write error at byte offset 438729728, length 1024. hardirqs last enabled at (83391573): [] restore_all+0xf/0x65 hardirqs last disabled at (83391572): [] reschedule_interrupt+0x30/0x3c loop: Write error at byte offset 438860288, length 1536. softirqs last enabled at (83389244): [] __do_softirq+0x1ae/0x476 softirqs last disabled at (83389237): [] do_softirq_own_stack+0x2c/0x40 loop: Write error at byte offset 438990848, length 2048. ================================ WARNING: inconsistent lock state 4.12.0-rc2+ #30 Tainted: G O -------------------------------- inconsistent {HARDIRQ-ON-W} -> {IN-HARDIRQ-W} usage. xfs_io/7959 [HC1[1]:SC0[0]:HE0:SE1] takes: (&(&sbi->cp_lock)->rlock){?.+...}, at: [] f2fs_stop_checkpoint+0x1c/0x50 [f2fs] {HARDIRQ-ON-W} state was registered at: __lock_acquire+0x527/0x7b0 lock_acquire+0xae/0x220 _raw_spin_lock+0x42/0x50 do_checkpoint+0x165/0x9e0 [f2fs] write_checkpoint+0x33f/0x740 [f2fs] __f2fs_sync_fs+0x92/0x1f0 [f2fs] f2fs_sync_fs+0x12/0x20 [f2fs] sync_filesystem+0x67/0x80 generic_shutdown_super+0x27/0x100 kill_block_super+0x22/0x50 kill_f2fs_super+0x3a/0x40 [f2fs] deactivate_locked_super+0x3d/0x70 deactivate_super+0x40/0x60 cleanup_mnt+0x39/0x70 __cleanup_mnt+0x10/0x20 task_work_run+0x69/0x80 exit_to_usermode_loop+0x57/0x85 do_fast_syscall_32+0x18c/0x1b0 entry_SYSENTER_32+0x4c/0x7b irq event stamp: 1957420 hardirqs last enabled at (1957419): [] _raw_spin_unlock_irq+0x27/0x50 hardirqs last disabled at (1957420): [] call_function_single_interrupt+0x30/0x3c softirqs last enabled at (1953784): [] __do_softirq+0x1ae/0x476 softirqs last disabled at (1953773): [] do_softirq_own_stack+0x2c/0x40 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(&(&sbi->cp_lock)->rlock); lock(&(&sbi->cp_lock)->rlock); *** DEADLOCK *** 2 locks held by xfs_io/7959: #0: (sb_writers#13){.+.+.+}, at: [] vfs_write+0x16a/0x190 #1: (&sb->s_type->i_mutex_key#16){+.+.+.}, at: [] f2fs_file_write_iter+0x25/0x140 [f2fs] stack backtrace: CPU: 2 PID: 7959 Comm: xfs_io Tainted: G O 4.12.0-rc2+ #30 Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006 Call Trace: dump_stack+0x5f/0x92 print_usage_bug+0x1d3/0x1dd ? check_usage_backwards+0xe0/0xe0 mark_lock+0x23d/0x280 __lock_acquire+0x699/0x7b0 ? __this_cpu_preempt_check+0xf/0x20 ? trace_hardirqs_off_caller+0x91/0xe0 lock_acquire+0xae/0x220 ? f2fs_stop_checkpoint+0x1c/0x50 [f2fs] _raw_spin_lock+0x42/0x50 ? f2fs_stop_checkpoint+0x1c/0x50 [f2fs] f2fs_stop_checkpoint+0x1c/0x50 [f2fs] f2fs_write_end_io+0x147/0x150 [f2fs] bio_endio+0x7a/0x1e0 blk_update_request+0xad/0x410 blk_mq_end_request+0x16/0x60 lo_complete_rq+0x3c/0x70 __blk_mq_complete_request_remote+0x11/0x20 flush_smp_call_function_queue+0x6d/0x120 ? debug_smp_processor_id+0x12/0x20 generic_smp_call_function_single_interrupt+0x12/0x30 smp_call_function_single_interrupt+0x25/0x40 call_function_single_interrupt+0x37/0x3c EIP: _raw_spin_unlock_irq+0x2d/0x50 EFLAGS: 00000296 CPU: 2 EAX: 00000001 EBX: d2ccc51c ECX: 00000001 EDX: c1aacebd ESI: 00000000 EDI: 00000000 EBP: c96c9d1c ESP: c96c9d18 DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068 ? inherit_task_group.isra.98.part.99+0x6b/0xb0 __add_to_page_cache_locked+0x1d4/0x290 add_to_page_cache_lru+0x38/0xb0 pagecache_get_page+0x8e/0x200 f2fs_write_begin+0x96/0xf00 [f2fs] ? trace_hardirqs_on_caller+0xdd/0x1c0 ? current_time+0x17/0x50 ? trace_hardirqs_on+0xb/0x10 generic_perform_write+0xa9/0x170 __generic_file_write_iter+0x1a2/0x1f0 ? f2fs_preallocate_blocks+0x137/0x160 [f2fs] f2fs_file_write_iter+0x6e/0x140 [f2fs] ? __lock_acquire+0x429/0x7b0 __vfs_write+0xc1/0x140 vfs_write+0x9b/0x190 SyS_pwrite64+0x63/0xa0 do_fast_syscall_32+0xa1/0x1b0 entry_SYSENTER_32+0x4c/0x7b EIP: 0xb7786c61 EFLAGS: 00000293 CPU: 2 EAX: ffffffda EBX: 00000003 ECX: 08416000 EDX: 00001000 ESI: 18b24000 EDI: 00000000 EBP: 00000003 ESP: bf9b36b0 DS: 007b ES: 007b FS: 0000 GS: 0033 SS: 007b Fixes: aaec2b1d1879 ("f2fs: introduce cp_lock to protect updating of ckpt_flags") Cc: stable@vger.kernel.org Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 11 ++++++----- fs/f2fs/f2fs.h | 18 ++++++++++++------ 2 files changed, 18 insertions(+), 11 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 52ed60e72a25..aabf7c4984d3 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1054,8 +1054,9 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc) { unsigned long orphan_num = sbi->im[ORPHAN_INO].ino_num; struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + unsigned long flags; - spin_lock(&sbi->cp_lock); + spin_lock_irqsave(&sbi->cp_lock, flags); if ((cpc->reason & CP_UMOUNT) && le32_to_cpu(ckpt->cp_pack_total_block_count) > @@ -1086,14 +1087,14 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* set this flag to activate crc|cp_ver for recovery */ __set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG); - spin_unlock(&sbi->cp_lock); + spin_unlock_irqrestore(&sbi->cp_lock, flags); } static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); struct f2fs_nm_info *nm_i = NM_I(sbi); - unsigned long orphan_num = sbi->im[ORPHAN_INO].ino_num; + unsigned long orphan_num = sbi->im[ORPHAN_INO].ino_num, flags; block_t start_blk; unsigned int data_sum_blocks, orphan_blocks; __u32 crc32 = 0; @@ -1135,12 +1136,12 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* 2 cp + n data seg summary + orphan inode blocks */ data_sum_blocks = npages_for_summary_flush(sbi, false); - spin_lock(&sbi->cp_lock); + spin_lock_irqsave(&sbi->cp_lock, flags); if (data_sum_blocks < NR_CURSEG_DATA_TYPE) __set_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG); else __clear_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG); - spin_unlock(&sbi->cp_lock); + spin_unlock_irqrestore(&sbi->cp_lock, flags); orphan_blocks = GET_ORPHAN_BLOCKS(orphan_num); ckpt->cp_pack_start_sum = cpu_to_le32(1 + cp_payload_blks + diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 3e0832973a2c..b44519fef652 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1321,9 +1321,11 @@ static inline void __set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) static inline void set_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f) { - spin_lock(&sbi->cp_lock); + unsigned long flags; + + spin_lock_irqsave(&sbi->cp_lock, flags); __set_ckpt_flags(F2FS_CKPT(sbi), f); - spin_unlock(&sbi->cp_lock); + spin_unlock_irqrestore(&sbi->cp_lock, flags); } static inline void __clear_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) @@ -1337,22 +1339,26 @@ static inline void __clear_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f static inline void clear_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f) { - spin_lock(&sbi->cp_lock); + unsigned long flags; + + spin_lock_irqsave(&sbi->cp_lock, flags); __clear_ckpt_flags(F2FS_CKPT(sbi), f); - spin_unlock(&sbi->cp_lock); + spin_unlock_irqrestore(&sbi->cp_lock, flags); } static inline void disable_nat_bits(struct f2fs_sb_info *sbi, bool lock) { + unsigned long flags; + set_sbi_flag(sbi, SBI_NEED_FSCK); if (lock) - spin_lock(&sbi->cp_lock); + spin_lock_irqsave(&sbi->cp_lock, flags); __clear_ckpt_flags(F2FS_CKPT(sbi), CP_NAT_BITS_FLAG); kfree(NM_I(sbi)->nat_bits); NM_I(sbi)->nat_bits = NULL; if (lock) - spin_unlock(&sbi->cp_lock); + spin_unlock_irqrestore(&sbi->cp_lock, flags); } static inline bool enabled_nat_bits(struct f2fs_sb_info *sbi, From f2cade6d8a7ec93e8e7392c4c161d26ea3350aad Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 21 Jun 2017 17:52:39 -0700 Subject: [PATCH 360/804] f2fs: avoid deadlock caused by lock order of page and lock_op - punch_hole - fill_zero - f2fs_lock_op - get_new_data_page - lock_page - f2fs_write_data_pages - lock_page - do_write_data_page - f2fs_lock_op Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index dc204f178b13..e5efea00ad05 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1402,8 +1402,9 @@ int do_write_data_page(struct f2fs_io_info *fio) } } - if (fio->need_lock == LOCK_REQ) - f2fs_lock_op(fio->sbi); + /* Deadlock due to between page->lock and f2fs_lock_op */ + if (fio->need_lock == LOCK_REQ && !f2fs_trylock_op(fio->sbi)) + return -EAGAIN; err = get_dnode_of_data(&dn, page->index, LOOKUP_NODE); if (err) @@ -1665,7 +1666,7 @@ retry: } done_index = page->index; - +retry_write: lock_page(page); if (unlikely(page->mapping != mapping)) { @@ -1701,6 +1702,15 @@ continue_unlock: unlock_page(page); ret = 0; continue; + } else if (ret == -EAGAIN) { + ret = 0; + if (wbc->sync_mode == WB_SYNC_ALL) { + cond_resched(); + congestion_wait(BLK_RW_ASYNC, + HZ/50); + goto retry_write; + } + continue; } done_index = page->index + 1; done = 1; From 9df0a9280fcbdabf170ad3c0d36548bf9fc37e67 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sun, 9 Jul 2017 00:13:07 +0800 Subject: [PATCH 361/804] f2fs: support plain user/group quota This patch adds to support plain user/group quota. Change Note by Jaegeuk Kim. - Use f2fs page cache for quota files in order to consider garbage collection. so, quota files are not tolerable for sudden power-cuts, so user needs to do quotacheck. - setattr() calls dquot_transfer which will transfer inode->i_blocks. We can't reclaim that during f2fs_evict_inode(). So, we need to count node blocks as well in order to match i_blocks with dquot's space. Note that, Chao wrote a patch to count inode->i_blocks without inode block. (f2fs: don't count inode block in in-memory inode.i_blocks) - in f2fs_remount, we need to make RW in prior to dquot_resume. - handle fault_injection case during f2fs_quota_off_umount - TODO: Project quota Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.txt | 2 + fs/f2fs/data.c | 10 +- fs/f2fs/f2fs.h | 92 +++++++--- fs/f2fs/file.c | 34 +++- fs/f2fs/inode.c | 5 + fs/f2fs/namei.c | 66 ++++++- fs/f2fs/node.c | 9 +- fs/f2fs/super.c | 278 +++++++++++++++++++++++++++++ 8 files changed, 454 insertions(+), 42 deletions(-) diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt index 8e454b0559f1..3ba27469a8dd 100644 --- a/Documentation/filesystems/f2fs.txt +++ b/Documentation/filesystems/f2fs.txt @@ -160,6 +160,8 @@ mode=%s Control block allocation mode which supports "adaptive" writes towards main area. io_bits=%u Set the bit size of write IO requests. It should be set with "mode=lfs". +usrquota Enable plain user disk quota accounting. +grpquota Enable plain group disk quota accounting. ================================================================================ DEBUGFS ENTRIES diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index e5efea00ad05..b8588c8360e2 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -490,14 +490,15 @@ void f2fs_update_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr) int reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count) { struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); + int err; if (!count) return 0; if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC))) return -EPERM; - if (unlikely(!inc_valid_block_count(sbi, dn->inode, &count))) - return -ENOSPC; + if (unlikely((err = inc_valid_block_count(sbi, dn->inode, &count)))) + return err; trace_f2fs_reserve_new_blocks(dn->inode, dn->nid, dn->ofs_in_node, count); @@ -748,6 +749,7 @@ static int __allocate_data_block(struct dnode_of_data *dn) struct node_info ni; pgoff_t fofs; blkcnt_t count = 1; + int err; if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC))) return -EPERM; @@ -756,8 +758,8 @@ static int __allocate_data_block(struct dnode_of_data *dn) if (dn->data_blkaddr == NEW_ADDR) goto alloc; - if (unlikely(!inc_valid_block_count(sbi, dn->inode, &count))) - return -ENOSPC; + if (unlikely((err = inc_valid_block_count(sbi, dn->inode, &count)))) + return err; alloc: get_node_info(sbi, dn->nid, &ni); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index b44519fef652..c1d323018738 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -22,6 +22,7 @@ #include #include #include +#include #ifdef CONFIG_F2FS_FS_ENCRYPTION #include #else @@ -89,6 +90,8 @@ extern char *fault_name[FAULT_MAX]; #define F2FS_MOUNT_FAULT_INJECTION 0x00010000 #define F2FS_MOUNT_ADAPTIVE 0x00020000 #define F2FS_MOUNT_LFS 0x00040000 +#define F2FS_MOUNT_USRQUOTA 0x00080000 +#define F2FS_MOUNT_GRPQUOTA 0x00100000 #define clear_opt(sbi, option) ((sbi)->mount_opt.opt &= ~F2FS_MOUNT_##option) #define set_opt(sbi, option) ((sbi)->mount_opt.opt |= F2FS_MOUNT_##option) @@ -588,6 +591,12 @@ struct f2fs_inode_info { nid_t i_xattr_nid; /* node id that contains xattrs */ loff_t last_disk_size; /* lastly written file size */ +#ifdef CONFIG_QUOTA + struct dquot *i_dquot[MAXQUOTAS]; + + /* quota space reservation, managed internally by quota code */ + qsize_t i_reserved_quota; +#endif struct list_head dirty_list; /* dirty list for dirs and files */ struct list_head gdirty_list; /* linked in global dirty list */ struct list_head inmem_pages; /* inmemory pages managed by f2fs */ @@ -1443,17 +1452,23 @@ static inline bool f2fs_has_xattr_block(unsigned int ofs) return ofs == XATTR_NODE_OFFSET; } -static inline void f2fs_i_blocks_write(struct inode *, block_t, bool); -static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi, +static inline void f2fs_i_blocks_write(struct inode *, block_t, bool, bool); +static inline int inc_valid_block_count(struct f2fs_sb_info *sbi, struct inode *inode, blkcnt_t *count) { - blkcnt_t diff; + blkcnt_t diff = 0, release = 0; block_t avail_user_block_count; + int ret; + + ret = dquot_reserve_block(inode, *count); + if (ret) + return ret; #ifdef CONFIG_F2FS_FAULT_INJECTION if (time_to_inject(sbi, FAULT_BLOCK)) { f2fs_show_injection_info(FAULT_BLOCK); - return false; + release = *count; + goto enospc; } #endif /* @@ -1468,17 +1483,24 @@ static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi, if (unlikely(sbi->total_valid_block_count > avail_user_block_count)) { diff = sbi->total_valid_block_count - avail_user_block_count; *count -= diff; + release = diff; sbi->total_valid_block_count = avail_user_block_count; if (!*count) { spin_unlock(&sbi->stat_lock); percpu_counter_sub(&sbi->alloc_valid_block_count, diff); - return false; + goto enospc; } } spin_unlock(&sbi->stat_lock); - f2fs_i_blocks_write(inode, *count, true); - return true; + if (release) + dquot_release_reservation_block(inode, release); + f2fs_i_blocks_write(inode, *count, true, true); + return 0; + +enospc: + dquot_release_reservation_block(inode, release); + return -ENOSPC; } static inline void dec_valid_block_count(struct f2fs_sb_info *sbi, @@ -1492,7 +1514,7 @@ static inline void dec_valid_block_count(struct f2fs_sb_info *sbi, f2fs_bug_on(sbi, inode->i_blocks < sectors); sbi->total_valid_block_count -= (block_t)count; spin_unlock(&sbi->stat_lock); - f2fs_i_blocks_write(inode, count, false); + f2fs_i_blocks_write(inode, count, false, true); } static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type) @@ -1621,11 +1643,18 @@ static inline block_t __start_sum_addr(struct f2fs_sb_info *sbi) return le32_to_cpu(F2FS_CKPT(sbi)->cp_pack_start_sum); } -static inline bool inc_valid_node_count(struct f2fs_sb_info *sbi, +static inline int inc_valid_node_count(struct f2fs_sb_info *sbi, struct inode *inode, bool is_inode) { block_t valid_block_count; unsigned int valid_node_count; + bool quota = inode && !is_inode; + + if (quota) { + int ret = dquot_reserve_block(inode, 1); + if (ret) + return ret; + } spin_lock(&sbi->stat_lock); @@ -1633,28 +1662,33 @@ static inline bool inc_valid_node_count(struct f2fs_sb_info *sbi, if (unlikely(valid_block_count + sbi->reserved_blocks > sbi->user_block_count)) { spin_unlock(&sbi->stat_lock); - return false; + goto enospc; } valid_node_count = sbi->total_valid_node_count + 1; if (unlikely(valid_node_count > sbi->total_node_count)) { spin_unlock(&sbi->stat_lock); - return false; - } - - if (inode) { - if (is_inode) - f2fs_mark_inode_dirty_sync(inode, true); - else - f2fs_i_blocks_write(inode, 1, true); + goto enospc; } sbi->total_valid_node_count++; sbi->total_valid_block_count++; spin_unlock(&sbi->stat_lock); + if (inode) { + if (is_inode) + f2fs_mark_inode_dirty_sync(inode, true); + else + f2fs_i_blocks_write(inode, 1, true, true); + } + percpu_counter_inc(&sbi->alloc_valid_block_count); - return true; + return 0; + +enospc: + if (quota) + dquot_release_reservation_block(inode, 1); + return -ENOSPC; } static inline void dec_valid_node_count(struct f2fs_sb_info *sbi, @@ -1666,12 +1700,13 @@ static inline void dec_valid_node_count(struct f2fs_sb_info *sbi, f2fs_bug_on(sbi, !sbi->total_valid_node_count); f2fs_bug_on(sbi, !is_inode && !inode->i_blocks); - if (!is_inode) - f2fs_i_blocks_write(inode, 1, false); sbi->total_valid_node_count--; sbi->total_valid_block_count--; spin_unlock(&sbi->stat_lock); + + if (!is_inode) + f2fs_i_blocks_write(inode, 1, false, true); } static inline unsigned int valid_node_count(struct f2fs_sb_info *sbi) @@ -1946,14 +1981,21 @@ static inline void f2fs_i_links_write(struct inode *inode, bool inc) } static inline void f2fs_i_blocks_write(struct inode *inode, - block_t diff, bool add) + block_t diff, bool add, bool claim) { bool clean = !is_inode_flag_set(inode, FI_DIRTY_INODE); bool recover = is_inode_flag_set(inode, FI_AUTO_RECOVER); - blkcnt_t sectors = diff << F2FS_LOG_SECTORS_PER_BLOCK; - inode->i_blocks = add ? inode->i_blocks + sectors : - inode->i_blocks - sectors; + /* add = 1, claim = 1 should be dquot_reserve_block in pair */ + if (add) { + if (claim) + dquot_claim_block(inode, diff); + else + dquot_alloc_block_nofail(inode, diff); + } else { + dquot_free_block(inode, diff); + } + f2fs_mark_inode_dirty_sync(inode, true); if (clean || recover) set_inode_flag(inode, FI_AUTO_RECOVER); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 9978e86c89c5..5700722a1c42 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -445,11 +445,10 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma) static int f2fs_file_open(struct inode *inode, struct file *filp) { - int ret = generic_file_open(inode, filp); struct dentry *dir; - if (!ret && f2fs_encrypted_inode(inode)) { - ret = fscrypt_get_encryption_info(inode); + if (f2fs_encrypted_inode(inode)) { + int ret = fscrypt_get_encryption_info(inode); if (ret) return -EACCES; if (!fscrypt_has_encryption_key(inode)) @@ -462,7 +461,7 @@ static int f2fs_file_open(struct inode *inode, struct file *filp) return -EPERM; } dput(dir); - return ret; + return dquot_file_open(inode, filp); } int truncate_data_blocks_range(struct dnode_of_data *dn, int count) @@ -692,6 +691,20 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) if (err) return err; + if (is_quota_modification(inode, attr)) { + err = dquot_initialize(inode); + if (err) + return err; + } + if ((attr->ia_valid & ATTR_UID && + !uid_eq(attr->ia_uid, inode->i_uid)) || + (attr->ia_valid & ATTR_GID && + !gid_eq(attr->ia_gid, inode->i_gid))) { + err = dquot_transfer(inode, attr); + if (err) + return err; + } + if (attr->ia_valid & ATTR_SIZE) { if (f2fs_encrypted_inode(inode)) { err = fscrypt_get_encryption_info(inode); @@ -981,9 +994,9 @@ static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode, if (do_replace[i]) { f2fs_i_blocks_write(src_inode, - 1, false); + 1, false, false); f2fs_i_blocks_write(dst_inode, - 1, true); + 1, true, false); f2fs_replace_block(sbi, &dn, dn.data_blkaddr, blkaddr[i], ni.version, true, false); @@ -1508,6 +1521,13 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg) inode_lock(inode); + /* Is it quota file? Do not allow user to mess with it */ + if (IS_NOQUOTA(inode)) { + inode_unlock(inode); + ret = -EPERM; + goto unlock_out; + } + flags = f2fs_mask_flags(inode->i_mode, flags); oldflags = fi->i_flags; @@ -1527,7 +1547,7 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg) inode->i_ctime = current_time(inode); f2fs_set_inode_flags(inode); f2fs_mark_inode_dirty_sync(inode, false); - +unlock_out: inode_unlock(inode); out: mnt_drop_write_file(filp); diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index e42a7a8805dc..6cd312a17c69 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -373,6 +373,8 @@ void f2fs_evict_inode(struct inode *inode) if (inode->i_nlink || is_bad_inode(inode)) goto no_delete; + dquot_initialize(inode); + remove_ino_entry(sbi, inode->i_ino, APPEND_INO); remove_ino_entry(sbi, inode->i_ino, UPDATE_INO); @@ -405,8 +407,11 @@ retry: if (err) update_inode_page(inode); + dquot_free_inode(inode); sb_end_intwrite(inode->i_sb); no_delete: + dquot_drop(inode); + stat_dec_inline_xattr(inode); stat_dec_inline_dir(inode); stat_dec_inline_inode(inode); diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 82714cdde5f2..766439e3ba42 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -15,6 +15,7 @@ #include #include #include +#include #include "f2fs.h" #include "node.h" @@ -42,6 +43,8 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) } f2fs_unlock_op(sbi); + nid_free = true; + inode_init_owner(inode, dir, mode); inode->i_ino = ino; @@ -52,10 +55,17 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) err = insert_inode_locked(inode); if (err) { err = -EINVAL; - nid_free = true; goto fail; } + err = dquot_initialize(inode); + if (err) + goto fail_drop; + + err = dquot_alloc_inode(inode); + if (err) + goto fail_drop; + /* If the directory encrypted, then we should encrypt the inode. */ if (f2fs_encrypted_inode(dir) && f2fs_may_encrypt(inode)) f2fs_set_encrypted_inode(inode); @@ -85,6 +95,16 @@ fail: set_inode_flag(inode, FI_FREE_NID); iput(inode); return ERR_PTR(err); +fail_drop: + trace_f2fs_new_inode(inode, err); + dquot_drop(inode); + inode->i_flags |= S_NOQUOTA; + if (nid_free) + set_inode_flag(inode, FI_FREE_NID); + clear_nlink(inode); + unlock_new_inode(inode); + iput(inode); + return ERR_PTR(err); } static int is_multimedia_file(const unsigned char *s, const char *sub) @@ -136,6 +156,10 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, nid_t ino = 0; int err; + err = dquot_initialize(dir); + if (err) + return err; + inode = f2fs_new_inode(dir, mode); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -180,6 +204,10 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir, !fscrypt_has_permitted_context(dir, inode)) return -EPERM; + err = dquot_initialize(dir); + if (err) + return err; + f2fs_balance_fs(sbi, true); inode->i_ctime = current_time(inode); @@ -347,6 +375,10 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) trace_f2fs_unlink_enter(dir, dentry); + err = dquot_initialize(dir); + if (err) + return err; + de = f2fs_find_entry(dir, &dentry->d_name, &page); if (!de) { if (IS_ERR(page)) @@ -410,6 +442,10 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, if (disk_link.len > dir->i_sb->s_blocksize) return -ENAMETOOLONG; + err = dquot_initialize(dir); + if (err) + return err; + inode = f2fs_new_inode(dir, S_IFLNK | S_IRWXUGO); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -497,6 +533,10 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) struct inode *inode; int err; + err = dquot_initialize(dir); + if (err) + return err; + inode = f2fs_new_inode(dir, S_IFDIR | mode); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -545,6 +585,10 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry, struct inode *inode; int err = 0; + err = dquot_initialize(dir); + if (err) + return err; + inode = f2fs_new_inode(dir, mode); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -580,6 +624,10 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry, struct inode *inode; int err; + err = dquot_initialize(dir); + if (err) + return err; + inode = f2fs_new_inode(dir, mode); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -673,6 +721,14 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, goto out; } + err = dquot_initialize(old_dir); + if (err) + goto out; + + err = dquot_initialize(new_dir); + if (err) + goto out; + old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page); if (!old_entry) { if (IS_ERR(old_page)) @@ -853,6 +909,14 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, !fscrypt_has_permitted_context(old_dir, new_inode))) return -EPERM; + err = dquot_initialize(old_dir); + if (err) + goto out; + + err = dquot_initialize(new_dir); + if (err) + goto out; + old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page); if (!old_entry) { if (IS_ERR(old_page)) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 81c8d4eca6b9..d737ae43ce08 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1040,10 +1040,9 @@ struct page *new_node_page(struct dnode_of_data *dn, if (!page) return ERR_PTR(-ENOMEM); - if (unlikely(!inc_valid_node_count(sbi, dn->inode, !ofs))) { - err = -ENOSPC; + if (unlikely((err = inc_valid_node_count(sbi, dn->inode, !ofs)))) goto fail; - } + #ifdef CONFIG_F2FS_CHECK_FS get_node_info(sbi, dn->nid, &new_ni); f2fs_bug_on(sbi, new_ni.blk_addr != NULL_ADDR); @@ -2213,7 +2212,7 @@ recover_xnid: /* 2: update xattr nid in inode */ remove_free_nid(sbi, new_xnid); f2fs_i_xnid_write(inode, new_xnid); - if (unlikely(!inc_valid_node_count(sbi, inode, false))) + if (unlikely(inc_valid_node_count(sbi, inode, false))) f2fs_bug_on(sbi, 1); update_inode_page(inode); @@ -2271,7 +2270,7 @@ retry: new_ni = old_ni; new_ni.ino = ino; - if (unlikely(!inc_valid_node_count(sbi, NULL, true))) + if (unlikely(inc_valid_node_count(sbi, NULL, true))) WARN_ON(1); set_node_addr(sbi, &new_ni, NEW_ADDR, false); inc_valid_inode_count(sbi); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 7f3c99d43579..49dd2b8efc03 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -106,6 +107,8 @@ enum { Opt_fault_injection, Opt_lazytime, Opt_nolazytime, + Opt_usrquota, + Opt_grpquota, Opt_err, }; @@ -141,6 +144,8 @@ static match_table_t f2fs_tokens = { {Opt_fault_injection, "fault_injection=%u"}, {Opt_lazytime, "lazytime"}, {Opt_nolazytime, "nolazytime"}, + {Opt_usrquota, "usrquota"}, + {Opt_grpquota, "grpquota"}, {Opt_err, NULL}, }; @@ -380,6 +385,20 @@ static int parse_options(struct super_block *sb, char *options) case Opt_nolazytime: sb->s_flags &= ~MS_LAZYTIME; break; +#ifdef CONFIG_QUOTA + case Opt_usrquota: + set_opt(sbi, USRQUOTA); + break; + case Opt_grpquota: + set_opt(sbi, GRPQUOTA); + break; +#else + case Opt_usrquota: + case Opt_grpquota: + f2fs_msg(sb, KERN_INFO, + "quota operations not supported"); + break; +#endif default: f2fs_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" or missing value", @@ -421,6 +440,10 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) init_rwsem(&fi->dio_rwsem[WRITE]); init_rwsem(&fi->i_mmap_sem); +#ifdef CONFIG_QUOTA + memset(&fi->i_dquot, 0, sizeof(fi->i_dquot)); + fi->i_reserved_quota = 0; +#endif /* Will be used by directory only */ fi->i_dir_level = F2FS_SB(sb)->dir_level; return &fi->vfs_inode; @@ -561,11 +584,14 @@ static void destroy_device_list(struct f2fs_sb_info *sbi) kfree(sbi->devs); } +static void f2fs_quota_off_umount(struct super_block *sb); static void f2fs_put_super(struct super_block *sb) { struct f2fs_sb_info *sbi = F2FS_SB(sb); int i; + f2fs_quota_off_umount(sb); + /* prevent remaining shrinker jobs */ mutex_lock(&sbi->umount_mutex); @@ -783,6 +809,12 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_printf(seq, ",fault_injection=%u", sbi->fault_info.inject_rate); #endif +#ifdef CONFIG_QUOTA + if (test_opt(sbi, USRQUOTA)) + seq_puts(seq, ",usrquota"); + if (test_opt(sbi, GRPQUOTA)) + seq_puts(seq, ",grpquota"); +#endif return 0; } @@ -823,6 +855,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) { struct f2fs_sb_info *sbi = F2FS_SB(sb); struct f2fs_mount_info org_mount_opt; + unsigned long old_sb_flags; int err, active_logs; bool need_restart_gc = false; bool need_stop_gc = false; @@ -836,6 +869,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) * need to restore them. */ org_mount_opt = sbi->mount_opt; + old_sb_flags = sb->s_flags; active_logs = sbi->active_logs; /* recover superblocks we couldn't write due to previous RO mount */ @@ -861,6 +895,16 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) if (f2fs_readonly(sb) && (*flags & MS_RDONLY)) goto skip; + if (!f2fs_readonly(sb) && (*flags & MS_RDONLY)) { + err = dquot_suspend(sb, -1); + if (err < 0) + goto restore_opts; + } else { + /* dquot_resume needs RW */ + sb->s_flags &= ~MS_RDONLY; + dquot_resume(sb, -1); + } + /* disallow enable/disable extent_cache dynamically */ if (no_extent_cache == !!test_opt(sbi, EXTENT_CACHE)) { err = -EINVAL; @@ -925,12 +969,235 @@ restore_gc: restore_opts: sbi->mount_opt = org_mount_opt; sbi->active_logs = active_logs; + sb->s_flags = old_sb_flags; #ifdef CONFIG_F2FS_FAULT_INJECTION sbi->fault_info = ffi; #endif return err; } +#ifdef CONFIG_QUOTA +/* Read data from quotafile */ +static ssize_t f2fs_quota_read(struct super_block *sb, int type, char *data, + size_t len, loff_t off) +{ + struct inode *inode = sb_dqopt(sb)->files[type]; + struct address_space *mapping = inode->i_mapping; + block_t blkidx = F2FS_BYTES_TO_BLK(off); + int offset = off & (sb->s_blocksize - 1); + int tocopy; + size_t toread; + loff_t i_size = i_size_read(inode); + struct page *page; + char *kaddr; + + if (off > i_size) + return 0; + + if (off + len > i_size) + len = i_size - off; + toread = len; + while (toread > 0) { + tocopy = min_t(unsigned long, sb->s_blocksize - offset, toread); +repeat: + page = read_mapping_page(mapping, blkidx, NULL); + if (IS_ERR(page)) + return PTR_ERR(page); + + lock_page(page); + + if (unlikely(page->mapping != mapping)) { + f2fs_put_page(page, 1); + goto repeat; + } + if (unlikely(!PageUptodate(page))) { + f2fs_put_page(page, 1); + return -EIO; + } + + kaddr = kmap_atomic(page); + memcpy(data, kaddr + offset, tocopy); + kunmap_atomic(kaddr); + f2fs_put_page(page, 1); + + offset = 0; + toread -= tocopy; + data += tocopy; + blkidx++; + } + return len; +} + +/* Write to quotafile */ +static ssize_t f2fs_quota_write(struct super_block *sb, int type, + const char *data, size_t len, loff_t off) +{ + struct inode *inode = sb_dqopt(sb)->files[type]; + struct address_space *mapping = inode->i_mapping; + const struct address_space_operations *a_ops = mapping->a_ops; + int offset = off & (sb->s_blocksize - 1); + size_t towrite = len; + struct page *page; + char *kaddr; + int err = 0; + int tocopy; + + while (towrite > 0) { + tocopy = min_t(unsigned long, sb->s_blocksize - offset, + towrite); + + err = a_ops->write_begin(NULL, mapping, off, tocopy, 0, + &page, NULL); + if (unlikely(err)) + break; + + kaddr = kmap_atomic(page); + memcpy(kaddr + offset, data, tocopy); + kunmap_atomic(kaddr); + flush_dcache_page(page); + + a_ops->write_end(NULL, mapping, off, tocopy, tocopy, + page, NULL); + offset = 0; + towrite -= tocopy; + off += tocopy; + data += tocopy; + cond_resched(); + } + + if (len == towrite) + return err; + inode->i_version++; + inode->i_mtime = inode->i_ctime = current_time(inode); + f2fs_mark_inode_dirty_sync(inode, false); + return len - towrite; +} + +static struct dquot **f2fs_get_dquots(struct inode *inode) +{ + return F2FS_I(inode)->i_dquot; +} + +static qsize_t *f2fs_get_reserved_space(struct inode *inode) +{ + return &F2FS_I(inode)->i_reserved_quota; +} + +static int f2fs_quota_sync(struct super_block *sb, int type) +{ + struct quota_info *dqopt = sb_dqopt(sb); + int cnt; + int ret; + + ret = dquot_writeback_dquots(sb, type); + if (ret) + return ret; + + /* + * Now when everything is written we can discard the pagecache so + * that userspace sees the changes. + */ + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + if (type != -1 && cnt != type) + continue; + if (!sb_has_quota_active(sb, cnt)) + continue; + + ret = filemap_write_and_wait(dqopt->files[cnt]->i_mapping); + if (ret) + return ret; + + inode_lock(dqopt->files[cnt]); + truncate_inode_pages(&dqopt->files[cnt]->i_data, 0); + inode_unlock(dqopt->files[cnt]); + } + return 0; +} + +static int f2fs_quota_on(struct super_block *sb, int type, int format_id, + struct path *path) +{ + struct inode *inode; + int err; + + err = f2fs_quota_sync(sb, -1); + if (err) + return err; + + err = dquot_quota_on(sb, type, format_id, path); + if (err) + return err; + + inode = d_inode(path->dentry); + + inode_lock(inode); + F2FS_I(inode)->i_flags |= FS_NOATIME_FL | FS_IMMUTABLE_FL; + inode_set_flags(inode, S_NOATIME | S_IMMUTABLE, + S_NOATIME | S_IMMUTABLE); + inode_unlock(inode); + f2fs_mark_inode_dirty_sync(inode, false); + + return 0; +} + +static int f2fs_quota_off(struct super_block *sb, int type) +{ + struct inode *inode = sb_dqopt(sb)->files[type]; + int err; + + if (!inode || !igrab(inode)) + return dquot_quota_off(sb, type); + + f2fs_quota_sync(sb, -1); + + err = dquot_quota_off(sb, type); + if (err) + goto out_put; + + inode_lock(inode); + F2FS_I(inode)->i_flags &= ~(FS_NOATIME_FL | FS_IMMUTABLE_FL); + inode_set_flags(inode, 0, S_NOATIME | S_IMMUTABLE); + inode_unlock(inode); + f2fs_mark_inode_dirty_sync(inode, false); +out_put: + iput(inode); + return err; +} + +static void f2fs_quota_off_umount(struct super_block *sb) +{ + int type; + + for (type = 0; type < MAXQUOTAS; type++) + f2fs_quota_off(sb, type); +} + +static const struct dquot_operations f2fs_quota_operations = { + .get_reserved_space = f2fs_get_reserved_space, + .write_dquot = dquot_commit, + .acquire_dquot = dquot_acquire, + .release_dquot = dquot_release, + .mark_dirty = dquot_mark_dquot_dirty, + .write_info = dquot_commit_info, + .alloc_dquot = dquot_alloc, + .destroy_dquot = dquot_destroy, +}; + +static const struct quotactl_ops f2fs_quotactl_ops = { + .quota_on = f2fs_quota_on, + .quota_off = f2fs_quota_off, + .quota_sync = f2fs_quota_sync, + .get_state = dquot_get_state, + .set_info = dquot_set_dqinfo, + .get_dqblk = dquot_get_dqblk, + .set_dqblk = dquot_set_dqblk, +}; +#else +static inline void f2fs_quota_off_umount(struct super_block *sb) +{ +} +#endif + static struct super_operations f2fs_sops = { .alloc_inode = f2fs_alloc_inode, .drop_inode = f2fs_drop_inode, @@ -938,6 +1205,11 @@ static struct super_operations f2fs_sops = { .write_inode = f2fs_write_inode, .dirty_inode = f2fs_dirty_inode, .show_options = f2fs_show_options, +#ifdef CONFIG_QUOTA + .quota_read = f2fs_quota_read, + .quota_write = f2fs_quota_write, + .get_dquots = f2fs_get_dquots, +#endif .evict_inode = f2fs_evict_inode, .put_super = f2fs_put_super, .sync_fs = f2fs_sync_fs, @@ -1684,6 +1956,12 @@ try_onemore: sb->s_max_links = F2FS_LINK_MAX; get_random_bytes(&sbi->s_next_generation, sizeof(u32)); +#ifdef CONFIG_QUOTA + sb->dq_op = &f2fs_quota_operations; + sb->s_qcop = &f2fs_quotactl_ops; + sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP; +#endif + sb->s_op = &f2fs_sops; sb->s_cop = &f2fs_cryptops; sb->s_xattr = f2fs_xattr_handlers; From 972aaba68e97f8eabdfcfe594a9b78d604ee613f Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 10 Jul 2017 19:16:28 -0700 Subject: [PATCH 362/804] f2fs: make more close to v4.13-rc1 Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 5 +++-- fs/f2fs/file.c | 4 ++-- fs/f2fs/namei.c | 4 ++-- fs/f2fs/node.c | 6 +++--- fs/f2fs/segment.c | 18 +++++++++--------- fs/f2fs/super.c | 8 ++++---- 6 files changed, 23 insertions(+), 22 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index c1d323018738..ecfd7fc02b57 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2228,7 +2228,7 @@ static inline void *f2fs_kmalloc(struct f2fs_sb_info *sbi, return kmalloc(size, flags); } -static inline void *f2fs_kvmalloc(size_t size, gfp_t flags) +static inline void *kvmalloc(size_t size, gfp_t flags) { void *ret; @@ -2238,7 +2238,7 @@ static inline void *f2fs_kvmalloc(size_t size, gfp_t flags) return ret; } -static inline void *f2fs_kvzalloc(size_t size, gfp_t flags) +static inline void *kvzalloc(size_t size, gfp_t flags) { void *ret; @@ -2891,4 +2891,5 @@ static inline bool f2fs_may_encrypt(struct inode *inode) return 0; #endif } + #endif diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 5700722a1c42..789d75beb7ed 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1048,11 +1048,11 @@ static int __exchange_data_block(struct inode *src_inode, while (len) { olen = min((pgoff_t)4 * ADDRS_PER_BLOCK, len); - src_blkaddr = f2fs_kvzalloc(sizeof(block_t) * olen, GFP_KERNEL); + src_blkaddr = kvzalloc(sizeof(block_t) * olen, GFP_KERNEL); if (!src_blkaddr) return -ENOMEM; - do_replace = f2fs_kvzalloc(sizeof(int) * olen, GFP_KERNEL); + do_replace = kvzalloc(sizeof(int) * olen, GFP_KERNEL); if (!do_replace) { kvfree(src_blkaddr); return -ENOMEM; diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 766439e3ba42..541d755193c8 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -988,7 +988,7 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, file_lost_pino(old_inode); up_write(&F2FS_I(old_inode)->i_sem); - old_dir->i_ctime = CURRENT_TIME; + old_dir->i_ctime = current_time(old_dir); if (old_nlink) { down_write(&F2FS_I(old_dir)->i_sem); f2fs_i_links_write(old_dir, old_nlink > 0); @@ -1003,7 +1003,7 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, file_lost_pino(new_inode); up_write(&F2FS_I(new_inode)->i_sem); - new_dir->i_ctime = CURRENT_TIME; + new_dir->i_ctime = current_time(new_dir); if (new_nlink) { down_write(&F2FS_I(new_dir)->i_sem); f2fs_i_links_write(new_dir, new_nlink > 0); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index d737ae43ce08..d0d6a5830181 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2664,17 +2664,17 @@ static int init_free_nid_cache(struct f2fs_sb_info *sbi) { struct f2fs_nm_info *nm_i = NM_I(sbi); - nm_i->free_nid_bitmap = f2fs_kvzalloc(nm_i->nat_blocks * + nm_i->free_nid_bitmap = kvzalloc(nm_i->nat_blocks * NAT_ENTRY_BITMAP_SIZE, GFP_KERNEL); if (!nm_i->free_nid_bitmap) return -ENOMEM; - nm_i->nat_block_bitmap = f2fs_kvzalloc(nm_i->nat_blocks / 8, + nm_i->nat_block_bitmap = kvzalloc(nm_i->nat_blocks / 8, GFP_KERNEL); if (!nm_i->nat_block_bitmap) return -ENOMEM; - nm_i->free_nid_count = f2fs_kvzalloc(nm_i->nat_blocks * + nm_i->free_nid_count = kvzalloc(nm_i->nat_blocks * sizeof(unsigned short), GFP_KERNEL); if (!nm_i->free_nid_count) return -ENOMEM; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 0f6cded83c7b..9744e8c9d308 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1243,8 +1243,8 @@ static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, sector = SECTOR_FROM_BLOCK(blkstart); nr_sects = SECTOR_FROM_BLOCK(blklen); - if (sector & (bdev_zone_size(bdev) - 1) || - nr_sects != bdev_zone_size(bdev)) { + if (sector & (bdev_zone_sectors(bdev) - 1) || + nr_sects != bdev_zone_sectors(bdev)) { f2fs_msg(sbi->sb, KERN_INFO, "(%d) %s: Unaligned discard attempted (block %x + %x)", devi, sbi->s_ndevs ? FDEV(devi).path: "", @@ -2998,13 +2998,13 @@ static int build_sit_info(struct f2fs_sb_info *sbi) SM_I(sbi)->sit_info = sit_i; - sit_i->sentries = f2fs_kvzalloc(MAIN_SEGS(sbi) * + sit_i->sentries = kvzalloc(MAIN_SEGS(sbi) * sizeof(struct seg_entry), GFP_KERNEL); if (!sit_i->sentries) return -ENOMEM; bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi)); - sit_i->dirty_sentries_bitmap = f2fs_kvzalloc(bitmap_size, GFP_KERNEL); + sit_i->dirty_sentries_bitmap = kvzalloc(bitmap_size, GFP_KERNEL); if (!sit_i->dirty_sentries_bitmap) return -ENOMEM; @@ -3037,7 +3037,7 @@ static int build_sit_info(struct f2fs_sb_info *sbi) return -ENOMEM; if (sbi->segs_per_sec > 1) { - sit_i->sec_entries = f2fs_kvzalloc(MAIN_SECS(sbi) * + sit_i->sec_entries = kvzalloc(MAIN_SECS(sbi) * sizeof(struct sec_entry), GFP_KERNEL); if (!sit_i->sec_entries) return -ENOMEM; @@ -3088,12 +3088,12 @@ static int build_free_segmap(struct f2fs_sb_info *sbi) SM_I(sbi)->free_info = free_i; bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi)); - free_i->free_segmap = f2fs_kvmalloc(bitmap_size, GFP_KERNEL); + free_i->free_segmap = kvmalloc(bitmap_size, GFP_KERNEL); if (!free_i->free_segmap) return -ENOMEM; sec_bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi)); - free_i->free_secmap = f2fs_kvmalloc(sec_bitmap_size, GFP_KERNEL); + free_i->free_secmap = kvmalloc(sec_bitmap_size, GFP_KERNEL); if (!free_i->free_secmap) return -ENOMEM; @@ -3273,7 +3273,7 @@ static int init_victim_secmap(struct f2fs_sb_info *sbi) struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); unsigned int bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi)); - dirty_i->victim_secmap = f2fs_kvzalloc(bitmap_size, GFP_KERNEL); + dirty_i->victim_secmap = kvzalloc(bitmap_size, GFP_KERNEL); if (!dirty_i->victim_secmap) return -ENOMEM; return 0; @@ -3295,7 +3295,7 @@ static int build_dirty_segmap(struct f2fs_sb_info *sbi) bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi)); for (i = 0; i < NR_DIRTY_TYPE; i++) { - dirty_i->dirty_segmap[i] = f2fs_kvzalloc(bitmap_size, GFP_KERNEL); + dirty_i->dirty_segmap[i] = kvzalloc(bitmap_size, GFP_KERNEL); if (!dirty_i->dirty_segmap[i]) return -ENOMEM; } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 49dd2b8efc03..1eb2013fece6 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1638,16 +1638,16 @@ static int init_blkz_info(struct f2fs_sb_info *sbi, int devi) return 0; if (sbi->blocks_per_blkz && sbi->blocks_per_blkz != - SECTOR_TO_BLOCK(bdev_zone_size(bdev))) + SECTOR_TO_BLOCK(bdev_zone_sectors(bdev))) return -EINVAL; - sbi->blocks_per_blkz = SECTOR_TO_BLOCK(bdev_zone_size(bdev)); + sbi->blocks_per_blkz = SECTOR_TO_BLOCK(bdev_zone_sectors(bdev)); if (sbi->log_blocks_per_blkz && sbi->log_blocks_per_blkz != __ilog2_u32(sbi->blocks_per_blkz)) return -EINVAL; sbi->log_blocks_per_blkz = __ilog2_u32(sbi->blocks_per_blkz); FDEV(devi).nr_blkz = SECTOR_TO_BLOCK(nr_sectors) >> sbi->log_blocks_per_blkz; - if (nr_sectors & (bdev_zone_size(bdev) - 1)) + if (nr_sectors & (bdev_zone_sectors(bdev) - 1)) FDEV(devi).nr_blkz++; FDEV(devi).blkz_type = kmalloc(FDEV(devi).nr_blkz, GFP_KERNEL); @@ -1789,7 +1789,7 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) /* Initialize single device information */ if (!RDEV(0).path[0]) { #ifdef CONFIG_BLK_DEV_ZONED - if (bdev_zoned_model(sbi->sb->s_bdev) == BLK_ZONED_NONE) + if (!bdev_is_zoned(sbi->sb->s_bdev)) return 0; max_devices = 1; #else From f6ba8b4893da189199657b0db85705ee247d9c36 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 22 May 2017 18:14:06 -0700 Subject: [PATCH 363/804] fscrypt: inline fscrypt_free_filename() fscrypt_free_filename() only needs to do a kfree() of crypto_buf.name, which works well as an inline function. We can skip setting the various pointers to NULL, since no user cares about it (the name is always freed just before it goes out of scope). Signed-off-by: Eric Biggers Reviewed-by: David Gstir Signed-off-by: Theodore Ts'o --- fs/crypto/fname.c | 9 --------- include/linux/fscrypt_supp.h | 7 ++++++- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index d1bb02b1ee58..ad9f814fdead 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -453,12 +453,3 @@ errout: return ret; } EXPORT_SYMBOL(fscrypt_setup_filename); - -void fscrypt_free_filename(struct fscrypt_name *fname) -{ - kfree(fname->crypto_buf.name); - fname->crypto_buf.name = NULL; - fname->usr_fname = NULL; - fname->disk_name.name = NULL; -} -EXPORT_SYMBOL(fscrypt_free_filename); diff --git a/include/linux/fscrypt_supp.h b/include/linux/fscrypt_supp.h index cd4e82c17304..32e2fcf13b01 100644 --- a/include/linux/fscrypt_supp.h +++ b/include/linux/fscrypt_supp.h @@ -47,7 +47,12 @@ extern void fscrypt_put_encryption_info(struct inode *, struct fscrypt_info *); /* fname.c */ extern int fscrypt_setup_filename(struct inode *, const struct qstr *, int lookup, struct fscrypt_name *); -extern void fscrypt_free_filename(struct fscrypt_name *); + +static inline void fscrypt_free_filename(struct fscrypt_name *fname) +{ + kfree(fname->crypto_buf.name); +} + extern u32 fscrypt_fname_encrypted_size(const struct inode *, u32); extern int fscrypt_fname_alloc_buffer(const struct inode *, u32, struct fscrypt_str *); From 73a2900701bac20ceb808514dc3b275e96484a79 Mon Sep 17 00:00:00 2001 From: Daniel Walter Date: Mon, 19 Jun 2017 09:27:58 +0200 Subject: [PATCH 364/804] fscrypt: add support for AES-128-CBC fscrypt provides facilities to use different encryption algorithms which are selectable by userspace when setting the encryption policy. Currently, only AES-256-XTS for file contents and AES-256-CBC-CTS for file names are implemented. This is a clear case of kernel offers the mechanism and userspace selects a policy. Similar to what dm-crypt and ecryptfs have. This patch adds support for using AES-128-CBC for file contents and AES-128-CBC-CTS for file name encryption. To mitigate watermarking attacks, IVs are generated using the ESSIV algorithm. While AES-CBC is actually slightly less secure than AES-XTS from a security point of view, there is more widespread hardware support. Using AES-CBC gives us the acceptable performance while still providing a moderate level of security for persistent storage. Especially low-powered embedded devices with crypto accelerators such as CAAM or CESA often only support AES-CBC. Since using AES-CBC over AES-XTS is basically thought of a last resort, we use AES-128-CBC over AES-256-CBC since it has less encryption rounds and yields noticeable better performance starting from a file size of just a few kB. Signed-off-by: Daniel Walter [david@sigma-star.at: addressed review comments] Signed-off-by: David Gstir Reviewed-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/crypto/crypto.c | 23 +++-- fs/crypto/fscrypt_private.h | 9 +- fs/crypto/keyinfo.c | 175 +++++++++++++++++++++++++-------- fs/crypto/policy.c | 8 +- include/linux/fscrypt_common.h | 16 +-- include/uapi/linux/fs.h | 2 + 6 files changed, 174 insertions(+), 59 deletions(-) diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index 6d6eca394d4d..c7835df7e7b8 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -26,6 +26,7 @@ #include #include #include +#include #include "fscrypt_private.h" static unsigned int num_prealloc_crypto_pages = 32; @@ -147,8 +148,8 @@ int fscrypt_do_page_crypto(const struct inode *inode, fscrypt_direction_t rw, { struct { __le64 index; - u8 padding[FS_XTS_TWEAK_SIZE - sizeof(__le64)]; - } xts_tweak; + u8 padding[FS_IV_SIZE - sizeof(__le64)]; + } iv; struct skcipher_request *req = NULL; DECLARE_FS_COMPLETION_RESULT(ecr); struct scatterlist dst, src; @@ -158,6 +159,16 @@ int fscrypt_do_page_crypto(const struct inode *inode, fscrypt_direction_t rw, BUG_ON(len == 0); + BUILD_BUG_ON(sizeof(iv) != FS_IV_SIZE); + BUILD_BUG_ON(AES_BLOCK_SIZE != FS_IV_SIZE); + iv.index = cpu_to_le64(lblk_num); + memset(iv.padding, 0, sizeof(iv.padding)); + + if (ci->ci_essiv_tfm != NULL) { + crypto_cipher_encrypt_one(ci->ci_essiv_tfm, (u8 *)&iv, + (u8 *)&iv); + } + req = skcipher_request_alloc(tfm, gfp_flags); if (!req) { printk_ratelimited(KERN_ERR @@ -170,15 +181,11 @@ int fscrypt_do_page_crypto(const struct inode *inode, fscrypt_direction_t rw, req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, page_crypt_complete, &ecr); - BUILD_BUG_ON(sizeof(xts_tweak) != FS_XTS_TWEAK_SIZE); - xts_tweak.index = cpu_to_le64(lblk_num); - memset(xts_tweak.padding, 0, sizeof(xts_tweak.padding)); - sg_init_table(&dst, 1); sg_set_page(&dst, dest_page, len, offs); sg_init_table(&src, 1); sg_set_page(&src, src_page, len, offs); - skcipher_request_set_crypt(req, &src, &dst, len, &xts_tweak); + skcipher_request_set_crypt(req, &src, &dst, len, &iv); if (rw == FS_DECRYPT) res = crypto_skcipher_decrypt(req); else @@ -477,6 +484,8 @@ static void __exit fscrypt_exit(void) destroy_workqueue(fscrypt_read_workqueue); kmem_cache_destroy(fscrypt_ctx_cachep); kmem_cache_destroy(fscrypt_info_cachep); + + fscrypt_essiv_cleanup(); } module_exit(fscrypt_exit); diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index 0b65491de28a..79d79755d79b 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -12,10 +12,13 @@ #define _FSCRYPT_PRIVATE_H #include +#include /* Encryption parameters */ -#define FS_XTS_TWEAK_SIZE 16 +#define FS_IV_SIZE 16 #define FS_AES_128_ECB_KEY_SIZE 16 +#define FS_AES_128_CBC_KEY_SIZE 16 +#define FS_AES_128_CTS_KEY_SIZE 16 #define FS_AES_256_GCM_KEY_SIZE 32 #define FS_AES_256_CBC_KEY_SIZE 32 #define FS_AES_256_CTS_KEY_SIZE 32 @@ -54,6 +57,7 @@ struct fscrypt_info { u8 ci_filename_mode; u8 ci_flags; struct crypto_skcipher *ci_ctfm; + struct crypto_cipher *ci_essiv_tfm; u8 ci_master_key[FS_KEY_DESCRIPTOR_SIZE]; }; @@ -97,4 +101,7 @@ extern int fscrypt_do_page_crypto(const struct inode *inode, extern struct page *fscrypt_alloc_bounce_page(struct fscrypt_ctx *ctx, gfp_t gfp_flags); +/* keyinfo.c */ +extern void __exit fscrypt_essiv_cleanup(void); + #endif /* _FSCRYPT_PRIVATE_H */ diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c index 4636c18c2fb9..66e0728e9bbe 100644 --- a/fs/crypto/keyinfo.c +++ b/fs/crypto/keyinfo.c @@ -10,8 +10,13 @@ #include #include +#include +#include +#include #include "fscrypt_private.h" +static struct crypto_shash *essiv_hash_tfm; + static void derive_crypt_complete(struct crypto_async_request *req, int rc) { struct fscrypt_completion_result *ecr = req->data; @@ -27,13 +32,13 @@ static void derive_crypt_complete(struct crypto_async_request *req, int rc) * derive_key_aes() - Derive a key using AES-128-ECB * @deriving_key: Encryption key used for derivation. * @source_key: Source key to which to apply derivation. - * @derived_key: Derived key. + * @derived_raw_key: Derived raw key. * * Return: Zero on success; non-zero otherwise. */ static int derive_key_aes(u8 deriving_key[FS_AES_128_ECB_KEY_SIZE], - u8 source_key[FS_AES_256_XTS_KEY_SIZE], - u8 derived_key[FS_AES_256_XTS_KEY_SIZE]) + const struct fscrypt_key *source_key, + u8 derived_raw_key[FS_MAX_KEY_SIZE]) { int res = 0; struct skcipher_request *req = NULL; @@ -60,10 +65,10 @@ static int derive_key_aes(u8 deriving_key[FS_AES_128_ECB_KEY_SIZE], if (res < 0) goto out; - sg_init_one(&src_sg, source_key, FS_AES_256_XTS_KEY_SIZE); - sg_init_one(&dst_sg, derived_key, FS_AES_256_XTS_KEY_SIZE); - skcipher_request_set_crypt(req, &src_sg, &dst_sg, - FS_AES_256_XTS_KEY_SIZE, NULL); + sg_init_one(&src_sg, source_key->raw, source_key->size); + sg_init_one(&dst_sg, derived_raw_key, source_key->size); + skcipher_request_set_crypt(req, &src_sg, &dst_sg, source_key->size, + NULL); res = crypto_skcipher_encrypt(req); if (res == -EINPROGRESS || res == -EBUSY) { wait_for_completion(&ecr.completion); @@ -77,7 +82,7 @@ out: static int validate_user_key(struct fscrypt_info *crypt_info, struct fscrypt_context *ctx, u8 *raw_key, - const char *prefix) + const char *prefix, int min_keysize) { char *description; struct key *keyring_key; @@ -111,50 +116,60 @@ static int validate_user_key(struct fscrypt_info *crypt_info, master_key = (struct fscrypt_key *)ukp->data; BUILD_BUG_ON(FS_AES_128_ECB_KEY_SIZE != FS_KEY_DERIVATION_NONCE_SIZE); - if (master_key->size != FS_AES_256_XTS_KEY_SIZE) { + if (master_key->size < min_keysize || master_key->size > FS_MAX_KEY_SIZE + || master_key->size % AES_BLOCK_SIZE != 0) { printk_once(KERN_WARNING "%s: key size incorrect: %d\n", __func__, master_key->size); res = -ENOKEY; goto out; } - res = derive_key_aes(ctx->nonce, master_key->raw, raw_key); + res = derive_key_aes(ctx->nonce, master_key, raw_key); out: up_read(&keyring_key->sem); key_put(keyring_key); return res; } +static const struct { + const char *cipher_str; + int keysize; +} available_modes[] = { + [FS_ENCRYPTION_MODE_AES_256_XTS] = { "xts(aes)", + FS_AES_256_XTS_KEY_SIZE }, + [FS_ENCRYPTION_MODE_AES_256_CTS] = { "cts(cbc(aes))", + FS_AES_256_CTS_KEY_SIZE }, + [FS_ENCRYPTION_MODE_AES_128_CBC] = { "cbc(aes)", + FS_AES_128_CBC_KEY_SIZE }, + [FS_ENCRYPTION_MODE_AES_128_CTS] = { "cts(cbc(aes))", + FS_AES_128_CTS_KEY_SIZE }, +}; + static int determine_cipher_type(struct fscrypt_info *ci, struct inode *inode, const char **cipher_str_ret, int *keysize_ret) { + u32 mode; + + if (!fscrypt_valid_enc_modes(ci->ci_data_mode, ci->ci_filename_mode)) { + pr_warn_ratelimited("fscrypt: inode %lu uses unsupported encryption modes (contents mode %d, filenames mode %d)\n", + inode->i_ino, + ci->ci_data_mode, ci->ci_filename_mode); + return -EINVAL; + } + if (S_ISREG(inode->i_mode)) { - if (ci->ci_data_mode == FS_ENCRYPTION_MODE_AES_256_XTS) { - *cipher_str_ret = "xts(aes)"; - *keysize_ret = FS_AES_256_XTS_KEY_SIZE; - return 0; - } - pr_warn_once("fscrypto: unsupported contents encryption mode " - "%d for inode %lu\n", - ci->ci_data_mode, inode->i_ino); - return -ENOKEY; + mode = ci->ci_data_mode; + } else if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) { + mode = ci->ci_filename_mode; + } else { + WARN_ONCE(1, "fscrypt: filesystem tried to load encryption info for inode %lu, which is not encryptable (file type %d)\n", + inode->i_ino, (inode->i_mode & S_IFMT)); + return -EINVAL; } - if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) { - if (ci->ci_filename_mode == FS_ENCRYPTION_MODE_AES_256_CTS) { - *cipher_str_ret = "cts(cbc(aes))"; - *keysize_ret = FS_AES_256_CTS_KEY_SIZE; - return 0; - } - pr_warn_once("fscrypto: unsupported filenames encryption mode " - "%d for inode %lu\n", - ci->ci_filename_mode, inode->i_ino); - return -ENOKEY; - } - - pr_warn_once("fscrypto: unsupported file type %d for inode %lu\n", - (inode->i_mode & S_IFMT), inode->i_ino); - return -ENOKEY; + *cipher_str_ret = available_modes[mode].cipher_str; + *keysize_ret = available_modes[mode].keysize; + return 0; } static void put_crypt_info(struct fscrypt_info *ci) @@ -163,9 +178,76 @@ static void put_crypt_info(struct fscrypt_info *ci) return; crypto_free_skcipher(ci->ci_ctfm); + crypto_free_cipher(ci->ci_essiv_tfm); kmem_cache_free(fscrypt_info_cachep, ci); } +static int derive_essiv_salt(const u8 *key, int keysize, u8 *salt) +{ + struct crypto_shash *tfm = READ_ONCE(essiv_hash_tfm); + + /* init hash transform on demand */ + if (unlikely(!tfm)) { + struct crypto_shash *prev_tfm; + + tfm = crypto_alloc_shash("sha256", 0, 0); + if (IS_ERR(tfm)) { + pr_warn_ratelimited("fscrypt: error allocating SHA-256 transform: %ld\n", + PTR_ERR(tfm)); + return PTR_ERR(tfm); + } + prev_tfm = cmpxchg(&essiv_hash_tfm, NULL, tfm); + if (prev_tfm) { + crypto_free_shash(tfm); + tfm = prev_tfm; + } + } + + { + SHASH_DESC_ON_STACK(desc, tfm); + desc->tfm = tfm; + desc->flags = 0; + + return crypto_shash_digest(desc, key, keysize, salt); + } +} + +static int init_essiv_generator(struct fscrypt_info *ci, const u8 *raw_key, + int keysize) +{ + int err; + struct crypto_cipher *essiv_tfm; + u8 salt[SHA256_DIGEST_SIZE]; + + essiv_tfm = crypto_alloc_cipher("aes", 0, 0); + if (IS_ERR(essiv_tfm)) + return PTR_ERR(essiv_tfm); + + ci->ci_essiv_tfm = essiv_tfm; + + err = derive_essiv_salt(raw_key, keysize, salt); + if (err) + goto out; + + /* + * Using SHA256 to derive the salt/key will result in AES-256 being + * used for IV generation. File contents encryption will still use the + * configured keysize (AES-128) nevertheless. + */ + err = crypto_cipher_setkey(essiv_tfm, salt, sizeof(salt)); + if (err) + goto out; + +out: + memzero_explicit(salt, sizeof(salt)); + return err; +} + +void __exit fscrypt_essiv_cleanup(void) +{ + crypto_free_shash(essiv_hash_tfm); +} + int fscrypt_get_encryption_info(struct inode *inode) { struct fscrypt_info *crypt_info; @@ -212,6 +294,7 @@ int fscrypt_get_encryption_info(struct inode *inode) crypt_info->ci_data_mode = ctx.contents_encryption_mode; crypt_info->ci_filename_mode = ctx.filenames_encryption_mode; crypt_info->ci_ctfm = NULL; + crypt_info->ci_essiv_tfm = NULL; memcpy(crypt_info->ci_master_key, ctx.master_key_descriptor, sizeof(crypt_info->ci_master_key)); @@ -228,10 +311,12 @@ int fscrypt_get_encryption_info(struct inode *inode) if (!raw_key) goto out; - res = validate_user_key(crypt_info, &ctx, raw_key, FS_KEY_DESC_PREFIX); + res = validate_user_key(crypt_info, &ctx, raw_key, FS_KEY_DESC_PREFIX, + keysize); if (res && inode->i_sb->s_cop->key_prefix) { int res2 = validate_user_key(crypt_info, &ctx, raw_key, - inode->i_sb->s_cop->key_prefix); + inode->i_sb->s_cop->key_prefix, + keysize); if (res2) { if (res2 == -ENOKEY) res = -ENOKEY; @@ -243,18 +328,30 @@ int fscrypt_get_encryption_info(struct inode *inode) ctfm = crypto_alloc_skcipher(cipher_str, 0, 0); if (!ctfm || IS_ERR(ctfm)) { res = ctfm ? PTR_ERR(ctfm) : -ENOMEM; - printk(KERN_DEBUG - "%s: error %d (inode %u) allocating crypto tfm\n", - __func__, res, (unsigned) inode->i_ino); + pr_debug("%s: error %d (inode %lu) allocating crypto tfm\n", + __func__, res, inode->i_ino); goto out; } crypt_info->ci_ctfm = ctfm; crypto_skcipher_clear_flags(ctfm, ~0); crypto_skcipher_set_flags(ctfm, CRYPTO_TFM_REQ_WEAK_KEY); + /* + * if the provided key is longer than keysize, we use the first + * keysize bytes of the derived key only + */ res = crypto_skcipher_setkey(ctfm, raw_key, keysize); if (res) goto out; + if (S_ISREG(inode->i_mode) && + crypt_info->ci_data_mode == FS_ENCRYPTION_MODE_AES_128_CBC) { + res = init_essiv_generator(crypt_info, raw_key, keysize); + if (res) { + pr_debug("%s: error %d (inode %lu) allocating essiv tfm\n", + __func__, res, inode->i_ino); + goto out; + } + } if (cmpxchg(&inode->i_crypt_info, NULL, crypt_info) == NULL) crypt_info = NULL; out: diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index 210976e7a269..9914d51dff86 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -38,12 +38,8 @@ static int create_encryption_context_from_policy(struct inode *inode, memcpy(ctx.master_key_descriptor, policy->master_key_descriptor, FS_KEY_DESCRIPTOR_SIZE); - if (!fscrypt_valid_contents_enc_mode( - policy->contents_encryption_mode)) - return -EINVAL; - - if (!fscrypt_valid_filenames_enc_mode( - policy->filenames_encryption_mode)) + if (!fscrypt_valid_enc_modes(policy->contents_encryption_mode, + policy->filenames_encryption_mode)) return -EINVAL; if (policy->flags & ~FS_POLICY_FLAGS_VALID) diff --git a/include/linux/fscrypt_common.h b/include/linux/fscrypt_common.h index 0a30c106c1e5..4022c61f7e9b 100644 --- a/include/linux/fscrypt_common.h +++ b/include/linux/fscrypt_common.h @@ -91,14 +91,18 @@ static inline bool fscrypt_dummy_context_enabled(struct inode *inode) return false; } -static inline bool fscrypt_valid_contents_enc_mode(u32 mode) +static inline bool fscrypt_valid_enc_modes(u32 contents_mode, + u32 filenames_mode) { - return (mode == FS_ENCRYPTION_MODE_AES_256_XTS); -} + if (contents_mode == FS_ENCRYPTION_MODE_AES_128_CBC && + filenames_mode == FS_ENCRYPTION_MODE_AES_128_CTS) + return true; -static inline bool fscrypt_valid_filenames_enc_mode(u32 mode) -{ - return (mode == FS_ENCRYPTION_MODE_AES_256_CTS); + if (contents_mode == FS_ENCRYPTION_MODE_AES_256_XTS && + filenames_mode == FS_ENCRYPTION_MODE_AES_256_CTS) + return true; + + return false; } static inline bool fscrypt_is_dot_dotdot(const struct qstr *str) diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index a1533084395c..f3ef5016cf9c 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -189,6 +189,8 @@ struct inodes_stat_t { #define FS_ENCRYPTION_MODE_AES_256_GCM 2 #define FS_ENCRYPTION_MODE_AES_256_CBC 3 #define FS_ENCRYPTION_MODE_AES_256_CTS 4 +#define FS_ENCRYPTION_MODE_AES_128_CBC 5 +#define FS_ENCRYPTION_MODE_AES_128_CTS 6 struct fscrypt_policy { From 8e85002136aab271e4c9bf4c60da5d5c1b46dc4b Mon Sep 17 00:00:00 2001 From: Luis Henriques Date: Tue, 11 Jul 2017 17:30:33 +0100 Subject: [PATCH 365/804] f2fs: remove extra inode_unlock() in error path This commit removes an extra inode_unlock() that is being done in function f2fs_ioc_setflags error path. While there, get rid of a useless 'out' label as well. Fixes: 0abd675e97e6 ("f2fs: support plain user/group quota") Signed-off-by: Luis Henriques Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 789d75beb7ed..435927e6c6f3 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1523,7 +1523,6 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg) /* Is it quota file? Do not allow user to mess with it */ if (IS_NOQUOTA(inode)) { - inode_unlock(inode); ret = -EPERM; goto unlock_out; } @@ -1534,9 +1533,8 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg) if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) { if (!capable(CAP_LINUX_IMMUTABLE)) { - inode_unlock(inode); ret = -EPERM; - goto out; + goto unlock_out; } } @@ -1549,7 +1547,6 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg) f2fs_mark_inode_dirty_sync(inode, false); unlock_out: inode_unlock(inode); -out: mnt_drop_write_file(filp); return ret; } From 39480d9be6996dcafae10a9ca147ce5c77efaa83 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 11 Jul 2017 14:56:49 -0700 Subject: [PATCH 366/804] f2fs: Don't clear SGID when inheriting ACLs This patch copies commit b7f8a09f80: "btrfs: Don't clear SGID when inheriting ACLs" written by Jan. Fixes: 073931017b49d9458aa351605b43a7e34598caef CC: stable@vger.kernel.org Signed-off-by: Jan Kara Signed-off-by: Jaegeuk Kim --- fs/f2fs/acl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index ad26f59ba464..05d6f6095549 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -214,7 +214,7 @@ static int __f2fs_set_acl(struct inode *inode, int type, switch (type) { case ACL_TYPE_ACCESS: name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS; - if (acl) { + if (acl && !ipage) { error = posix_acl_update_mode(inode, &inode->i_mode, &acl); if (error) return error; From 7c7c34c88c646b70285fb2c3b9d004e3fbe0d011 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 13 Jul 2017 17:45:21 -0700 Subject: [PATCH 367/804] f2fs: include seq_file.h for sysfs.c This patch includes seq_file.h to avoid compile error. Signed-off-by: Eric Biggers Signed-off-by: Jaegeuk Kim --- fs/f2fs/sysfs.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 9adc202fcd6f..71191d89917d 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -11,6 +11,7 @@ */ #include #include +#include #include "f2fs.h" #include "segment.h" From 745ad3de2fbae66415d2734837384ab6e648e357 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 14 Jul 2017 11:45:21 -0700 Subject: [PATCH 368/804] f2fs: avoid cpu lockup Before retrying to flush data or dentry pages, we need to release cpu in order to prevent watchdog. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index aabf7c4984d3..e8ceff42d09b 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -880,6 +880,7 @@ int sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type) struct inode *inode; struct f2fs_inode_info *fi; bool is_dir = (type == DIR_INODE); + unsigned long ino = 0; trace_f2fs_sync_dirty_inodes_enter(sbi->sb, is_dir, get_pages(sbi, is_dir ? @@ -902,8 +903,17 @@ retry: inode = igrab(&fi->vfs_inode); spin_unlock(&sbi->inode_lock[type]); if (inode) { + unsigned long cur_ino = inode->i_ino; + filemap_fdatawrite(inode->i_mapping); iput(inode); + /* We need to give cpu to another writers. */ + if (ino == cur_ino) { + congestion_wait(BLK_RW_ASYNC, HZ/50); + cond_resched(); + } else { + ino = cur_ino; + } } else { /* * We should submit bio, since it exists several From 92fd02d1052e69d85e577f9b5ec5af0ec1de1dc3 Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Mon, 17 Jul 2017 19:16:11 +0800 Subject: [PATCH 369/804] f2fs: remove unused input parameter This patch remove unused input parameter in function new_node_page. Signed-off-by: Yunlei He Signed-off-by: Yong Sheng Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 3 +-- fs/f2fs/node.c | 7 +++---- fs/f2fs/xattr.c | 2 +- 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index ecfd7fc02b57..43c0956f7ce7 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2369,8 +2369,7 @@ int truncate_xattr_node(struct inode *inode, struct page *page); int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino); int remove_inode_page(struct inode *inode); struct page *new_inode_page(struct inode *inode); -struct page *new_node_page(struct dnode_of_data *dn, - unsigned int ofs, struct page *ipage); +struct page *new_node_page(struct dnode_of_data *dn, unsigned int ofs); void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid); struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid); struct page *get_node_page_ra(struct page *parent, int start); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index d0d6a5830181..ed4014f677c2 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -613,7 +613,7 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) } dn->nid = nids[i]; - npage[i] = new_node_page(dn, noffset[i], NULL); + npage[i] = new_node_page(dn, noffset[i]); if (IS_ERR(npage[i])) { alloc_nid_failed(sbi, nids[i]); err = PTR_ERR(npage[i]); @@ -1022,11 +1022,10 @@ struct page *new_inode_page(struct inode *inode) set_new_dnode(&dn, inode, NULL, NULL, inode->i_ino); /* caller should f2fs_put_page(page, 1); */ - return new_node_page(&dn, 0, NULL); + return new_node_page(&dn, 0); } -struct page *new_node_page(struct dnode_of_data *dn, - unsigned int ofs, struct page *ipage) +struct page *new_node_page(struct dnode_of_data *dn, unsigned int ofs) { struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); struct node_info new_ni; diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index aaf0a4167175..aad59c7c3a63 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -489,7 +489,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, } else { struct dnode_of_data dn; set_new_dnode(&dn, inode, NULL, NULL, new_nid); - xpage = new_node_page(&dn, XATTR_NODE_OFFSET, ipage); + xpage = new_node_page(&dn, XATTR_NODE_OFFSET); if (IS_ERR(xpage)) { alloc_nid_failed(sbi, new_nid); return PTR_ERR(xpage); From f19a8a046a172655102a9aa0a995c4c77143df0c Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sun, 16 Jul 2017 15:08:54 +0800 Subject: [PATCH 370/804] f2fs: spread struct f2fs_dentry_ptr for inline path Use f2fs_dentry_ptr structure to indicate inline dentry structure as much as possible, so we can wrap inline dentry with size-fixed fields to the one with size-changeable fields. With this change, we can handle size-changeable inline dentry more easily. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 5 ++++- fs/f2fs/inline.c | 47 ++++++++++++++++++++++++++--------------------- 2 files changed, 30 insertions(+), 22 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 43c0956f7ce7..a02645780fd7 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -428,10 +428,11 @@ struct f2fs_flush_device { /* for directory operations */ struct f2fs_dentry_ptr { struct inode *inode; - const void *bitmap; + void *bitmap; struct f2fs_dir_entry *dentry; __u8 (*filename)[F2FS_SLOT_LEN]; int max; + int nr_bitmap; }; static inline void make_dentry_ptr_block(struct inode *inode, @@ -439,6 +440,7 @@ static inline void make_dentry_ptr_block(struct inode *inode, { d->inode = inode; d->max = NR_DENTRY_IN_BLOCK; + d->nr_bitmap = SIZE_OF_DENTRY_BITMAP; d->bitmap = &t->dentry_bitmap; d->dentry = t->dentry; d->filename = t->filename; @@ -449,6 +451,7 @@ static inline void make_dentry_ptr_inline(struct inode *inode, { d->inode = inode; d->max = NR_INLINE_DENTRY; + d->nr_bitmap = INLINE_DENTRY_BITMAP_SIZE; d->bitmap = &t->dentry_bitmap; d->dentry = t->dentry; d->filename = t->filename; diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 03c86e55e4a7..918eb89eb404 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -342,6 +342,7 @@ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage, struct page *page; struct dnode_of_data dn; struct f2fs_dentry_block *dentry_blk; + struct f2fs_dentry_ptr src, dst; int err; page = f2fs_grab_cache_page(dir->i_mapping, 0, false); @@ -360,21 +361,20 @@ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage, dentry_blk = kmap_atomic(page); + make_dentry_ptr_inline(NULL, &src, inline_dentry); + make_dentry_ptr_block(NULL, &dst, dentry_blk); + /* copy data from inline dentry block to new dentry block */ - memcpy(dentry_blk->dentry_bitmap, inline_dentry->dentry_bitmap, - INLINE_DENTRY_BITMAP_SIZE); - memset(dentry_blk->dentry_bitmap + INLINE_DENTRY_BITMAP_SIZE, 0, - SIZE_OF_DENTRY_BITMAP - INLINE_DENTRY_BITMAP_SIZE); + memcpy(dst.bitmap, src.bitmap, src.nr_bitmap); + memset(dst.bitmap + src.nr_bitmap, 0, dst.nr_bitmap - src.nr_bitmap); /* * we do not need to zero out remainder part of dentry and filename * field, since we have used bitmap for marking the usage status of * them, besides, we can also ignore copying/zeroing reserved space * of dentry block, because them haven't been used so far. */ - memcpy(dentry_blk->dentry, inline_dentry->dentry, - sizeof(struct f2fs_dir_entry) * NR_INLINE_DENTRY); - memcpy(dentry_blk->filename, inline_dentry->filename, - NR_INLINE_DENTRY * F2FS_SLOT_LEN); + memcpy(dst.dentry, src.dentry, SIZE_OF_DIR_ENTRY * src.max); + memcpy(dst.filename, src.filename, src.max * F2FS_SLOT_LEN); kunmap_atomic(dentry_blk); if (!PageUptodate(page)) @@ -511,9 +511,10 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, return PTR_ERR(ipage); inline_dentry = inline_data_addr(ipage); - bit_pos = room_for_filename(&inline_dentry->dentry_bitmap, - slots, NR_INLINE_DENTRY); - if (bit_pos >= NR_INLINE_DENTRY) { + make_dentry_ptr_inline(NULL, &d, inline_dentry); + + bit_pos = room_for_filename(d.bitmap, slots, d.max); + if (bit_pos >= d.max) { err = f2fs_convert_inline_dir(dir, ipage, inline_dentry); if (err) return err; @@ -534,7 +535,6 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, f2fs_wait_on_page_writeback(ipage, NODE, true); name_hash = f2fs_dentry_hash(new_name, NULL); - make_dentry_ptr_inline(NULL, &d, inline_dentry); f2fs_update_dentry(ino, mode, &d, new_name, name_hash, bit_pos); set_page_dirty(ipage); @@ -558,6 +558,7 @@ void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page, struct inode *dir, struct inode *inode) { struct f2fs_inline_dentry *inline_dentry; + struct f2fs_dentry_ptr d; int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len)); unsigned int bit_pos; int i; @@ -566,10 +567,11 @@ void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page, f2fs_wait_on_page_writeback(page, NODE, true); inline_dentry = inline_data_addr(page); - bit_pos = dentry - inline_dentry->dentry; + make_dentry_ptr_inline(NULL, &d, inline_dentry); + + bit_pos = dentry - d.dentry; for (i = 0; i < slots; i++) - __clear_bit_le(bit_pos + i, - &inline_dentry->dentry_bitmap); + __clear_bit_le(bit_pos + i, d.bitmap); set_page_dirty(page); f2fs_put_page(page, 1); @@ -587,19 +589,20 @@ bool f2fs_empty_inline_dir(struct inode *dir) struct page *ipage; unsigned int bit_pos = 2; struct f2fs_inline_dentry *inline_dentry; + struct f2fs_dentry_ptr d; ipage = get_node_page(sbi, dir->i_ino); if (IS_ERR(ipage)) return false; inline_dentry = inline_data_addr(ipage); - bit_pos = find_next_bit_le(&inline_dentry->dentry_bitmap, - NR_INLINE_DENTRY, - bit_pos); + make_dentry_ptr_inline(NULL, &d, inline_dentry); + + bit_pos = find_next_bit_le(d.bitmap, d.max, bit_pos); f2fs_put_page(ipage, 1); - if (bit_pos < NR_INLINE_DENTRY) + if (bit_pos < d.max) return false; return true; @@ -614,7 +617,9 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx, struct f2fs_dentry_ptr d; int err; - if (ctx->pos == NR_INLINE_DENTRY) + make_dentry_ptr_inline(inode, &d, inline_dentry); + + if (ctx->pos == d.max) return 0; ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino); @@ -627,7 +632,7 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx, err = f2fs_fill_dentries(ctx, &d, 0, fstr); if (!err) - ctx->pos = NR_INLINE_DENTRY; + ctx->pos = d.max; f2fs_put_page(ipage, 1); return err < 0 ? err : 0; From d143729d715df2a467e52fbe401f7ab2cc162dca Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Tue, 18 Jul 2017 09:48:12 +0800 Subject: [PATCH 371/804] f2fs: alloc new nids for xattr block in recovery recovery file A: recovery file B: -get_dnode_of_data -alloc_nid -recover_xattr_data -set_node_addr(sbi, &ni, NEW_ADDR, false); --->bug_on for nid has been used by file A In recovery process, new allocated node blocks may "reuse" xattr block nids, this patch alloc new nids for xattr blocks in recovery process to avoid this problem. Signed-off-by: Yunlei He Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index ed4014f677c2..46fb5c2693ad 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -19,6 +19,7 @@ #include "f2fs.h" #include "node.h" #include "segment.h" +#include "xattr.h" #include "trace.h" #include @@ -2193,7 +2194,8 @@ int recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); nid_t prev_xnid = F2FS_I(inode)->i_xattr_nid; - nid_t new_xnid = nid_of_node(page); + nid_t new_xnid; + struct dnode_of_data dn; struct node_info ni; struct page *xpage; @@ -2209,22 +2211,22 @@ int recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr) recover_xnid: /* 2: update xattr nid in inode */ - remove_free_nid(sbi, new_xnid); - f2fs_i_xnid_write(inode, new_xnid); - if (unlikely(inc_valid_node_count(sbi, inode, false))) - f2fs_bug_on(sbi, 1); + if (!alloc_nid(sbi, &new_xnid)) + return -ENOSPC; + + set_new_dnode(&dn, inode, NULL, NULL, new_xnid); + xpage = new_node_page(&dn, XATTR_NODE_OFFSET); + if (IS_ERR(xpage)) { + alloc_nid_failed(sbi, new_xnid); + return PTR_ERR(xpage); + } + + alloc_nid_done(sbi, new_xnid); update_inode_page(inode); /* 3: update and set xattr node page dirty */ - xpage = grab_cache_page(NODE_MAPPING(sbi), new_xnid); - if (!xpage) - return -ENOMEM; + memcpy(F2FS_NODE(xpage), F2FS_NODE(page), VALID_XATTR_BLOCK_SIZE); - memcpy(F2FS_NODE(xpage), F2FS_NODE(page), PAGE_SIZE); - - get_node_info(sbi, new_xnid, &ni); - ni.ino = inode->i_ino; - set_node_addr(sbi, &ni, NEW_ADDR, false); set_page_dirty(xpage); f2fs_put_page(xpage, 1); From ada848409822bd027643a98e5cae45958650beca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ernesto=20A=2E=20Fern=C3=A1ndez?= Date: Sun, 23 Jul 2017 22:32:54 -0300 Subject: [PATCH 372/804] f2fs: preserve i_mode if __f2fs_set_acl() fails MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When changing a file's acl mask, __f2fs_set_acl() will first set the group bits of i_mode to the value of the mask, and only then set the actual extended attribute representing the new acl. If the second part fails (due to lack of space, for example) and the file had no acl attribute to begin with, the system will from now on assume that the mask permission bits are actual group permission bits, potentially granting access to the wrong users. Prevent this by only changing the inode mode after the acl has been set. Signed-off-by: Ernesto A. Fernández Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/acl.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index 05d6f6095549..112f8e04c549 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -210,15 +210,16 @@ static int __f2fs_set_acl(struct inode *inode, int type, void *value = NULL; size_t size = 0; int error; + umode_t mode = inode->i_mode; switch (type) { case ACL_TYPE_ACCESS: name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS; if (acl && !ipage) { - error = posix_acl_update_mode(inode, &inode->i_mode, &acl); + error = posix_acl_update_mode(inode, &mode, &acl); if (error) return error; - set_acl_inode(inode, inode->i_mode); + set_acl_inode(inode, mode); } break; From 6bf7fc57146876306137d2229d42082165af5dbf Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 19 Jul 2017 10:59:55 -0700 Subject: [PATCH 373/804] f2fs: give a try to do atomic write in -ENOMEM case It'd be better to retry writing atomic pages when we get -ENOMEM. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 9744e8c9d308..bf9d66fa0af5 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -309,17 +309,21 @@ static int __commit_inmem_pages(struct inode *inode, inode_dec_dirty_pages(inode); remove_dirty_inode(inode); } - +retry: fio.page = page; fio.old_blkaddr = NULL_ADDR; fio.encrypted_page = NULL; fio.need_lock = LOCK_DONE; err = do_write_data_page(&fio); if (err) { + if (err == -ENOMEM) { + congestion_wait(BLK_RW_ASYNC, HZ/50); + cond_resched(); + goto retry; + } unlock_page(page); break; } - /* record old blkaddr for revoking */ cur->old_addr = fio.old_blkaddr; last_idx = page->index; From 79e86c92c62c8bc5b699f657b016fd281a39f2ac Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 24 Jul 2017 19:46:29 -0700 Subject: [PATCH 374/804] f2fs: don't give partially written atomic data from process crash This patch resolves the below scenario. == Process 1 == == Process 2 == open(w) open(rw) begin write(new_#1) process_crash f_op->flush locks_remove_posix f_op>release read (new_#1) In order to avoid corrupted database caused by new_#1, we must do roll-back at process_crash time. In order to check that, this patch keeps task which triggers transaction begin, and does roll-back in f_op->flush before removing file locks. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 1 + fs/f2fs/file.c | 18 ++++++++++++++++++ 2 files changed, 19 insertions(+) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index a02645780fd7..8ec1afac4897 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -603,6 +603,7 @@ struct f2fs_inode_info { struct list_head dirty_list; /* dirty list for dirs and files */ struct list_head gdirty_list; /* linked in global dirty list */ struct list_head inmem_pages; /* inmemory pages managed by f2fs */ + struct task_struct *inmem_task; /* store inmemory task */ struct mutex inmem_lock; /* lock for inmemory pages */ struct extent_tree *extent_tree; /* cached extent_tree entry */ struct rw_semaphore dio_rwsem[2];/* avoid racing between dio and gc */ diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 435927e6c6f3..368aa332c833 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1480,6 +1480,22 @@ static int f2fs_release_file(struct inode *inode, struct file *filp) return 0; } +static int f2fs_file_flush(struct file *file, fl_owner_t id) +{ + struct inode *inode = file_inode(file); + + /* + * If the process doing a transaction is crashed, we should do + * roll-back. Otherwise, other reader/write can see corrupted database + * until all the writers close its file. Since this should be done + * before dropping file lock, it needs to do in ->flush. + */ + if (f2fs_is_atomic_file(inode) && + F2FS_I(inode)->inmem_task == current) + drop_inmem_pages(inode); + return 0; +} + #define F2FS_REG_FLMASK (~(FS_DIRSYNC_FL | FS_TOPDIR_FL)) #define F2FS_OTHER_FLMASK (FS_NODUMP_FL | FS_NOATIME_FL) @@ -1599,6 +1615,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) } inc_stat: + F2FS_I(inode)->inmem_task = current; stat_inc_atomic_write(inode); stat_update_max_atomic_write(inode); out: @@ -2496,6 +2513,7 @@ const struct file_operations f2fs_file_operations = { .open = f2fs_file_open, .release = f2fs_release_file, .mmap = f2fs_file_mmap, + .flush = f2fs_file_flush, .fsync = f2fs_sync_file, .fallocate = f2fs_fallocate, .unlocked_ioctl = f2fs_ioctl, From e088277a813b12d98f81f79da0c738b54974d56a Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 22 Jul 2017 08:52:23 +0800 Subject: [PATCH 375/804] f2fs: make background threads of f2fs being aware of freezing When ->freeze_fs is called from lvm for doing snapshot, it needs to make sure there will be no more changes in filesystem's data, however, previously, background threads like GC thread wasn't aware of freezing, so in environment with active background threads, data of snapshot becomes unstable. This patch fixes this issue by adding sb_{start,end}_intwrite in below background threads: - GC thread - flush thread - discard thread Note that, don't use sb_start_intwrite() in gc_thread_func() due to: generic/241 reports below bug: ====================================================== WARNING: possible circular locking dependency detected 4.13.0-rc1+ #32 Tainted: G O ------------------------------------------------------ f2fs_gc-250:0/22186 is trying to acquire lock: (&sbi->gc_mutex){+.+...}, at: [] f2fs_sync_fs+0x7b/0x1b0 [f2fs] but task is already holding lock: (sb_internal#2){++++.-}, at: [] gc_thread_func+0x159/0x4a0 [f2fs] which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #2 (sb_internal#2){++++.-}: __lock_acquire+0x405/0x7b0 lock_acquire+0xae/0x220 __sb_start_write+0x11d/0x1f0 f2fs_evict_inode+0x2d6/0x4e0 [f2fs] evict+0xa8/0x170 iput+0x1fb/0x2c0 f2fs_sync_inode_meta+0x3f/0xf0 [f2fs] write_checkpoint+0x1b1/0x750 [f2fs] f2fs_sync_fs+0x85/0x1b0 [f2fs] f2fs_do_sync_file.isra.24+0x137/0xa30 [f2fs] f2fs_sync_file+0x34/0x40 [f2fs] vfs_fsync_range+0x4a/0xa0 do_fsync+0x3c/0x60 SyS_fdatasync+0x15/0x20 do_fast_syscall_32+0xa1/0x1b0 entry_SYSENTER_32+0x4c/0x7b -> #1 (&sbi->cp_mutex){+.+...}: __lock_acquire+0x405/0x7b0 lock_acquire+0xae/0x220 __mutex_lock+0x4f/0x830 mutex_lock_nested+0x25/0x30 write_checkpoint+0x2f/0x750 [f2fs] f2fs_sync_fs+0x85/0x1b0 [f2fs] sync_filesystem+0x67/0x80 generic_shutdown_super+0x27/0x100 kill_block_super+0x22/0x50 kill_f2fs_super+0x3a/0x40 [f2fs] deactivate_locked_super+0x3d/0x70 deactivate_super+0x40/0x60 cleanup_mnt+0x39/0x70 __cleanup_mnt+0x10/0x20 task_work_run+0x69/0x80 exit_to_usermode_loop+0x57/0x92 do_fast_syscall_32+0x18c/0x1b0 entry_SYSENTER_32+0x4c/0x7b -> #0 (&sbi->gc_mutex){+.+...}: validate_chain.isra.36+0xc50/0xdb0 __lock_acquire+0x405/0x7b0 lock_acquire+0xae/0x220 __mutex_lock+0x4f/0x830 mutex_lock_nested+0x25/0x30 f2fs_sync_fs+0x7b/0x1b0 [f2fs] f2fs_balance_fs_bg+0xb9/0x200 [f2fs] gc_thread_func+0x302/0x4a0 [f2fs] kthread+0xe9/0x120 ret_from_fork+0x19/0x24 other info that might help us debug this: Chain exists of: &sbi->gc_mutex --> &sbi->cp_mutex --> sb_internal#2 Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(sb_internal#2); lock(&sbi->cp_mutex); lock(sb_internal#2); lock(&sbi->gc_mutex); *** DEADLOCK *** 1 lock held by f2fs_gc-250:0/22186: #0: (sb_internal#2){++++.-}, at: [] gc_thread_func+0x159/0x4a0 [f2fs] stack backtrace: CPU: 2 PID: 22186 Comm: f2fs_gc-250:0 Tainted: G O 4.13.0-rc1+ #32 Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006 Call Trace: dump_stack+0x5f/0x92 print_circular_bug+0x1b3/0x1bd validate_chain.isra.36+0xc50/0xdb0 ? __this_cpu_preempt_check+0xf/0x20 __lock_acquire+0x405/0x7b0 lock_acquire+0xae/0x220 ? f2fs_sync_fs+0x7b/0x1b0 [f2fs] __mutex_lock+0x4f/0x830 ? f2fs_sync_fs+0x7b/0x1b0 [f2fs] mutex_lock_nested+0x25/0x30 ? f2fs_sync_fs+0x7b/0x1b0 [f2fs] f2fs_sync_fs+0x7b/0x1b0 [f2fs] f2fs_balance_fs_bg+0xb9/0x200 [f2fs] gc_thread_func+0x302/0x4a0 [f2fs] ? preempt_schedule_common+0x2f/0x4d ? f2fs_gc+0x540/0x540 [f2fs] kthread+0xe9/0x120 ? f2fs_gc+0x540/0x540 [f2fs] ? kthread_create_on_node+0x30/0x30 ret_from_fork+0x19/0x24 The deadlock occurs in below condition: GC Thread Thread B - sb_start_intwrite - f2fs_sync_file - f2fs_sync_fs - mutex_lock(&sbi->gc_mutex) - write_checkpoint - block_operations - f2fs_sync_inode_meta - iput - sb_start_intwrite - mutex_lock(&sbi->gc_mutex) Fix this by altering sb_start_intwrite to sb_start_write_trylock. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 9 +++++++-- fs/f2fs/segment.c | 8 ++++++++ 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index c72da8733ba6..8f30dae0fe46 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -55,6 +55,9 @@ static int gc_thread_func(void *data) } #endif + if (!sb_start_write_trylock(sbi->sb)) + continue; + /* * [GC triggering condition] * 0. GC is not conducted currently. @@ -69,12 +72,12 @@ static int gc_thread_func(void *data) * So, I'd like to wait some time to collect dirty segments. */ if (!mutex_trylock(&sbi->gc_mutex)) - continue; + goto next; if (!is_idle(sbi)) { increase_sleep_time(gc_th, &wait_ms); mutex_unlock(&sbi->gc_mutex); - continue; + goto next; } if (has_enough_invalid_blocks(sbi)) @@ -93,6 +96,8 @@ static int gc_thread_func(void *data) /* balancing f2fs's metadata periodically */ f2fs_balance_fs_bg(sbi); +next: + sb_end_write(sbi->sb); } while (!kthread_should_stop()); return 0; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index bf9d66fa0af5..3573b95f4fab 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -485,6 +485,8 @@ repeat: if (kthread_should_stop()) return 0; + sb_start_intwrite(sbi->sb); + if (!llist_empty(&fcc->issue_list)) { struct flush_cmd *cmd, *next; int ret; @@ -503,6 +505,8 @@ repeat: fcc->dispatch_list = NULL; } + sb_end_intwrite(sbi->sb); + wait_event_interruptible(*q, kthread_should_stop() || !llist_empty(&fcc->issue_list)); goto repeat; @@ -1210,9 +1214,13 @@ static int issue_discard_thread(void *data) if (kthread_should_stop()) return 0; + sb_start_intwrite(sbi->sb); + __issue_discard_cmd(sbi, true); __wait_discard_cmd(sbi, true); + sb_end_intwrite(sbi->sb); + congestion_wait(BLK_RW_SYNC, HZ/50); } while (!kthread_should_stop()); return 0; From 68a6e4b9740c7e16636360a317c12d4ba870eb21 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 21 Jul 2017 12:58:59 -0700 Subject: [PATCH 376/804] f2fs: add ioctl to expose current features This patch adds an ioctl to provide feature information to user. For exapmle, SQLite can use this ioctl to detect whether f2fs support atomic write or not. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 ++ fs/f2fs/file.c | 13 +++++++++++++ 2 files changed, 15 insertions(+) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 8ec1afac4897..4eb067f1b160 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -113,6 +113,7 @@ struct f2fs_mount_info { #define F2FS_FEATURE_ENCRYPT 0x0001 #define F2FS_FEATURE_BLKZONED 0x0002 +#define F2FS_FEATURE_ATOMIC_WRITE 0x0004 #define F2FS_HAS_FEATURE(sb, mask) \ ((F2FS_SB(sb)->raw_super->feature & cpu_to_le32(mask)) != 0) @@ -375,6 +376,7 @@ static inline bool __has_cursum_space(struct f2fs_journal *journal, struct f2fs_flush_device) #define F2FS_IOC_GARBAGE_COLLECT_RANGE _IOW(F2FS_IOCTL_MAGIC, 11, \ struct f2fs_gc_range) +#define F2FS_IOC_GET_FEATURES _IOR(F2FS_IOCTL_MAGIC, 12, __u32) #define F2FS_IOC_SET_ENCRYPTION_POLICY FS_IOC_SET_ENCRYPTION_POLICY #define F2FS_IOC_GET_ENCRYPTION_POLICY FS_IOC_GET_ENCRYPTION_POLICY diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 368aa332c833..bc732b17be91 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2386,6 +2386,16 @@ out: return ret; } +static int f2fs_ioc_get_features(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + u32 sb_feature = le32_to_cpu(F2FS_I_SB(inode)->raw_super->feature); + + /* Must validate to set it with SQLite behavior in Android. */ + sb_feature |= F2FS_FEATURE_ATOMIC_WRITE; + + return put_user(sb_feature, (u32 __user *)arg); +} long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { @@ -2428,6 +2438,8 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return f2fs_ioc_move_range(filp, arg); case F2FS_IOC_FLUSH_DEVICE: return f2fs_ioc_flush_device(filp, arg); + case F2FS_IOC_GET_FEATURES: + return f2fs_ioc_get_features(filp, arg); default: return -ENOTTY; } @@ -2498,6 +2510,7 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case F2FS_IOC_DEFRAGMENT: case F2FS_IOC_MOVE_RANGE: case F2FS_IOC_FLUSH_DEVICE: + case F2FS_IOC_GET_FEATURES: break; default: return -ENOIOCTLCMD; From 242ed6f4bbea201af6f4a8cc8db9f4b407bbfb25 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 19 Jul 2017 00:19:05 +0800 Subject: [PATCH 377/804] f2fs: make max inline size changeable This patch tries to make below macros calculating max inline size, inline dentry field size considerring reserving size-changeable space: - MAX_INLINE_DATA - NR_INLINE_DENTRY - INLINE_DENTRY_BITMAP_SIZE - INLINE_RESERVED_SIZE Then, when inline_{data,dentry} options is enabled, it allows us to reserve inline space with different size flexibly for adding newly introduced inode attribute. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 4 +- fs/f2fs/f2fs.h | 48 +++++++++++++++++---- fs/f2fs/inline.c | 95 +++++++++++++++++++++-------------------- fs/f2fs/inode.c | 4 +- fs/f2fs/super.c | 3 ++ include/linux/f2fs_fs.h | 23 +--------- 6 files changed, 96 insertions(+), 81 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index b8588c8360e2..f31c71bcdf0e 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -812,7 +812,7 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) F2FS_GET_BLOCK_PRE_AIO : F2FS_GET_BLOCK_PRE_DIO); } - if (iocb->ki_pos + iov_iter_count(from) > MAX_INLINE_DATA) { + if (iocb->ki_pos + iov_iter_count(from) > MAX_INLINE_DATA(inode)) { err = f2fs_convert_inline_inode(inode); if (err) return err; @@ -1855,7 +1855,7 @@ restart: set_new_dnode(&dn, inode, ipage, ipage, 0); if (f2fs_has_inline_data(inode)) { - if (pos + len <= MAX_INLINE_DATA) { + if (pos + len <= MAX_INLINE_DATA(inode)) { read_inline_data(page, ipage); set_inode_flag(inode, FI_DATA_EXIST); if (inode->i_nlink) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 4eb067f1b160..153a487b1189 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -424,6 +424,25 @@ struct f2fs_flush_device { u32 segments; /* # of segments to flush */ }; +/* for inline stuff */ +#define DEF_INLINE_RESERVED_SIZE 1 + +static inline int get_inline_reserved_size(struct inode *inode); +#define MAX_INLINE_DATA(inode) (sizeof(__le32) * (DEF_ADDRS_PER_INODE -\ + get_inline_reserved_size(inode) -\ + F2FS_INLINE_XATTR_ADDRS)) + +/* for inline dir */ +#define NR_INLINE_DENTRY(inode) (MAX_INLINE_DATA(inode) * BITS_PER_BYTE / \ + ((SIZE_OF_DIR_ENTRY + F2FS_SLOT_LEN) * \ + BITS_PER_BYTE + 1)) +#define INLINE_DENTRY_BITMAP_SIZE(inode) ((NR_INLINE_DENTRY(inode) + \ + BITS_PER_BYTE - 1) / BITS_PER_BYTE) +#define INLINE_RESERVED_SIZE(inode) (MAX_INLINE_DATA(inode) - \ + ((SIZE_OF_DIR_ENTRY + F2FS_SLOT_LEN) * \ + NR_INLINE_DENTRY(inode) + \ + INLINE_DENTRY_BITMAP_SIZE(inode))) + /* * For INODE and NODE manager */ @@ -449,14 +468,19 @@ static inline void make_dentry_ptr_block(struct inode *inode, } static inline void make_dentry_ptr_inline(struct inode *inode, - struct f2fs_dentry_ptr *d, struct f2fs_inline_dentry *t) + struct f2fs_dentry_ptr *d, void *t) { + int entry_cnt = NR_INLINE_DENTRY(inode); + int bitmap_size = INLINE_DENTRY_BITMAP_SIZE(inode); + int reserved_size = INLINE_RESERVED_SIZE(inode); + d->inode = inode; - d->max = NR_INLINE_DENTRY; - d->nr_bitmap = INLINE_DENTRY_BITMAP_SIZE; - d->bitmap = &t->dentry_bitmap; - d->dentry = t->dentry; - d->filename = t->filename; + d->max = entry_cnt; + d->nr_bitmap = bitmap_size; + d->bitmap = t; + d->dentry = t + bitmap_size + reserved_size; + d->filename = t + bitmap_size + reserved_size + + SIZE_OF_DIR_ENTRY * entry_cnt; } /* @@ -610,6 +634,8 @@ struct f2fs_inode_info { struct extent_tree *extent_tree; /* cached extent_tree entry */ struct rw_semaphore dio_rwsem[2];/* avoid racing between dio and gc */ struct rw_semaphore i_mmap_sem; + + int i_inline_reserved; /* reserved size in inline data */ }; static inline void get_extent_info(struct extent_info *ext, @@ -2139,11 +2165,12 @@ static inline bool f2fs_is_drop_cache(struct inode *inode) return is_inode_flag_set(inode, FI_DROP_CACHE); } -static inline void *inline_data_addr(struct page *page) +static inline void *inline_data_addr(struct inode *inode, struct page *page) { struct f2fs_inode *ri = F2FS_INODE(page); + int reserved_size = get_inline_reserved_size(inode); - return (void *)&(ri->i_addr[1]); + return (void *)&(ri->i_addr[reserved_size]); } static inline int f2fs_has_inline_dentry(struct inode *inode) @@ -2254,6 +2281,11 @@ static inline void *kvzalloc(size_t size, gfp_t flags) return ret; } +static inline int get_inline_reserved_size(struct inode *inode) +{ + return F2FS_I(inode)->i_inline_reserved; +} + #define get_inode_mode(i) \ ((is_inode_flag_set(i, FI_ACL_MODE)) ? \ (F2FS_I(i)->i_acl_mode) : ((i)->i_mode)) diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 918eb89eb404..ed5b1153901e 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -22,7 +22,7 @@ bool f2fs_may_inline_data(struct inode *inode) if (!S_ISREG(inode->i_mode) && !S_ISLNK(inode->i_mode)) return false; - if (i_size_read(inode) > MAX_INLINE_DATA) + if (i_size_read(inode) > MAX_INLINE_DATA(inode)) return false; if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) @@ -44,6 +44,7 @@ bool f2fs_may_inline_dentry(struct inode *inode) void read_inline_data(struct page *page, struct page *ipage) { + struct inode *inode = page->mapping->host; void *src_addr, *dst_addr; if (PageUptodate(page)) @@ -51,12 +52,12 @@ void read_inline_data(struct page *page, struct page *ipage) f2fs_bug_on(F2FS_P_SB(page), page->index); - zero_user_segment(page, MAX_INLINE_DATA, PAGE_SIZE); + zero_user_segment(page, MAX_INLINE_DATA(inode), PAGE_SIZE); /* Copy the whole inline data block */ - src_addr = inline_data_addr(ipage); + src_addr = inline_data_addr(inode, ipage); dst_addr = kmap_atomic(page); - memcpy(dst_addr, src_addr, MAX_INLINE_DATA); + memcpy(dst_addr, src_addr, MAX_INLINE_DATA(inode)); flush_dcache_page(page); kunmap_atomic(dst_addr); if (!PageUptodate(page)) @@ -67,13 +68,13 @@ void truncate_inline_inode(struct inode *inode, struct page *ipage, u64 from) { void *addr; - if (from >= MAX_INLINE_DATA) + if (from >= MAX_INLINE_DATA(inode)) return; - addr = inline_data_addr(ipage); + addr = inline_data_addr(inode, ipage); f2fs_wait_on_page_writeback(ipage, NODE, true); - memset(addr + from, 0, MAX_INLINE_DATA - from); + memset(addr + from, 0, MAX_INLINE_DATA(inode) - from); set_page_dirty(ipage); if (from == 0) @@ -216,8 +217,8 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page) f2fs_wait_on_page_writeback(dn.inode_page, NODE, true); src_addr = kmap_atomic(page); - dst_addr = inline_data_addr(dn.inode_page); - memcpy(dst_addr, src_addr, MAX_INLINE_DATA); + dst_addr = inline_data_addr(inode, dn.inode_page); + memcpy(dst_addr, src_addr, MAX_INLINE_DATA(inode)); kunmap_atomic(src_addr); set_page_dirty(dn.inode_page); @@ -255,9 +256,9 @@ process_inline: f2fs_wait_on_page_writeback(ipage, NODE, true); - src_addr = inline_data_addr(npage); - dst_addr = inline_data_addr(ipage); - memcpy(dst_addr, src_addr, MAX_INLINE_DATA); + src_addr = inline_data_addr(inode, npage); + dst_addr = inline_data_addr(inode, ipage); + memcpy(dst_addr, src_addr, MAX_INLINE_DATA(inode)); set_inode_flag(inode, FI_INLINE_DATA); set_inode_flag(inode, FI_DATA_EXIST); @@ -285,11 +286,11 @@ struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir, struct fscrypt_name *fname, struct page **res_page) { struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); - struct f2fs_inline_dentry *inline_dentry; struct qstr name = FSTR_TO_QSTR(&fname->disk_name); struct f2fs_dir_entry *de; struct f2fs_dentry_ptr d; struct page *ipage; + void *inline_dentry; f2fs_hash_t namehash; ipage = get_node_page(sbi, dir->i_ino); @@ -300,9 +301,9 @@ struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir, namehash = f2fs_dentry_hash(&name, fname); - inline_dentry = inline_data_addr(ipage); + inline_dentry = inline_data_addr(dir, ipage); - make_dentry_ptr_inline(NULL, &d, inline_dentry); + make_dentry_ptr_inline(dir, &d, inline_dentry); de = find_target_dentry(fname, namehash, NULL, &d); unlock_page(ipage); if (de) @@ -316,19 +317,19 @@ struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir, int make_empty_inline_dir(struct inode *inode, struct inode *parent, struct page *ipage) { - struct f2fs_inline_dentry *inline_dentry; struct f2fs_dentry_ptr d; + void *inline_dentry; - inline_dentry = inline_data_addr(ipage); + inline_dentry = inline_data_addr(inode, ipage); - make_dentry_ptr_inline(NULL, &d, inline_dentry); + make_dentry_ptr_inline(inode, &d, inline_dentry); do_make_empty_dir(inode, parent, &d); set_page_dirty(ipage); /* update i_size to MAX_INLINE_DATA */ - if (i_size_read(inode) < MAX_INLINE_DATA) - f2fs_i_size_write(inode, MAX_INLINE_DATA); + if (i_size_read(inode) < MAX_INLINE_DATA(inode)) + f2fs_i_size_write(inode, MAX_INLINE_DATA(inode)); return 0; } @@ -337,7 +338,7 @@ int make_empty_inline_dir(struct inode *inode, struct inode *parent, * release ipage in this function. */ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage, - struct f2fs_inline_dentry *inline_dentry) + void *inline_dentry) { struct page *page; struct dnode_of_data dn; @@ -357,12 +358,12 @@ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage, goto out; f2fs_wait_on_page_writeback(page, DATA, true); - zero_user_segment(page, MAX_INLINE_DATA, PAGE_SIZE); + zero_user_segment(page, MAX_INLINE_DATA(dir), PAGE_SIZE); dentry_blk = kmap_atomic(page); - make_dentry_ptr_inline(NULL, &src, inline_dentry); - make_dentry_ptr_block(NULL, &dst, dentry_blk); + make_dentry_ptr_inline(dir, &src, inline_dentry); + make_dentry_ptr_block(dir, &dst, dentry_blk); /* copy data from inline dentry block to new dentry block */ memcpy(dst.bitmap, src.bitmap, src.nr_bitmap); @@ -395,14 +396,13 @@ out: return err; } -static int f2fs_add_inline_entries(struct inode *dir, - struct f2fs_inline_dentry *inline_dentry) +static int f2fs_add_inline_entries(struct inode *dir, void *inline_dentry) { struct f2fs_dentry_ptr d; unsigned long bit_pos = 0; int err = 0; - make_dentry_ptr_inline(NULL, &d, inline_dentry); + make_dentry_ptr_inline(dir, &d, inline_dentry); while (bit_pos < d.max) { struct f2fs_dir_entry *de; @@ -444,19 +444,19 @@ punch_dentry_pages: } static int f2fs_move_rehashed_dirents(struct inode *dir, struct page *ipage, - struct f2fs_inline_dentry *inline_dentry) + void *inline_dentry) { - struct f2fs_inline_dentry *backup_dentry; + void *backup_dentry; int err; backup_dentry = f2fs_kmalloc(F2FS_I_SB(dir), - sizeof(struct f2fs_inline_dentry), GFP_F2FS_ZERO); + MAX_INLINE_DATA(dir), GFP_F2FS_ZERO); if (!backup_dentry) { f2fs_put_page(ipage, 1); return -ENOMEM; } - memcpy(backup_dentry, inline_dentry, MAX_INLINE_DATA); + memcpy(backup_dentry, inline_dentry, MAX_INLINE_DATA(dir)); truncate_inline_inode(dir, ipage, 0); unlock_page(ipage); @@ -473,9 +473,9 @@ static int f2fs_move_rehashed_dirents(struct inode *dir, struct page *ipage, return 0; recover: lock_page(ipage); - memcpy(inline_dentry, backup_dentry, MAX_INLINE_DATA); + memcpy(inline_dentry, backup_dentry, MAX_INLINE_DATA(dir)); f2fs_i_depth_write(dir, 0); - f2fs_i_size_write(dir, MAX_INLINE_DATA); + f2fs_i_size_write(dir, MAX_INLINE_DATA(dir)); set_page_dirty(ipage); f2fs_put_page(ipage, 1); @@ -484,7 +484,7 @@ recover: } static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage, - struct f2fs_inline_dentry *inline_dentry) + void *inline_dentry) { if (!F2FS_I(dir)->i_dir_level) return f2fs_move_inline_dirents(dir, ipage, inline_dentry); @@ -500,7 +500,7 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, struct page *ipage; unsigned int bit_pos; f2fs_hash_t name_hash; - struct f2fs_inline_dentry *inline_dentry = NULL; + void *inline_dentry = NULL; struct f2fs_dentry_ptr d; int slots = GET_DENTRY_SLOTS(new_name->len); struct page *page = NULL; @@ -510,8 +510,8 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, if (IS_ERR(ipage)) return PTR_ERR(ipage); - inline_dentry = inline_data_addr(ipage); - make_dentry_ptr_inline(NULL, &d, inline_dentry); + inline_dentry = inline_data_addr(dir, ipage); + make_dentry_ptr_inline(dir, &d, inline_dentry); bit_pos = room_for_filename(d.bitmap, slots, d.max); if (bit_pos >= d.max) { @@ -557,8 +557,8 @@ out: void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page, struct inode *dir, struct inode *inode) { - struct f2fs_inline_dentry *inline_dentry; struct f2fs_dentry_ptr d; + void *inline_dentry; int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len)); unsigned int bit_pos; int i; @@ -566,8 +566,8 @@ void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page, lock_page(page); f2fs_wait_on_page_writeback(page, NODE, true); - inline_dentry = inline_data_addr(page); - make_dentry_ptr_inline(NULL, &d, inline_dentry); + inline_dentry = inline_data_addr(dir, page); + make_dentry_ptr_inline(dir, &d, inline_dentry); bit_pos = dentry - d.dentry; for (i = 0; i < slots; i++) @@ -588,15 +588,15 @@ bool f2fs_empty_inline_dir(struct inode *dir) struct f2fs_sb_info *sbi = F2FS_I_SB(dir); struct page *ipage; unsigned int bit_pos = 2; - struct f2fs_inline_dentry *inline_dentry; + void *inline_dentry; struct f2fs_dentry_ptr d; ipage = get_node_page(sbi, dir->i_ino); if (IS_ERR(ipage)) return false; - inline_dentry = inline_data_addr(ipage); - make_dentry_ptr_inline(NULL, &d, inline_dentry); + inline_dentry = inline_data_addr(dir, ipage); + make_dentry_ptr_inline(dir, &d, inline_dentry); bit_pos = find_next_bit_le(d.bitmap, d.max, bit_pos); @@ -612,9 +612,9 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx, struct fscrypt_str *fstr) { struct inode *inode = file_inode(file); - struct f2fs_inline_dentry *inline_dentry = NULL; struct page *ipage = NULL; struct f2fs_dentry_ptr d; + void *inline_dentry = NULL; int err; make_dentry_ptr_inline(inode, &d, inline_dentry); @@ -626,7 +626,7 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx, if (IS_ERR(ipage)) return PTR_ERR(ipage); - inline_dentry = inline_data_addr(ipage); + inline_dentry = inline_data_addr(inode, ipage); make_dentry_ptr_inline(inode, &d, inline_dentry); @@ -657,7 +657,7 @@ int f2fs_inline_data_fiemap(struct inode *inode, goto out; } - ilen = min_t(size_t, MAX_INLINE_DATA, i_size_read(inode)); + ilen = min_t(size_t, MAX_INLINE_DATA(inode), i_size_read(inode)); if (start >= ilen) goto out; if (start + len < ilen) @@ -666,7 +666,8 @@ int f2fs_inline_data_fiemap(struct inode *inode, get_node_info(F2FS_I_SB(inode), inode->i_ino, &ni); byteaddr = (__u64)ni.blk_addr << inode->i_sb->s_blocksize_bits; - byteaddr += (char *)inline_data_addr(ipage) - (char *)F2FS_INODE(ipage); + byteaddr += (char *)inline_data_addr(inode, ipage) - + (char *)F2FS_INODE(ipage); err = fiemap_fill_next_extent(fieinfo, start, byteaddr, ilen, flags); out: f2fs_put_page(ipage, 1); diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 6cd312a17c69..32ec6b23fe01 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -87,9 +87,9 @@ static void __set_inode_rdev(struct inode *inode, struct f2fs_inode *ri) static void __recover_inline_status(struct inode *inode, struct page *ipage) { - void *inline_data = inline_data_addr(ipage); + void *inline_data = inline_data_addr(inode, ipage); __le32 *start = inline_data; - __le32 *end = start + MAX_INLINE_DATA / sizeof(__le32); + __le32 *end = start + MAX_INLINE_DATA(inode) / sizeof(__le32); while (start < end) { if (*start++) { diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 1eb2013fece6..ac719a3ef848 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -446,6 +446,9 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) #endif /* Will be used by directory only */ fi->i_dir_level = F2FS_SB(sb)->dir_level; + + fi->i_inline_reserved = DEF_INLINE_RESERVED_SIZE; + return &fi->vfs_inode; } diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index 2b7183c5c9a3..bf27f140c21b 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -206,9 +206,6 @@ struct f2fs_extent { #define F2FS_DATA_EXIST 0x08 /* file inline data exist flag */ #define F2FS_INLINE_DOTS 0x10 /* file having implicit dot dentries */ -#define MAX_INLINE_DATA (sizeof(__le32) * (DEF_ADDRS_PER_INODE - \ - F2FS_INLINE_XATTR_ADDRS - 1)) - struct f2fs_inode { __le16 i_mode; /* file mode */ __u8 i_advise; /* file hints */ @@ -465,7 +462,7 @@ typedef __le32 f2fs_hash_t; #define MAX_DIR_BUCKETS (1 << ((MAX_DIR_HASH_DEPTH / 2) - 1)) /* - * space utilization of regular dentry and inline dentry + * space utilization of regular dentry and inline dentry (w/o extra reservation) * regular dentry inline dentry * bitmap 1 * 27 = 27 1 * 23 = 23 * reserved 1 * 3 = 3 1 * 7 = 7 @@ -501,24 +498,6 @@ struct f2fs_dentry_block { __u8 filename[NR_DENTRY_IN_BLOCK][F2FS_SLOT_LEN]; } __packed; -/* for inline dir */ -#define NR_INLINE_DENTRY (MAX_INLINE_DATA * BITS_PER_BYTE / \ - ((SIZE_OF_DIR_ENTRY + F2FS_SLOT_LEN) * \ - BITS_PER_BYTE + 1)) -#define INLINE_DENTRY_BITMAP_SIZE ((NR_INLINE_DENTRY + \ - BITS_PER_BYTE - 1) / BITS_PER_BYTE) -#define INLINE_RESERVED_SIZE (MAX_INLINE_DATA - \ - ((SIZE_OF_DIR_ENTRY + F2FS_SLOT_LEN) * \ - NR_INLINE_DENTRY + INLINE_DENTRY_BITMAP_SIZE)) - -/* inline directory entry structure */ -struct f2fs_inline_dentry { - __u8 dentry_bitmap[INLINE_DENTRY_BITMAP_SIZE]; - __u8 reserved[INLINE_RESERVED_SIZE]; - struct f2fs_dir_entry dentry[NR_INLINE_DENTRY]; - __u8 filename[NR_INLINE_DENTRY][F2FS_SLOT_LEN]; -} __packed; - /* file types used in inode_info->flags */ enum { F2FS_FT_UNKNOWN, From 40f4330d958ba11f77c3e443dc98ffe2b9f3cdd1 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 19 Jul 2017 00:19:06 +0800 Subject: [PATCH 378/804] f2fs: enhance on-disk inode structure scalability This patch add new flag F2FS_EXTRA_ATTR storing in inode.i_inline to indicate that on-disk structure of current inode is extended. In order to extend, we changed the inode structure a bit: Original one: struct f2fs_inode { ... struct f2fs_extent i_ext; __le32 i_addr[DEF_ADDRS_PER_INODE]; __le32 i_nid[DEF_NIDS_PER_INODE]; } Extended one: struct f2fs_inode { ... struct f2fs_extent i_ext; union { struct { __le16 i_extra_isize; __le16 i_padding; __le32 i_extra_end[0]; }; __le32 i_addr[DEF_ADDRS_PER_INODE]; }; __le32 i_nid[DEF_NIDS_PER_INODE]; } Once F2FS_EXTRA_ATTR is set, we will steal four bytes in the head of i_addr field for storing i_extra_isize and i_padding. with i_extra_isize, we can calculate actual size of reserved space in i_addr, available attribute fields included in total extra attribute fields for current inode can be described as below: +--------------------+ | .i_mode | | ... | | .i_ext | +--------------------+ | .i_extra_isize |-----+ | .i_padding | | | .i_prjid | | | .i_atime_extra | | | .i_ctime_extra | | | .i_mtime_extra |<----+ | .i_inode_cs |<----- store blkaddr/inline from here | .i_xattr_cs | | ... | +--------------------+ | | | block address | | | +--------------------+ | .i_nid | +--------------------+ | node_footer | | (nid, ino, offset) | +--------------------+ Hence, with this patch, we would enhance scalability of f2fs inode for storing more newly added attribute. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 15 ++++++--- fs/f2fs/f2fs.h | 72 +++++++++++++++++++++++++++++++---------- fs/f2fs/file.c | 23 ++++++++----- fs/f2fs/gc.c | 2 +- fs/f2fs/inode.c | 32 +++++++++++------- fs/f2fs/namei.c | 5 +++ fs/f2fs/node.c | 7 ++-- fs/f2fs/recovery.c | 7 ++-- fs/f2fs/super.c | 11 +++++-- include/linux/f2fs_fs.h | 13 ++++++-- 10 files changed, 135 insertions(+), 52 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index f31c71bcdf0e..376d59929ded 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -459,10 +459,14 @@ static void __set_data_blkaddr(struct dnode_of_data *dn) { struct f2fs_node *rn = F2FS_NODE(dn->node_page); __le32 *addr_array; + int base = 0; + + if (IS_INODE(dn->node_page) && f2fs_has_extra_attr(dn->inode)) + base = get_extra_isize(dn->inode); /* Get physical address of data block */ addr_array = blkaddr_in_node(rn); - addr_array[dn->ofs_in_node] = cpu_to_le32(dn->data_blkaddr); + addr_array[base + dn->ofs_in_node] = cpu_to_le32(dn->data_blkaddr); } /* @@ -506,8 +510,8 @@ int reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count) f2fs_wait_on_page_writeback(dn->node_page, NODE, true); for (; count > 0; dn->ofs_in_node++) { - block_t blkaddr = - datablock_addr(dn->node_page, dn->ofs_in_node); + block_t blkaddr = datablock_addr(dn->inode, + dn->node_page, dn->ofs_in_node); if (blkaddr == NULL_ADDR) { dn->data_blkaddr = NEW_ADDR; __set_data_blkaddr(dn); @@ -754,7 +758,8 @@ static int __allocate_data_block(struct dnode_of_data *dn) if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC))) return -EPERM; - dn->data_blkaddr = datablock_addr(dn->node_page, dn->ofs_in_node); + dn->data_blkaddr = datablock_addr(dn->inode, + dn->node_page, dn->ofs_in_node); if (dn->data_blkaddr == NEW_ADDR) goto alloc; @@ -901,7 +906,7 @@ next_dnode: end_offset = ADDRS_PER_PAGE(dn.node_page, inode); next_block: - blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node); + blkaddr = datablock_addr(dn.inode, dn.node_page, dn.ofs_in_node); if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR) { if (create) { diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 153a487b1189..845ebcd4217e 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -111,9 +111,10 @@ struct f2fs_mount_info { unsigned int opt; }; -#define F2FS_FEATURE_ENCRYPT 0x0001 -#define F2FS_FEATURE_BLKZONED 0x0002 -#define F2FS_FEATURE_ATOMIC_WRITE 0x0004 +#define F2FS_FEATURE_ENCRYPT 0x0001 +#define F2FS_FEATURE_BLKZONED 0x0002 +#define F2FS_FEATURE_ATOMIC_WRITE 0x0004 +#define F2FS_FEATURE_EXTRA_ATTR 0x0008 #define F2FS_HAS_FEATURE(sb, mask) \ ((F2FS_SB(sb)->raw_super->feature & cpu_to_le32(mask)) != 0) @@ -426,10 +427,10 @@ struct f2fs_flush_device { /* for inline stuff */ #define DEF_INLINE_RESERVED_SIZE 1 - -static inline int get_inline_reserved_size(struct inode *inode); -#define MAX_INLINE_DATA(inode) (sizeof(__le32) * (DEF_ADDRS_PER_INODE -\ - get_inline_reserved_size(inode) -\ +static inline int get_extra_isize(struct inode *inode); +#define MAX_INLINE_DATA(inode) (sizeof(__le32) * \ + (CUR_ADDRS_PER_INODE(inode) - \ + DEF_INLINE_RESERVED_SIZE - \ F2FS_INLINE_XATTR_ADDRS)) /* for inline dir */ @@ -635,7 +636,7 @@ struct f2fs_inode_info { struct rw_semaphore dio_rwsem[2];/* avoid racing between dio and gc */ struct rw_semaphore i_mmap_sem; - int i_inline_reserved; /* reserved size in inline data */ + int i_extra_isize; /* size of extra space located in i_addr */ }; static inline void get_extent_info(struct extent_info *ext, @@ -1856,20 +1857,38 @@ static inline bool IS_INODE(struct page *page) return RAW_IS_INODE(p); } +static inline int offset_in_addr(struct f2fs_inode *i) +{ + return (i->i_inline & F2FS_EXTRA_ATTR) ? + (le16_to_cpu(i->i_extra_isize) / sizeof(__le32)) : 0; +} + static inline __le32 *blkaddr_in_node(struct f2fs_node *node) { return RAW_IS_INODE(node) ? node->i.i_addr : node->dn.addr; } -static inline block_t datablock_addr(struct page *node_page, - unsigned int offset) +static inline int f2fs_has_extra_attr(struct inode *inode); +static inline block_t datablock_addr(struct inode *inode, + struct page *node_page, unsigned int offset) { struct f2fs_node *raw_node; __le32 *addr_array; + int base = 0; + bool is_inode = IS_INODE(node_page); raw_node = F2FS_NODE(node_page); + + /* from GC path only */ + if (!inode) { + if (is_inode) + base = offset_in_addr(&raw_node->i); + } else if (f2fs_has_extra_attr(inode) && is_inode) { + base = get_extra_isize(inode); + } + addr_array = blkaddr_in_node(raw_node); - return le32_to_cpu(addr_array[offset]); + return le32_to_cpu(addr_array[base + offset]); } static inline int f2fs_test_bit(unsigned int nr, char *addr) @@ -1960,6 +1979,7 @@ enum { FI_DIRTY_FILE, /* indicate regular/symlink has dirty pages */ FI_NO_PREALLOC, /* indicate skipped preallocated blocks */ FI_HOT_DATA, /* indicate file is hot */ + FI_EXTRA_ATTR, /* indicate file has extra attribute */ }; static inline void __mark_inode_dirty_flag(struct inode *inode, @@ -2079,6 +2099,8 @@ static inline void get_inline_info(struct inode *inode, struct f2fs_inode *ri) set_bit(FI_DATA_EXIST, &fi->flags); if (ri->i_inline & F2FS_INLINE_DOTS) set_bit(FI_INLINE_DOTS, &fi->flags); + if (ri->i_inline & F2FS_EXTRA_ATTR) + set_bit(FI_EXTRA_ATTR, &fi->flags); } static inline void set_raw_inline(struct inode *inode, struct f2fs_inode *ri) @@ -2095,6 +2117,13 @@ static inline void set_raw_inline(struct inode *inode, struct f2fs_inode *ri) ri->i_inline |= F2FS_DATA_EXIST; if (is_inode_flag_set(inode, FI_INLINE_DOTS)) ri->i_inline |= F2FS_INLINE_DOTS; + if (is_inode_flag_set(inode, FI_EXTRA_ATTR)) + ri->i_inline |= F2FS_EXTRA_ATTR; +} + +static inline int f2fs_has_extra_attr(struct inode *inode) +{ + return is_inode_flag_set(inode, FI_EXTRA_ATTR); } static inline int f2fs_has_inline_xattr(struct inode *inode) @@ -2105,8 +2134,8 @@ static inline int f2fs_has_inline_xattr(struct inode *inode) static inline unsigned int addrs_per_inode(struct inode *inode) { if (f2fs_has_inline_xattr(inode)) - return DEF_ADDRS_PER_INODE - F2FS_INLINE_XATTR_ADDRS; - return DEF_ADDRS_PER_INODE; + return CUR_ADDRS_PER_INODE(inode) - F2FS_INLINE_XATTR_ADDRS; + return CUR_ADDRS_PER_INODE(inode); } static inline void *inline_xattr_addr(struct page *page) @@ -2168,9 +2197,9 @@ static inline bool f2fs_is_drop_cache(struct inode *inode) static inline void *inline_data_addr(struct inode *inode, struct page *page) { struct f2fs_inode *ri = F2FS_INODE(page); - int reserved_size = get_inline_reserved_size(inode); + int extra_size = get_extra_isize(inode); - return (void *)&(ri->i_addr[reserved_size]); + return (void *)&(ri->i_addr[extra_size + DEF_INLINE_RESERVED_SIZE]); } static inline int f2fs_has_inline_dentry(struct inode *inode) @@ -2281,15 +2310,19 @@ static inline void *kvzalloc(size_t size, gfp_t flags) return ret; } -static inline int get_inline_reserved_size(struct inode *inode) +static inline int get_extra_isize(struct inode *inode) { - return F2FS_I(inode)->i_inline_reserved; + return F2FS_I(inode)->i_extra_isize / sizeof(__le32); } #define get_inode_mode(i) \ ((is_inode_flag_set(i, FI_ACL_MODE)) ? \ (F2FS_I(i)->i_acl_mode) : ((i)->i_mode)) +#define F2FS_TOTAL_EXTRA_ATTR_SIZE \ + (offsetof(struct f2fs_inode, i_extra_end) - \ + offsetof(struct f2fs_inode, i_extra_isize)) \ + /* * file.c */ @@ -2882,6 +2915,11 @@ static inline int f2fs_sb_mounted_blkzoned(struct super_block *sb) return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_BLKZONED); } +static inline int f2fs_sb_has_extra_attr(struct super_block *sb) +{ + return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_EXTRA_ATTR); +} + #ifdef CONFIG_BLK_DEV_ZONED static inline int get_blkz_type(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t blkaddr) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index bc732b17be91..f6a9ae012471 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -385,7 +385,8 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) dn.ofs_in_node++, pgofs++, data_ofs = (loff_t)pgofs << PAGE_SHIFT) { block_t blkaddr; - blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node); + blkaddr = datablock_addr(dn.inode, + dn.node_page, dn.ofs_in_node); if (__found_offset(blkaddr, dirty, pgofs, whence)) { f2fs_put_dnode(&dn); @@ -470,9 +471,13 @@ int truncate_data_blocks_range(struct dnode_of_data *dn, int count) struct f2fs_node *raw_node; int nr_free = 0, ofs = dn->ofs_in_node, len = count; __le32 *addr; + int base = 0; + + if (IS_INODE(dn->node_page) && f2fs_has_extra_attr(dn->inode)) + base = get_extra_isize(dn->inode); raw_node = F2FS_NODE(dn->node_page); - addr = blkaddr_in_node(raw_node) + ofs; + addr = blkaddr_in_node(raw_node) + base + ofs; for (; count > 0; count--, addr++, dn->ofs_in_node++) { block_t blkaddr = le32_to_cpu(*addr); @@ -912,7 +917,8 @@ next_dnode: done = min((pgoff_t)ADDRS_PER_PAGE(dn.node_page, inode) - dn.ofs_in_node, len); for (i = 0; i < done; i++, blkaddr++, do_replace++, dn.ofs_in_node++) { - *blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node); + *blkaddr = datablock_addr(dn.inode, + dn.node_page, dn.ofs_in_node); if (!is_checkpointed_data(sbi, *blkaddr)) { if (test_opt(sbi, LFS)) { @@ -988,8 +994,8 @@ static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode, ADDRS_PER_PAGE(dn.node_page, dst_inode) - dn.ofs_in_node, len - i); do { - dn.data_blkaddr = datablock_addr(dn.node_page, - dn.ofs_in_node); + dn.data_blkaddr = datablock_addr(dn.inode, + dn.node_page, dn.ofs_in_node); truncate_data_blocks_range(&dn, 1); if (do_replace[i]) { @@ -1158,7 +1164,8 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start, int ret; for (; index < end; index++, dn->ofs_in_node++) { - if (datablock_addr(dn->node_page, dn->ofs_in_node) == NULL_ADDR) + if (datablock_addr(dn->inode, dn->node_page, + dn->ofs_in_node) == NULL_ADDR) count++; } @@ -1169,8 +1176,8 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start, dn->ofs_in_node = ofs_in_node; for (index = start; index < end; index++, dn->ofs_in_node++) { - dn->data_blkaddr = - datablock_addr(dn->node_page, dn->ofs_in_node); + dn->data_blkaddr = datablock_addr(dn->inode, + dn->node_page, dn->ofs_in_node); /* * reserve_new_blocks will not guarantee entire block * allocation. diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 8f30dae0fe46..f74685ae008b 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -587,7 +587,7 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, } *nofs = ofs_of_node(node_page); - source_blkaddr = datablock_addr(node_page, ofs_in_node); + source_blkaddr = datablock_addr(NULL, node_page, ofs_in_node); f2fs_put_page(node_page, 1); if (source_blkaddr != blkaddr) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 32ec6b23fe01..0a6699a23dfb 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -49,20 +49,22 @@ void f2fs_set_inode_flags(struct inode *inode) static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri) { + int extra_size = get_extra_isize(inode); + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { - if (ri->i_addr[0]) - inode->i_rdev = - old_decode_dev(le32_to_cpu(ri->i_addr[0])); + if (ri->i_addr[extra_size]) + inode->i_rdev = old_decode_dev( + le32_to_cpu(ri->i_addr[extra_size])); else - inode->i_rdev = - new_decode_dev(le32_to_cpu(ri->i_addr[1])); + inode->i_rdev = new_decode_dev( + le32_to_cpu(ri->i_addr[extra_size + 1])); } } static bool __written_first_block(struct f2fs_inode *ri) { - block_t addr = le32_to_cpu(ri->i_addr[0]); + block_t addr = le32_to_cpu(ri->i_addr[offset_in_addr(ri)]); if (addr != NEW_ADDR && addr != NULL_ADDR) return true; @@ -71,16 +73,18 @@ static bool __written_first_block(struct f2fs_inode *ri) static void __set_inode_rdev(struct inode *inode, struct f2fs_inode *ri) { + int extra_size = get_extra_isize(inode); + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { if (old_valid_dev(inode->i_rdev)) { - ri->i_addr[0] = + ri->i_addr[extra_size] = cpu_to_le32(old_encode_dev(inode->i_rdev)); - ri->i_addr[1] = 0; + ri->i_addr[extra_size + 1] = 0; } else { - ri->i_addr[0] = 0; - ri->i_addr[1] = + ri->i_addr[extra_size] = 0; + ri->i_addr[extra_size + 1] = cpu_to_le32(new_encode_dev(inode->i_rdev)); - ri->i_addr[2] = 0; + ri->i_addr[extra_size + 2] = 0; } } } @@ -153,6 +157,9 @@ static int do_read_inode(struct inode *inode) get_inline_info(inode, ri); + fi->i_extra_isize = f2fs_has_extra_attr(inode) ? + le16_to_cpu(ri->i_extra_isize) : 0; + /* check data exist */ if (f2fs_has_inline_data(inode) && !f2fs_exist_data(inode)) __recover_inline_status(inode, node_page); @@ -292,6 +299,9 @@ int update_inode(struct inode *inode, struct page *node_page) ri->i_generation = cpu_to_le32(inode->i_generation); ri->i_dir_level = F2FS_I(inode)->i_dir_level; + if (f2fs_has_extra_attr(inode)) + ri->i_extra_isize = cpu_to_le16(F2FS_I(inode)->i_extra_isize); + __set_inode_rdev(inode, ri); set_cold_node(inode, node_page); diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 541d755193c8..f098ae65363b 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -72,6 +72,11 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) set_inode_flag(inode, FI_NEW_INODE); + if (f2fs_sb_has_extra_attr(sbi->sb)) { + set_inode_flag(inode, FI_EXTRA_ATTR); + F2FS_I(inode)->i_extra_isize = F2FS_TOTAL_EXTRA_ATTR_SIZE; + } + if (test_opt(sbi, INLINE_XATTR)) set_inode_flag(inode, FI_INLINE_XATTR); if (test_opt(sbi, INLINE_DATA) && f2fs_may_inline_data(inode)) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 46fb5c2693ad..cde5526ec3fa 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -655,7 +655,8 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) dn->nid = nids[level]; dn->ofs_in_node = offset[level]; dn->node_page = npage[level]; - dn->data_blkaddr = datablock_addr(dn->node_page, dn->ofs_in_node); + dn->data_blkaddr = datablock_addr(dn->inode, + dn->node_page, dn->ofs_in_node); return 0; release_pages: @@ -2266,7 +2267,9 @@ retry: dst->i_blocks = cpu_to_le64(1); dst->i_links = cpu_to_le32(1); dst->i_xattr_nid = 0; - dst->i_inline = src->i_inline & F2FS_INLINE_XATTR; + dst->i_inline = src->i_inline & (F2FS_INLINE_XATTR | F2FS_EXTRA_ATTR); + if (dst->i_inline & F2FS_EXTRA_ATTR) + dst->i_extra_isize = src->i_extra_isize; new_ni = old_ni; new_ni.ino = ino; diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 907d6b7dde6a..2d9b8182691f 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -361,7 +361,8 @@ out: return 0; truncate_out: - if (datablock_addr(tdn.node_page, tdn.ofs_in_node) == blkaddr) + if (datablock_addr(tdn.inode, tdn.node_page, + tdn.ofs_in_node) == blkaddr) truncate_data_blocks_range(&tdn, 1); if (dn->inode->i_ino == nid && !dn->inode_page_locked) unlock_page(dn->inode_page); @@ -414,8 +415,8 @@ retry_dn: for (; start < end; start++, dn.ofs_in_node++) { block_t src, dest; - src = datablock_addr(dn.node_page, dn.ofs_in_node); - dest = datablock_addr(page, dn.ofs_in_node); + src = datablock_addr(dn.inode, dn.node_page, dn.ofs_in_node); + dest = datablock_addr(dn.inode, page, dn.ofs_in_node); /* skip recovering if dest is the same as src */ if (src == dest) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index ac719a3ef848..caf6f24ce3a5 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -447,8 +447,6 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) /* Will be used by directory only */ fi->i_dir_level = F2FS_SB(sb)->dir_level; - fi->i_inline_reserved = DEF_INLINE_RESERVED_SIZE; - return &fi->vfs_inode; } @@ -1305,9 +1303,16 @@ static const struct export_operations f2fs_export_ops = { static loff_t max_file_blocks(void) { - loff_t result = (DEF_ADDRS_PER_INODE - F2FS_INLINE_XATTR_ADDRS); + loff_t result = 0; loff_t leaf_count = ADDRS_PER_BLOCK; + /* + * note: previously, result is equal to (DEF_ADDRS_PER_INODE - + * F2FS_INLINE_XATTR_ADDRS), but now f2fs try to reserve more + * space in inode.i_addr, it will be more safe to reassign + * result as zero. + */ + /* two direct node blocks */ result += (leaf_count * 2); diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index bf27f140c21b..350c6b931fdb 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -186,6 +186,8 @@ struct f2fs_extent { #define F2FS_NAME_LEN 255 #define F2FS_INLINE_XATTR_ADDRS 50 /* 200 bytes for inline xattrs */ #define DEF_ADDRS_PER_INODE 923 /* Address Pointers in an Inode */ +#define CUR_ADDRS_PER_INODE(inode) (DEF_ADDRS_PER_INODE - \ + get_extra_isize(inode)) #define DEF_NIDS_PER_INODE 5 /* Node IDs in an Inode */ #define ADDRS_PER_INODE(inode) addrs_per_inode(inode) #define ADDRS_PER_BLOCK 1018 /* Address Pointers in a Direct Block */ @@ -205,6 +207,7 @@ struct f2fs_extent { #define F2FS_INLINE_DENTRY 0x04 /* file inline dentry flag */ #define F2FS_DATA_EXIST 0x08 /* file inline data exist flag */ #define F2FS_INLINE_DOTS 0x10 /* file having implicit dot dentries */ +#define F2FS_EXTRA_ATTR 0x20 /* file having extra attribute */ struct f2fs_inode { __le16 i_mode; /* file mode */ @@ -232,8 +235,14 @@ struct f2fs_inode { struct f2fs_extent i_ext; /* caching a largest extent */ - __le32 i_addr[DEF_ADDRS_PER_INODE]; /* Pointers to data blocks */ - + union { + struct { + __le16 i_extra_isize; /* extra inode attribute size */ + __le16 i_padding; /* padding */ + __le32 i_extra_end[0]; /* for attribute size calculation */ + }; + __le32 i_addr[DEF_ADDRS_PER_INODE]; /* Pointers to data blocks */ + }; __le32 i_nid[DEF_NIDS_PER_INODE]; /* direct(2), indirect(2), double_indirect(1) node id */ } __packed; From 186801baf7a2bee5fd187a123a533e46ccfc8e2b Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 24 Jul 2017 17:12:06 +0800 Subject: [PATCH 379/804] f2fs: record quota during dot{,dot} recovery In ->lookup(), we will have a try to recover dot or dotdot for corrupted directory, once disk quota is on, if it allocates new block during dotdot recovery, we need to record disk quota info for the allocation, so this patch fixes this issue by adding missing dquot_initialize() in __recover_dot_dentries. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/namei.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index f098ae65363b..a0bd1c68ec9c 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -266,6 +266,10 @@ static int __recover_dot_dentries(struct inode *dir, nid_t pino) return 0; } + err = dquot_initialize(dir); + if (err) + return err; + f2fs_balance_fs(sbi, true); f2fs_lock_op(sbi); From fbe3ba58535fe619c191b5f384b6ec84e5e46e61 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 26 Jul 2017 00:01:41 +0800 Subject: [PATCH 380/804] f2fs: support project quota This patch adds to support plain project quota. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.txt | 1 + fs/f2fs/f2fs.h | 29 ++++++++++++++++++++++++++++ fs/f2fs/file.c | 13 ------------- fs/f2fs/inode.c | 24 ++++++++++++++++++++++- fs/f2fs/namei.c | 31 ++++++++++++++++++++++++++++++ fs/f2fs/node.c | 7 ++++++- fs/f2fs/super.c | 22 ++++++++++++++++++++- include/linux/f2fs_fs.h | 3 +++ 8 files changed, 114 insertions(+), 16 deletions(-) diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt index 3ba27469a8dd..5cf383f7fa8a 100644 --- a/Documentation/filesystems/f2fs.txt +++ b/Documentation/filesystems/f2fs.txt @@ -162,6 +162,7 @@ io_bits=%u Set the bit size of write IO requests. It should be set with "mode=lfs". usrquota Enable plain user disk quota accounting. grpquota Enable plain group disk quota accounting. +prjquota Enable plain project quota accounting. ================================================================================ DEBUGFS ENTRIES diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 845ebcd4217e..dce0857a72f6 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -92,6 +92,7 @@ extern char *fault_name[FAULT_MAX]; #define F2FS_MOUNT_LFS 0x00040000 #define F2FS_MOUNT_USRQUOTA 0x00080000 #define F2FS_MOUNT_GRPQUOTA 0x00100000 +#define F2FS_MOUNT_PRJQUOTA 0x00200000 #define clear_opt(sbi, option) ((sbi)->mount_opt.opt &= ~F2FS_MOUNT_##option) #define set_opt(sbi, option) ((sbi)->mount_opt.opt |= F2FS_MOUNT_##option) @@ -115,6 +116,7 @@ struct f2fs_mount_info { #define F2FS_FEATURE_BLKZONED 0x0002 #define F2FS_FEATURE_ATOMIC_WRITE 0x0004 #define F2FS_FEATURE_EXTRA_ATTR 0x0008 +#define F2FS_FEATURE_PRJQUOTA 0x0010 #define F2FS_HAS_FEATURE(sb, mask) \ ((F2FS_SB(sb)->raw_super->feature & cpu_to_le32(mask)) != 0) @@ -637,6 +639,7 @@ struct f2fs_inode_info { struct rw_semaphore i_mmap_sem; int i_extra_isize; /* size of extra space located in i_addr */ + kprojid_t i_projid; /* id for project quota */ }; static inline void get_extent_info(struct extent_info *ext, @@ -1951,6 +1954,20 @@ static inline void f2fs_change_bit(unsigned int nr, char *addr) *addr ^= mask; } +#define F2FS_REG_FLMASK (~(FS_DIRSYNC_FL | FS_TOPDIR_FL)) +#define F2FS_OTHER_FLMASK (FS_NODUMP_FL | FS_NOATIME_FL) +#define F2FS_FL_INHERITED (FS_PROJINHERIT_FL) + +static inline __u32 f2fs_mask_flags(umode_t mode, __u32 flags) +{ + if (S_ISDIR(mode)) + return flags; + else if (S_ISREG(mode)) + return flags & F2FS_REG_FLMASK; + else + return flags & F2FS_OTHER_FLMASK; +} + /* used for f2fs_inode_info->flags */ enum { FI_NEW_INODE, /* indicate newly allocated inode */ @@ -1980,6 +1997,7 @@ enum { FI_NO_PREALLOC, /* indicate skipped preallocated blocks */ FI_HOT_DATA, /* indicate file is hot */ FI_EXTRA_ATTR, /* indicate file has extra attribute */ + FI_PROJ_INHERIT, /* indicate file inherits projectid */ }; static inline void __mark_inode_dirty_flag(struct inode *inode, @@ -2323,6 +2341,12 @@ static inline int get_extra_isize(struct inode *inode) (offsetof(struct f2fs_inode, i_extra_end) - \ offsetof(struct f2fs_inode, i_extra_isize)) \ +#define F2FS_OLD_ATTRIBUTE_SIZE (offsetof(struct f2fs_inode, i_addr)) +#define F2FS_FITS_IN_INODE(f2fs_inode, extra_isize, field) \ + ((offsetof(typeof(*f2fs_inode), field) + \ + sizeof((f2fs_inode)->field)) \ + <= (F2FS_OLD_ATTRIBUTE_SIZE + extra_isize)) \ + /* * file.c */ @@ -2920,6 +2944,11 @@ static inline int f2fs_sb_has_extra_attr(struct super_block *sb) return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_EXTRA_ATTR); } +static inline int f2fs_sb_has_project_quota(struct super_block *sb) +{ + return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_PRJQUOTA); +} + #ifdef CONFIG_BLK_DEV_ZONED static inline int get_blkz_type(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t blkaddr) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index f6a9ae012471..5f4355e9c336 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1503,19 +1503,6 @@ static int f2fs_file_flush(struct file *file, fl_owner_t id) return 0; } -#define F2FS_REG_FLMASK (~(FS_DIRSYNC_FL | FS_TOPDIR_FL)) -#define F2FS_OTHER_FLMASK (FS_NODUMP_FL | FS_NOATIME_FL) - -static inline __u32 f2fs_mask_flags(umode_t mode, __u32 flags) -{ - if (S_ISDIR(mode)) - return flags; - else if (S_ISREG(mode)) - return flags & F2FS_REG_FLMASK; - else - return flags & F2FS_OTHER_FLMASK; -} - static int f2fs_ioc_getflags(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 0a6699a23dfb..f15e663a1a15 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -114,6 +114,7 @@ static int do_read_inode(struct inode *inode) struct f2fs_inode_info *fi = F2FS_I(inode); struct page *node_page; struct f2fs_inode *ri; + projid_t i_projid; /* Check if ino is within scope */ if (check_nid_range(sbi, inode->i_ino)) { @@ -173,6 +174,16 @@ static int do_read_inode(struct inode *inode) if (!need_inode_block_update(sbi, inode->i_ino)) fi->last_disk_size = inode->i_size; + if (fi->i_flags & FS_PROJINHERIT_FL) + set_inode_flag(inode, FI_PROJ_INHERIT); + + if (f2fs_has_extra_attr(inode) && f2fs_sb_has_project_quota(sbi->sb) && + F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, i_projid)) + i_projid = (projid_t)le32_to_cpu(ri->i_projid); + else + i_projid = F2FS_DEF_PROJID; + fi->i_projid = make_kprojid(&init_user_ns, i_projid); + f2fs_put_page(node_page, 1); stat_inc_inline_xattr(inode); @@ -299,9 +310,20 @@ int update_inode(struct inode *inode, struct page *node_page) ri->i_generation = cpu_to_le32(inode->i_generation); ri->i_dir_level = F2FS_I(inode)->i_dir_level; - if (f2fs_has_extra_attr(inode)) + if (f2fs_has_extra_attr(inode)) { ri->i_extra_isize = cpu_to_le16(F2FS_I(inode)->i_extra_isize); + if (f2fs_sb_has_project_quota(F2FS_I_SB(inode)->sb) && + F2FS_FITS_IN_INODE(ri, F2FS_I(inode)->i_extra_isize, + i_projid)) { + projid_t i_projid; + + i_projid = from_kprojid(&init_user_ns, + F2FS_I(inode)->i_projid); + ri->i_projid = cpu_to_le32(i_projid); + } + } + __set_inode_rdev(inode, ri); set_cold_node(inode, node_page); diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index a0bd1c68ec9c..621b164bbe3c 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -58,6 +58,13 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) goto fail; } + if (f2fs_sb_has_project_quota(sbi->sb) && + (F2FS_I(dir)->i_flags & FS_PROJINHERIT_FL)) + F2FS_I(inode)->i_projid = F2FS_I(dir)->i_projid; + else + F2FS_I(inode)->i_projid = make_kprojid(&init_user_ns, + F2FS_DEF_PROJID); + err = dquot_initialize(inode); if (err) goto fail_drop; @@ -90,6 +97,12 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) stat_inc_inline_inode(inode); stat_inc_inline_dir(inode); + F2FS_I(inode)->i_flags = + f2fs_mask_flags(mode, F2FS_I(dir)->i_flags & F2FS_FL_INHERITED); + + if (F2FS_I(inode)->i_flags & FS_PROJINHERIT_FL) + set_inode_flag(inode, FI_PROJ_INHERIT); + trace_f2fs_new_inode(inode, 0); return inode; @@ -209,6 +222,11 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir, !fscrypt_has_permitted_context(dir, inode)) return -EPERM; + if (is_inode_flag_set(dir, FI_PROJ_INHERIT) && + (!projid_eq(F2FS_I(dir)->i_projid, + F2FS_I(old_dentry->d_inode)->i_projid))) + return -EXDEV; + err = dquot_initialize(dir); if (err) return err; @@ -730,6 +748,11 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, goto out; } + if (is_inode_flag_set(new_dir, FI_PROJ_INHERIT) && + (!projid_eq(F2FS_I(new_dir)->i_projid, + F2FS_I(old_dentry->d_inode)->i_projid))) + return -EXDEV; + err = dquot_initialize(old_dir); if (err) goto out; @@ -918,6 +941,14 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, !fscrypt_has_permitted_context(old_dir, new_inode))) return -EPERM; + if ((is_inode_flag_set(new_dir, FI_PROJ_INHERIT) && + !projid_eq(F2FS_I(new_dir)->i_projid, + F2FS_I(old_dentry->d_inode)->i_projid)) || + (is_inode_flag_set(new_dir, FI_PROJ_INHERIT) && + !projid_eq(F2FS_I(old_dir)->i_projid, + F2FS_I(new_dentry->d_inode)->i_projid))) + return -EXDEV; + err = dquot_initialize(old_dir); if (err) goto out; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index cde5526ec3fa..62f7bb2227bf 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2268,8 +2268,13 @@ retry: dst->i_links = cpu_to_le32(1); dst->i_xattr_nid = 0; dst->i_inline = src->i_inline & (F2FS_INLINE_XATTR | F2FS_EXTRA_ATTR); - if (dst->i_inline & F2FS_EXTRA_ATTR) + if (dst->i_inline & F2FS_EXTRA_ATTR) { dst->i_extra_isize = src->i_extra_isize; + if (f2fs_sb_has_project_quota(sbi->sb) && + F2FS_FITS_IN_INODE(src, le16_to_cpu(src->i_extra_isize), + i_projid)) + dst->i_projid = src->i_projid; + } new_ni = old_ni; new_ni.ino = ino; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index caf6f24ce3a5..e641418751c1 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -109,6 +109,7 @@ enum { Opt_nolazytime, Opt_usrquota, Opt_grpquota, + Opt_prjquota, Opt_err, }; @@ -146,6 +147,7 @@ static match_table_t f2fs_tokens = { {Opt_nolazytime, "nolazytime"}, {Opt_usrquota, "usrquota"}, {Opt_grpquota, "grpquota"}, + {Opt_prjquota, "prjquota"}, {Opt_err, NULL}, }; @@ -392,9 +394,13 @@ static int parse_options(struct super_block *sb, char *options) case Opt_grpquota: set_opt(sbi, GRPQUOTA); break; + case Opt_prjquota: + set_opt(sbi, PRJQUOTA); + break; #else case Opt_usrquota: case Opt_grpquota: + case Opt_prjquota: f2fs_msg(sb, KERN_INFO, "quota operations not supported"); break; @@ -815,6 +821,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",usrquota"); if (test_opt(sbi, GRPQUOTA)) seq_puts(seq, ",grpquota"); + if (test_opt(sbi, PRJQUOTA)) + seq_puts(seq, ",prjquota"); #endif return 0; @@ -1173,6 +1181,14 @@ static void f2fs_quota_off_umount(struct super_block *sb) f2fs_quota_off(sb, type); } +#if 0 +int f2fs_get_projid(struct inode *inode, kprojid_t *projid) +{ + *projid = F2FS_I(inode)->i_projid; + return 0; +} +#endif + static const struct dquot_operations f2fs_quota_operations = { .get_reserved_space = f2fs_get_reserved_space, .write_dquot = dquot_commit, @@ -1182,6 +1198,10 @@ static const struct dquot_operations f2fs_quota_operations = { .write_info = dquot_commit_info, .alloc_dquot = dquot_alloc, .destroy_dquot = dquot_destroy, +#if 0 + .get_projid = f2fs_get_projid, + .get_next_id = dquot_get_next_id, +#endif }; static const struct quotactl_ops f2fs_quotactl_ops = { @@ -1967,7 +1987,7 @@ try_onemore: #ifdef CONFIG_QUOTA sb->dq_op = &f2fs_quota_operations; sb->s_qcop = &f2fs_quotactl_ops; - sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP; + sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ; #endif sb->s_op = &f2fs_sops; diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index 350c6b931fdb..5a6261a7f1ab 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -239,6 +239,7 @@ struct f2fs_inode { struct { __le16 i_extra_isize; /* extra inode attribute size */ __le16 i_padding; /* padding */ + __le32 i_projid; /* project id */ __le32 i_extra_end[0]; /* for attribute size calculation */ }; __le32 i_addr[DEF_ADDRS_PER_INODE]; /* Pointers to data blocks */ @@ -522,4 +523,6 @@ enum { #define S_SHIFT 12 +#define F2FS_DEF_PROJID 0 /* default project ID */ + #endif /* _LINUX_F2FS_FS_H */ From 8af6d9311d8beb57310d2a51d8ed876dc19f0b5d Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 26 Jul 2017 11:24:13 -0700 Subject: [PATCH 381/804] f2fs: avoid naming confusion of sysfs init This patch changes the function names of sysfs init to follow ext4. f2fs_init_sysfs <-> f2fs_register_sysfs f2fs_exit_sysfs <-> f2fs_unregister_sysfs Suggested-by: Chao Yu Reivewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 8 ++++---- fs/f2fs/super.c | 12 ++++++------ fs/f2fs/sysfs.c | 8 ++++---- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index dce0857a72f6..95f366e1f7ae 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2904,10 +2904,10 @@ void destroy_extent_cache(void); /* * sysfs.c */ -int __init f2fs_register_sysfs(void); -void f2fs_unregister_sysfs(void); -int f2fs_init_sysfs(struct f2fs_sb_info *sbi); -void f2fs_exit_sysfs(struct f2fs_sb_info *sbi); +int __init f2fs_init_sysfs(void); +void f2fs_exit_sysfs(void); +int f2fs_register_sysfs(struct f2fs_sb_info *sbi); +void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi); /* * crypto support diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index e641418751c1..24678120969e 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -649,7 +649,7 @@ static void f2fs_put_super(struct super_block *sb) kfree(sbi->ckpt); - f2fs_exit_sysfs(sbi); + f2fs_unregister_sysfs(sbi); sb->s_fs_info = NULL; if (sbi->s_chksum_driver) @@ -2153,7 +2153,7 @@ try_onemore: goto free_root_inode; } - err = f2fs_init_sysfs(sbi); + err = f2fs_register_sysfs(sbi); if (err) goto free_root_inode; @@ -2224,7 +2224,7 @@ skip_recovery: free_sysfs: f2fs_sync_inode_meta(sbi); - f2fs_exit_sysfs(sbi); + f2fs_unregister_sysfs(sbi); free_root_inode: dput(sb->s_root); sb->s_root = NULL; @@ -2342,7 +2342,7 @@ static int __init init_f2fs_fs(void) err = create_extent_cache(); if (err) goto free_checkpoint_caches; - err = f2fs_register_sysfs(); + err = f2fs_init_sysfs(); if (err) goto free_extent_cache; err = register_shrinker(&f2fs_shrinker_info); @@ -2361,7 +2361,7 @@ free_filesystem: free_shrinker: unregister_shrinker(&f2fs_shrinker_info); free_sysfs: - f2fs_unregister_sysfs(); + f2fs_exit_sysfs(); free_extent_cache: destroy_extent_cache(); free_checkpoint_caches: @@ -2381,7 +2381,7 @@ static void __exit exit_f2fs_fs(void) f2fs_destroy_root_stats(); unregister_filesystem(&f2fs_fs_type); unregister_shrinker(&f2fs_shrinker_info); - f2fs_unregister_sysfs(); + f2fs_exit_sysfs(); destroy_extent_cache(); destroy_checkpoint_caches(); destroy_segment_manager_caches(); diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 71191d89917d..5a78b9af92ef 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -304,7 +304,7 @@ static const struct file_operations f2fs_seq_##_name##_fops = { \ F2FS_PROC_FILE_DEF(segment_info); F2FS_PROC_FILE_DEF(segment_bits); -int __init f2fs_register_sysfs(void) +int __init f2fs_init_sysfs(void) { f2fs_proc_root = proc_mkdir("fs/f2fs", NULL); @@ -314,13 +314,13 @@ int __init f2fs_register_sysfs(void) return 0; } -void f2fs_unregister_sysfs(void) +void f2fs_exit_sysfs(void) { kset_unregister(f2fs_kset); remove_proc_entry("fs/f2fs", NULL); } -int f2fs_init_sysfs(struct f2fs_sb_info *sbi) +int f2fs_register_sysfs(struct f2fs_sb_info *sbi) { struct super_block *sb = sbi->sb; int err; @@ -351,7 +351,7 @@ err_out: return err; } -void f2fs_exit_sysfs(struct f2fs_sb_info *sbi) +void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi) { kobject_del(&sbi->s_kobj); kobject_put(&sbi->s_kobj); From 4f71d28e090f62689f2b48ff25200120a68e07b6 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 28 Jul 2017 02:29:12 -0700 Subject: [PATCH 382/804] f2fs: don't need to wait for node writes for atomic write We have a node chain to serialize node block writes, so if any IOs for node block writes are reordered, we'll get broken node chain. IOWs, roll-forward recovery will see all or none node blocks given fsync mark. E.g., Node chain consists of: N1 -> N2 -> N3 -> NFSYNC -> N1' -> N2' -> N'FSYNC Reordered to: 1) N1 -> N2 -> N3 -> N2' -> NFSYNC -> N'FSYNC -> power-cut 2) N1 -> N2 -> N3 -> N1' -> NFSYNC -> power-cut 3) N1 -> N2 -> NFSYNC -> N1' -> N'FSYNC -> N3 -> power-cut 4) N1 -> NFSYNC -> N1' -> N2' -> N'FSYNC -> N3 -> power-cut Roll-forward recovery can proceed to: 1) N1 -> N2 -> N3 -> NFSYNC -> X 2) N1 -> N2 -> N3 -> NFSYNC -> N1' -> X 3) N1 -> N2 -> N3 -> FSYNC -> N1' -> X 4) N1 -> X Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 5f4355e9c336..db3f5023c713 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -277,9 +277,19 @@ sync_nodes: goto sync_nodes; } - ret = wait_on_node_pages_writeback(sbi, ino); - if (ret) - goto out; + /* + * If it's atomic_write, it's just fine to keep write ordering. So + * here we don't need to wait for node write completion, since we use + * node chain which serializes node blocks. If one of node writes are + * reordered, we can see simply broken chain, resulting in stopping + * roll-forward recovery. It means we'll recover all or none node blocks + * given fsync mark. + */ + if (!atomic) { + ret = wait_on_node_pages_writeback(sbi, ino); + if (ret) + goto out; + } /* once recovery info is written, don't need to tack this */ remove_ino_entry(sbi, ino, APPEND_INO); From 9fcb9eca7376ff83a973e19431bc2596390708eb Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 29 Jul 2017 00:32:53 +0800 Subject: [PATCH 383/804] f2fs: introduce f2fs_statfs_project This patch introduces f2fs_statfs_project, it enables to show usage status of directory tree which is limited with project quota. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 24678120969e..991448cf762e 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -706,6 +706,48 @@ static int f2fs_unfreeze(struct super_block *sb) return 0; } +#ifdef CONFIG_QUOTA +static int f2fs_statfs_project(struct super_block *sb, + kprojid_t projid, struct kstatfs *buf) +{ + struct kqid qid; + struct dquot *dquot; + u64 limit; + u64 curblock; + + qid = make_kqid_projid(projid); + dquot = dqget(sb, qid); + if (IS_ERR(dquot)) + return PTR_ERR(dquot); + spin_lock(&dq_data_lock); + + limit = (dquot->dq_dqb.dqb_bsoftlimit ? + dquot->dq_dqb.dqb_bsoftlimit : + dquot->dq_dqb.dqb_bhardlimit) >> sb->s_blocksize_bits; + if (limit && buf->f_blocks > limit) { + curblock = dquot->dq_dqb.dqb_curspace >> sb->s_blocksize_bits; + buf->f_blocks = limit; + buf->f_bfree = buf->f_bavail = + (buf->f_blocks > curblock) ? + (buf->f_blocks - curblock) : 0; + } + + limit = dquot->dq_dqb.dqb_isoftlimit ? + dquot->dq_dqb.dqb_isoftlimit : + dquot->dq_dqb.dqb_ihardlimit; + if (limit && buf->f_files > limit) { + buf->f_files = limit; + buf->f_ffree = + (buf->f_files > dquot->dq_dqb.dqb_curinodes) ? + (buf->f_files - dquot->dq_dqb.dqb_curinodes) : 0; + } + + spin_unlock(&dq_data_lock); + dqput(dquot); + return 0; +} +#endif + static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; @@ -741,6 +783,12 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_fsid.val[0] = (u32)id; buf->f_fsid.val[1] = (u32)(id >> 32); +#ifdef CONFIG_QUOTA + if (is_inode_flag_set(dentry->d_inode, FI_PROJ_INHERIT) && + sb_has_quota_limits_enabled(sb, PRJQUOTA)) { + f2fs_statfs_project(sb, F2FS_I(dentry->d_inode)->i_projid, buf); + } +#endif return 0; } From 48ac27052b634bbf3bbefeb9b77c22cd8b1b7388 Mon Sep 17 00:00:00 2001 From: Yunlong Song Date: Thu, 27 Jul 2017 20:11:00 +0800 Subject: [PATCH 384/804] f2fs: provide f2fs_balance_fs to __write_node_page Let node writeback also do f2fs_balance_fs to ensure there are always enough free segments. Signed-off-by: Yunlong Song Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 2 +- fs/f2fs/f2fs.h | 3 ++- fs/f2fs/node.c | 16 ++++++++++------ 3 files changed, 13 insertions(+), 8 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index e8ceff42d09b..24976959ef4b 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1018,7 +1018,7 @@ retry_flush_nodes: if (get_pages(sbi, F2FS_DIRTY_NODES)) { up_write(&sbi->node_write); - err = sync_node_pages(sbi, &wbc); + err = sync_node_pages(sbi, &wbc, false); if (err) { up_write(&sbi->node_change); f2fs_unlock_all(sbi); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 95f366e1f7ae..5175e5b1bdfc 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2471,7 +2471,8 @@ struct page *get_node_page_ra(struct page *parent, int start); void move_node_page(struct page *node_page, int gc_type); int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, struct writeback_control *wbc, bool atomic); -int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc); +int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc, + bool do_balance); void build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount); bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid); void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 62f7bb2227bf..133afd288b0b 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1327,7 +1327,7 @@ continue_unlock: } static int __write_node_page(struct page *page, bool atomic, bool *submitted, - struct writeback_control *wbc) + struct writeback_control *wbc, bool do_balance) { struct f2fs_sb_info *sbi = F2FS_P_SB(page); nid_t nid; @@ -1396,6 +1396,8 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, if (submitted) *submitted = fio.submitted; + if (do_balance) + f2fs_balance_fs(sbi, false); return 0; redirty_out: @@ -1406,7 +1408,7 @@ redirty_out: static int f2fs_write_node_page(struct page *page, struct writeback_control *wbc) { - return __write_node_page(page, false, NULL, wbc); + return __write_node_page(page, false, NULL, wbc, false); } int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, @@ -1494,7 +1496,7 @@ continue_unlock: ret = __write_node_page(page, atomic && page == last_page, - &submitted, wbc); + &submitted, wbc, true); if (ret) { unlock_page(page); f2fs_put_page(last_page, 0); @@ -1531,7 +1533,8 @@ out: return ret ? -EIO: 0; } -int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc) +int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc, + bool do_balance) { pgoff_t index, end; struct pagevec pvec; @@ -1609,7 +1612,8 @@ continue_unlock: set_fsync_mark(page, 0); set_dentry_mark(page, 0); - ret = __write_node_page(page, false, &submitted, wbc); + ret = __write_node_page(page, false, &submitted, + wbc, do_balance); if (ret) unlock_page(page); else if (submitted) @@ -1701,7 +1705,7 @@ static int f2fs_write_node_pages(struct address_space *mapping, diff = nr_pages_to_write(sbi, NODE, wbc); wbc->sync_mode = WB_SYNC_NONE; blk_start_plug(&plug); - sync_node_pages(sbi, wbc); + sync_node_pages(sbi, wbc, true); blk_finish_plug(&plug); wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff); return 0; From 63b0ac86e6dec65e671282ed23319e7a096b4587 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Sun, 30 Jul 2017 09:45:14 -0700 Subject: [PATCH 385/804] f2fs: return wrong error number on f2fs_quota_write This must return size, not error number. Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 991448cf762e..a8aa498c88eb 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1123,7 +1123,7 @@ static ssize_t f2fs_quota_write(struct super_block *sb, int type, } if (len == towrite) - return err; + return 0; inode->i_version++; inode->i_mtime = inode->i_ctime = current_time(inode); f2fs_mark_inode_dirty_sync(inode, false); From 12832f18b49d43473a9c59d7666887ee1d21d03c Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 31 Jul 2017 20:19:09 +0800 Subject: [PATCH 386/804] f2fs: support inode checksum This patch adds to support inode checksum in f2fs. Signed-off-by: Chao Yu [Jaegeuk Kim: fix verification flow] Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 32 +++++++++++++++++++ fs/f2fs/inode.c | 70 +++++++++++++++++++++++++++++++++++++++++ fs/f2fs/node.c | 7 +++++ fs/f2fs/segment.c | 5 ++- fs/f2fs/super.c | 5 +++ include/linux/f2fs_fs.h | 1 + 6 files changed, 119 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 5175e5b1bdfc..fc958df78748 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -117,6 +117,7 @@ struct f2fs_mount_info { #define F2FS_FEATURE_ATOMIC_WRITE 0x0004 #define F2FS_FEATURE_EXTRA_ATTR 0x0008 #define F2FS_FEATURE_PRJQUOTA 0x0010 +#define F2FS_FEATURE_INODE_CHKSUM 0x0020 #define F2FS_HAS_FEATURE(sb, mask) \ ((F2FS_SB(sb)->raw_super->feature & cpu_to_le32(mask)) != 0) @@ -1149,6 +1150,9 @@ struct f2fs_sb_info { /* Reference to checksum algorithm driver via cryptoapi */ struct crypto_shash *s_chksum_driver; + /* Precomputed FS UUID checksum for seeding other checksums */ + __u32 s_chksum_seed; + /* For fault injection */ #ifdef CONFIG_F2FS_FAULT_INJECTION struct f2fs_fault_info fault_info; @@ -1237,6 +1241,27 @@ static inline bool f2fs_crc_valid(struct f2fs_sb_info *sbi, __u32 blk_crc, return f2fs_crc32(sbi, buf, buf_size) == blk_crc; } +static inline u32 f2fs_chksum(struct f2fs_sb_info *sbi, u32 crc, + const void *address, unsigned int length) +{ + struct { + struct shash_desc shash; + char ctx[4]; + } desc; + int err; + + BUG_ON(crypto_shash_descsize(sbi->s_chksum_driver) != sizeof(desc.ctx)); + + desc.shash.tfm = sbi->s_chksum_driver; + desc.shash.flags = 0; + *(u32 *)desc.ctx = crc; + + err = crypto_shash_update(&desc.shash, address, length); + BUG_ON(err); + + return *(u32 *)desc.ctx; +} + static inline struct f2fs_inode_info *F2FS_I(struct inode *inode) { return container_of(inode, struct f2fs_inode_info, vfs_inode); @@ -2366,6 +2391,8 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); * inode.c */ void f2fs_set_inode_flags(struct inode *inode); +bool f2fs_inode_chksum_verify(struct f2fs_sb_info *sbi, struct page *page); +void f2fs_inode_chksum_set(struct f2fs_sb_info *sbi, struct page *page); struct inode *f2fs_iget(struct super_block *sb, unsigned long ino); struct inode *f2fs_iget_retry(struct super_block *sb, unsigned long ino); int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink); @@ -2950,6 +2977,11 @@ static inline int f2fs_sb_has_project_quota(struct super_block *sb) return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_PRJQUOTA); } +static inline int f2fs_sb_has_inode_chksum(struct super_block *sb) +{ + return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_INODE_CHKSUM); +} + #ifdef CONFIG_BLK_DEV_ZONED static inline int get_blkz_type(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t blkaddr) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index f15e663a1a15..b4c401d456e7 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -108,6 +108,76 @@ static void __recover_inline_status(struct inode *inode, struct page *ipage) return; } +static bool f2fs_enable_inode_chksum(struct f2fs_sb_info *sbi, struct page *page) +{ + struct f2fs_inode *ri = &F2FS_NODE(page)->i; + int extra_isize = le32_to_cpu(ri->i_extra_isize); + + if (!f2fs_sb_has_inode_chksum(sbi->sb)) + return false; + + if (!RAW_IS_INODE(F2FS_NODE(page)) || !(ri->i_inline & F2FS_EXTRA_ATTR)) + return false; + + if (!F2FS_FITS_IN_INODE(ri, extra_isize, i_inode_checksum)) + return false; + + return true; +} + +static __u32 f2fs_inode_chksum(struct f2fs_sb_info *sbi, struct page *page) +{ + struct f2fs_node *node = F2FS_NODE(page); + struct f2fs_inode *ri = &node->i; + __le32 ino = node->footer.ino; + __le32 gen = ri->i_generation; + __u32 chksum, chksum_seed; + __u32 dummy_cs = 0; + unsigned int offset = offsetof(struct f2fs_inode, i_inode_checksum); + unsigned int cs_size = sizeof(dummy_cs); + + chksum = f2fs_chksum(sbi, sbi->s_chksum_seed, (__u8 *)&ino, + sizeof(ino)); + chksum_seed = f2fs_chksum(sbi, chksum, (__u8 *)&gen, sizeof(gen)); + + chksum = f2fs_chksum(sbi, chksum_seed, (__u8 *)ri, offset); + chksum = f2fs_chksum(sbi, chksum, (__u8 *)&dummy_cs, cs_size); + offset += cs_size; + chksum = f2fs_chksum(sbi, chksum, (__u8 *)ri + offset, + F2FS_BLKSIZE - offset); + return chksum; +} + +bool f2fs_inode_chksum_verify(struct f2fs_sb_info *sbi, struct page *page) +{ + struct f2fs_inode *ri; + __u32 provided, calculated; + + if (!f2fs_enable_inode_chksum(sbi, page)) + return true; + + ri = &F2FS_NODE(page)->i; + provided = le32_to_cpu(ri->i_inode_checksum); + calculated = f2fs_inode_chksum(sbi, page); + + if (provided != calculated) + f2fs_msg(sbi->sb, KERN_WARNING, + "checksum invalid, ino = %x, %x vs. %x", + ino_of_node(page), provided, calculated); + + return provided == calculated; +} + +void f2fs_inode_chksum_set(struct f2fs_sb_info *sbi, struct page *page) +{ + struct f2fs_inode *ri = &F2FS_NODE(page)->i; + + if (!f2fs_enable_inode_chksum(sbi, page)) + return; + + ri->i_inode_checksum = cpu_to_le32(f2fs_inode_chksum(sbi, page)); +} + static int do_read_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 133afd288b0b..6c7cc7cdf776 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1171,6 +1171,11 @@ repeat: err = -EIO; goto out_err; } + + if (!f2fs_inode_chksum_verify(sbi, page)) { + err = -EBADMSG; + goto out_err; + } page_hit: if(unlikely(nid != nid_of_node(page))) { f2fs_msg(sbi->sb, KERN_WARNING, "inconsistent node block, " @@ -2278,6 +2283,8 @@ retry: F2FS_FITS_IN_INODE(src, le16_to_cpu(src->i_extra_isize), i_projid)) dst->i_projid = src->i_projid; + + f2fs_inode_chksum_set(sbi, ipage); } new_ni = old_ni; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 3573b95f4fab..af7da1b62e94 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2294,9 +2294,12 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, mutex_unlock(&sit_i->sentry_lock); - if (page && IS_NODESEG(type)) + if (page && IS_NODESEG(type)) { fill_node_footer_blkaddr(page, NEXT_FREE_BLKADDR(sbi, curseg)); + f2fs_inode_chksum_set(sbi, page); + } + if (add_list) { struct f2fs_bio_info *io; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index a8aa498c88eb..dd28d8bed37a 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2001,6 +2001,11 @@ try_onemore: sb->s_fs_info = sbi; sbi->raw_super = raw_super; + /* precompute checksum seed for metadata */ + if (f2fs_sb_has_inode_chksum(sb)) + sbi->s_chksum_seed = f2fs_chksum(sbi, ~0, raw_super->uuid, + sizeof(raw_super->uuid)); + /* * The BLKZONED feature indicates that the drive was formatted with * zone alignment optimization. This is optional for host-aware diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index 5a6261a7f1ab..c2a975e4a711 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -240,6 +240,7 @@ struct f2fs_inode { __le16 i_extra_isize; /* extra inode attribute size */ __le16 i_padding; /* padding */ __le32 i_projid; /* project id */ + __le32 i_inode_checksum;/* inode meta checksum */ __le32 i_extra_end[0]; /* for attribute size calculation */ }; __le32 i_addr[DEF_ADDRS_PER_INODE]; /* Pointers to data blocks */ From 4eaf3d7698b8d5213a7f50e099be72aeea0ae6fa Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 21 Jul 2017 17:14:09 -0700 Subject: [PATCH 387/804] f2fs: expose features to sysfs entry This patch exposes what features are supported by current f2fs build to sysfs entry via: /sys/fs/f2fs/features/ /sys/fs/f2fs/dev/features Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/sysfs.c | 156 ++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 130 insertions(+), 26 deletions(-) diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 5a78b9af92ef..1e31d0c5b6ab 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -18,7 +18,6 @@ #include "gc.h" static struct proc_dir_entry *f2fs_proc_root; -static struct kset *f2fs_kset; /* Sysfs support for f2fs */ enum { @@ -41,6 +40,7 @@ struct f2fs_attr { const char *, size_t); int struct_type; int offset; + int id; }; static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type) @@ -76,6 +76,34 @@ static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a, BD_PART_WRITTEN(sbi))); } +static ssize_t features_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + struct super_block *sb = sbi->sb; + int len = 0; + + if (!sb->s_bdev->bd_part) + return snprintf(buf, PAGE_SIZE, "0\n"); + + if (f2fs_sb_has_crypto(sb)) + len += snprintf(buf, PAGE_SIZE - len, "%s", + "encryption"); + if (f2fs_sb_mounted_blkzoned(sb)) + len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len ? ", " : "", "blkzoned"); + if (f2fs_sb_has_extra_attr(sb)) + len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len ? ", " : "", "extra_attr"); + if (f2fs_sb_has_project_quota(sb)) + len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len ? ", " : "", "projquota"); + if (f2fs_sb_has_inode_chksum(sb)) + len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len ? ", " : "", "inode_checksum"); + len += snprintf(buf + len, PAGE_SIZE - len, "\n"); + return len; +} + static ssize_t f2fs_sbi_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { @@ -155,6 +183,30 @@ static void f2fs_sb_release(struct kobject *kobj) complete(&sbi->s_kobj_unregister); } +enum feat_id { + FEAT_CRYPTO = 0, + FEAT_BLKZONED, + FEAT_ATOMIC_WRITE, + FEAT_EXTRA_ATTR, + FEAT_PROJECT_QUOTA, + FEAT_INODE_CHECKSUM, +}; + +static ssize_t f2fs_feature_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + switch (a->id) { + case FEAT_CRYPTO: + case FEAT_BLKZONED: + case FEAT_ATOMIC_WRITE: + case FEAT_EXTRA_ATTR: + case FEAT_PROJECT_QUOTA: + case FEAT_INODE_CHECKSUM: + return snprintf(buf, PAGE_SIZE, "supported\n"); + } + return 0; +} + #define F2FS_ATTR_OFFSET(_struct_type, _name, _mode, _show, _store, _offset) \ static struct f2fs_attr f2fs_attr_##_name = { \ .attr = {.name = __stringify(_name), .mode = _mode }, \ @@ -172,6 +224,13 @@ static struct f2fs_attr f2fs_attr_##_name = { \ #define F2FS_GENERAL_RO_ATTR(name) \ static struct f2fs_attr f2fs_attr_##name = __ATTR(name, 0444, name##_show, NULL) +#define F2FS_FEATURE_RO_ATTR(_name, _id) \ +static struct f2fs_attr f2fs_attr_##_name = { \ + .attr = {.name = __stringify(_name), .mode = 0444 }, \ + .show = f2fs_feature_show, \ + .id = _id, \ +} + F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_min_sleep_time, min_sleep_time); F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_max_sleep_time, max_sleep_time); F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_no_gc_sleep_time, no_gc_sleep_time); @@ -196,6 +255,18 @@ F2FS_RW_ATTR(FAULT_INFO_RATE, f2fs_fault_info, inject_rate, inject_rate); F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type); #endif F2FS_GENERAL_RO_ATTR(lifetime_write_kbytes); +F2FS_GENERAL_RO_ATTR(features); + +#ifdef CONFIG_F2FS_FS_ENCRYPTION +F2FS_FEATURE_RO_ATTR(encryption, FEAT_CRYPTO); +#endif +#ifdef CONFIG_BLK_DEV_ZONED +F2FS_FEATURE_RO_ATTR(block_zoned, FEAT_BLKZONED); +#endif +F2FS_FEATURE_RO_ATTR(atomic_write, FEAT_ATOMIC_WRITE); +F2FS_FEATURE_RO_ATTR(extra_attr, FEAT_EXTRA_ATTR); +F2FS_FEATURE_RO_ATTR(project_quota, FEAT_PROJECT_QUOTA); +F2FS_FEATURE_RO_ATTR(inode_checksum, FEAT_INODE_CHECKSUM); #define ATTR_LIST(name) (&f2fs_attr_##name.attr) static struct attribute *f2fs_attrs[] = { @@ -222,21 +293,53 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(inject_type), #endif ATTR_LIST(lifetime_write_kbytes), + ATTR_LIST(features), ATTR_LIST(reserved_blocks), NULL, }; +static struct attribute *f2fs_feat_attrs[] = { +#ifdef CONFIG_F2FS_FS_ENCRYPTION + ATTR_LIST(encryption), +#endif +#ifdef CONFIG_BLK_DEV_ZONED + ATTR_LIST(block_zoned), +#endif + ATTR_LIST(atomic_write), + ATTR_LIST(extra_attr), + ATTR_LIST(project_quota), + ATTR_LIST(inode_checksum), + NULL, +}; + static const struct sysfs_ops f2fs_attr_ops = { .show = f2fs_attr_show, .store = f2fs_attr_store, }; -static struct kobj_type f2fs_ktype = { +static struct kobj_type f2fs_sb_ktype = { .default_attrs = f2fs_attrs, .sysfs_ops = &f2fs_attr_ops, .release = f2fs_sb_release, }; +static struct kobj_type f2fs_ktype = { + .sysfs_ops = &f2fs_attr_ops, +}; + +static struct kset f2fs_kset = { + .kobj = {.ktype = &f2fs_ktype}, +}; + +static struct kobj_type f2fs_feat_ktype = { + .default_attrs = f2fs_feat_attrs, + .sysfs_ops = &f2fs_attr_ops, +}; + +static struct kobject f2fs_feat = { + .kset = &f2fs_kset, +}; + static int segment_info_seq_show(struct seq_file *seq, void *offset) { struct super_block *sb = seq->private; @@ -306,18 +409,29 @@ F2FS_PROC_FILE_DEF(segment_bits); int __init f2fs_init_sysfs(void) { - f2fs_proc_root = proc_mkdir("fs/f2fs", NULL); + int ret; - f2fs_kset = kset_create_and_add("f2fs", NULL, fs_kobj); - if (!f2fs_kset) - return -ENOMEM; - return 0; + kobject_set_name(&f2fs_kset.kobj, "f2fs"); + f2fs_kset.kobj.parent = fs_kobj; + ret = kset_register(&f2fs_kset); + if (ret) + return ret; + + ret = kobject_init_and_add(&f2fs_feat, &f2fs_feat_ktype, + NULL, "features"); + if (ret) + kset_unregister(&f2fs_kset); + else + f2fs_proc_root = proc_mkdir("fs/f2fs", NULL); + return ret; } void f2fs_exit_sysfs(void) { - kset_unregister(f2fs_kset); + kobject_put(&f2fs_feat); + kset_unregister(&f2fs_kset); remove_proc_entry("fs/f2fs", NULL); + f2fs_proc_root = NULL; } int f2fs_register_sysfs(struct f2fs_sb_info *sbi) @@ -325,6 +439,13 @@ int f2fs_register_sysfs(struct f2fs_sb_info *sbi) struct super_block *sb = sbi->sb; int err; + sbi->s_kobj.kset = &f2fs_kset; + init_completion(&sbi->s_kobj_unregister); + err = kobject_init_and_add(&sbi->s_kobj, &f2fs_sb_ktype, NULL, + "%s", sb->s_id); + if (err) + return err; + if (f2fs_proc_root) sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root); @@ -334,32 +455,15 @@ int f2fs_register_sysfs(struct f2fs_sb_info *sbi) proc_create_data("segment_bits", S_IRUGO, sbi->s_proc, &f2fs_seq_segment_bits_fops, sb); } - - sbi->s_kobj.kset = f2fs_kset; - init_completion(&sbi->s_kobj_unregister); - err = kobject_init_and_add(&sbi->s_kobj, &f2fs_ktype, NULL, - "%s", sb->s_id); - if (err) - goto err_out; return 0; -err_out: - if (sbi->s_proc) { - remove_proc_entry("segment_info", sbi->s_proc); - remove_proc_entry("segment_bits", sbi->s_proc); - remove_proc_entry(sb->s_id, f2fs_proc_root); - } - return err; } void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi) { - kobject_del(&sbi->s_kobj); - kobject_put(&sbi->s_kobj); - wait_for_completion(&sbi->s_kobj_unregister); - if (sbi->s_proc) { remove_proc_entry("segment_info", sbi->s_proc); remove_proc_entry("segment_bits", sbi->s_proc); remove_proc_entry(sbi->sb->s_id, f2fs_proc_root); } + kobject_del(&sbi->s_kobj); } From 5ca0d2134d724b247e3f5df385b3b28e190cd09a Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 2 Aug 2017 20:58:29 -0700 Subject: [PATCH 388/804] f2fs: use printk_ratelimited for f2fs_msg This patch reduces contention of printks. Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index dd28d8bed37a..d9a6f8132755 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -159,7 +159,7 @@ void f2fs_msg(struct super_block *sb, const char *level, const char *fmt, ...) va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; - printk("%sF2FS-fs (%s): %pV\n", level, sb->s_id, &vaf); + printk_ratelimited("%sF2FS-fs (%s): %pV\n", level, sb->s_id, &vaf); va_end(args); } From f18ec06e50207cb24e29523e7ad75237335d0b9c Mon Sep 17 00:00:00 2001 From: Yunlong Song Date: Wed, 2 Aug 2017 21:20:13 +0800 Subject: [PATCH 389/804] f2fs: update cur_valid_map_mir together with cur_valid_map When cur_valid_map passes the f2fs_test_and_set(,clear)_bit test, cur_valid_map_mir update is skipped unlikely, so fix it. The fix now changes the mirror check together with cur_valid_map all the time. Signed-off-by: Yunlong Song Signed-off-by: Chao Yu [Jaegeuk Kim: Fix unused variable and add unlikely for corner condition.] Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 48 +++++++++++++++++++++++++++++++---------------- 1 file changed, 32 insertions(+), 16 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index af7da1b62e94..1a9737f764d1 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1583,6 +1583,10 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) struct seg_entry *se; unsigned int segno, offset; long int new_vblocks; + bool exist; +#ifdef CONFIG_F2FS_CHECK_FS + bool mir_exist; +#endif segno = GET_SEGNO(sbi, blkaddr); @@ -1599,17 +1603,23 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) /* Update valid block bitmap */ if (del > 0) { - if (f2fs_test_and_set_bit(offset, se->cur_valid_map)) { + exist = f2fs_test_and_set_bit(offset, se->cur_valid_map); #ifdef CONFIG_F2FS_CHECK_FS - if (f2fs_test_and_set_bit(offset, - se->cur_valid_map_mir)) - f2fs_bug_on(sbi, 1); - else - WARN_ON(1); -#else + mir_exist = f2fs_test_and_set_bit(offset, + se->cur_valid_map_mir); + if (unlikely(exist != mir_exist)) { + f2fs_msg(sbi->sb, KERN_ERR, "Inconsistent error " + "when setting bitmap, blk:%u, old bit:%d", + blkaddr, exist); f2fs_bug_on(sbi, 1); -#endif } +#endif + if (unlikely(exist)) { + f2fs_msg(sbi->sb, KERN_ERR, + "Bitmap was wrongly set, blk:%u", blkaddr); + f2fs_bug_on(sbi, 1); + } + if (f2fs_discard_en(sbi) && !f2fs_test_and_set_bit(offset, se->discard_map)) sbi->discard_blks--; @@ -1620,17 +1630,23 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) se->ckpt_valid_blocks++; } } else { - if (!f2fs_test_and_clear_bit(offset, se->cur_valid_map)) { + exist = f2fs_test_and_clear_bit(offset, se->cur_valid_map); #ifdef CONFIG_F2FS_CHECK_FS - if (!f2fs_test_and_clear_bit(offset, - se->cur_valid_map_mir)) - f2fs_bug_on(sbi, 1); - else - WARN_ON(1); -#else + mir_exist = f2fs_test_and_clear_bit(offset, + se->cur_valid_map_mir); + if (unlikely(exist != mir_exist)) { + f2fs_msg(sbi->sb, KERN_ERR, "Inconsistent error " + "when clearing bitmap, blk:%u, old bit:%d", + blkaddr, exist); f2fs_bug_on(sbi, 1); -#endif } +#endif + if (unlikely(!exist)) { + f2fs_msg(sbi->sb, KERN_ERR, + "Bitmap was wrongly cleared, blk:%u", blkaddr); + f2fs_bug_on(sbi, 1); + } + if (f2fs_discard_en(sbi) && f2fs_test_and_clear_bit(offset, se->discard_map)) sbi->discard_blks++; From 98407fc7a07f5b7d21fc5caaf00844f3889e77a0 Mon Sep 17 00:00:00 2001 From: Yunlong Song Date: Wed, 2 Aug 2017 22:16:54 +0800 Subject: [PATCH 390/804] f2fs: do not change the valid_block value if cur_valid_map was wrongly set or cleared Signed-off-by: Yunlong Song Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 1a9737f764d1..09df86430ed0 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1618,6 +1618,8 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) f2fs_msg(sbi->sb, KERN_ERR, "Bitmap was wrongly set, blk:%u", blkaddr); f2fs_bug_on(sbi, 1); + se->valid_blocks--; + del = 0; } if (f2fs_discard_en(sbi) && @@ -1645,6 +1647,8 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) f2fs_msg(sbi->sb, KERN_ERR, "Bitmap was wrongly cleared, blk:%u", blkaddr); f2fs_bug_on(sbi, 1); + se->valid_blocks++; + del = 0; } if (f2fs_discard_en(sbi) && From d39f75a593462334d1baf72b67e57bd93e9a1b0d Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 2 Aug 2017 23:21:48 +0800 Subject: [PATCH 391/804] f2fs: add app/fs io stat This patch enables inner app/fs io stats and introduces below virtual fs nodes for exposing stats info: /sys/fs/f2fs//iostat_enable /proc/fs/f2fs//iostat_info Signed-off-by: Chao Yu [Jaegeuk Kim: fix wrong stat assignment] Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 34 +++++++++++++++++-------- fs/f2fs/data.c | 35 +++++++++++++++++++------- fs/f2fs/f2fs.h | 59 +++++++++++++++++++++++++++++++++++++++++--- fs/f2fs/file.c | 7 +++++- fs/f2fs/gc.c | 3 +++ fs/f2fs/inline.c | 1 + fs/f2fs/node.c | 15 ++++++----- fs/f2fs/segment.c | 21 ++++++++++++++-- fs/f2fs/super.c | 4 +++ fs/f2fs/sysfs.c | 52 ++++++++++++++++++++++++++++++++++++++ 10 files changed, 200 insertions(+), 31 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 24976959ef4b..2b29d8b836fa 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -231,8 +231,9 @@ void ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index) ra_meta_pages(sbi, index, BIO_MAX_PAGES, META_POR, true); } -static int f2fs_write_meta_page(struct page *page, - struct writeback_control *wbc) +static int __f2fs_write_meta_page(struct page *page, + struct writeback_control *wbc, + enum iostat_type io_type) { struct f2fs_sb_info *sbi = F2FS_P_SB(page); @@ -245,7 +246,7 @@ static int f2fs_write_meta_page(struct page *page, if (unlikely(f2fs_cp_error(sbi))) goto redirty_out; - write_meta_page(sbi, page); + write_meta_page(sbi, page, io_type); dec_page_count(sbi, F2FS_DIRTY_META); if (wbc->for_reclaim) @@ -264,6 +265,12 @@ redirty_out: return AOP_WRITEPAGE_ACTIVATE; } +static int f2fs_write_meta_page(struct page *page, + struct writeback_control *wbc) +{ + return __f2fs_write_meta_page(page, wbc, FS_META_IO); +} + static int f2fs_write_meta_pages(struct address_space *mapping, struct writeback_control *wbc) { @@ -284,7 +291,7 @@ static int f2fs_write_meta_pages(struct address_space *mapping, trace_f2fs_writepages(mapping->host, wbc, META); diff = nr_pages_to_write(sbi, META, wbc); - written = sync_meta_pages(sbi, META, wbc->nr_to_write); + written = sync_meta_pages(sbi, META, wbc->nr_to_write, FS_META_IO); mutex_unlock(&sbi->cp_mutex); wbc->nr_to_write = max((long)0, wbc->nr_to_write - written - diff); return 0; @@ -296,7 +303,7 @@ skip_write: } long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, - long nr_to_write) + long nr_to_write, enum iostat_type io_type) { struct address_space *mapping = META_MAPPING(sbi); pgoff_t index = 0, end = ULONG_MAX, prev = ULONG_MAX; @@ -347,7 +354,7 @@ continue_unlock: if (!clear_page_dirty_for_io(page)) goto continue_unlock; - if (mapping->a_ops->writepage(page, &wbc)) { + if (__f2fs_write_meta_page(page, &wbc, io_type)) { unlock_page(page); break; } @@ -905,7 +912,14 @@ retry: if (inode) { unsigned long cur_ino = inode->i_ino; + if (is_dir) + F2FS_I(inode)->cp_task = current; + filemap_fdatawrite(inode->i_mapping); + + if (is_dir) + F2FS_I(inode)->cp_task = NULL; + iput(inode); /* We need to give cpu to another writers. */ if (ino == cur_ino) { @@ -1018,7 +1032,7 @@ retry_flush_nodes: if (get_pages(sbi, F2FS_DIRTY_NODES)) { up_write(&sbi->node_write); - err = sync_node_pages(sbi, &wbc, false); + err = sync_node_pages(sbi, &wbc, false, FS_CP_NODE_IO); if (err) { up_write(&sbi->node_change); f2fs_unlock_all(sbi); @@ -1116,7 +1130,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* Flush all the NAT/SIT pages */ while (get_pages(sbi, F2FS_DIRTY_META)) { - sync_meta_pages(sbi, META, LONG_MAX); + sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO); if (unlikely(f2fs_cp_error(sbi))) return -EIO; } @@ -1195,7 +1209,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* Flush all the NAT BITS pages */ while (get_pages(sbi, F2FS_DIRTY_META)) { - sync_meta_pages(sbi, META, LONG_MAX); + sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO); if (unlikely(f2fs_cp_error(sbi))) return -EIO; } @@ -1250,7 +1264,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) percpu_counter_set(&sbi->alloc_valid_block_count, 0); /* Here, we only have one bio having CP pack */ - sync_meta_pages(sbi, META_FLUSH, LONG_MAX); + sync_meta_pages(sbi, META_FLUSH, LONG_MAX, FS_CP_META_IO); /* wait for previous submitted meta pages writeback */ wait_on_all_pages_writeback(sbi); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 376d59929ded..47584eb07ddf 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1473,7 +1473,8 @@ out: } static int __write_data_page(struct page *page, bool *submitted, - struct writeback_control *wbc) + struct writeback_control *wbc, + enum iostat_type io_type) { struct inode *inode = page->mapping->host; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -1494,6 +1495,7 @@ static int __write_data_page(struct page *page, bool *submitted, .encrypted_page = NULL, .submitted = false, .need_lock = LOCK_RETRY, + .io_type = io_type, }; trace_f2fs_writepage(page, DATA); @@ -1600,7 +1602,7 @@ redirty_out: static int f2fs_write_data_page(struct page *page, struct writeback_control *wbc) { - return __write_data_page(page, NULL, wbc); + return __write_data_page(page, NULL, wbc, FS_DATA_IO); } /* @@ -1609,7 +1611,8 @@ static int f2fs_write_data_page(struct page *page, * warm/hot data page. */ static int f2fs_write_cache_pages(struct address_space *mapping, - struct writeback_control *wbc) + struct writeback_control *wbc, + enum iostat_type io_type) { int ret = 0; int done = 0; @@ -1699,7 +1702,7 @@ continue_unlock: if (!clear_page_dirty_for_io(page)) goto continue_unlock; - ret = __write_data_page(page, &submitted, wbc); + ret = __write_data_page(page, &submitted, wbc, io_type); if (unlikely(ret)) { /* * keep nr_to_write, since vfs uses this to @@ -1754,8 +1757,9 @@ continue_unlock: return ret; } -static int f2fs_write_data_pages(struct address_space *mapping, - struct writeback_control *wbc) +int __f2fs_write_data_pages(struct address_space *mapping, + struct writeback_control *wbc, + enum iostat_type io_type) { struct inode *inode = mapping->host; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -1792,7 +1796,7 @@ static int f2fs_write_data_pages(struct address_space *mapping, goto skip_write; blk_start_plug(&plug); - ret = f2fs_write_cache_pages(mapping, wbc); + ret = f2fs_write_cache_pages(mapping, wbc, io_type); blk_finish_plug(&plug); if (wbc->sync_mode == WB_SYNC_ALL) @@ -1811,6 +1815,16 @@ skip_write: return 0; } +static int f2fs_write_data_pages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct inode *inode = mapping->host; + + return __f2fs_write_data_pages(mapping, wbc, + F2FS_I(inode)->cp_task == current ? + FS_CP_DATA_IO : FS_DATA_IO); +} + static void f2fs_write_failed(struct address_space *mapping, loff_t to) { struct inode *inode = mapping->host; @@ -2076,10 +2090,13 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, up_read(&F2FS_I(inode)->dio_rwsem[rw]); if (rw == WRITE) { - if (err > 0) + if (err > 0) { + f2fs_update_iostat(F2FS_I_SB(inode), APP_DIRECT_IO, + err); set_inode_flag(inode, FI_UPDATE_WRITE); - else if (err < 0) + } else if (err < 0) { f2fs_write_failed(mapping, offset + count); + } } trace_f2fs_direct_IO_exit(inode, offset, count, rw, err); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index fc958df78748..976944eb8491 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -621,6 +621,7 @@ struct f2fs_inode_info { f2fs_hash_t chash; /* hash value of given file name */ unsigned int clevel; /* maximum level of given file name */ struct task_struct *task; /* lookup and create consistency */ + struct task_struct *cp_task; /* separate cp/wb IO stats*/ nid_t i_xattr_nid; /* node id that contains xattrs */ loff_t last_disk_size; /* lastly written file size */ @@ -927,6 +928,23 @@ enum need_lock_type { LOCK_RETRY, }; +enum iostat_type { + APP_DIRECT_IO, /* app direct IOs */ + APP_BUFFERED_IO, /* app buffered IOs */ + APP_WRITE_IO, /* app write IOs */ + APP_MAPPED_IO, /* app mapped IOs */ + FS_DATA_IO, /* data IOs from kworker/fsync/reclaimer */ + FS_NODE_IO, /* node IOs from kworker/fsync/reclaimer */ + FS_META_IO, /* meta IOs from kworker/reclaimer */ + FS_GC_DATA_IO, /* data IOs from forground gc */ + FS_GC_NODE_IO, /* node IOs from forground gc */ + FS_CP_DATA_IO, /* data IOs from checkpoint */ + FS_CP_NODE_IO, /* node IOs from checkpoint */ + FS_CP_META_IO, /* meta IOs from checkpoint */ + FS_DISCARD, /* discard */ + NR_IO_TYPE, +}; + struct f2fs_io_info { struct f2fs_sb_info *sbi; /* f2fs_sb_info pointer */ enum page_type type; /* contains DATA/NODE/META/META_FLUSH */ @@ -941,6 +959,7 @@ struct f2fs_io_info { bool submitted; /* indicate IO submission */ int need_lock; /* indicate we need to lock cp_rwsem */ bool in_list; /* indicate fio is in io_list */ + enum iostat_type io_type; /* io type */ }; #define is_read_io(rw) ((rw) == READ) @@ -1132,6 +1151,11 @@ struct f2fs_sb_info { #endif spinlock_t stat_lock; /* lock for stat operations */ + /* For app/fs IO statistics */ + spinlock_t iostat_lock; + unsigned long long write_iostat[NR_IO_TYPE]; + bool iostat_enable; + /* For sysfs suppport */ struct kobject s_kobj; struct completion s_kobj_unregister; @@ -2372,6 +2396,31 @@ static inline int get_extra_isize(struct inode *inode) sizeof((f2fs_inode)->field)) \ <= (F2FS_OLD_ATTRIBUTE_SIZE + extra_isize)) \ +static inline void f2fs_reset_iostat(struct f2fs_sb_info *sbi) +{ + int i; + + spin_lock(&sbi->iostat_lock); + for (i = 0; i < NR_IO_TYPE; i++) + sbi->write_iostat[i] = 0; + spin_unlock(&sbi->iostat_lock); +} + +static inline void f2fs_update_iostat(struct f2fs_sb_info *sbi, + enum iostat_type type, unsigned long long io_bytes) +{ + if (!sbi->iostat_enable) + return; + spin_lock(&sbi->iostat_lock); + sbi->write_iostat[type] += io_bytes; + + if (type == APP_WRITE_IO || type == APP_DIRECT_IO) + sbi->write_iostat[APP_BUFFERED_IO] = + sbi->write_iostat[APP_WRITE_IO] - + sbi->write_iostat[APP_DIRECT_IO]; + spin_unlock(&sbi->iostat_lock); +} + /* * file.c */ @@ -2499,7 +2548,7 @@ void move_node_page(struct page *node_page, int gc_type); int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, struct writeback_control *wbc, bool atomic); int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc, - bool do_balance); + bool do_balance, enum iostat_type io_type); void build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount); bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid); void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid); @@ -2542,7 +2591,8 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range); bool exist_trim_candidates(struct f2fs_sb_info *sbi, struct cp_control *cpc); struct page *get_sum_page(struct f2fs_sb_info *sbi, unsigned int segno); void update_meta_page(struct f2fs_sb_info *sbi, void *src, block_t blk_addr); -void write_meta_page(struct f2fs_sb_info *sbi, struct page *page); +void write_meta_page(struct f2fs_sb_info *sbi, struct page *page, + enum iostat_type io_type); void write_node_page(unsigned int nid, struct f2fs_io_info *fio); void write_data_page(struct dnode_of_data *dn, struct f2fs_io_info *fio); int rewrite_data_page(struct f2fs_io_info *fio); @@ -2583,7 +2633,7 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type, bool sync); void ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index); long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, - long nr_to_write); + long nr_to_write, enum iostat_type io_type); void add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type); void remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type); void release_ino_entry(struct f2fs_sb_info *sbi, bool all); @@ -2636,6 +2686,9 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len); void f2fs_set_page_dirty_nobuffers(struct page *page); +int __f2fs_write_data_pages(struct address_space *mapping, + struct writeback_control *wbc, + enum iostat_type io_type); void f2fs_invalidate_page(struct page *page, unsigned int offset, unsigned int length); int f2fs_release_page(struct page *page, gfp_t wait); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index db3f5023c713..a606dadcedee 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -101,6 +101,8 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, if (!PageUptodate(page)) SetPageUptodate(page); + f2fs_update_iostat(sbi, APP_MAPPED_IO, F2FS_BLKSIZE); + trace_f2fs_vm_page_mkwrite(page, DATA); mapped: /* fill the page */ @@ -1792,7 +1794,7 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) f2fs_stop_checkpoint(sbi, false); break; case F2FS_GOING_DOWN_METAFLUSH: - sync_meta_pages(sbi, META, LONG_MAX); + sync_meta_pages(sbi, META, LONG_MAX, FS_META_IO); f2fs_stop_checkpoint(sbi, false); break; default: @@ -2473,6 +2475,9 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) ret = __generic_file_write_iter(iocb, from); blk_finish_plug(&plug); clear_inode_flag(inode, FI_NO_PREALLOC); + + if (ret > 0) + f2fs_update_iostat(F2FS_I_SB(inode), APP_WRITE_IO, ret); } inode_unlock(inode); diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index f74685ae008b..0cf76a5e3997 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -689,6 +689,8 @@ static void move_encrypted_block(struct inode *inode, block_t bidx, fio.new_blkaddr = newaddr; f2fs_submit_page_write(&fio); + f2fs_update_iostat(fio.sbi, FS_GC_DATA_IO, F2FS_BLKSIZE); + f2fs_update_data_blkaddr(&dn, newaddr); set_inode_flag(inode, FI_APPEND_WRITE); if (page->index == 0) @@ -736,6 +738,7 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type, .page = page, .encrypted_page = NULL, .need_lock = LOCK_REQ, + .io_type = FS_GC_DATA_IO, }; bool is_dirty = PageDirty(page); int err; diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index ed5b1153901e..041072017ef8 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -117,6 +117,7 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) .op_flags = REQ_SYNC | REQ_NOIDLE | REQ_PRIO, .page = page, .encrypted_page = NULL, + .io_type = FS_DATA_IO, }; int dirty, err; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 6c7cc7cdf776..bc748df0b04f 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1332,7 +1332,8 @@ continue_unlock: } static int __write_node_page(struct page *page, bool atomic, bool *submitted, - struct writeback_control *wbc, bool do_balance) + struct writeback_control *wbc, bool do_balance, + enum iostat_type io_type) { struct f2fs_sb_info *sbi = F2FS_P_SB(page); nid_t nid; @@ -1345,6 +1346,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, .page = page, .encrypted_page = NULL, .submitted = false, + .io_type = io_type, }; trace_f2fs_writepage(page, NODE); @@ -1413,7 +1415,7 @@ redirty_out: static int f2fs_write_node_page(struct page *page, struct writeback_control *wbc) { - return __write_node_page(page, false, NULL, wbc, false); + return __write_node_page(page, false, NULL, wbc, false, FS_NODE_IO); } int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, @@ -1501,7 +1503,8 @@ continue_unlock: ret = __write_node_page(page, atomic && page == last_page, - &submitted, wbc, true); + &submitted, wbc, true, + FS_NODE_IO); if (ret) { unlock_page(page); f2fs_put_page(last_page, 0); @@ -1539,7 +1542,7 @@ out: } int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc, - bool do_balance) + bool do_balance, enum iostat_type io_type) { pgoff_t index, end; struct pagevec pvec; @@ -1618,7 +1621,7 @@ continue_unlock: set_dentry_mark(page, 0); ret = __write_node_page(page, false, &submitted, - wbc, do_balance); + wbc, do_balance, io_type); if (ret) unlock_page(page); else if (submitted) @@ -1710,7 +1713,7 @@ static int f2fs_write_node_pages(struct address_space *mapping, diff = nr_pages_to_write(sbi, NODE, wbc); wbc->sync_mode = WB_SYNC_NONE; blk_start_plug(&plug); - sync_node_pages(sbi, wbc, true); + sync_node_pages(sbi, wbc, true, FS_NODE_IO); blk_finish_plug(&plug); wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff); return 0; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 09df86430ed0..edc7c3d254c7 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -292,6 +292,7 @@ static int __commit_inmem_pages(struct inode *inode, .type = DATA, .op = REQ_OP_WRITE, .op_flags = REQ_SYNC | REQ_PRIO, + .io_type = FS_DATA_IO, }; pgoff_t last_idx = ULONG_MAX; int err = 0; @@ -903,6 +904,8 @@ static void __submit_discard_cmd(struct f2fs_sb_info *sbi, submit_bio(REQ_SYNC, bio); list_move_tail(&dc->list, &dcc->wait_list); __check_sit_bitmap(sbi, dc->start, dc->start + dc->len); + + f2fs_update_iostat(sbi, FS_DISCARD, 1); } } else { __remove_discard_cmd(sbi, dc); @@ -2351,7 +2354,8 @@ reallocate: } } -void write_meta_page(struct f2fs_sb_info *sbi, struct page *page) +void write_meta_page(struct f2fs_sb_info *sbi, struct page *page, + enum iostat_type io_type) { struct f2fs_io_info fio = { .sbi = sbi, @@ -2370,6 +2374,8 @@ void write_meta_page(struct f2fs_sb_info *sbi, struct page *page) set_page_writeback(page); f2fs_submit_page_write(&fio); + + f2fs_update_iostat(sbi, io_type, F2FS_BLKSIZE); } void write_node_page(unsigned int nid, struct f2fs_io_info *fio) @@ -2378,6 +2384,8 @@ void write_node_page(unsigned int nid, struct f2fs_io_info *fio) set_summary(&sum, nid, 0, 0); do_write_page(&sum, fio); + + f2fs_update_iostat(fio->sbi, fio->io_type, F2FS_BLKSIZE); } void write_data_page(struct dnode_of_data *dn, struct f2fs_io_info *fio) @@ -2391,13 +2399,22 @@ void write_data_page(struct dnode_of_data *dn, struct f2fs_io_info *fio) set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); do_write_page(&sum, fio); f2fs_update_data_blkaddr(dn, fio->new_blkaddr); + + f2fs_update_iostat(sbi, fio->io_type, F2FS_BLKSIZE); } int rewrite_data_page(struct f2fs_io_info *fio) { + int err; + fio->new_blkaddr = fio->old_blkaddr; stat_inc_inplace_blocks(fio->sbi); - return f2fs_submit_page_bio(fio); + + err = f2fs_submit_page_bio(fio); + + f2fs_update_iostat(fio->sbi, fio->io_type, F2FS_BLKSIZE); + + return err; } void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index d9a6f8132755..318df0660b74 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2064,6 +2064,10 @@ try_onemore: set_sbi_flag(sbi, SBI_POR_DOING); spin_lock_init(&sbi->stat_lock); + /* init iostat info */ + spin_lock_init(&sbi->iostat_lock); + sbi->iostat_enable = false; + for (i = 0; i < NR_PAGE_TYPE; i++) { int n = (i == META) ? 1: NR_TEMP_TYPE; int j; diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 1e31d0c5b6ab..3d6bbdb743b0 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -153,6 +153,10 @@ static ssize_t f2fs_sbi_store(struct f2fs_attr *a, return count; } *ui = t; + + if (!strcmp(a->attr.name, "iostat_enable") && *ui == 0) + f2fs_reset_iostat(sbi); + return count; } @@ -250,6 +254,7 @@ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, cp_interval, interval_time[CP_TIME]); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, idle_interval, interval_time[REQ_TIME]); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, iostat_enable, iostat_enable); #ifdef CONFIG_F2FS_FAULT_INJECTION F2FS_RW_ATTR(FAULT_INFO_RATE, f2fs_fault_info, inject_rate, inject_rate); F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type); @@ -288,6 +293,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(dirty_nats_ratio), ATTR_LIST(cp_interval), ATTR_LIST(idle_interval), + ATTR_LIST(iostat_enable), #ifdef CONFIG_F2FS_FAULT_INJECTION ATTR_LIST(inject_rate), ATTR_LIST(inject_type), @@ -391,6 +397,48 @@ static int segment_bits_seq_show(struct seq_file *seq, void *offset) return 0; } +static int iostat_info_seq_show(struct seq_file *seq, void *offset) +{ + struct super_block *sb = seq->private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + time64_t now = ktime_get_real_seconds(); + + if (!sbi->iostat_enable) + return 0; + + seq_printf(seq, "time: %-16llu\n", now); + + /* print app IOs */ + seq_printf(seq, "app buffered: %-16llu\n", + sbi->write_iostat[APP_BUFFERED_IO]); + seq_printf(seq, "app direct: %-16llu\n", + sbi->write_iostat[APP_DIRECT_IO]); + seq_printf(seq, "app mapped: %-16llu\n", + sbi->write_iostat[APP_MAPPED_IO]); + + /* print fs IOs */ + seq_printf(seq, "fs data: %-16llu\n", + sbi->write_iostat[FS_DATA_IO]); + seq_printf(seq, "fs node: %-16llu\n", + sbi->write_iostat[FS_NODE_IO]); + seq_printf(seq, "fs meta: %-16llu\n", + sbi->write_iostat[FS_META_IO]); + seq_printf(seq, "fs gc data: %-16llu\n", + sbi->write_iostat[FS_GC_DATA_IO]); + seq_printf(seq, "fs gc node: %-16llu\n", + sbi->write_iostat[FS_GC_NODE_IO]); + seq_printf(seq, "fs cp data: %-16llu\n", + sbi->write_iostat[FS_CP_DATA_IO]); + seq_printf(seq, "fs cp node: %-16llu\n", + sbi->write_iostat[FS_CP_NODE_IO]); + seq_printf(seq, "fs cp meta: %-16llu\n", + sbi->write_iostat[FS_CP_META_IO]); + seq_printf(seq, "fs discard: %-16llu\n", + sbi->write_iostat[FS_DISCARD]); + + return 0; +} + #define F2FS_PROC_FILE_DEF(_name) \ static int _name##_open_fs(struct inode *inode, struct file *file) \ { \ @@ -406,6 +454,7 @@ static const struct file_operations f2fs_seq_##_name##_fops = { \ F2FS_PROC_FILE_DEF(segment_info); F2FS_PROC_FILE_DEF(segment_bits); +F2FS_PROC_FILE_DEF(iostat_info); int __init f2fs_init_sysfs(void) { @@ -454,6 +503,8 @@ int f2fs_register_sysfs(struct f2fs_sb_info *sbi) &f2fs_seq_segment_info_fops, sb); proc_create_data("segment_bits", S_IRUGO, sbi->s_proc, &f2fs_seq_segment_bits_fops, sb); + proc_create_data("iostat_info", S_IRUGO, sbi->s_proc, + &f2fs_seq_iostat_info_fops, sb); } return 0; } @@ -461,6 +512,7 @@ int f2fs_register_sysfs(struct f2fs_sb_info *sbi) void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi) { if (sbi->s_proc) { + remove_proc_entry("iostat_info", sbi->s_proc); remove_proc_entry("segment_info", sbi->s_proc); remove_proc_entry("segment_bits", sbi->s_proc); remove_proc_entry(sbi->sb->s_id, f2fs_proc_root); From c9881425b5b16c4dd9656d6ae0b95029157ccc50 Mon Sep 17 00:00:00 2001 From: Yunlong Song Date: Fri, 4 Aug 2017 17:07:15 +0800 Subject: [PATCH 392/804] f2fs: fix the size value in __check_sit_bitmap The current size value is not correct and will miss bitmap check. Signed-off-by: Yunlong Song Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index edc7c3d254c7..20f466ace8b0 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -868,11 +868,14 @@ void __check_sit_bitmap(struct f2fs_sb_info *sbi, sentry = get_seg_entry(sbi, segno); offset = GET_BLKOFF_FROM_SEG0(sbi, blk); - size = min((unsigned long)(end - blk), max_blocks); + if (end < START_BLOCK(sbi, segno + 1)) + size = GET_BLKOFF_FROM_SEG0(sbi, end); + else + size = max_blocks; map = (unsigned long *)(sentry->cur_valid_map); offset = __find_rev_next_bit(map, size, offset); f2fs_bug_on(sbi, offset != size); - blk += size; + blk = START_BLOCK(sbi, segno + 1); } #endif } From 2d982d49c3205e47fc8bf92ee98c4a4e90e67cfd Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Sat, 5 Aug 2017 14:25:08 -0700 Subject: [PATCH 393/804] f2fs: use IPU for cold files We expect cold files write data sequentially, but sometimes some of small data can be updated, which incurs fragmentation. Let's avoid that. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index e9ba1f1d9723..84242eb5226f 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -577,6 +577,10 @@ static inline bool need_inplace_update_policy(struct inode *inode, if (test_opt(sbi, LFS)) return false; + /* if this is cold file, we should overwrite to avoid fragmentation */ + if (file_is_cold(inode)) + return true; + if (policy & (0x1 << F2FS_IPU_FORCE)) return true; if (policy & (0x1 << F2FS_IPU_SSR) && need_SSR(sbi)) From bdf6e5ea9299f0893e3b304316941ae40a7a3897 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Sun, 6 Aug 2017 22:09:00 -0700 Subject: [PATCH 394/804] f2fs: introduce gc_urgent mode for background GC This patch adds a sysfs entry to control urgent mode for background GC. If this is set, background GC thread conducts GC with gc_urgent_sleep_time all the time. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 12 ++++++++++++ Documentation/filesystems/f2fs.txt | 9 +++++++++ fs/f2fs/gc.c | 17 +++++++++++++++-- fs/f2fs/gc.h | 4 ++++ fs/f2fs/sysfs.c | 9 +++++++++ 5 files changed, 49 insertions(+), 2 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index 2805ce062fdb..6c2c50b4e781 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -124,3 +124,15 @@ Date: June 2017 Contact: "Chao Yu" Description: Controls current reserved blocks in system. + +What: /sys/fs/f2fs//gc_urgent +Date: August 2017 +Contact: "Jaegeuk Kim" +Description: + Do background GC agressively + +What: /sys/fs/f2fs//gc_urgent_sleep_time +Date: August 2017 +Contact: "Jaegeuk Kim" +Description: + Controls sleep time of GC urgent mode diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt index 5cf383f7fa8a..8a3f991098ad 100644 --- a/Documentation/filesystems/f2fs.txt +++ b/Documentation/filesystems/f2fs.txt @@ -208,6 +208,15 @@ Files in /sys/fs/f2fs/ gc_idle = 1 will select the Cost Benefit approach & setting gc_idle = 2 will select the greedy approach. + gc_urgent This parameter controls triggering background GCs + urgently or not. Setting gc_urgent = 0 [default] + makes back to default behavior, while if it is set + to 1, background thread starts to do GC by given + gc_urgent_sleep_time interval. + + gc_urgent_sleep_time This parameter controls sleep time for gc_urgent. + 500 ms is set by default. See above gc_urgent. + reclaim_segments This parameter controls the number of prefree segments to be reclaimed. If the number of prefree segments is larger than the number of segments diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 0cf76a5e3997..3c05eea382b9 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -35,9 +35,14 @@ static int gc_thread_func(void *data) set_freezable(); do { wait_event_interruptible_timeout(*wq, - kthread_should_stop() || freezing(current), + kthread_should_stop() || freezing(current) || + gc_th->gc_wake, msecs_to_jiffies(wait_ms)); + /* give it a try one time */ + if (gc_th->gc_wake) + gc_th->gc_wake = 0; + if (try_to_freeze()) continue; if (kthread_should_stop()) @@ -74,6 +79,11 @@ static int gc_thread_func(void *data) if (!mutex_trylock(&sbi->gc_mutex)) goto next; + if (gc_th->gc_urgent) { + wait_ms = gc_th->urgent_sleep_time; + goto do_gc; + } + if (!is_idle(sbi)) { increase_sleep_time(gc_th, &wait_ms); mutex_unlock(&sbi->gc_mutex); @@ -84,7 +94,7 @@ static int gc_thread_func(void *data) decrease_sleep_time(gc_th, &wait_ms); else increase_sleep_time(gc_th, &wait_ms); - +do_gc: stat_inc_bggc_count(sbi); /* if return value is not zero, no victim was selected */ @@ -115,11 +125,14 @@ int start_gc_thread(struct f2fs_sb_info *sbi) goto out; } + gc_th->urgent_sleep_time = DEF_GC_THREAD_URGENT_SLEEP_TIME; gc_th->min_sleep_time = DEF_GC_THREAD_MIN_SLEEP_TIME; gc_th->max_sleep_time = DEF_GC_THREAD_MAX_SLEEP_TIME; gc_th->no_gc_sleep_time = DEF_GC_THREAD_NOGC_SLEEP_TIME; gc_th->gc_idle = 0; + gc_th->gc_urgent = 0; + gc_th->gc_wake= 0; sbi->gc_thread = gc_th; init_waitqueue_head(&sbi->gc_thread->gc_wait_queue_head); diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h index a993967dcdb9..57a9000ce3af 100644 --- a/fs/f2fs/gc.h +++ b/fs/f2fs/gc.h @@ -13,6 +13,7 @@ * whether IO subsystem is idle * or not */ +#define DEF_GC_THREAD_URGENT_SLEEP_TIME 500 /* 500 ms */ #define DEF_GC_THREAD_MIN_SLEEP_TIME 30000 /* milliseconds */ #define DEF_GC_THREAD_MAX_SLEEP_TIME 60000 #define DEF_GC_THREAD_NOGC_SLEEP_TIME 300000 /* wait 5 min */ @@ -27,12 +28,15 @@ struct f2fs_gc_kthread { wait_queue_head_t gc_wait_queue_head; /* for gc sleep time */ + unsigned int urgent_sleep_time; unsigned int min_sleep_time; unsigned int max_sleep_time; unsigned int no_gc_sleep_time; /* for changing gc mode */ unsigned int gc_idle; + unsigned int gc_urgent; + unsigned int gc_wake; }; struct gc_inode_list { diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 3d6bbdb743b0..c40e5d24df9f 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -156,6 +156,10 @@ static ssize_t f2fs_sbi_store(struct f2fs_attr *a, if (!strcmp(a->attr.name, "iostat_enable") && *ui == 0) f2fs_reset_iostat(sbi); + if (!strcmp(a->attr.name, "gc_urgent") && t == 1 && sbi->gc_thread) { + sbi->gc_thread->gc_wake = 1; + wake_up_interruptible_all(&sbi->gc_thread->gc_wait_queue_head); + } return count; } @@ -235,10 +239,13 @@ static struct f2fs_attr f2fs_attr_##_name = { \ .id = _id, \ } +F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_urgent_sleep_time, + urgent_sleep_time); F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_min_sleep_time, min_sleep_time); F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_max_sleep_time, max_sleep_time); F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_no_gc_sleep_time, no_gc_sleep_time); F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_idle, gc_idle); +F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_urgent, gc_urgent); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments); F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_small_discards, max_discards); F2FS_RW_ATTR(RESERVED_BLOCKS, f2fs_sb_info, reserved_blocks, reserved_blocks); @@ -275,10 +282,12 @@ F2FS_FEATURE_RO_ATTR(inode_checksum, FEAT_INODE_CHECKSUM); #define ATTR_LIST(name) (&f2fs_attr_##name.attr) static struct attribute *f2fs_attrs[] = { + ATTR_LIST(gc_urgent_sleep_time), ATTR_LIST(gc_min_sleep_time), ATTR_LIST(gc_max_sleep_time), ATTR_LIST(gc_no_gc_sleep_time), ATTR_LIST(gc_idle), + ATTR_LIST(gc_urgent), ATTR_LIST(reclaim_segments), ATTR_LIST(max_small_discards), ATTR_LIST(batched_trim_sections), From 3f42e75b2df897eeaac4f5c8c38d2c072499753a Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 7 Aug 2017 16:37:59 +0800 Subject: [PATCH 395/804] f2fs: avoid unneeded sync on quota file We only need to sync quota file with appointed quota type instead of all types in f2fs_quota_{on,off}. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 318df0660b74..801ab4ceeb36 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1177,7 +1177,7 @@ static int f2fs_quota_on(struct super_block *sb, int type, int format_id, struct inode *inode; int err; - err = f2fs_quota_sync(sb, -1); + err = f2fs_quota_sync(sb, type); if (err) return err; @@ -1205,7 +1205,7 @@ static int f2fs_quota_off(struct super_block *sb, int type) if (!inode || !igrab(inode)) return dquot_quota_off(sb, type); - f2fs_quota_sync(sb, -1); + f2fs_quota_sync(sb, type); err = dquot_quota_off(sb, type); if (err) From 3ba499e5fa949b7711d45b8957c670029adf51c5 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 7 Aug 2017 23:12:46 +0800 Subject: [PATCH 396/804] f2fs: fix potential overflow when adjusting GC cycle While comparing signed and unsigned variables, compiler will converts the signed value to unsigned one, due to this reason, {in,de}crease_sleep_time may return overflowed result. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 2 +- fs/f2fs/gc.h | 23 +++++++++++++++-------- include/trace/events/f2fs.h | 6 +++--- 3 files changed, 19 insertions(+), 12 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 3c05eea382b9..faed28e56203 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -28,7 +28,7 @@ static int gc_thread_func(void *data) struct f2fs_sb_info *sbi = data; struct f2fs_gc_kthread *gc_th = sbi->gc_thread; wait_queue_head_t *wq = &sbi->gc_thread->gc_wait_queue_head; - long wait_ms; + unsigned int wait_ms; wait_ms = gc_th->min_sleep_time; diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h index 57a9000ce3af..9325191fab2d 100644 --- a/fs/f2fs/gc.h +++ b/fs/f2fs/gc.h @@ -69,25 +69,32 @@ static inline block_t limit_free_user_blocks(struct f2fs_sb_info *sbi) } static inline void increase_sleep_time(struct f2fs_gc_kthread *gc_th, - long *wait) + unsigned int *wait) { + unsigned int min_time = gc_th->min_sleep_time; + unsigned int max_time = gc_th->max_sleep_time; + if (*wait == gc_th->no_gc_sleep_time) return; - *wait += gc_th->min_sleep_time; - if (*wait > gc_th->max_sleep_time) - *wait = gc_th->max_sleep_time; + if ((long long)*wait + (long long)min_time > (long long)max_time) + *wait = max_time; + else + *wait += min_time; } static inline void decrease_sleep_time(struct f2fs_gc_kthread *gc_th, - long *wait) + unsigned int *wait) { + unsigned int min_time = gc_th->min_sleep_time; + if (*wait == gc_th->no_gc_sleep_time) *wait = gc_th->max_sleep_time; - *wait -= gc_th->min_sleep_time; - if (*wait <= gc_th->min_sleep_time) - *wait = gc_th->min_sleep_time; + if ((long long)*wait - (long long)min_time < (long long)min_time) + *wait = min_time; + else + *wait -= min_time; } static inline bool has_enough_invalid_blocks(struct f2fs_sb_info *sbi) diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 20c4556ab56d..167c40850f98 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -535,14 +535,14 @@ TRACE_EVENT(f2fs_map_blocks, TRACE_EVENT(f2fs_background_gc, - TP_PROTO(struct super_block *sb, long wait_ms, + TP_PROTO(struct super_block *sb, unsigned int wait_ms, unsigned int prefree, unsigned int free), TP_ARGS(sb, wait_ms, prefree, free), TP_STRUCT__entry( __field(dev_t, dev) - __field(long, wait_ms) + __field(unsigned int, wait_ms) __field(unsigned int, prefree) __field(unsigned int, free) ), @@ -554,7 +554,7 @@ TRACE_EVENT(f2fs_background_gc, __entry->free = free; ), - TP_printk("dev = (%d,%d), wait_ms = %ld, prefree = %u, free = %u", + TP_printk("dev = (%d,%d), wait_ms = %u, prefree = %u, free = %u", show_dev(__entry->dev), __entry->wait_ms, __entry->prefree, From 9e6ece8a411241dd247f1c8afa0ca5860d1ba1d7 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 8 Aug 2017 10:54:31 +0800 Subject: [PATCH 397/804] f2fs: support journalled quota This patch supports to enable f2fs to accept quota information through mount option: - {usr,grp,prj}jquota= - jqfmt= Then, in ->mount flow, we can recover quota file during log replaying, by this, journelled quota can be supported. Signed-off-by: Chao Yu [Jaegeuk Kim: Fix wrong return values.] Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.txt | 9 + fs/f2fs/checkpoint.c | 26 ++- fs/f2fs/f2fs.h | 9 + fs/f2fs/recovery.c | 72 ++++++- fs/f2fs/super.c | 326 +++++++++++++++++++++++++++-- 5 files changed, 412 insertions(+), 30 deletions(-) diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt index 8a3f991098ad..6cf9ad12c57f 100644 --- a/Documentation/filesystems/f2fs.txt +++ b/Documentation/filesystems/f2fs.txt @@ -163,6 +163,15 @@ io_bits=%u Set the bit size of write IO requests. It should be set usrquota Enable plain user disk quota accounting. grpquota Enable plain group disk quota accounting. prjquota Enable plain project quota accounting. +usrjquota= Appoint specified file and type during mount, so that quota +grpjquota= information can be properly updated during recovery flow, +prjjquota= : must be in root directory; +jqfmt= : [vfsold,vfsv0,vfsv1]. +offusrjquota Turn off user journelled quota. +offgrpjquota Turn off group journelled quota. +offprjjquota Turn off project journelled quota. +quota Enable plain user disk quota accounting. +noquota Disable all plain disk quota option. ================================================================================ DEBUGFS ENTRIES diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 2b29d8b836fa..e86f67ac96c6 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -589,11 +589,24 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) int recover_orphan_inodes(struct f2fs_sb_info *sbi) { block_t start_blk, orphan_blocks, i, j; - int err; + unsigned int s_flags = sbi->sb->s_flags; + int err = 0; if (!is_set_ckpt_flags(sbi, CP_ORPHAN_PRESENT_FLAG)) return 0; + if (s_flags & MS_RDONLY) { + f2fs_msg(sbi->sb, KERN_INFO, "orphan cleanup on readonly fs"); + sbi->sb->s_flags &= ~MS_RDONLY; + } + +#ifdef CONFIG_QUOTA + /* Needed for iput() to work correctly and not trash data */ + sbi->sb->s_flags |= MS_ACTIVE; + /* Turn on quotas so that they are updated correctly */ + f2fs_enable_quota_files(sbi); +#endif + start_blk = __start_cp_addr(sbi) + 1 + __cp_payload(sbi); orphan_blocks = __start_sum_addr(sbi) - 1 - __cp_payload(sbi); @@ -609,14 +622,21 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi) err = recover_orphan_inode(sbi, ino); if (err) { f2fs_put_page(page, 1); - return err; + goto out; } } f2fs_put_page(page, 1); } /* clear Orphan Flag */ clear_ckpt_flags(sbi, CP_ORPHAN_PRESENT_FLAG); - return 0; +out: +#ifdef CONFIG_QUOTA + /* Turn quotas off */ + f2fs_quota_off_umount(sbi->sb); +#endif + sbi->sb->s_flags = s_flags; /* Restore MS_RDONLY status */ + + return err; } static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 976944eb8491..310d8588ad3c 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -93,6 +93,7 @@ extern char *fault_name[FAULT_MAX]; #define F2FS_MOUNT_USRQUOTA 0x00080000 #define F2FS_MOUNT_GRPQUOTA 0x00100000 #define F2FS_MOUNT_PRJQUOTA 0x00200000 +#define F2FS_MOUNT_QUOTA 0x00400000 #define clear_opt(sbi, option) ((sbi)->mount_opt.opt &= ~F2FS_MOUNT_##option) #define set_opt(sbi, option) ((sbi)->mount_opt.opt |= F2FS_MOUNT_##option) @@ -1181,6 +1182,12 @@ struct f2fs_sb_info { #ifdef CONFIG_F2FS_FAULT_INJECTION struct f2fs_fault_info fault_info; #endif + +#ifdef CONFIG_QUOTA + /* Names of quota files with journalled quota */ + char *s_qf_names[MAXQUOTAS]; + int s_jquota_fmt; /* Format of quota to use */ +#endif }; #ifdef CONFIG_F2FS_FAULT_INJECTION @@ -2510,6 +2517,8 @@ static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode) */ int f2fs_inode_dirtied(struct inode *inode, bool sync); void f2fs_inode_synced(struct inode *inode); +void f2fs_enable_quota_files(struct f2fs_sb_info *sbi); +void f2fs_quota_off_umount(struct super_block *sb); int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover); int f2fs_sync_fs(struct super_block *sb, int sync); extern __printf(3, 4) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 2d9b8182691f..a3d02613934a 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -69,20 +69,34 @@ static struct fsync_inode_entry *get_fsync_inode(struct list_head *head, } static struct fsync_inode_entry *add_fsync_inode(struct f2fs_sb_info *sbi, - struct list_head *head, nid_t ino) + struct list_head *head, nid_t ino, bool quota_inode) { struct inode *inode; struct fsync_inode_entry *entry; + int err; inode = f2fs_iget_retry(sbi->sb, ino); if (IS_ERR(inode)) return ERR_CAST(inode); + err = dquot_initialize(inode); + if (err) + goto err_out; + + if (quota_inode) { + err = dquot_alloc_inode(inode); + if (err) + goto err_out; + } + entry = f2fs_kmem_cache_alloc(fsync_entry_slab, GFP_F2FS_ZERO); entry->inode = inode; list_add_tail(&entry->list, head); return entry; +err_out: + iput(inode); + return ERR_PTR(err); } static void del_fsync_inode(struct fsync_inode_entry *entry) @@ -107,7 +121,8 @@ static int recover_dentry(struct inode *inode, struct page *ipage, entry = get_fsync_inode(dir_list, pino); if (!entry) { - entry = add_fsync_inode(F2FS_I_SB(inode), dir_list, pino); + entry = add_fsync_inode(F2FS_I_SB(inode), dir_list, + pino, false); if (IS_ERR(entry)) { dir = ERR_CAST(entry); err = PTR_ERR(entry); @@ -140,6 +155,13 @@ retry: err = -EEXIST; goto out_unmap_put; } + + err = dquot_initialize(einode); + if (err) { + iput(einode); + goto out_unmap_put; + } + err = acquire_orphan_inode(F2FS_I_SB(inode)); if (err) { iput(einode); @@ -226,18 +248,22 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head, entry = get_fsync_inode(head, ino_of_node(page)); if (!entry) { + bool quota_inode = false; + if (!check_only && IS_INODE(page) && is_dent_dnode(page)) { err = recover_inode_page(sbi, page); if (err) break; + quota_inode = true; } /* * CP | dnode(F) | inode(DF) * For this case, we should not give up now. */ - entry = add_fsync_inode(sbi, head, ino_of_node(page)); + entry = add_fsync_inode(sbi, head, ino_of_node(page), + quota_inode); if (IS_ERR(entry)) { err = PTR_ERR(entry); if (err == -ENOENT) { @@ -328,10 +354,18 @@ got_it: f2fs_put_page(node_page, 1); if (ino != dn->inode->i_ino) { + int ret; + /* Deallocate previous index in the node page */ inode = f2fs_iget_retry(sbi->sb, ino); if (IS_ERR(inode)) return PTR_ERR(inode); + + ret = dquot_initialize(inode); + if (ret) { + iput(inode); + return ret; + } } else { inode = dn->inode; } @@ -558,12 +592,27 @@ int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) struct list_head dir_list; int err; int ret = 0; + unsigned long s_flags = sbi->sb->s_flags; bool need_writecp = false; + if (s_flags & MS_RDONLY) { + f2fs_msg(sbi->sb, KERN_INFO, "orphan cleanup on readonly fs"); + sbi->sb->s_flags &= ~MS_RDONLY; + } + +#ifdef CONFIG_QUOTA + /* Needed for iput() to work correctly and not trash data */ + sbi->sb->s_flags |= MS_ACTIVE; + /* Turn on quotas so that they are updated correctly */ + f2fs_enable_quota_files(sbi); +#endif + fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry", sizeof(struct fsync_inode_entry)); - if (!fsync_entry_slab) - return -ENOMEM; + if (!fsync_entry_slab) { + err = -ENOMEM; + goto out; + } INIT_LIST_HEAD(&inode_list); INIT_LIST_HEAD(&dir_list); @@ -574,11 +623,11 @@ int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) /* step #1: find fsynced inode numbers */ err = find_fsync_dnodes(sbi, &inode_list, check_only); if (err || list_empty(&inode_list)) - goto out; + goto skip; if (check_only) { ret = 1; - goto out; + goto skip; } need_writecp = true; @@ -587,7 +636,7 @@ int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) err = recover_data(sbi, &inode_list, &dir_list); if (!err) f2fs_bug_on(sbi, !list_empty(&inode_list)); -out: +skip: destroy_fsync_dnodes(&inode_list); /* truncate meta pages to be used by the recovery */ @@ -615,5 +664,12 @@ out: } kmem_cache_destroy(fsync_entry_slab); +out: +#ifdef CONFIG_QUOTA + /* Turn quotas off */ + f2fs_quota_off_umount(sbi->sb); +#endif + sbi->sb->s_flags = s_flags; /* Restore MS_RDONLY status */ + return ret ? ret: err; } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 801ab4ceeb36..4a5eae7ec64c 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -25,6 +25,7 @@ #include #include #include +#include #include "f2fs.h" #include "node.h" @@ -107,9 +108,20 @@ enum { Opt_fault_injection, Opt_lazytime, Opt_nolazytime, + Opt_quota, + Opt_noquota, Opt_usrquota, Opt_grpquota, Opt_prjquota, + Opt_usrjquota, + Opt_grpjquota, + Opt_prjjquota, + Opt_offusrjquota, + Opt_offgrpjquota, + Opt_offprjjquota, + Opt_jqfmt_vfsold, + Opt_jqfmt_vfsv0, + Opt_jqfmt_vfsv1, Opt_err, }; @@ -145,9 +157,20 @@ static match_table_t f2fs_tokens = { {Opt_fault_injection, "fault_injection=%u"}, {Opt_lazytime, "lazytime"}, {Opt_nolazytime, "nolazytime"}, + {Opt_quota, "quota"}, + {Opt_noquota, "noquota"}, {Opt_usrquota, "usrquota"}, {Opt_grpquota, "grpquota"}, {Opt_prjquota, "prjquota"}, + {Opt_usrjquota, "usrjquota=%s"}, + {Opt_grpjquota, "grpjquota=%s"}, + {Opt_prjjquota, "prjjquota=%s"}, + {Opt_offusrjquota, "usrjquota="}, + {Opt_offgrpjquota, "grpjquota="}, + {Opt_offprjjquota, "prjjquota="}, + {Opt_jqfmt_vfsold, "jqfmt=vfsold"}, + {Opt_jqfmt_vfsv0, "jqfmt=vfsv0"}, + {Opt_jqfmt_vfsv1, "jqfmt=vfsv1"}, {Opt_err, NULL}, }; @@ -170,6 +193,104 @@ static void init_once(void *foo) inode_init_once(&fi->vfs_inode); } +#ifdef CONFIG_QUOTA +static const char * const quotatypes[] = INITQFNAMES; +#define QTYPE2NAME(t) (quotatypes[t]) +static int f2fs_set_qf_name(struct super_block *sb, int qtype, + substring_t *args) +{ + struct f2fs_sb_info *sbi = F2FS_SB(sb); + char *qname; + int ret = -EINVAL; + + if (sb_any_quota_loaded(sb) && !sbi->s_qf_names[qtype]) { + f2fs_msg(sb, KERN_ERR, + "Cannot change journaled " + "quota options when quota turned on"); + return -EINVAL; + } + qname = match_strdup(args); + if (!qname) { + f2fs_msg(sb, KERN_ERR, + "Not enough memory for storing quotafile name"); + return -EINVAL; + } + if (sbi->s_qf_names[qtype]) { + if (strcmp(sbi->s_qf_names[qtype], qname) == 0) + ret = 0; + else + f2fs_msg(sb, KERN_ERR, + "%s quota file already specified", + QTYPE2NAME(qtype)); + goto errout; + } + if (strchr(qname, '/')) { + f2fs_msg(sb, KERN_ERR, + "quotafile must be on filesystem root"); + goto errout; + } + sbi->s_qf_names[qtype] = qname; + set_opt(sbi, QUOTA); + return 0; +errout: + kfree(qname); + return ret; +} + +static int f2fs_clear_qf_name(struct super_block *sb, int qtype) +{ + struct f2fs_sb_info *sbi = F2FS_SB(sb); + + if (sb_any_quota_loaded(sb) && sbi->s_qf_names[qtype]) { + f2fs_msg(sb, KERN_ERR, "Cannot change journaled quota options" + " when quota turned on"); + return -EINVAL; + } + kfree(sbi->s_qf_names[qtype]); + sbi->s_qf_names[qtype] = NULL; + return 0; +} + +static int f2fs_check_quota_options(struct f2fs_sb_info *sbi) +{ + /* + * We do the test below only for project quotas. 'usrquota' and + * 'grpquota' mount options are allowed even without quota feature + * to support legacy quotas in quota files. + */ + if (test_opt(sbi, PRJQUOTA) && !f2fs_sb_has_project_quota(sbi->sb)) { + f2fs_msg(sbi->sb, KERN_ERR, "Project quota feature not enabled. " + "Cannot enable project quota enforcement."); + return -1; + } + if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA] || + sbi->s_qf_names[PRJQUOTA]) { + if (test_opt(sbi, USRQUOTA) && sbi->s_qf_names[USRQUOTA]) + clear_opt(sbi, USRQUOTA); + + if (test_opt(sbi, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA]) + clear_opt(sbi, GRPQUOTA); + + if (test_opt(sbi, PRJQUOTA) && sbi->s_qf_names[PRJQUOTA]) + clear_opt(sbi, PRJQUOTA); + + if (test_opt(sbi, GRPQUOTA) || test_opt(sbi, USRQUOTA) || + test_opt(sbi, PRJQUOTA)) { + f2fs_msg(sbi->sb, KERN_ERR, "old and new quota " + "format mixing"); + return -1; + } + + if (!sbi->s_jquota_fmt) { + f2fs_msg(sbi->sb, KERN_ERR, "journaled quota format " + "not specified"); + return -1; + } + } + return 0; +} +#endif + static int parse_options(struct super_block *sb, char *options) { struct f2fs_sb_info *sbi = F2FS_SB(sb); @@ -177,6 +298,9 @@ static int parse_options(struct super_block *sb, char *options) substring_t args[MAX_OPT_ARGS]; char *p, *name; int arg = 0; +#ifdef CONFIG_QUOTA + int ret; +#endif if (!options) return 0; @@ -388,6 +512,7 @@ static int parse_options(struct super_block *sb, char *options) sb->s_flags &= ~MS_LAZYTIME; break; #ifdef CONFIG_QUOTA + case Opt_quota: case Opt_usrquota: set_opt(sbi, USRQUOTA); break; @@ -397,10 +522,66 @@ static int parse_options(struct super_block *sb, char *options) case Opt_prjquota: set_opt(sbi, PRJQUOTA); break; + case Opt_usrjquota: + ret = f2fs_set_qf_name(sb, USRQUOTA, &args[0]); + if (ret) + return ret; + break; + case Opt_grpjquota: + ret = f2fs_set_qf_name(sb, GRPQUOTA, &args[0]); + if (ret) + return ret; + break; + case Opt_prjjquota: + ret = f2fs_set_qf_name(sb, PRJQUOTA, &args[0]); + if (ret) + return ret; + break; + case Opt_offusrjquota: + ret = f2fs_clear_qf_name(sb, USRQUOTA); + if (ret) + return ret; + break; + case Opt_offgrpjquota: + ret = f2fs_clear_qf_name(sb, GRPQUOTA); + if (ret) + return ret; + break; + case Opt_offprjjquota: + ret = f2fs_clear_qf_name(sb, PRJQUOTA); + if (ret) + return ret; + break; + case Opt_jqfmt_vfsold: + sbi->s_jquota_fmt = QFMT_VFS_OLD; + break; + case Opt_jqfmt_vfsv0: + sbi->s_jquota_fmt = QFMT_VFS_V0; + break; + case Opt_jqfmt_vfsv1: + sbi->s_jquota_fmt = QFMT_VFS_V1; + break; + case Opt_noquota: + clear_opt(sbi, QUOTA); + clear_opt(sbi, USRQUOTA); + clear_opt(sbi, GRPQUOTA); + clear_opt(sbi, PRJQUOTA); + break; #else + case Opt_quota: case Opt_usrquota: case Opt_grpquota: case Opt_prjquota: + case Opt_usrjquota: + case Opt_grpjquota: + case Opt_prjjquota: + case Opt_offusrjquota: + case Opt_offgrpjquota: + case Opt_offprjjquota: + case Opt_jqfmt_vfsold: + case Opt_jqfmt_vfsv0: + case Opt_jqfmt_vfsv1: + case Opt_noquota: f2fs_msg(sb, KERN_INFO, "quota operations not supported"); break; @@ -412,6 +593,10 @@ static int parse_options(struct super_block *sb, char *options) return -EINVAL; } } +#ifdef CONFIG_QUOTA + if (f2fs_check_quota_options(sbi)) + return -EINVAL; +#endif if (F2FS_IO_SIZE_BITS(sbi) && !test_opt(sbi, LFS)) { f2fs_msg(sb, KERN_ERR, @@ -591,7 +776,6 @@ static void destroy_device_list(struct f2fs_sb_info *sbi) kfree(sbi->devs); } -static void f2fs_quota_off_umount(struct super_block *sb); static void f2fs_put_super(struct super_block *sb) { struct f2fs_sb_info *sbi = F2FS_SB(sb); @@ -659,6 +843,10 @@ static void f2fs_put_super(struct super_block *sb) destroy_device_list(sbi); if (sbi->write_io_dummy) mempool_destroy(sbi->write_io_dummy); +#ifdef CONFIG_QUOTA + for (i = 0; i < MAXQUOTAS; i++) + kfree(sbi->s_qf_names[i]); +#endif destroy_percpu_info(sbi); for (i = 0; i < NR_PAGE_TYPE; i++) kfree(sbi->write_io[i]); @@ -672,6 +860,9 @@ int f2fs_sync_fs(struct super_block *sb, int sync) trace_f2fs_sync_fs(sb, sync); + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) + return -EAGAIN; + if (sync) { struct cp_control cpc; @@ -792,6 +983,40 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) return 0; } +static inline void f2fs_show_quota_options(struct seq_file *seq, + struct super_block *sb) +{ +#ifdef CONFIG_QUOTA + struct f2fs_sb_info *sbi = F2FS_SB(sb); + + if (sbi->s_jquota_fmt) { + char *fmtname = ""; + + switch (sbi->s_jquota_fmt) { + case QFMT_VFS_OLD: + fmtname = "vfsold"; + break; + case QFMT_VFS_V0: + fmtname = "vfsv0"; + break; + case QFMT_VFS_V1: + fmtname = "vfsv1"; + break; + } + seq_printf(seq, ",jqfmt=%s", fmtname); + } + + if (sbi->s_qf_names[USRQUOTA]) + seq_show_option(seq, "usrjquota", sbi->s_qf_names[USRQUOTA]); + + if (sbi->s_qf_names[GRPQUOTA]) + seq_show_option(seq, "grpjquota", sbi->s_qf_names[GRPQUOTA]); + + if (sbi->s_qf_names[PRJQUOTA]) + seq_show_option(seq, "prjjquota", sbi->s_qf_names[PRJQUOTA]); +#endif +} + static int f2fs_show_options(struct seq_file *seq, struct dentry *root) { struct f2fs_sb_info *sbi = F2FS_SB(root->d_sb); @@ -865,6 +1090,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) sbi->fault_info.inject_rate); #endif #ifdef CONFIG_QUOTA + if (test_opt(sbi, QUOTA)) + seq_puts(seq, ",quota"); if (test_opt(sbi, USRQUOTA)) seq_puts(seq, ",usrquota"); if (test_opt(sbi, GRPQUOTA)) @@ -872,6 +1099,7 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) if (test_opt(sbi, PRJQUOTA)) seq_puts(seq, ",prjquota"); #endif + f2fs_show_quota_options(seq, sbi->sb); return 0; } @@ -920,6 +1148,11 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) #ifdef CONFIG_F2FS_FAULT_INJECTION struct f2fs_fault_info ffi = sbi->fault_info; #endif +#ifdef CONFIG_QUOTA + int s_jquota_fmt; + char *s_qf_names[MAXQUOTAS]; + int i, j; +#endif /* * Save the old mount options in case we @@ -929,6 +1162,23 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) old_sb_flags = sb->s_flags; active_logs = sbi->active_logs; +#ifdef CONFIG_QUOTA + s_jquota_fmt = sbi->s_jquota_fmt; + for (i = 0; i < MAXQUOTAS; i++) { + if (sbi->s_qf_names[i]) { + s_qf_names[i] = kstrdup(sbi->s_qf_names[i], + GFP_KERNEL); + if (!s_qf_names[i]) { + for (j = 0; j < i; j++) + kfree(s_qf_names[j]); + return -ENOMEM; + } + } else { + s_qf_names[i] = NULL; + } + } +#endif + /* recover superblocks we couldn't write due to previous RO mount */ if (!(*flags & MS_RDONLY) && is_sbi_flag_set(sbi, SBI_NEED_SB_WRITE)) { err = f2fs_commit_super(sbi, false); @@ -1010,6 +1260,11 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) goto restore_gc; } skip: +#ifdef CONFIG_QUOTA + /* Release old quota file names */ + for (i = 0; i < MAXQUOTAS; i++) + kfree(s_qf_names[i]); +#endif /* Update the POSIXACL Flag */ sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | (test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0); @@ -1024,6 +1279,13 @@ restore_gc: stop_gc_thread(sbi); } restore_opts: +#ifdef CONFIG_QUOTA + sbi->s_jquota_fmt = s_jquota_fmt; + for (i = 0; i < MAXQUOTAS; i++) { + kfree(sbi->s_qf_names[i]); + sbi->s_qf_names[i] = s_qf_names[i]; + } +#endif sbi->mount_opt = org_mount_opt; sbi->active_logs = active_logs; sb->s_flags = old_sb_flags; @@ -1140,6 +1402,27 @@ static qsize_t *f2fs_get_reserved_space(struct inode *inode) return &F2FS_I(inode)->i_reserved_quota; } +static int f2fs_quota_on_mount(struct f2fs_sb_info *sbi, int type) +{ + return dquot_quota_on_mount(sbi->sb, sbi->s_qf_names[type], + sbi->s_jquota_fmt, type); +} + +void f2fs_enable_quota_files(struct f2fs_sb_info *sbi) +{ + int i, ret; + + for (i = 0; i < MAXQUOTAS; i++) { + if (sbi->s_qf_names[i]) { + ret = f2fs_quota_on_mount(sbi, i); + if (ret < 0) + f2fs_msg(sbi->sb, KERN_ERR, + "Cannot turn on journaled " + "quota: error %d", ret); + } + } +} + static int f2fs_quota_sync(struct super_block *sb, int type) { struct quota_info *dqopt = sb_dqopt(sb); @@ -1221,7 +1504,7 @@ out_put: return err; } -static void f2fs_quota_off_umount(struct super_block *sb) +void f2fs_quota_off_umount(struct super_block *sb) { int type; @@ -1262,7 +1545,7 @@ static const struct quotactl_ops f2fs_quotactl_ops = { .set_dqblk = dquot_set_dqblk, }; #else -static inline void f2fs_quota_off_umount(struct super_block *sb) +void f2fs_quota_off_umount(struct super_block *sb) { } #endif @@ -2186,11 +2469,6 @@ try_onemore: if (err) goto free_nm; - /* if there are nt orphan nodes free them */ - err = recover_orphan_inodes(sbi); - if (err) - goto free_node_inode; - /* read root inode and dentry */ root = f2fs_iget(sb, F2FS_ROOT_INO(sbi)); if (IS_ERR(root)) { @@ -2214,6 +2492,11 @@ try_onemore: if (err) goto free_root_inode; + /* if there are nt orphan nodes free them */ + err = recover_orphan_inodes(sbi); + if (err) + goto free_sysfs; + /* recover fsynced data */ if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) { /* @@ -2223,7 +2506,7 @@ try_onemore: if (bdev_read_only(sb->s_bdev) && !is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) { err = -EROFS; - goto free_sysfs; + goto free_meta; } if (need_fsck) @@ -2237,7 +2520,7 @@ try_onemore: need_fsck = true; f2fs_msg(sb, KERN_ERR, "Cannot recover all fsync data errno=%d", err); - goto free_sysfs; + goto free_meta; } } else { err = recover_fsync_data(sbi, true); @@ -2261,7 +2544,7 @@ skip_recovery: /* After POR, we can run background GC thread.*/ err = start_gc_thread(sbi); if (err) - goto free_sysfs; + goto free_meta; } kfree(options); @@ -2279,8 +2562,16 @@ skip_recovery: f2fs_update_time(sbi, REQ_TIME); return 0; -free_sysfs: +free_meta: f2fs_sync_inode_meta(sbi); + /* + * Some dirty meta pages can be produced by recover_orphan_inodes() + * failed by EIO. Then, iput(node_inode) can trigger balance_fs_bg() + * followed by write_checkpoint() through f2fs_write_node_pages(), which + * falls into an infinite loop in sync_meta_pages(). + */ + truncate_inode_pages_final(META_MAPPING(sbi)); +free_sysfs: f2fs_unregister_sysfs(sbi); free_root_inode: dput(sb->s_root); @@ -2290,13 +2581,6 @@ free_node_inode: mutex_lock(&sbi->umount_mutex); release_ino_entry(sbi, true); f2fs_leave_shrinker(sbi); - /* - * Some dirty meta pages can be produced by recover_orphan_inodes() - * failed by EIO. Then, iput(node_inode) can trigger balance_fs_bg() - * followed by write_checkpoint() through f2fs_write_node_pages(), which - * falls into an infinite loop in sync_meta_pages(). - */ - truncate_inode_pages_final(META_MAPPING(sbi)); iput(sbi->node_inode); mutex_unlock(&sbi->umount_mutex); f2fs_destroy_stats(sbi); @@ -2316,6 +2600,10 @@ free_options: for (i = 0; i < NR_PAGE_TYPE; i++) kfree(sbi->write_io[i]); destroy_percpu_info(sbi); +#ifdef CONFIG_QUOTA + for (i = 0; i < MAXQUOTAS; i++) + kfree(sbi->s_qf_names[i]); +#endif kfree(options); free_sb_buf: kfree(raw_super); From 99dae6bc11ad5687760004133f26ad3e14d86a74 Mon Sep 17 00:00:00 2001 From: Qiuyang Sun Date: Wed, 9 Aug 2017 17:27:30 +0800 Subject: [PATCH 398/804] f2fs: merge equivalent flags F2FS_GET_BLOCK_[READ|DIO] Currently, the two flags F2FS_GET_BLOCK_[READ|DIO] are totally equivalent and can be used interchangably in all scenarios they are involved in. Neither of the flags is referenced in f2fs_map_blocks(), making them both the default case. To remove the ambiguity, this patch merges both flags into F2FS_GET_BLOCK_DEFAULT, and introduces an enum for all distinct flags. Signed-off-by: Qiuyang Sun Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 4 ++-- fs/f2fs/f2fs.h | 13 +++++++------ fs/f2fs/file.c | 4 ++-- 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 47584eb07ddf..d7aa2e908570 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1043,7 +1043,7 @@ static int get_data_block_dio(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { return __get_data_block(inode, iblock, bh_result, create, - F2FS_GET_BLOCK_DIO, NULL); + F2FS_GET_BLOCK_DEFAULT, NULL); } static int get_data_block_bmap(struct inode *inode, sector_t iblock, @@ -1242,7 +1242,7 @@ static int f2fs_mpage_readpages(struct address_space *mapping, map.m_len = last_block - block_in_file; if (f2fs_map_blocks(inode, &map, 0, - F2FS_GET_BLOCK_READ)) + F2FS_GET_BLOCK_DEFAULT)) goto set_error_page; } got_it: diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 310d8588ad3c..1aac76dd1938 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -574,12 +574,13 @@ struct f2fs_map_blocks { }; /* for flag in get_data_block */ -#define F2FS_GET_BLOCK_READ 0 -#define F2FS_GET_BLOCK_DIO 1 -#define F2FS_GET_BLOCK_FIEMAP 2 -#define F2FS_GET_BLOCK_BMAP 3 -#define F2FS_GET_BLOCK_PRE_DIO 4 -#define F2FS_GET_BLOCK_PRE_AIO 5 +enum { + F2FS_GET_BLOCK_DEFAULT, + F2FS_GET_BLOCK_FIEMAP, + F2FS_GET_BLOCK_BMAP, + F2FS_GET_BLOCK_PRE_DIO, + F2FS_GET_BLOCK_PRE_AIO, +}; /* * i_advise uses FADVISE_XXX_BIT. We can add additional hints later. diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index a606dadcedee..30dc356d922c 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2051,7 +2051,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, */ while (map.m_lblk < pg_end) { map.m_len = pg_end - map.m_lblk; - err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_READ); + err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_DEFAULT); if (err) goto out; @@ -2093,7 +2093,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, do_map: map.m_len = pg_end - map.m_lblk; - err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_READ); + err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_DEFAULT); if (err) goto clear_out; From 2b476db7b17c8525f0c5b9b42b6225118b38dd0e Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 10 Aug 2017 17:35:04 -0700 Subject: [PATCH 399/804] f2fs: let fill_super handle roll-forward errors If we set CP_ERROR_FLAG in roll-forward error, f2fs is no longer to proceed any IOs due to f2fs_cp_error(). But, for example, if some stale data is involved on roll-forward process, we're able to get -ENOENT, getting fs stuck. If we get any error, let fill_super set SBI_NEED_FSCK and try to recover back to stable point. Cc: Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/recovery.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index a3d02613934a..f707d810c87d 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -649,8 +649,6 @@ skip: } clear_sbi_flag(sbi, SBI_POR_DOING); - if (err) - set_ckpt_flags(sbi, CP_ERROR_FLAG); mutex_unlock(&sbi->cp_mutex); /* let's drop all the directory inodes for clean checkpoint */ From f542a0378dc8b18ffb09bcdbf23aa55c260d6acc Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 8 Aug 2017 19:09:08 +0800 Subject: [PATCH 400/804] f2fs: retry to revoke atomic commit in -ENOMEM case During atomic committing, if we encounter -ENOMEM in revoke path, it's better to give a chance to retry revoking. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 20f466ace8b0..03849778b881 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -213,9 +213,15 @@ static int __revoke_inmem_pages(struct inode *inode, struct node_info ni; trace_f2fs_commit_inmem_page(page, INMEM_REVOKE); - +retry: set_new_dnode(&dn, inode, NULL, NULL, 0); - if (get_dnode_of_data(&dn, page->index, LOOKUP_NODE)) { + err = get_dnode_of_data(&dn, page->index, LOOKUP_NODE); + if (err) { + if (err == -ENOMEM) { + congestion_wait(BLK_RW_ASYNC, HZ/50); + cond_resched(); + goto retry; + } err = -EAGAIN; goto next; } From 6ae3dde9ed3cba9eba3d5e95ba8dba1635134d91 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 11 Aug 2017 18:00:15 +0800 Subject: [PATCH 401/804] f2fs: add tracepoint for f2fs_gc This patch adds tracepoint for f2fs_gc. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 50 ++++++++++++----- include/trace/events/f2fs.h | 107 ++++++++++++++++++++++++++++++++++++ 2 files changed, 143 insertions(+), 14 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index faed28e56203..ccb00de9b0b0 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -919,7 +919,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, struct blk_plug plug; unsigned int segno = start_segno; unsigned int end_segno = start_segno + sbi->segs_per_sec; - int sec_freed = 0; + int seg_freed = 0; unsigned char type = IS_DATASEG(get_seg_entry(sbi, segno)->type) ? SUM_TYPE_DATA : SUM_TYPE_NODE; @@ -965,6 +965,10 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, gc_type); stat_inc_seg_count(sbi, type, gc_type); + + if (gc_type == FG_GC && + get_valid_blocks(sbi, segno, false) == 0) + seg_freed++; next: f2fs_put_page(sum_page, 0); } @@ -975,21 +979,17 @@ next: blk_finish_plug(&plug); - if (gc_type == FG_GC && - get_valid_blocks(sbi, start_segno, true) == 0) - sec_freed = 1; - stat_inc_call_count(sbi->stat_info); - return sec_freed; + return seg_freed; } int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background, unsigned int segno) { int gc_type = sync ? FG_GC : BG_GC; - int sec_freed = 0; - int ret; + int sec_freed = 0, seg_freed = 0, total_freed = 0; + int ret = 0; struct cp_control cpc; unsigned int init_segno = segno; struct gc_inode_list gc_list = { @@ -997,6 +997,15 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, .iroot = RADIX_TREE_INIT(GFP_NOFS), }; + trace_f2fs_gc_begin(sbi->sb, sync, background, + get_pages(sbi, F2FS_DIRTY_NODES), + get_pages(sbi, F2FS_DIRTY_DENTS), + get_pages(sbi, F2FS_DIRTY_IMETA), + free_sections(sbi), + free_segments(sbi), + reserved_segments(sbi), + prefree_segments(sbi)); + cpc.reason = __get_cp_reason(sbi); gc_more: if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE))) { @@ -1023,17 +1032,20 @@ gc_more: gc_type = FG_GC; } - ret = -EINVAL; /* f2fs_balance_fs doesn't need to do BG_GC in critical path. */ - if (gc_type == BG_GC && !background) + if (gc_type == BG_GC && !background) { + ret = -EINVAL; goto stop; - if (!__get_victim(sbi, &segno, gc_type)) + } + if (!__get_victim(sbi, &segno, gc_type)) { + ret = -ENODATA; goto stop; - ret = 0; + } - if (do_garbage_collect(sbi, segno, &gc_list, gc_type) && - gc_type == FG_GC) + seg_freed = do_garbage_collect(sbi, segno, &gc_list, gc_type); + if (gc_type == FG_GC && seg_freed == sbi->segs_per_sec) sec_freed++; + total_freed += seg_freed; if (gc_type == FG_GC) sbi->cur_victim_sec = NULL_SEGNO; @@ -1050,6 +1062,16 @@ gc_more: stop: SIT_I(sbi)->last_victim[ALLOC_NEXT] = 0; SIT_I(sbi)->last_victim[FLUSH_DEVICE] = init_segno; + + trace_f2fs_gc_end(sbi->sb, ret, total_freed, sec_freed, + get_pages(sbi, F2FS_DIRTY_NODES), + get_pages(sbi, F2FS_DIRTY_DENTS), + get_pages(sbi, F2FS_DIRTY_IMETA), + free_sections(sbi), + free_segments(sbi), + reserved_segments(sbi), + prefree_segments(sbi)); + mutex_unlock(&sbi->gc_mutex); put_gc_inode(&gc_list); diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 167c40850f98..7063bbcca03b 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -561,6 +561,113 @@ TRACE_EVENT(f2fs_background_gc, __entry->free) ); +TRACE_EVENT(f2fs_gc_begin, + + TP_PROTO(struct super_block *sb, bool sync, bool background, + long long dirty_nodes, long long dirty_dents, + long long dirty_imeta, unsigned int free_sec, + unsigned int free_seg, int reserved_seg, + unsigned int prefree_seg), + + TP_ARGS(sb, sync, background, dirty_nodes, dirty_dents, dirty_imeta, + free_sec, free_seg, reserved_seg, prefree_seg), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(bool, sync) + __field(bool, background) + __field(long long, dirty_nodes) + __field(long long, dirty_dents) + __field(long long, dirty_imeta) + __field(unsigned int, free_sec) + __field(unsigned int, free_seg) + __field(int, reserved_seg) + __field(unsigned int, prefree_seg) + ), + + TP_fast_assign( + __entry->dev = sb->s_dev; + __entry->sync = sync; + __entry->background = background; + __entry->dirty_nodes = dirty_nodes; + __entry->dirty_dents = dirty_dents; + __entry->dirty_imeta = dirty_imeta; + __entry->free_sec = free_sec; + __entry->free_seg = free_seg; + __entry->reserved_seg = reserved_seg; + __entry->prefree_seg = prefree_seg; + ), + + TP_printk("dev = (%d,%d), sync = %d, background = %d, nodes = %lld, " + "dents = %lld, imeta = %lld, free_sec:%u, free_seg:%u, " + "rsv_seg:%d, prefree_seg:%u", + show_dev(__entry->dev), + __entry->sync, + __entry->background, + __entry->dirty_nodes, + __entry->dirty_dents, + __entry->dirty_imeta, + __entry->free_sec, + __entry->free_seg, + __entry->reserved_seg, + __entry->prefree_seg) +); + +TRACE_EVENT(f2fs_gc_end, + + TP_PROTO(struct super_block *sb, int ret, int seg_freed, + int sec_freed, long long dirty_nodes, + long long dirty_dents, long long dirty_imeta, + unsigned int free_sec, unsigned int free_seg, + int reserved_seg, unsigned int prefree_seg), + + TP_ARGS(sb, ret, seg_freed, sec_freed, dirty_nodes, dirty_dents, + dirty_imeta, free_sec, free_seg, reserved_seg, prefree_seg), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(int, ret) + __field(int, seg_freed) + __field(int, sec_freed) + __field(long long, dirty_nodes) + __field(long long, dirty_dents) + __field(long long, dirty_imeta) + __field(unsigned int, free_sec) + __field(unsigned int, free_seg) + __field(int, reserved_seg) + __field(unsigned int, prefree_seg) + ), + + TP_fast_assign( + __entry->dev = sb->s_dev; + __entry->ret = ret; + __entry->seg_freed = seg_freed; + __entry->sec_freed = sec_freed; + __entry->dirty_nodes = dirty_nodes; + __entry->dirty_dents = dirty_dents; + __entry->dirty_imeta = dirty_imeta; + __entry->free_sec = free_sec; + __entry->free_seg = free_seg; + __entry->reserved_seg = reserved_seg; + __entry->prefree_seg = prefree_seg; + ), + + TP_printk("dev = (%d,%d), ret = %d, seg_freed = %d, sec_freed = %d, " + "nodes = %lld, dents = %lld, imeta = %lld, free_sec:%u, " + "free_seg:%u, rsv_seg:%d, prefree_seg:%u", + show_dev(__entry->dev), + __entry->ret, + __entry->seg_freed, + __entry->sec_freed, + __entry->dirty_nodes, + __entry->dirty_dents, + __entry->dirty_imeta, + __entry->free_sec, + __entry->free_seg, + __entry->reserved_seg, + __entry->prefree_seg) +); + TRACE_EVENT(f2fs_get_victim, TP_PROTO(struct super_block *sb, int type, int gc_type, From 8a8cce5b1f1705f757747ea558985e5eb2a7c69d Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Sat, 12 Aug 2017 21:33:23 -0700 Subject: [PATCH 402/804] f2fs: check hot_data for roll-forward recovery We need to check HOT_DATA to truncate any previous data block when doing roll-forward recovery. Cc: Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/recovery.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index f707d810c87d..9626758bc762 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -317,7 +317,7 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, return 0; /* Get the previous summary */ - for (i = CURSEG_WARM_DATA; i <= CURSEG_COLD_DATA; i++) { + for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { struct curseg_info *curseg = CURSEG_I(sbi, i); if (curseg->segno == segno) { sum = curseg->sum_blk->entries[blkoff]; From a50bb55fd335f59297c2b2a9f11f5fd209ef0a77 Mon Sep 17 00:00:00 2001 From: Yunlong Song Date: Mon, 14 Aug 2017 16:52:43 +0800 Subject: [PATCH 403/804] f2fs: remove unused function overprovision_sections Signed-off-by: Yunlong Song Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 84242eb5226f..a843751b253b 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -492,11 +492,6 @@ static inline int overprovision_segments(struct f2fs_sb_info *sbi) return SM_I(sbi)->ovp_segments; } -static inline int overprovision_sections(struct f2fs_sb_info *sbi) -{ - return GET_SEC_FROM_SEG(sbi, (unsigned int)overprovision_segments(sbi)); -} - static inline int reserved_sections(struct f2fs_sb_info *sbi) { return GET_SEC_FROM_SEG(sbi, (unsigned int)reserved_segments(sbi)); From 4ff6d9bf5af4c74a6a69e32a630975c92f9614d1 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 7 Aug 2017 23:09:56 +0800 Subject: [PATCH 404/804] f2fs: introduce discard_granularity sysfs entry Commit d618ebaf0aa8 ("f2fs: enable small discard by default") enables f2fs to issue 4K size discard in real-time discard mode. However, issuing smaller discard may cost more lifetime but releasing less free space in flash device. Since f2fs has ability of separating hot/cold data and garbage collection, we can expect that small-sized invalid region would expand soon with OPU, deletion or garbage collection on valid datas, so it's better to delay or skip issuing smaller size discards, it could help to reduce overmuch consumption of IO bandwidth and lifetime of flash storage. This patch makes f2fs selectng 64K size as its default minimal granularity, and issue discard with the size which is not smaller than minimal granularity. Also it exposes discard granularity as sysfs entry for configuration in different scenario. Jaegeuk Kim: We must issue all the accumulated discard commands when fstrim is called. So, I've added pend_list_tag[] to indicate whether we should issue the commands or not. If tag sets P_ACTIVE or P_TRIM, we have to issue them. P_TRIM is set once at a time, given fstrim trigger. In addition, issue_discard_thread is calling too much due to the number of discard commands remaining in the pending list. I added a timer to control it likewise gc_thread. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 9 +++ fs/f2fs/f2fs.h | 12 ++++ fs/f2fs/segment.c | 91 +++++++++++++++++++++---- fs/f2fs/sysfs.c | 23 +++++++ 4 files changed, 121 insertions(+), 14 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index 6c2c50b4e781..500c60403653 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -57,6 +57,15 @@ Contact: "Jaegeuk Kim" Description: Controls the issue rate of small discard commands. +What: /sys/fs/f2fs//discard_granularity +Date: July 2017 +Contact: "Chao Yu" +Description: + Controls discard granularity of inner discard thread, inner thread + will not issue discards with size that is smaller than granularity. + The unit size is one block, now only support configuring in range + of [1, 512]. + What: /sys/fs/f2fs//max_victim_search Date: January 2014 Contact: "Jaegeuk Kim" diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 1aac76dd1938..e13daceb7995 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -215,6 +215,8 @@ enum { (BATCHED_TRIM_SEGMENTS(sbi) << (sbi)->log_blocks_per_seg) #define MAX_DISCARD_BLOCKS(sbi) BLKS_PER_SEC(sbi) #define DISCARD_ISSUE_RATE 8 +#define DEF_MIN_DISCARD_ISSUE_TIME 50 /* 50 ms, if exists */ +#define DEF_MAX_DISCARD_ISSUE_TIME 60000 /* 60 s, if no candidates */ #define DEF_CP_INTERVAL 60 /* 60 secs */ #define DEF_IDLE_INTERVAL 5 /* 5 secs */ @@ -263,11 +265,18 @@ struct discard_entry { unsigned char discard_map[SIT_VBLOCK_MAP_SIZE]; /* segment discard bitmap */ }; +/* default discard granularity of inner discard thread, unit: block count */ +#define DEFAULT_DISCARD_GRANULARITY 16 + /* max discard pend list number */ #define MAX_PLIST_NUM 512 #define plist_idx(blk_num) ((blk_num) >= MAX_PLIST_NUM ? \ (MAX_PLIST_NUM - 1) : (blk_num - 1)) +#define P_ACTIVE 0x01 +#define P_TRIM 0x02 +#define plist_issue(tag) (((tag) & P_ACTIVE) || ((tag) & P_TRIM)) + enum { D_PREP, D_SUBMIT, @@ -303,11 +312,14 @@ struct discard_cmd_control { struct task_struct *f2fs_issue_discard; /* discard thread */ struct list_head entry_list; /* 4KB discard entry list */ struct list_head pend_list[MAX_PLIST_NUM];/* store pending entries */ + unsigned char pend_list_tag[MAX_PLIST_NUM];/* tag for pending entries */ struct list_head wait_list; /* store on-flushing entries */ wait_queue_head_t discard_wait_queue; /* waiting queue for wake-up */ + unsigned int discard_wake; /* to wake up discard thread */ struct mutex cmd_lock; unsigned int nr_discards; /* # of discards in the list */ unsigned int max_discards; /* max. discards to be issued */ + unsigned int discard_granularity; /* discard granularity */ unsigned int undiscard_blks; /* # of undiscard blocks */ atomic_t issued_discard; /* # of issued discard */ atomic_t issing_discard; /* # of issing discard */ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 03849778b881..97d43373e10e 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1096,32 +1096,65 @@ static int __queue_discard_cmd(struct f2fs_sb_info *sbi, return 0; } -static void __issue_discard_cmd(struct f2fs_sb_info *sbi, bool issue_cond) +static int __issue_discard_cmd(struct f2fs_sb_info *sbi, bool issue_cond) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct list_head *pend_list; struct discard_cmd *dc, *tmp; struct blk_plug plug; - int i, iter = 0; + int iter = 0, issued = 0; + int i; mutex_lock(&dcc->cmd_lock); f2fs_bug_on(sbi, !__check_rb_tree_consistence(sbi, &dcc->root)); blk_start_plug(&plug); - for (i = MAX_PLIST_NUM - 1; i >= 0; i--) { + for (i = MAX_PLIST_NUM - 1; + i >= 0 && plist_issue(dcc->pend_list_tag[i]); i--) { pend_list = &dcc->pend_list[i]; list_for_each_entry_safe(dc, tmp, pend_list, list) { f2fs_bug_on(sbi, dc->state != D_PREP); - if (!issue_cond || is_idle(sbi)) + /* Hurry up to finish fstrim */ + if (dcc->pend_list_tag[i] & P_TRIM) { __submit_discard_cmd(sbi, dc); + issued++; + continue; + } + + if (!issue_cond || is_idle(sbi)) { + issued++; + __submit_discard_cmd(sbi, dc); + } if (issue_cond && iter++ > DISCARD_ISSUE_RATE) goto out; } + if (list_empty(pend_list) && dcc->pend_list_tag[i] & P_TRIM) + dcc->pend_list_tag[i] &= (~P_TRIM); } out: blk_finish_plug(&plug); mutex_unlock(&dcc->cmd_lock); + + return issued; +} + +static void __drop_discard_cmd(struct f2fs_sb_info *sbi) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct list_head *pend_list; + struct discard_cmd *dc, *tmp; + int i; + + mutex_lock(&dcc->cmd_lock); + for (i = MAX_PLIST_NUM - 1; i >= 0; i--) { + pend_list = &dcc->pend_list[i]; + list_for_each_entry_safe(dc, tmp, pend_list, list) { + f2fs_bug_on(sbi, dc->state != D_PREP); + __remove_discard_cmd(sbi, dc); + } + } + mutex_unlock(&dcc->cmd_lock); } static void __wait_one_discard_bio(struct f2fs_sb_info *sbi, @@ -1206,34 +1239,56 @@ void stop_discard_thread(struct f2fs_sb_info *sbi) void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi) { __issue_discard_cmd(sbi, false); + __drop_discard_cmd(sbi); __wait_discard_cmd(sbi, false); } +static void mark_discard_range_all(struct f2fs_sb_info *sbi) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + int i; + + mutex_lock(&dcc->cmd_lock); + for (i = 0; i < MAX_PLIST_NUM; i++) + dcc->pend_list_tag[i] |= P_TRIM; + mutex_unlock(&dcc->cmd_lock); +} + static int issue_discard_thread(void *data) { struct f2fs_sb_info *sbi = data; struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; wait_queue_head_t *q = &dcc->discard_wait_queue; + unsigned int wait_ms = DEF_MIN_DISCARD_ISSUE_TIME; + int issued; set_freezable(); do { - wait_event_interruptible(*q, kthread_should_stop() || - freezing(current) || - atomic_read(&dcc->discard_cmd_cnt)); + wait_event_interruptible_timeout(*q, + kthread_should_stop() || freezing(current) || + dcc->discard_wake, + msecs_to_jiffies(wait_ms)); if (try_to_freeze()) continue; if (kthread_should_stop()) return 0; + if (dcc->discard_wake) + dcc->discard_wake = 0; + sb_start_intwrite(sbi->sb); - __issue_discard_cmd(sbi, true); - __wait_discard_cmd(sbi, true); + issued = __issue_discard_cmd(sbi, true); + if (issued) { + __wait_discard_cmd(sbi, true); + wait_ms = DEF_MIN_DISCARD_ISSUE_TIME; + } else { + wait_ms = DEF_MAX_DISCARD_ISSUE_TIME; + } sb_end_intwrite(sbi->sb); - congestion_wait(BLK_RW_SYNC, HZ/50); } while (!kthread_should_stop()); return 0; } @@ -1424,7 +1479,8 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi) void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc) { - struct list_head *head = &(SM_I(sbi)->dcc_info->entry_list); + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct list_head *head = &dcc->entry_list; struct discard_entry *entry, *this; struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); unsigned long *prefree_map = dirty_i->dirty_segmap[PRE]; @@ -1506,11 +1562,12 @@ skip: goto find_next; list_del(&entry->list); - SM_I(sbi)->dcc_info->nr_discards -= total_len; + dcc->nr_discards -= total_len; kmem_cache_free(discard_entry_slab, entry); } - wake_up(&SM_I(sbi)->dcc_info->discard_wait_queue); + dcc->discard_wake = 1; + wake_up_interruptible_all(&dcc->discard_wait_queue); } static int create_discard_cmd_control(struct f2fs_sb_info *sbi) @@ -1528,9 +1585,13 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi) if (!dcc) return -ENOMEM; + dcc->discard_granularity = DEFAULT_DISCARD_GRANULARITY; INIT_LIST_HEAD(&dcc->entry_list); - for (i = 0; i < MAX_PLIST_NUM; i++) + for (i = 0; i < MAX_PLIST_NUM; i++) { INIT_LIST_HEAD(&dcc->pend_list[i]); + if (i >= dcc->discard_granularity - 1) + dcc->pend_list_tag[i] |= P_ACTIVE; + } INIT_LIST_HEAD(&dcc->wait_list); mutex_init(&dcc->cmd_lock); atomic_set(&dcc->issued_discard, 0); @@ -2207,6 +2268,8 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) schedule(); } + /* It's time to issue all the filed discards */ + mark_discard_range_all(sbi); out: range->len = F2FS_BLK_TO_BYTES(cpc.trimmed); return err; diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index c40e5d24df9f..4bcaa9059026 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -152,6 +152,27 @@ static ssize_t f2fs_sbi_store(struct f2fs_attr *a, spin_unlock(&sbi->stat_lock); return count; } + + if (!strcmp(a->attr.name, "discard_granularity")) { + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + int i; + + if (t == 0 || t > MAX_PLIST_NUM) + return -EINVAL; + if (t == *ui) + return count; + + mutex_lock(&dcc->cmd_lock); + for (i = 0; i < MAX_PLIST_NUM; i++) { + if (i >= t - 1) + dcc->pend_list_tag[i] |= P_ACTIVE; + else + dcc->pend_list_tag[i] &= (~P_ACTIVE); + } + mutex_unlock(&dcc->cmd_lock); + return count; + } + *ui = t; if (!strcmp(a->attr.name, "iostat_enable") && *ui == 0) @@ -248,6 +269,7 @@ F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_idle, gc_idle); F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_urgent, gc_urgent); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments); F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_small_discards, max_discards); +F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, discard_granularity, discard_granularity); F2FS_RW_ATTR(RESERVED_BLOCKS, f2fs_sb_info, reserved_blocks, reserved_blocks); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, batched_trim_sections, trim_sections); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy); @@ -290,6 +312,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(gc_urgent), ATTR_LIST(reclaim_segments), ATTR_LIST(max_small_discards), + ATTR_LIST(discard_granularity), ATTR_LIST(batched_trim_sections), ATTR_LIST(ipu_policy), ATTR_LIST(min_ipu_util), From 8f8b9cda392501633678aa75ed8830d31e8e79f9 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 15 Aug 2017 21:27:19 -0700 Subject: [PATCH 405/804] f2fs: issue discard commands if gc_urgent is set It's time to issue all the discard commands, if user sets the idle time. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 6 +++++- fs/f2fs/sysfs.c | 5 +++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 97d43373e10e..abfa55174d0c 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -21,6 +21,7 @@ #include "f2fs.h" #include "segment.h" #include "node.h" +#include "gc.h" #include "trace.h" #include @@ -1274,8 +1275,11 @@ static int issue_discard_thread(void *data) if (kthread_should_stop()) return 0; - if (dcc->discard_wake) + if (dcc->discard_wake) { dcc->discard_wake = 0; + if (sbi->gc_thread && sbi->gc_thread->gc_urgent) + mark_discard_range_all(sbi); + } sb_start_intwrite(sbi->sb); diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 4bcaa9059026..b9ad9041559f 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -178,8 +178,13 @@ static ssize_t f2fs_sbi_store(struct f2fs_attr *a, if (!strcmp(a->attr.name, "iostat_enable") && *ui == 0) f2fs_reset_iostat(sbi); if (!strcmp(a->attr.name, "gc_urgent") && t == 1 && sbi->gc_thread) { + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + sbi->gc_thread->gc_wake = 1; wake_up_interruptible_all(&sbi->gc_thread->gc_wait_queue_head); + + dcc->discard_wake = 1; + wake_up_interruptible_all(&dcc->discard_wait_queue); } return count; From 440c08fb62d22b49bcb846b1805a20441d297172 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 21 Aug 2017 22:53:45 +0800 Subject: [PATCH 406/804] f2fs: fix out-of-order execution in f2fs_issue_flush In f2fs_issue_flush, due to out-of-order execution of CPU, wake_up can be called before we insert issue_list, result in long latency of wait_for_completion. Fix this by adding smp_mb() to force the order of related codes. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index abfa55174d0c..e9416ae025aa 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -549,7 +549,10 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi) atomic_inc(&fcc->issing_flush); llist_add(&cmd.llnode, &fcc->issue_list); - if (!fcc->dispatch_list) + /* update issue_list before we wake up issue_flush thread */ + smp_mb(); + + if (waitqueue_active(&fcc->flush_wait_queue)) wake_up(&fcc->flush_wait_queue); if (fcc->f2fs_issue_flush) { From 9071bb1c094f70ab08253b9335e62b8a79e20b15 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 18 Aug 2017 23:37:36 +0800 Subject: [PATCH 407/804] f2fs: clear FI_HOT_DATA correctly This patch fixes to clear FI_HOT_DATA correctly in below path: - error handling in f2fs_ioc_start_atomic_write - after commit atomic write in f2fs_ioc_commit_atomic_write - after drop atomic write in drop_inmem_pages Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 2 ++ fs/f2fs/segment.c | 1 + 2 files changed, 3 insertions(+) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 30dc356d922c..25087401b2e6 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1617,6 +1617,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); if (ret) { clear_inode_flag(inode, FI_ATOMIC_FILE); + clear_inode_flag(inode, FI_HOT_DATA); goto out; } @@ -1655,6 +1656,7 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); if (!ret) { clear_inode_flag(inode, FI_ATOMIC_FILE); + clear_inode_flag(inode, FI_HOT_DATA); stat_dec_atomic_write(inode); } } else { diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index e9416ae025aa..78a0e8ee62b8 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -255,6 +255,7 @@ void drop_inmem_pages(struct inode *inode) mutex_unlock(&fi->inmem_lock); clear_inode_flag(inode, FI_ATOMIC_FILE); + clear_inode_flag(inode, FI_HOT_DATA); stat_dec_atomic_write(inode); } From e1a34a55188890b7addd174aa83aee14840c8c9e Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 18 Aug 2017 16:20:33 +0800 Subject: [PATCH 408/804] f2fs: trigger normal fsync for non-atomic_write file If file was not opened with atomic write mode, but user uses atomic write ioctl to fsync datas, in the flow, we should not fsync that file with atomic write mode. Fixes: 608514deba38 ("f2fs: set fsync mark only for the last dnode") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 25087401b2e6..1c3dffc987b1 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1660,7 +1660,7 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) stat_dec_atomic_write(inode); } } else { - ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); + ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, false); } err_out: inode_unlock(inode); From 5469cedba6796ebb2e1dceca9a7c3b605e215cd6 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 21 Aug 2017 13:51:32 -0700 Subject: [PATCH 409/804] f2fs: return error when accessing insane flie offset If file offset is insane, we have to return error instead of kernel panic. Reported-by: Eric Zhang Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index bc748df0b04f..a825a973dcf2 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -555,7 +555,7 @@ static int get_node_path(struct inode *inode, long block, level = 3; goto got; } else { - BUG(); + return -E2BIG; } got: return level; @@ -579,6 +579,8 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) int err = 0; level = get_node_path(dn->inode, index, offset, noffset); + if (level < 0) + return level; nids[0] = dn->inode->i_ino; npage[0] = dn->inode_page; @@ -878,6 +880,8 @@ int truncate_inode_blocks(struct inode *inode, pgoff_t from) trace_f2fs_truncate_inode_blocks_enter(inode, from); level = get_node_path(inode, from, offset, noffset); + if (level < 0) + return level; page = get_node_page(sbi, inode->i_ino); if (IS_ERR(page)) { From 1e5c4e7c8dc42ce706d09ace943ab29fbe6aa6ac Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 22 Aug 2017 21:15:43 -0700 Subject: [PATCH 410/804] f2fs: wake up discard_thread iff there is a candidate This patch fixes to avoid needless wake ups. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 3 +-- fs/f2fs/segment.h | 25 +++++++++++++++++++++++++ fs/f2fs/sysfs.c | 6 +----- 3 files changed, 27 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 78a0e8ee62b8..00253111c227 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1574,8 +1574,7 @@ skip: kmem_cache_free(discard_entry_slab, entry); } - dcc->discard_wake = 1; - wake_up_interruptible_all(&dcc->discard_wait_queue); + wake_up_discard_thread(sbi, false); } static int create_discard_cmd_control(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index a843751b253b..b8aa84109bf5 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -797,3 +797,28 @@ static inline long nr_pages_to_write(struct f2fs_sb_info *sbi, int type, wbc->nr_to_write = desired; return desired - nr_to_write; } + +static inline void wake_up_discard_thread(struct f2fs_sb_info *sbi, bool force) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + bool wakeup = false; + int i; + + if (force) + goto wake_up; + + mutex_lock(&dcc->cmd_lock); + for (i = MAX_PLIST_NUM - 1; + i >= 0 && plist_issue(dcc->pend_list_tag[i]); i--) { + if (!list_empty(&dcc->pend_list[i])) { + wakeup = true; + break; + } + } + mutex_unlock(&dcc->cmd_lock); + if (!wakeup) + return; +wake_up: + dcc->discard_wake = 1; + wake_up_interruptible_all(&dcc->discard_wait_queue); +} diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index b9ad9041559f..962735dc9c63 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -178,13 +178,9 @@ static ssize_t f2fs_sbi_store(struct f2fs_attr *a, if (!strcmp(a->attr.name, "iostat_enable") && *ui == 0) f2fs_reset_iostat(sbi); if (!strcmp(a->attr.name, "gc_urgent") && t == 1 && sbi->gc_thread) { - struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; - sbi->gc_thread->gc_wake = 1; wake_up_interruptible_all(&sbi->gc_thread->gc_wait_queue_head); - - dcc->discard_wake = 1; - wake_up_interruptible_all(&dcc->discard_wait_queue); + wake_up_discard_thread(sbi, true); } return count; From 0520ca37ef89ae0ce3679da7616b7a66a0cfa774 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 23 Aug 2017 18:23:24 +0800 Subject: [PATCH 411/804] f2fs: fix to avoid race in between aio and gc We won't wait DIO synchronously when doing AIO, so there will be potential IO reorder in between AIO and GC, which will cause data corruption. This patch adds inode_dio_wait to serialize aio and data GC to avoid this issue. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index ccb00de9b0b0..382b7d386ffb 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -875,6 +875,9 @@ next_step: continue; } locked = true; + + /* wait for all inflight aio data */ + inode_dio_wait(inode); } start_bidx = start_bidx_of_node(nofs, inode) From 077e22bf7a87013157dca27ea9e7ff3adcd75385 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 23 Aug 2017 18:23:25 +0800 Subject: [PATCH 412/804] f2fs: trigger fdatasync for non-atomic_write file Sqlite only cares about synchronization of file data instead of other data unrelated attribute of inode, so in commit flow, call fdatasync is enough. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 1c3dffc987b1..6c2ebe91afeb 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1660,7 +1660,7 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) stat_dec_atomic_write(inode); } } else { - ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, false); + ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 1, false); } err_out: inode_unlock(inode); From c953aed665079b3f3878497e0dcaf763a5645f50 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 31 Aug 2017 11:10:58 -0700 Subject: [PATCH 413/804] f2fs: don't need to update inode checksum for recovery This patch fixes "f2fs: support inode checksum". The recovered inode page will be rewritten with valid checksum. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index a825a973dcf2..d789cff5ffb1 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2290,8 +2290,6 @@ retry: F2FS_FITS_IN_INODE(src, le16_to_cpu(src->i_extra_isize), i_projid)) dst->i_projid = src->i_projid; - - f2fs_inode_chksum_set(sbi, ipage); } new_ni = old_ni; From 85825456837e849a5cf8d6de78edf8771fe44b98 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 31 Aug 2017 16:54:51 -0700 Subject: [PATCH 414/804] f2fs: don't check inode's checksum if it was dirtied or writebacked If another thread already made the page dirtied or writebacked, we must avoid to verify checksum. If we got an error, we need to remove its uptodate as well. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/inode.c | 3 ++- fs/f2fs/node.c | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index b4c401d456e7..c33b05aec1a1 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -153,7 +153,8 @@ bool f2fs_inode_chksum_verify(struct f2fs_sb_info *sbi, struct page *page) struct f2fs_inode *ri; __u32 provided, calculated; - if (!f2fs_enable_inode_chksum(sbi, page)) + if (!f2fs_enable_inode_chksum(sbi, page) || + PageDirty(page) || PageWriteback(page)) return true; ri = &F2FS_NODE(page)->i; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index d789cff5ffb1..32474db18ad9 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1187,9 +1187,9 @@ page_hit: nid, nid_of_node(page), ino_of_node(page), ofs_of_node(page), cpver_of_node(page), next_blkaddr_of_node(page)); - ClearPageUptodate(page); err = -EINVAL; out_err: + ClearPageUptodate(page); f2fs_put_page(page, 1); return ERR_PTR(err); } From 6337ccbeee428f9859925f90fe935093d0c692b7 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 30 Aug 2017 18:04:47 +0800 Subject: [PATCH 415/804] f2fs: update i_flags correctly f2fs enables hash-indexed directory by default, so we need to tag FS_INDEX_FL in inode::i_flags during directory creataion, in order to show correct status of inode in lsattr: Before: ------------------- /mnt/f2fs/dir/ After: -----------I------- /mnt/f2fs/dir/ Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/namei.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 621b164bbe3c..d92b8e9064cb 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -100,6 +100,9 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) F2FS_I(inode)->i_flags = f2fs_mask_flags(mode, F2FS_I(dir)->i_flags & F2FS_FL_INHERITED); + if (S_ISDIR(inode->i_mode)) + F2FS_I(inode)->i_flags |= FS_INDEX_FL; + if (F2FS_I(inode)->i_flags & FS_PROJINHERIT_FL) set_inode_flag(inode, FI_PROJ_INHERIT); From bc0c8fe8b11e30ac7d881638ed422a7473827bb7 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 30 Aug 2017 18:04:48 +0800 Subject: [PATCH 416/804] f2fs: remove unneeded parameter of change_curseg allocate_segment_by_default is the only caller of change_curseg passing @reuse with 'false', but commit 763bfe1bc575 ("f2fs: remove reusing any prefree segments") removes the calling, after that, @reuse in change_curseg always be true, so, let's clean up the unneeded parameter. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 00253111c227..a44c6fd2f1c5 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2092,7 +2092,7 @@ static void __refresh_next_blkoff(struct f2fs_sb_info *sbi, * This function always allocates a used segment(from dirty seglist) by SSR * manner, so it should recover the existing segment information of valid blocks */ -static void change_curseg(struct f2fs_sb_info *sbi, int type, bool reuse) +static void change_curseg(struct f2fs_sb_info *sbi, int type) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, type); @@ -2113,12 +2113,10 @@ static void change_curseg(struct f2fs_sb_info *sbi, int type, bool reuse) curseg->alloc_type = SSR; __next_free_blkoff(sbi, curseg, 0); - if (reuse) { - sum_page = get_sum_page(sbi, new_segno); - sum_node = (struct f2fs_summary_block *)page_address(sum_page); - memcpy(curseg->sum_blk, sum_node, SUM_ENTRY_SIZE); - f2fs_put_page(sum_page, 1); - } + sum_page = get_sum_page(sbi, new_segno); + sum_node = (struct f2fs_summary_block *)page_address(sum_page); + memcpy(curseg->sum_blk, sum_node, SUM_ENTRY_SIZE); + f2fs_put_page(sum_page, 1); } static int get_ssr_segment(struct f2fs_sb_info *sbi, int type) @@ -2182,7 +2180,7 @@ static void allocate_segment_by_default(struct f2fs_sb_info *sbi, else if (curseg->alloc_type == LFS && is_next_segment_free(sbi, type)) new_curseg(sbi, type, false); else if (need_SSR(sbi) && get_ssr_segment(sbi, type)) - change_curseg(sbi, type, true); + change_curseg(sbi, type); else new_curseg(sbi, type, false); @@ -2535,7 +2533,7 @@ void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, /* change the current segment */ if (segno != curseg->segno) { curseg->next_segno = segno; - change_curseg(sbi, type, true); + change_curseg(sbi, type); } curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr); @@ -2554,7 +2552,7 @@ void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, if (recover_curseg) { if (old_cursegno != curseg->segno) { curseg->next_segno = old_cursegno; - change_curseg(sbi, type, true); + change_curseg(sbi, type); } curseg->next_blkoff = old_blkoff; } From 3b8bbd990ce57815f0b0b4b60029ba8d173f5912 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 31 Aug 2017 18:56:05 +0800 Subject: [PATCH 417/804] f2fs: avoid race in between atomic_read & atomic_inc Previously, we will miss merging flush command during fsync due to below race condition: Thread A Thread B Thread C - f2fs_issue_flush - atomic_read(&issing_flush) - f2fs_issue_flush - atomic_read(&issing_flush) - f2fs_issue_flush - atomic_read(&issing_flush) - atomic_inc(&issing_flush) - atomic_inc(&issing_flush) - atomic_inc(&issing_flush) - submit_flush_wait - submit_flush_wait - submit_flush_wait It needs to use atomic_inc_return instead to avoid such race. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index a44c6fd2f1c5..370b4ca0e294 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -536,8 +536,7 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi) return ret; } - if (!atomic_read(&fcc->issing_flush)) { - atomic_inc(&fcc->issing_flush); + if (atomic_inc_return(&fcc->issing_flush) == 1) { ret = submit_flush_wait(sbi); atomic_dec(&fcc->issing_flush); @@ -547,7 +546,6 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi) init_completion(&cmd.wait); - atomic_inc(&fcc->issing_flush); llist_add(&cmd.llnode, &fcc->issue_list); /* update issue_list before we wake up issue_flush thread */ From ccb0b5d09d8c46c03e91c8e7a62b57fedee0662e Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 31 Aug 2017 18:56:06 +0800 Subject: [PATCH 418/804] f2fs: fix to wake up all sleeping flusher In scenario of remount_ro vs flush, after flush_thread exits in ->remount_fs, flusher will only clean up golbal issue_list, but without waking up flushers waiting on that list, result in hang related user threads. In order to fix this issue, this patch enables the flusher to take charge of issue_flush thread: executes merged flush command, and wake up all sleeping flushers. Fixes: 5eba8c5d1fb3 ("f2fs: fix to access nullified flush_cmd_control pointer") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 370b4ca0e294..9d8d32b38073 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -558,8 +558,27 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi) wait_for_completion(&cmd.wait); atomic_dec(&fcc->issing_flush); } else { - llist_del_all(&fcc->issue_list); - atomic_set(&fcc->issing_flush, 0); + struct llist_node *list; + + list = llist_del_all(&fcc->issue_list); + if (!list) { + wait_for_completion(&cmd.wait); + atomic_dec(&fcc->issing_flush); + } else { + struct flush_cmd *tmp, *next; + + ret = submit_flush_wait(sbi); + + llist_for_each_entry_safe(tmp, next, list, llnode) { + if (tmp == &cmd) { + cmd.ret = ret; + atomic_dec(&fcc->issing_flush); + continue; + } + tmp->ret = ret; + complete(&tmp->wait); + } + } } return cmd.ret; From f24eafa643946942a200e14db46115b7082ce5bf Mon Sep 17 00:00:00 2001 From: Arvind Yadav Date: Thu, 31 Aug 2017 15:06:24 +0530 Subject: [PATCH 419/804] f2fs: constify super_operations super_operations are not supposed to change at runtime. "struct super_block" working with super_operations provided by work with const super_operations. So mark the non-const structs as const Signed-off-by: Arvind Yadav Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 4a5eae7ec64c..731794142009 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1550,7 +1550,7 @@ void f2fs_quota_off_umount(struct super_block *sb) } #endif -static struct super_operations f2fs_sops = { +static const struct super_operations f2fs_sops = { .alloc_inode = f2fs_alloc_inode, .drop_inode = f2fs_drop_inode, .destroy_inode = f2fs_destroy_inode, From 9f467e94d08d4b0d674436bc637fdbdfb76490dd Mon Sep 17 00:00:00 2001 From: Yunlong Song Date: Mon, 4 Sep 2017 11:10:18 +0800 Subject: [PATCH 420/804] Revert "f2fs: add a new function get_ssr_cost" This reverts commit b7b7c4cf1c9ef0272a65f1480457cbfdadcda19d. se->ckpt_valid_blocks will never be smaller than se->valid_blocks, so just remove get_ssr_cost. Signed-off-by: Yunlong Song Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 382b7d386ffb..427f53489591 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -277,20 +277,11 @@ static unsigned int get_greedy_cost(struct f2fs_sb_info *sbi, valid_blocks * 2 : valid_blocks; } -static unsigned int get_ssr_cost(struct f2fs_sb_info *sbi, - unsigned int segno) -{ - struct seg_entry *se = get_seg_entry(sbi, segno); - - return se->ckpt_valid_blocks > se->valid_blocks ? - se->ckpt_valid_blocks : se->valid_blocks; -} - static inline unsigned int get_gc_cost(struct f2fs_sb_info *sbi, unsigned int segno, struct victim_sel_policy *p) { if (p->alloc_mode == SSR) - return get_ssr_cost(sbi, segno); + return get_seg_entry(sbi, segno)->ckpt_valid_blocks; /* alloc_mode == LFS */ if (p->gc_mode == GC_GREEDY) From 4445c7cfbdcc36f07598294b9585f545595e7051 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 5 Sep 2017 16:54:24 -0700 Subject: [PATCH 421/804] f2fs: introduce f2fs_encrypted_file for clean-up This patch replaces (f2fs_encrypted_inode() && S_ISREG()) with f2fs_encrypted_file(), which gives no functional change. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 10 +++++----- fs/f2fs/f2fs.h | 5 +++++ fs/f2fs/file.c | 2 +- fs/f2fs/gc.c | 5 ++--- fs/f2fs/inline.c | 2 +- 5 files changed, 14 insertions(+), 10 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index d7aa2e908570..f850060ff2e5 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -580,7 +580,7 @@ struct page *get_read_data_page(struct inode *inode, pgoff_t index, .encrypted_page = NULL, }; - if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) + if (f2fs_encrypted_file(inode)) return read_mapping_page(mapping, index, NULL); page = f2fs_grab_cache_page(mapping, index, for_write); @@ -785,7 +785,7 @@ alloc: static inline bool __force_buffered_io(struct inode *inode, int rw) { - return ((f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) || + return (f2fs_encrypted_file(inode) || (rw == WRITE && test_opt(F2FS_I_SB(inode), LFS)) || F2FS_I_SB(inode)->s_ndevs); } @@ -1156,7 +1156,7 @@ static struct bio *f2fs_grab_bio(struct inode *inode, block_t blkaddr, struct fscrypt_ctx *ctx = NULL; struct bio *bio; - if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) { + if (f2fs_encrypted_file(inode)) { ctx = fscrypt_get_ctx(inode, GFP_NOFS); if (IS_ERR(ctx)) return ERR_CAST(ctx); @@ -1343,7 +1343,7 @@ static int encrypt_one_page(struct f2fs_io_info *fio) struct inode *inode = fio->page->mapping->host; gfp_t gfp_flags = GFP_NOFS; - if (!f2fs_encrypted_inode(inode) || !S_ISREG(inode->i_mode)) + if (!f2fs_encrypted_file(inode)) return 0; /* wait for GCed encrypted page writeback */ @@ -1971,7 +1971,7 @@ repeat: f2fs_wait_on_page_writeback(page, DATA, false); /* wait for GCed encrypted page writeback */ - if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) + if (f2fs_encrypted_file(inode)) f2fs_wait_on_encrypted_page_writeback(sbi, blkaddr); if (len == PAGE_SIZE || PageUptodate(page)) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index e13daceb7995..7b9246197b23 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3020,6 +3020,11 @@ static inline bool f2fs_encrypted_inode(struct inode *inode) return file_is_encrypt(inode); } +static inline bool f2fs_encrypted_file(struct inode *inode) +{ + return f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode); +} + static inline void f2fs_set_encrypted_inode(struct inode *inode) { #ifdef CONFIG_F2FS_FS_ENCRYPTION diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 6c2ebe91afeb..2632d447c996 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -109,7 +109,7 @@ mapped: f2fs_wait_on_page_writeback(page, DATA, false); /* wait for GCed encrypted page writeback */ - if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) + if (f2fs_encrypted_file(inode)) f2fs_wait_on_encrypted_page_writeback(sbi, dn.data_blkaddr); out_sem: diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 427f53489591..d36130233d9e 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -831,8 +831,7 @@ next_step: continue; /* if encrypted inode, let's go phase 3 */ - if (f2fs_encrypted_inode(inode) && - S_ISREG(inode->i_mode)) { + if (f2fs_encrypted_file(inode)) { add_gc_inode(gc_list, inode); continue; } @@ -873,7 +872,7 @@ next_step: start_bidx = start_bidx_of_node(nofs, inode) + ofs_in_node; - if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) + if (f2fs_encrypted_file(inode)) move_encrypted_block(inode, start_bidx, segno, off); else move_data_page(inode, start_bidx, gc_type, segno, off); diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 041072017ef8..92b5a4a89ed2 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -25,7 +25,7 @@ bool f2fs_may_inline_data(struct inode *inode) if (i_size_read(inode) > MAX_INLINE_DATA(inode)) return false; - if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) + if (f2fs_encrypted_file(inode)) return false; return true; From e2cd416ffa3262e4cffb03aec48f6ad15f996f95 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 5 Sep 2017 17:04:35 -0700 Subject: [PATCH 422/804] f2fs: use generic terms used for encrypted block management This patch renames functions regarding to buffer management via META_MAPPING used for encrypted blocks especially. We can actually use them in generic way. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 6 +++--- fs/f2fs/f2fs.h | 3 +-- fs/f2fs/file.c | 2 +- fs/f2fs/gc.c | 13 +++++++++---- fs/f2fs/segment.c | 3 +-- 5 files changed, 15 insertions(+), 12 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index f850060ff2e5..4d79696c3429 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1162,7 +1162,7 @@ static struct bio *f2fs_grab_bio(struct inode *inode, block_t blkaddr, return ERR_CAST(ctx); /* wait the page to be moved by cleaning */ - f2fs_wait_on_encrypted_page_writeback(sbi, blkaddr); + f2fs_wait_on_block_writeback(sbi, blkaddr); } bio = bio_alloc(GFP_KERNEL, min_t(int, nr_pages, BIO_MAX_PAGES)); @@ -1347,7 +1347,7 @@ static int encrypt_one_page(struct f2fs_io_info *fio) return 0; /* wait for GCed encrypted page writeback */ - f2fs_wait_on_encrypted_page_writeback(fio->sbi, fio->old_blkaddr); + f2fs_wait_on_block_writeback(fio->sbi, fio->old_blkaddr); retry_encrypt: fio->encrypted_page = fscrypt_encrypt_page(inode, fio->page, @@ -1972,7 +1972,7 @@ repeat: /* wait for GCed encrypted page writeback */ if (f2fs_encrypted_file(inode)) - f2fs_wait_on_encrypted_page_writeback(sbi, blkaddr); + f2fs_wait_on_block_writeback(sbi, blkaddr); if (len == PAGE_SIZE || PageUptodate(page)) return 0; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 7b9246197b23..04ab25448c51 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2631,8 +2631,7 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, struct f2fs_io_info *fio, bool add_list); void f2fs_wait_on_page_writeback(struct page *page, enum page_type type, bool ordered); -void f2fs_wait_on_encrypted_page_writeback(struct f2fs_sb_info *sbi, - block_t blkaddr); +void f2fs_wait_on_block_writeback(struct f2fs_sb_info *sbi, block_t blkaddr); void write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk); void write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk); int lookup_journal_in_cursum(struct f2fs_journal *journal, int type, diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 2632d447c996..531379f513fa 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -110,7 +110,7 @@ mapped: /* wait for GCed encrypted page writeback */ if (f2fs_encrypted_file(inode)) - f2fs_wait_on_encrypted_page_writeback(sbi, dn.data_blkaddr); + f2fs_wait_on_block_writeback(sbi, dn.data_blkaddr); out_sem: up_read(&F2FS_I(inode)->i_mmap_sem); diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index d36130233d9e..bd16e6631cf3 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -599,8 +599,12 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, return true; } -static void move_encrypted_block(struct inode *inode, block_t bidx, - unsigned int segno, int off) +/* + * Move data block via META_MAPPING while keeping locked data page. + * This can be used to move blocks, aka LBAs, directly on disk. + */ +static void move_data_block(struct inode *inode, block_t bidx, + unsigned int segno, int off) { struct f2fs_io_info fio = { .sbi = F2FS_I_SB(inode), @@ -873,9 +877,10 @@ next_step: start_bidx = start_bidx_of_node(nofs, inode) + ofs_in_node; if (f2fs_encrypted_file(inode)) - move_encrypted_block(inode, start_bidx, segno, off); + move_data_block(inode, start_bidx, segno, off); else - move_data_page(inode, start_bidx, gc_type, segno, off); + move_data_page(inode, start_bidx, gc_type, + segno, off); if (locked) { up_write(&fi->dio_rwsem[WRITE]); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 9d8d32b38073..e95470071030 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2608,8 +2608,7 @@ void f2fs_wait_on_page_writeback(struct page *page, } } -void f2fs_wait_on_encrypted_page_writeback(struct f2fs_sb_info *sbi, - block_t blkaddr) +void f2fs_wait_on_block_writeback(struct f2fs_sb_info *sbi, block_t blkaddr) { struct page *cpage; From fc9c6007a268f2c48643ce32efef6862433580ff Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 6 Sep 2017 21:04:44 -0700 Subject: [PATCH 423/804] f2fs: make get_lock_data_page to handle encrypted inode This patch refactors get_lock_data_page() to handle encryption case directly. In order to do that, it introduces common f2fs_submit_page_read(). Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 109 +++++++++++++++++++++++-------------------------- 1 file changed, 51 insertions(+), 58 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 4d79696c3429..a275cbe57042 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -455,6 +455,53 @@ out_fail: return err; } +static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, + unsigned nr_pages) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct fscrypt_ctx *ctx = NULL; + struct bio *bio; + + if (f2fs_encrypted_file(inode)) { + ctx = fscrypt_get_ctx(inode, GFP_NOFS); + if (IS_ERR(ctx)) + return ERR_CAST(ctx); + + /* wait the page to be moved by cleaning */ + f2fs_wait_on_block_writeback(sbi, blkaddr); + } + + bio = bio_alloc(GFP_KERNEL, min_t(int, nr_pages, BIO_MAX_PAGES)); + if (!bio) { + if (ctx) + fscrypt_release_ctx(ctx); + return ERR_PTR(-ENOMEM); + } + f2fs_target_device(sbi, blkaddr, bio); + bio->bi_end_io = f2fs_read_end_io; + bio->bi_private = ctx; + bio_set_op_attrs(bio, REQ_OP_READ, 0); + + return bio; +} + +/* This can handle encryption stuffs */ +static int f2fs_submit_page_read(struct inode *inode, struct page *page, + block_t blkaddr) +{ + struct bio *bio = f2fs_grab_read_bio(inode, blkaddr, 1); + + if (IS_ERR(bio)) + return PTR_ERR(bio); + + if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { + bio_put(bio); + return -EFAULT; + } + __submit_bio(F2FS_I_SB(inode), bio, DATA); + return 0; +} + static void __set_data_blkaddr(struct dnode_of_data *dn) { struct f2fs_node *rn = F2FS_NODE(dn->node_page); @@ -572,16 +619,6 @@ struct page *get_read_data_page(struct inode *inode, pgoff_t index, struct page *page; struct extent_info ei = {0,0,0}; int err; - struct f2fs_io_info fio = { - .sbi = F2FS_I_SB(inode), - .type = DATA, - .op = REQ_OP_READ, - .op_flags = op_flags, - .encrypted_page = NULL, - }; - - if (f2fs_encrypted_file(inode)) - return read_mapping_page(mapping, index, NULL); page = f2fs_grab_cache_page(mapping, index, for_write); if (!page) @@ -622,9 +659,7 @@ got_it: return page; } - fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr; - fio.page = page; - err = f2fs_submit_page_bio(&fio); + err = f2fs_submit_page_read(inode, page, dn.data_blkaddr); if (err) goto put_err; return page; @@ -1149,35 +1184,6 @@ out: return ret; } -static struct bio *f2fs_grab_bio(struct inode *inode, block_t blkaddr, - unsigned nr_pages) -{ - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct fscrypt_ctx *ctx = NULL; - struct bio *bio; - - if (f2fs_encrypted_file(inode)) { - ctx = fscrypt_get_ctx(inode, GFP_NOFS); - if (IS_ERR(ctx)) - return ERR_CAST(ctx); - - /* wait the page to be moved by cleaning */ - f2fs_wait_on_block_writeback(sbi, blkaddr); - } - - bio = bio_alloc(GFP_KERNEL, min_t(int, nr_pages, BIO_MAX_PAGES)); - if (!bio) { - if (ctx) - fscrypt_release_ctx(ctx); - return ERR_PTR(-ENOMEM); - } - f2fs_target_device(sbi, blkaddr, bio); - bio->bi_end_io = f2fs_read_end_io; - bio->bi_private = ctx; - - return bio; -} - /* * This function was originally taken from fs/mpage.c, and customized for f2fs. * Major change was from block_size == page_size in f2fs by default. @@ -1273,12 +1279,11 @@ submit_and_realloc: bio = NULL; } if (bio == NULL) { - bio = f2fs_grab_bio(inode, block_nr, nr_pages); + bio = f2fs_grab_read_bio(inode, block_nr, nr_pages); if (IS_ERR(bio)) { bio = NULL; goto set_error_page; } - bio_set_op_attrs(bio, REQ_OP_READ, 0); } if (bio_add_page(bio, page, blocksize, 0) < blocksize) @@ -1986,21 +1991,9 @@ repeat: zero_user_segment(page, 0, PAGE_SIZE); SetPageUptodate(page); } else { - struct bio *bio; - - bio = f2fs_grab_bio(inode, blkaddr, 1); - if (IS_ERR(bio)) { - err = PTR_ERR(bio); + err = f2fs_submit_page_read(inode, page, blkaddr); + if (err) goto fail; - } - bio->bi_rw = READ_SYNC; - if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { - bio_put(bio); - err = -EFAULT; - goto fail; - } - - __submit_bio(sbi, bio, DATA); lock_page(page); if (unlikely(page->mapping != mapping)) { From b70c5bf429edad6e1856c20a8d0a309df72af544 Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Thu, 7 Sep 2017 10:40:54 +0800 Subject: [PATCH 424/804] f2fs: avoid race in between read xattr & write xattr Thread A: Thread B: -f2fs_getxattr -lookup_all_xattrs -xnid = F2FS_I(inode)->i_xattr_nid; -f2fs_setxattr -__f2fs_setxattr -write_all_xattrs -truncate_xattr_node ... ... -write_checkpoint ... ... -alloc_nid <- nid reuse -get_node_page -f2fs_bug_on <- nid != node_footer->nid It's need a rw_sem to avoid the race Signed-off-by: Yunlei He Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 1 + fs/f2fs/super.c | 1 + fs/f2fs/xattr.c | 6 ++++++ 3 files changed, 8 insertions(+) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 04ab25448c51..9d96f6d51eef 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -653,6 +653,7 @@ struct f2fs_inode_info { struct extent_tree *extent_tree; /* cached extent_tree entry */ struct rw_semaphore dio_rwsem[2];/* avoid racing between dio and gc */ struct rw_semaphore i_mmap_sem; + struct rw_semaphore i_xattr_sem; /* avoid racing between reading and changing EAs */ int i_extra_isize; /* size of extra space located in i_addr */ kprojid_t i_projid; /* id for project quota */ diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 731794142009..315e59ad1483 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -630,6 +630,7 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) init_rwsem(&fi->dio_rwsem[READ]); init_rwsem(&fi->dio_rwsem[WRITE]); init_rwsem(&fi->i_mmap_sem); + init_rwsem(&fi->i_xattr_sem); #ifdef CONFIG_QUOTA memset(&fi->i_dquot, 0, sizeof(fi->i_dquot)); diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index aad59c7c3a63..ab658419552b 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -520,8 +520,10 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name, if (len > F2FS_NAME_LEN) return -ERANGE; + down_read(&F2FS_I(inode)->i_xattr_sem); error = lookup_all_xattrs(inode, ipage, index, len, name, &entry, &base_addr); + up_read(&F2FS_I(inode)->i_xattr_sem); if (error) return error; @@ -550,7 +552,9 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) int error = 0; size_t rest = buffer_size; + down_read(&F2FS_I(inode)->i_xattr_sem); error = read_all_xattrs(inode, NULL, &base_addr); + up_read(&F2FS_I(inode)->i_xattr_sem); if (error) return error; @@ -726,7 +730,9 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name, f2fs_lock_op(sbi); /* protect xattr_ver */ down_write(&F2FS_I(inode)->i_sem); + down_write(&F2FS_I(inode)->i_xattr_sem); err = __f2fs_setxattr(inode, index, name, value, size, ipage, flags); + up_write(&F2FS_I(inode)->i_xattr_sem); up_write(&F2FS_I(inode)->i_sem); f2fs_unlock_op(sbi); From ef75b9afda215c1446195bf487987af43a458959 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Sat, 9 Sep 2017 12:03:23 -0700 Subject: [PATCH 425/804] f2fs: better to wait for fstrim completion In android, we'd better wait for fstrim completion instead of issuing the discard commands asynchronous. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index e95470071030..8ee473b1830f 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -17,6 +17,7 @@ #include #include #include +#include #include "f2fs.h" #include "segment.h" @@ -1141,6 +1142,9 @@ static int __issue_discard_cmd(struct f2fs_sb_info *sbi, bool issue_cond) if (dcc->pend_list_tag[i] & P_TRIM) { __submit_discard_cmd(sbi, dc); issued++; + + if (fatal_signal_pending(current)) + break; continue; } @@ -1257,7 +1261,7 @@ void stop_discard_thread(struct f2fs_sb_info *sbi) } } -/* This comes from f2fs_put_super */ +/* This comes from f2fs_put_super and f2fs_trim_fs */ void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi) { __issue_discard_cmd(sbi, false); @@ -2292,6 +2296,7 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) } /* It's time to issue all the filed discards */ mark_discard_range_all(sbi); + f2fs_wait_discard_bios(sbi); out: range->len = F2FS_BLK_TO_BYTES(cpc.trimmed); return err; From 29f775fa640d4df8d5a1a0dad0b1c44143a00007 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Sat, 9 Sep 2017 11:11:04 -0700 Subject: [PATCH 426/804] f2fs: speed up gc_urgent mode with SSR This patch activates SSR in gc_urgent mode. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 1 + fs/f2fs/segment.c | 15 +++++++++++++++ fs/f2fs/segment.h | 13 ------------- 3 files changed, 16 insertions(+), 13 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 9d96f6d51eef..ff694127243a 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2592,6 +2592,7 @@ void destroy_node_manager_caches(void); /* * segment.c */ +bool need_SSR(struct f2fs_sb_info *sbi); void register_inmem_page(struct inode *inode, struct page *page); void drop_inmem_pages(struct inode *inode); void drop_inmem_page(struct inode *inode, struct page *page); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 8ee473b1830f..3244cfb1885f 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -169,6 +169,21 @@ found: return result - size + __reverse_ffz(tmp); } +bool need_SSR(struct f2fs_sb_info *sbi) +{ + int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES); + int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS); + int imeta_secs = get_blocktype_secs(sbi, F2FS_DIRTY_IMETA); + + if (test_opt(sbi, LFS)) + return false; + if (sbi->gc_thread && sbi->gc_thread->gc_urgent) + return true; + + return free_sections(sbi) <= (node_secs + 2 * dent_secs + imeta_secs + + 2 * reserved_sections(sbi)); +} + void register_inmem_page(struct inode *inode, struct page *page) { struct f2fs_inode_info *fi = F2FS_I(inode); diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index b8aa84109bf5..ffa11274b0ce 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -497,19 +497,6 @@ static inline int reserved_sections(struct f2fs_sb_info *sbi) return GET_SEC_FROM_SEG(sbi, (unsigned int)reserved_segments(sbi)); } -static inline bool need_SSR(struct f2fs_sb_info *sbi) -{ - int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES); - int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS); - int imeta_secs = get_blocktype_secs(sbi, F2FS_DIRTY_IMETA); - - if (test_opt(sbi, LFS)) - return false; - - return free_sections(sbi) <= (node_secs + 2 * dent_secs + imeta_secs + - 2 * reserved_sections(sbi)); -} - static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed, int needed) { From 95b475cd685686460f04e6862cdc400ff021f843 Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Mon, 11 Sep 2017 16:30:28 +0900 Subject: [PATCH 427/804] f2fs: clear radix tree dirty tag of pages whose dirty flag is cleared On a senario like writing out the first dirty page of the inode as the inline data, we only cleared dirty flags of the pages, but didn't clear the dirty tags of those pages in the radix tree. If we don't clear the dirty tags of the pages in the radix tree, the inodes which contain the pages will be marked with I_DIRTY_PAGES again and again, and writepages() for the inodes will be invoked in every writeback period. As a result, nothing will be done in every writepages() for the inodes and it will just consume CPU time meaninglessly. Signed-off-by: Daeho Jeong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 7 +++++++ fs/f2fs/inline.c | 7 +++++++ mm/util.c | 1 + 3 files changed, 15 insertions(+) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 1380c442648b..4f2a8fedb313 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -705,6 +705,8 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, struct f2fs_dentry_block *dentry_blk; unsigned int bit_pos; int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len)); + struct address_space *mapping = page_mapping(page); + unsigned long flags; int i; f2fs_update_time(F2FS_I_SB(dir), REQ_TIME); @@ -735,6 +737,11 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, if (bit_pos == NR_DENTRY_IN_BLOCK && !truncate_hole(dir, page->index, page->index + 1)) { + spin_lock_irqsave(&mapping->tree_lock, flags); + radix_tree_tag_clear(&mapping->page_tree, page_index(page), + PAGECACHE_TAG_DIRTY); + spin_unlock_irqrestore(&mapping->tree_lock, flags); + clear_page_dirty_for_io(page); ClearPagePrivate(page); ClearPageUptodate(page); diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 92b5a4a89ed2..7e76c415b913 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -202,6 +202,8 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page) { void *src_addr, *dst_addr; struct dnode_of_data dn; + struct address_space *mapping = page_mapping(page); + unsigned long flags; int err; set_new_dnode(&dn, inode, NULL, NULL, 0); @@ -223,6 +225,11 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page) kunmap_atomic(src_addr); set_page_dirty(dn.inode_page); + spin_lock_irqsave(&mapping->tree_lock, flags); + radix_tree_tag_clear(&mapping->page_tree, page_index(page), + PAGECACHE_TAG_DIRTY); + spin_unlock_irqrestore(&mapping->tree_lock, flags); + set_inode_flag(inode, FI_APPEND_WRITE); set_inode_flag(inode, FI_DATA_EXIST); diff --git a/mm/util.c b/mm/util.c index d5259b62f8d7..d7b1065644be 100644 --- a/mm/util.c +++ b/mm/util.c @@ -348,6 +348,7 @@ struct address_space *page_mapping(struct page *page) return NULL; return page->mapping; } +EXPORT_SYMBOL(page_mapping); int overcommit_ratio_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, From 11dcf7834966656d4337dd6da77b04f6f765a893 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 12 Sep 2017 14:04:05 +0800 Subject: [PATCH 428/804] f2fs: detect dirty inode in evict_inode Add a bugon in f2fs_evict_inode to detect inconsistent status between inode cache and related node page cache. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/inode.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index c33b05aec1a1..50c88e37ed66 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -519,6 +519,9 @@ no_delete: stat_dec_inline_dir(inode); stat_dec_inline_inode(inode); + if (!is_set_ckpt_flags(sbi, CP_ERROR_FLAG)) + f2fs_bug_on(sbi, is_inode_flag_set(inode, FI_DIRTY_INODE)); + /* ino == 0, if f2fs_new_inode() was failed t*/ if (inode->i_ino) invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino, From 603dde39653d6dadd170329feb8febe2ac19cde5 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 12 Sep 2017 14:25:35 +0800 Subject: [PATCH 429/804] f2fs: fix to show correct discard_granularity in sysfs Fix below incorrect display when reading discard_granularity sysfs node. $ cat /sys/fs/f2fs//discard_granularity $ 16 $ echo 32 > /sys/fs/f2fs//discard_granularity $ cat /sys/fs/f2fs//discard_granularity $ 16 Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/sysfs.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 962735dc9c63..e2c258f717cd 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -170,6 +170,8 @@ static ssize_t f2fs_sbi_store(struct f2fs_attr *a, dcc->pend_list_tag[i] &= (~P_ACTIVE); } mutex_unlock(&dcc->cmd_lock); + + *ui = t; return count; } From c7fd9e2b4a687666fbf12b73e443134580976606 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 12 Sep 2017 21:35:12 +0800 Subject: [PATCH 430/804] f2fs: hurry up to issue discard after io interruption Once we encounter I/O interruption during issuing discards, we will delay long time before next round, but if system status is I/O idle during the time, it may loses opportunity to issue discards. So this patch changes to hurry up to issue discard after io interruption. Besides, this patch also fixes to issue discards accurately with assigned rate. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 3244cfb1885f..059a219b7740 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1142,6 +1142,7 @@ static int __issue_discard_cmd(struct f2fs_sb_info *sbi, bool issue_cond) struct blk_plug plug; int iter = 0, issued = 0; int i; + bool io_interrupted = false; mutex_lock(&dcc->cmd_lock); f2fs_bug_on(sbi, @@ -1163,11 +1164,20 @@ static int __issue_discard_cmd(struct f2fs_sb_info *sbi, bool issue_cond) continue; } - if (!issue_cond || is_idle(sbi)) { - issued++; + if (!issue_cond) { __submit_discard_cmd(sbi, dc); + issued++; + continue; } - if (issue_cond && iter++ > DISCARD_ISSUE_RATE) + + if (is_idle(sbi)) { + __submit_discard_cmd(sbi, dc); + issued++; + } else { + io_interrupted = true; + } + + if (++iter >= DISCARD_ISSUE_RATE) goto out; } if (list_empty(pend_list) && dcc->pend_list_tag[i] & P_TRIM) @@ -1177,6 +1187,9 @@ out: blk_finish_plug(&plug); mutex_unlock(&dcc->cmd_lock); + if (!issued && io_interrupted) + issued = -1; + return issued; } From d5347b1e666dd8ef0d26fde2f4f55e7bbd987dce Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 2 Oct 2017 02:50:16 +0800 Subject: [PATCH 431/804] f2fs: fix potential panic during fstrim As Ju Hyung Park reported: "When 'fstrim' is called for manual trim, a BUG() can be triggered randomly with this patch. I'm seeing this issue on both x86 Desktop and arm64 Android phone. On x86 Desktop, this was caused during Ubuntu boot-up. I have a cronjob installed which calls 'fstrim -v /' during boot. On arm64 Android, this was caused during GC looping with 1ms gc_min_sleep_time & gc_max_sleep_time." Root cause of this issue is that f2fs_wait_discard_bios can only be used by f2fs_put_super, because during put_super there must be no other referrers, so it can ignore discard entry's reference count when removing the entry, otherwise in other caller we will hit bug_on in __remove_discard_cmd as there may be other issuer added reference count in discard entry. Thread A Thread B - issue_discard_thread - f2fs_ioc_fitrim - f2fs_trim_fs - f2fs_wait_discard_bios - __issue_discard_cmd - __submit_discard_cmd - __wait_discard_cmd - dc->ref++ - __wait_one_discard_bio - __wait_discard_cmd - __remove_discard_cmd - f2fs_bug_on(sbi, dc->ref) Fixes: 969d1b180d987c2be02de890d0fff0f66a0e80de Reported-by: Ju Hyung Park Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- fs/f2fs/segment.c | 6 +++--- fs/f2fs/super.c | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index ff694127243a..dd840f60e172 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2606,7 +2606,7 @@ void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr); bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr); void refresh_sit_entry(struct f2fs_sb_info *sbi, block_t old, block_t new); void stop_discard_thread(struct f2fs_sb_info *sbi); -void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi); +void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi, bool umount); void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc); void release_discard_addrs(struct f2fs_sb_info *sbi); int npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 059a219b7740..f5c494389483 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1290,11 +1290,11 @@ void stop_discard_thread(struct f2fs_sb_info *sbi) } /* This comes from f2fs_put_super and f2fs_trim_fs */ -void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi) +void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi, bool umount) { __issue_discard_cmd(sbi, false); __drop_discard_cmd(sbi); - __wait_discard_cmd(sbi, false); + __wait_discard_cmd(sbi, !umount); } static void mark_discard_range_all(struct f2fs_sb_info *sbi) @@ -2324,7 +2324,7 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) } /* It's time to issue all the filed discards */ mark_discard_range_all(sbi); - f2fs_wait_discard_bios(sbi); + f2fs_wait_discard_bios(sbi, false); out: range->len = F2FS_BLK_TO_BYTES(cpc.trimmed); return err; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 315e59ad1483..482bb0333806 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -801,7 +801,7 @@ static void f2fs_put_super(struct super_block *sb) } /* be sure to wait for any on-going discard commands */ - f2fs_wait_discard_bios(sbi); + f2fs_wait_discard_bios(sbi, true); if (f2fs_discard_en(sbi) && !sbi->discard_blks) { struct cp_control cpc = { From 131bc9f6b7f9efc531eb81f8d542618c6c1cc3c5 Mon Sep 17 00:00:00 2001 From: Yunlong Song Date: Sat, 23 Sep 2017 17:02:18 +0800 Subject: [PATCH 432/804] Revert "f2fs: node segment is prior to data segment selected victim" This reverts commit b9cd20619e359d199b755543474c3d853c8e3415. That patch causes much fewer node segments (which can be used for SSR) than before, and in the corner case (e.g. create and delete *.txt files in one same directory, there will be very few node segments but many data segments), if the reserved free segments are all used up during gc, then the write_checkpoint can still flush dentry pages to data ssr segments, but will probably fail to flush node pages to node ssr segments, since there are not enough node ssr segments left (the left ones are all full). So revert this patch to give a fair chance to let node segments remain for SSR, which provides more robustness for corner cases. Conflicts: fs/f2fs/gc.c Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index bd16e6631cf3..3a6eaf01cdf7 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -267,16 +267,6 @@ static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno) return UINT_MAX - ((100 * (100 - u) * age) / (100 + u)); } -static unsigned int get_greedy_cost(struct f2fs_sb_info *sbi, - unsigned int segno) -{ - unsigned int valid_blocks = - get_valid_blocks(sbi, segno, true); - - return IS_DATASEG(get_seg_entry(sbi, segno)->type) ? - valid_blocks * 2 : valid_blocks; -} - static inline unsigned int get_gc_cost(struct f2fs_sb_info *sbi, unsigned int segno, struct victim_sel_policy *p) { @@ -285,7 +275,7 @@ static inline unsigned int get_gc_cost(struct f2fs_sb_info *sbi, /* alloc_mode == LFS */ if (p->gc_mode == GC_GREEDY) - return get_greedy_cost(sbi, segno); + return get_valid_blocks(sbi, segno, true); else return get_cb_cost(sbi, segno); } From dbce11e9ee5b89fd88e0fef40478c2bb8ff1ef68 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 25 Sep 2017 14:17:51 +0800 Subject: [PATCH 433/804] Revert "f2fs: reuse nids more aggressively" Commit 268344664603 ("f2fs: reuse nids more aggressively") tries to reuse nids as many as possilbe, in order to mitigate producing obsolete node pages in page cache. But acutally, before we reuse the nids and related node page cache, we will always invalidate that node page, so there will be not any obsolete node pages in cache. Let's just revert previous commit, so that nm_i::next_scan_nid can be increased ascendingly, making __build_free_nids traverses all NAT pages more easily, finally, free nid bitmap cache can be enabled as soon as possible. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 32474db18ad9..264ccd157858 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -327,10 +327,6 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, if (nat_get_blkaddr(e) != NEW_ADDR && new_blkaddr == NULL_ADDR) { unsigned char version = nat_get_version(e); nat_set_version(e, inc_node_version(version)); - - /* in order to reuse the nid */ - if (nm_i->next_scan_nid > ni->nid) - nm_i->next_scan_nid = ni->nid; } /* change address */ From 8ea6e1c327c53c785d9a29303e963d3b5c9f9ff4 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 4 Sep 2017 18:58:02 +0800 Subject: [PATCH 434/804] f2fs: introduce read_inline_xattr Commit ba38c27eb93e ("f2fs: enhance lookup xattr") introduces lookup_all_xattrs duplicating from read_all_xattrs, which leaves lots of similar codes in between them, so introduce new help read_inline_xattr to clean up redundant codes. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/xattr.c | 59 ++++++++++++++++++++++++------------------------- 1 file changed, 29 insertions(+), 30 deletions(-) diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index ab658419552b..bbdf9955c2dc 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -288,6 +288,29 @@ static struct f2fs_xattr_entry *__find_inline_xattr(void *base_addr, return entry; } +static int read_inline_xattr(struct inode *inode, struct page *ipage, + void *txattr_addr) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + unsigned int inline_size = inline_xattr_size(inode); + struct page *page = NULL; + void *inline_addr; + + if (ipage) { + inline_addr = inline_xattr_addr(ipage); + } else { + page = get_node_page(sbi, inode->i_ino); + if (IS_ERR(page)) + return PTR_ERR(page); + + inline_addr = inline_xattr_addr(page); + } + memcpy(txattr_addr, inline_addr, inline_size); + f2fs_put_page(page, 1); + + return 0; +} + static int lookup_all_xattrs(struct inode *inode, struct page *ipage, unsigned int index, unsigned int len, const char *name, struct f2fs_xattr_entry **xe, @@ -310,21 +333,9 @@ static int lookup_all_xattrs(struct inode *inode, struct page *ipage, /* read from inline xattr */ if (inline_size) { - struct page *page = NULL; - void *inline_addr; - - if (ipage) { - inline_addr = inline_xattr_addr(ipage); - } else { - page = get_node_page(sbi, inode->i_ino); - if (IS_ERR(page)) { - err = PTR_ERR(page); - goto out; - } - inline_addr = inline_xattr_addr(page); - } - memcpy(txattr_addr, inline_addr, inline_size); - f2fs_put_page(page, 1); + err = read_inline_xattr(inode, ipage, txattr_addr); + if (err) + goto out; *xe = __find_inline_xattr(txattr_addr, &last_addr, index, len, name); @@ -386,21 +397,9 @@ static int read_all_xattrs(struct inode *inode, struct page *ipage, /* read from inline xattr */ if (inline_size) { - struct page *page = NULL; - void *inline_addr; - - if (ipage) { - inline_addr = inline_xattr_addr(ipage); - } else { - page = get_node_page(sbi, inode->i_ino); - if (IS_ERR(page)) { - err = PTR_ERR(page); - goto fail; - } - inline_addr = inline_xattr_addr(page); - } - memcpy(txattr_addr, inline_addr, inline_size); - f2fs_put_page(page, 1); + err = read_inline_xattr(inode, ipage, txattr_addr); + if (err) + goto fail; } /* read from xattr node block */ From 6d625a93b4a8acf6eaa2cfebd21ce8bd7c7080dc Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 4 Sep 2017 18:58:03 +0800 Subject: [PATCH 435/804] f2fs: introduce read_xattr_block Commit ba38c27eb93e ("f2fs: enhance lookup xattr") introduces lookup_all_xattrs duplicating from read_all_xattrs, which leaves lots of similar codes in between them, so introduce new help read_xattr_block to clean up redundant codes. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/xattr.c | 50 ++++++++++++++++++++++++------------------------- 1 file changed, 24 insertions(+), 26 deletions(-) diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index bbdf9955c2dc..c5e6a7e42262 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -311,12 +311,31 @@ static int read_inline_xattr(struct inode *inode, struct page *ipage, return 0; } +static int read_xattr_block(struct inode *inode, void *txattr_addr) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + nid_t xnid = F2FS_I(inode)->i_xattr_nid; + unsigned int inline_size = inline_xattr_size(inode); + struct page *xpage; + void *xattr_addr; + + /* The inode already has an extended attribute block. */ + xpage = get_node_page(sbi, xnid); + if (IS_ERR(xpage)) + return PTR_ERR(xpage); + + xattr_addr = page_address(xpage); + memcpy(txattr_addr + inline_size, xattr_addr, VALID_XATTR_BLOCK_SIZE); + f2fs_put_page(xpage, 1); + + return 0; +} + static int lookup_all_xattrs(struct inode *inode, struct page *ipage, unsigned int index, unsigned int len, const char *name, struct f2fs_xattr_entry **xe, void **base_addr) { - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); void *cur_addr, *txattr_addr, *last_addr = NULL; nid_t xnid = F2FS_I(inode)->i_xattr_nid; unsigned int size = xnid ? VALID_XATTR_BLOCK_SIZE : 0; @@ -345,19 +364,9 @@ static int lookup_all_xattrs(struct inode *inode, struct page *ipage, /* read from xattr node block */ if (xnid) { - struct page *xpage; - void *xattr_addr; - - /* The inode already has an extended attribute block. */ - xpage = get_node_page(sbi, xnid); - if (IS_ERR(xpage)) { - err = PTR_ERR(xpage); + err = read_xattr_block(inode, txattr_addr); + if (err) goto out; - } - - xattr_addr = page_address(xpage); - memcpy(txattr_addr + inline_size, xattr_addr, size); - f2fs_put_page(xpage, 1); } if (last_addr) @@ -382,7 +391,6 @@ out: static int read_all_xattrs(struct inode *inode, struct page *ipage, void **base_addr) { - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_xattr_header *header; nid_t xnid = F2FS_I(inode)->i_xattr_nid; unsigned int size = VALID_XATTR_BLOCK_SIZE; @@ -404,19 +412,9 @@ static int read_all_xattrs(struct inode *inode, struct page *ipage, /* read from xattr node block */ if (xnid) { - struct page *xpage; - void *xattr_addr; - - /* The inode already has an extended attribute block. */ - xpage = get_node_page(sbi, xnid); - if (IS_ERR(xpage)) { - err = PTR_ERR(xpage); + err = read_xattr_block(inode, txattr_addr); + if (err) goto fail; - } - - xattr_addr = page_address(xpage); - memcpy(txattr_addr + inline_size, xattr_addr, size); - f2fs_put_page(xpage, 1); } header = XATTR_HDR(txattr_addr); From 322a45d172124837d6a253828465ba2ccb652443 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 14 Sep 2017 10:18:01 +0800 Subject: [PATCH 436/804] f2fs: show flush list status in sysfs This patch adds to show flush list status in sysfs. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 5 ++++- fs/f2fs/f2fs.h | 3 ++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 87f449845f5f..00c1d4a9f356 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -61,6 +61,8 @@ static void update_general_status(struct f2fs_sb_info *sbi) atomic_read(&SM_I(sbi)->fcc_info->issued_flush); si->nr_flushing = atomic_read(&SM_I(sbi)->fcc_info->issing_flush); + si->flush_list_empty = + llist_empty(&SM_I(sbi)->fcc_info->issue_list); } if (SM_I(sbi) && SM_I(sbi)->dcc_info) { si->nr_discarded = @@ -349,10 +351,11 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, " - Inner Struct Count: tree: %d(%d), node: %d\n", si->ext_tree, si->zombie_tree, si->ext_node); seq_puts(s, "\nBalancing F2FS Async:\n"); - seq_printf(s, " - IO (CP: %4d, Data: %4d, Flush: (%4d %4d), " + seq_printf(s, " - IO (CP: %4d, Data: %4d, Flush: (%4d %4d %4d), " "Discard: (%4d %4d)) cmd: %4d undiscard:%4u\n", si->nr_wb_cp_data, si->nr_wb_data, si->nr_flushing, si->nr_flushed, + si->flush_list_empty, si->nr_discarding, si->nr_discarded, si->nr_discard_cmd, si->undiscard_blks); seq_printf(s, " - inmem: %4d, atomic IO: %4d (Max. %4d), " diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index dd840f60e172..7ed6e4abdd15 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2755,7 +2755,8 @@ struct f2fs_stat_info { int free_nids, avail_nids, alloc_nids; int total_count, utilization; int bg_gc, nr_wb_cp_data, nr_wb_data; - int nr_flushing, nr_flushed, nr_discarding, nr_discarded; + int nr_flushing, nr_flushed, flush_list_empty; + int nr_discarding, nr_discarded; int nr_discard_cmd; unsigned int undiscard_blks; int inline_xattr, inline_inode, inline_dir, append, update, orphans; From 4de0ceb6b7ef46851b82251e2470cd81920d48cc Mon Sep 17 00:00:00 2001 From: Hsiang Kao Date: Sun, 24 Sep 2017 02:45:42 +0800 Subject: [PATCH 437/804] f2fs: allow readpages with NULL file pointer Keep in line with the other Linux file system implementations since page_cache_sync_readahead supports NULL file pointer, and thus we can readahead data by f2fs itself without file opening (something like the btrfs behavior). Signed-off-by: Gao Xiang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index a275cbe57042..3b1dea525f15 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1331,7 +1331,7 @@ static int f2fs_read_data_pages(struct file *file, struct address_space *mapping, struct list_head *pages, unsigned nr_pages) { - struct inode *inode = file->f_mapping->host; + struct inode *inode = mapping->host; struct page *page = list_last_entry(pages, struct page, lru); trace_f2fs_readpages(inode, page, nr_pages); From 75d3164ae128764bfef899df03a1facd31ab2f21 Mon Sep 17 00:00:00 2001 From: Weichao Guo Date: Fri, 29 Sep 2017 22:43:23 +0800 Subject: [PATCH 438/804] f2fs: convert inline data for direct I/O & FI_NO_PREALLOC In FI_NO_PREALLOC cases, direct I/O path may allocate blocks for an inode but keep its inline data flag. This inconsistency may trigger vfs clear_inode nrpages bug_on when evicting the inode. We should convert inline data first in this case. Signed-off-by: Weichao Guo Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 3b1dea525f15..143355c91873 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -831,6 +831,13 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) struct f2fs_map_blocks map; int err = 0; + /* convert inline data for Direct I/O*/ + if (iocb->ki_flags & IOCB_DIRECT) { + err = f2fs_convert_inline_inode(inode); + if (err) + return err; + } + if (is_inode_flag_set(inode, FI_NO_PREALLOC)) return 0; @@ -843,15 +850,11 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) map.m_next_pgofs = NULL; - if (iocb->ki_flags & IOCB_DIRECT) { - err = f2fs_convert_inline_inode(inode); - if (err) - return err; + if (iocb->ki_flags & IOCB_DIRECT) return f2fs_map_blocks(inode, &map, 1, __force_buffered_io(inode, WRITE) ? F2FS_GET_BLOCK_PRE_AIO : F2FS_GET_BLOCK_PRE_DIO); - } if (iocb->ki_pos + iov_iter_count(from) > MAX_INLINE_DATA(inode)) { err = f2fs_convert_inline_inode(inode); if (err) From f555b0a117d38ea29b157b788437ff6f0c72bd37 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 29 Sep 2017 13:59:35 +0800 Subject: [PATCH 439/804] f2fs: obsolete ALLOC_NID_LIST list As Fan Li reported, there is no user traversing nid_list[ALLOC_NID_LIST] which is used for tracking preallocated nids. Let's drop it, and only track preallocated nids in free_nid_root radix-tree. Reported-by: Fan Li Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 8 ++-- fs/f2fs/f2fs.h | 15 ++++--- fs/f2fs/node.c | 97 ++++++++++++++++++++++------------------------ fs/f2fs/node.h | 15 ++----- fs/f2fs/shrinker.c | 2 +- 5 files changed, 64 insertions(+), 73 deletions(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 00c1d4a9f356..14095fbb4039 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -98,9 +98,9 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->dirty_nats = NM_I(sbi)->dirty_nat_cnt; si->sits = MAIN_SEGS(sbi); si->dirty_sits = SIT_I(sbi)->dirty_sentries; - si->free_nids = NM_I(sbi)->nid_cnt[FREE_NID_LIST]; + si->free_nids = NM_I(sbi)->nid_cnt[FREE_NID]; si->avail_nids = NM_I(sbi)->available_nids; - si->alloc_nids = NM_I(sbi)->nid_cnt[ALLOC_NID_LIST]; + si->alloc_nids = NM_I(sbi)->nid_cnt[PREALLOC_NID]; si->bg_gc = sbi->bg_gc; si->util_free = (int)(free_user_blocks(sbi) >> sbi->log_blocks_per_seg) * 100 / (int)(sbi->user_block_count >> sbi->log_blocks_per_seg) @@ -233,8 +233,8 @@ get_cache: } /* free nids */ - si->cache_mem += (NM_I(sbi)->nid_cnt[FREE_NID_LIST] + - NM_I(sbi)->nid_cnt[ALLOC_NID_LIST]) * + si->cache_mem += (NM_I(sbi)->nid_cnt[FREE_NID] + + NM_I(sbi)->nid_cnt[PREALLOC_NID]) * sizeof(struct free_nid); si->cache_mem += NM_I(sbi)->nat_cnt * sizeof(struct nat_entry); si->cache_mem += NM_I(sbi)->dirty_nat_cnt * diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 7ed6e4abdd15..685145846946 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -730,10 +730,13 @@ static inline void __try_update_largest_extent(struct inode *inode, } } -enum nid_list { - FREE_NID_LIST, - ALLOC_NID_LIST, - MAX_NID_LIST, +/* + * For free nid management + */ +enum nid_state { + FREE_NID, /* newly added to free nid list */ + PREALLOC_NID, /* it is preallocated */ + MAX_NID_STATE, }; struct f2fs_nm_info { @@ -756,8 +759,8 @@ struct f2fs_nm_info { /* free node ids management */ struct radix_tree_root free_nid_root;/* root of the free_nid cache */ - struct list_head nid_list[MAX_NID_LIST];/* lists for free nids */ - unsigned int nid_cnt[MAX_NID_LIST]; /* the number of free node id */ + struct list_head free_nid_list; /* list for free nids excluding preallocated nids */ + unsigned int nid_cnt[MAX_NID_STATE]; /* the number of free node id */ spinlock_t nid_list_lock; /* protect nid lists ops */ struct mutex build_lock; /* lock for build free nids */ unsigned char (*free_nid_bitmap)[NAT_ENTRY_BITMAP_SIZE]; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 264ccd157858..513f5dfb1952 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -46,7 +46,7 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type) * give 25%, 25%, 50%, 50%, 50% memory for each components respectively */ if (type == FREE_NIDS) { - mem_size = (nm_i->nid_cnt[FREE_NID_LIST] * + mem_size = (nm_i->nid_cnt[FREE_NID] * sizeof(struct free_nid)) >> PAGE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2); } else if (type == NAT_ENTRIES) { @@ -1760,8 +1760,8 @@ static struct free_nid *__lookup_free_nid_list(struct f2fs_nm_info *nm_i, return radix_tree_lookup(&nm_i->free_nid_root, n); } -static int __insert_nid_to_list(struct f2fs_sb_info *sbi, - struct free_nid *i, enum nid_list list, bool new) +static int __insert_free_nid(struct f2fs_sb_info *sbi, + struct free_nid *i, enum nid_state state, bool new) { struct f2fs_nm_info *nm_i = NM_I(sbi); @@ -1771,22 +1771,22 @@ static int __insert_nid_to_list(struct f2fs_sb_info *sbi, return err; } - f2fs_bug_on(sbi, list == FREE_NID_LIST ? i->state != NID_NEW : - i->state != NID_ALLOC); - nm_i->nid_cnt[list]++; - list_add_tail(&i->list, &nm_i->nid_list[list]); + f2fs_bug_on(sbi, state != i->state); + nm_i->nid_cnt[state]++; + if (state == FREE_NID) + list_add_tail(&i->list, &nm_i->free_nid_list); return 0; } -static void __remove_nid_from_list(struct f2fs_sb_info *sbi, - struct free_nid *i, enum nid_list list, bool reuse) +static void __remove_free_nid(struct f2fs_sb_info *sbi, + struct free_nid *i, enum nid_state state, bool reuse) { struct f2fs_nm_info *nm_i = NM_I(sbi); - f2fs_bug_on(sbi, list == FREE_NID_LIST ? i->state != NID_NEW : - i->state != NID_ALLOC); - nm_i->nid_cnt[list]--; - list_del(&i->list); + f2fs_bug_on(sbi, state != i->state); + nm_i->nid_cnt[state]--; + if (state == FREE_NID) + list_del(&i->list); if (!reuse) radix_tree_delete(&nm_i->free_nid_root, i->nid); } @@ -1806,7 +1806,7 @@ static bool add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) i = f2fs_kmem_cache_alloc(free_nid_slab, GFP_NOFS); i->nid = nid; - i->state = NID_NEW; + i->state = FREE_NID; if (radix_tree_preload(GFP_NOFS)) goto err; @@ -1819,7 +1819,7 @@ static bool add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) * - f2fs_create * - f2fs_new_inode * - alloc_nid - * - __insert_nid_to_list(ALLOC_NID_LIST) + * - __insert_nid_to_list(PREALLOC_NID) * - f2fs_balance_fs_bg * - build_free_nids * - __build_free_nids @@ -1832,8 +1832,8 @@ static bool add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) * - new_node_page * - set_node_addr * - alloc_nid_done - * - __remove_nid_from_list(ALLOC_NID_LIST) - * - __insert_nid_to_list(FREE_NID_LIST) + * - __remove_nid_from_list(PREALLOC_NID) + * - __insert_nid_to_list(FREE_NID) */ ne = __lookup_nat_cache(nm_i, nid); if (ne && (!get_nat_flag(ne, IS_CHECKPOINTED) || @@ -1842,13 +1842,13 @@ static bool add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) e = __lookup_free_nid_list(nm_i, nid); if (e) { - if (e->state == NID_NEW) + if (e->state == FREE_NID) ret = true; goto err_out; } } ret = true; - err = __insert_nid_to_list(sbi, i, FREE_NID_LIST, true); + err = __insert_free_nid(sbi, i, FREE_NID, true); err_out: spin_unlock(&nm_i->nid_list_lock); radix_tree_preload_end(); @@ -1866,8 +1866,8 @@ static void remove_free_nid(struct f2fs_sb_info *sbi, nid_t nid) spin_lock(&nm_i->nid_list_lock); i = __lookup_free_nid_list(nm_i, nid); - if (i && i->state == NID_NEW) { - __remove_nid_from_list(sbi, i, FREE_NID_LIST, false); + if (i && i->state == FREE_NID) { + __remove_free_nid(sbi, i, FREE_NID, false); need_free = true; } spin_unlock(&nm_i->nid_list_lock); @@ -1952,7 +1952,7 @@ static void scan_free_nid_bits(struct f2fs_sb_info *sbi) nid = i * NAT_ENTRY_PER_BLOCK + idx; add_free_nid(sbi, nid, true); - if (nm_i->nid_cnt[FREE_NID_LIST] >= MAX_FREE_NIDS) + if (nm_i->nid_cnt[FREE_NID] >= MAX_FREE_NIDS) goto out; } } @@ -1985,7 +1985,7 @@ static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount) nid = 0; /* Enough entries */ - if (nm_i->nid_cnt[FREE_NID_LIST] >= NAT_ENTRY_PER_BLOCK) + if (nm_i->nid_cnt[FREE_NID] >= NAT_ENTRY_PER_BLOCK) return; if (!sync && !available_free_memory(sbi, FREE_NIDS)) @@ -1995,7 +1995,7 @@ static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount) /* try to find free nids in free_nid_bitmap */ scan_free_nid_bits(sbi); - if (nm_i->nid_cnt[FREE_NID_LIST]) + if (nm_i->nid_cnt[FREE_NID]) return; } @@ -2072,15 +2072,15 @@ retry: } /* We should not use stale free nids created by build_free_nids */ - if (nm_i->nid_cnt[FREE_NID_LIST] && !on_build_free_nids(nm_i)) { - f2fs_bug_on(sbi, list_empty(&nm_i->nid_list[FREE_NID_LIST])); - i = list_first_entry(&nm_i->nid_list[FREE_NID_LIST], + if (nm_i->nid_cnt[FREE_NID] && !on_build_free_nids(nm_i)) { + f2fs_bug_on(sbi, list_empty(&nm_i->free_nid_list)); + i = list_first_entry(&nm_i->free_nid_list, struct free_nid, list); *nid = i->nid; - __remove_nid_from_list(sbi, i, FREE_NID_LIST, true); - i->state = NID_ALLOC; - __insert_nid_to_list(sbi, i, ALLOC_NID_LIST, false); + __remove_free_nid(sbi, i, FREE_NID, true); + i->state = PREALLOC_NID; + __insert_free_nid(sbi, i, PREALLOC_NID, false); nm_i->available_nids--; update_free_nid_bitmap(sbi, *nid, false, false); @@ -2106,7 +2106,7 @@ void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid) spin_lock(&nm_i->nid_list_lock); i = __lookup_free_nid_list(nm_i, nid); f2fs_bug_on(sbi, !i); - __remove_nid_from_list(sbi, i, ALLOC_NID_LIST, false); + __remove_free_nid(sbi, i, PREALLOC_NID, false); spin_unlock(&nm_i->nid_list_lock); kmem_cache_free(free_nid_slab, i); @@ -2129,12 +2129,12 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) f2fs_bug_on(sbi, !i); if (!available_free_memory(sbi, FREE_NIDS)) { - __remove_nid_from_list(sbi, i, ALLOC_NID_LIST, false); + __remove_free_nid(sbi, i, PREALLOC_NID, false); need_free = true; } else { - __remove_nid_from_list(sbi, i, ALLOC_NID_LIST, true); - i->state = NID_NEW; - __insert_nid_to_list(sbi, i, FREE_NID_LIST, false); + __remove_free_nid(sbi, i, PREALLOC_NID, true); + i->state = FREE_NID; + __insert_free_nid(sbi, i, FREE_NID, false); } nm_i->available_nids++; @@ -2153,20 +2153,19 @@ int try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink) struct free_nid *i, *next; int nr = nr_shrink; - if (nm_i->nid_cnt[FREE_NID_LIST] <= MAX_FREE_NIDS) + if (nm_i->nid_cnt[FREE_NID] <= MAX_FREE_NIDS) return 0; if (!mutex_trylock(&nm_i->build_lock)) return 0; spin_lock(&nm_i->nid_list_lock); - list_for_each_entry_safe(i, next, &nm_i->nid_list[FREE_NID_LIST], - list) { + list_for_each_entry_safe(i, next, &nm_i->free_nid_list, list) { if (nr_shrink <= 0 || - nm_i->nid_cnt[FREE_NID_LIST] <= MAX_FREE_NIDS) + nm_i->nid_cnt[FREE_NID] <= MAX_FREE_NIDS) break; - __remove_nid_from_list(sbi, i, FREE_NID_LIST, false); + __remove_free_nid(sbi, i, FREE_NID, false); kmem_cache_free(free_nid_slab, i); nr_shrink--; } @@ -2638,16 +2637,15 @@ static int init_node_manager(struct f2fs_sb_info *sbi) /* not used nids: 0, node, meta, (and root counted as valid node) */ nm_i->available_nids = nm_i->max_nid - sbi->total_valid_node_count - F2FS_RESERVED_NODE_NUM; - nm_i->nid_cnt[FREE_NID_LIST] = 0; - nm_i->nid_cnt[ALLOC_NID_LIST] = 0; + nm_i->nid_cnt[FREE_NID] = 0; + nm_i->nid_cnt[PREALLOC_NID] = 0; nm_i->nat_cnt = 0; nm_i->ram_thresh = DEF_RAM_THRESHOLD; nm_i->ra_nid_pages = DEF_RA_NID_PAGES; nm_i->dirty_nats_ratio = DEF_DIRTY_NAT_RATIO_THRESHOLD; INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC); - INIT_LIST_HEAD(&nm_i->nid_list[FREE_NID_LIST]); - INIT_LIST_HEAD(&nm_i->nid_list[ALLOC_NID_LIST]); + INIT_LIST_HEAD(&nm_i->free_nid_list); INIT_RADIX_TREE(&nm_i->nat_root, GFP_NOIO); INIT_RADIX_TREE(&nm_i->nat_set_root, GFP_NOIO); INIT_LIST_HEAD(&nm_i->nat_entries); @@ -2739,16 +2737,15 @@ void destroy_node_manager(struct f2fs_sb_info *sbi) /* destroy free nid list */ spin_lock(&nm_i->nid_list_lock); - list_for_each_entry_safe(i, next_i, &nm_i->nid_list[FREE_NID_LIST], - list) { - __remove_nid_from_list(sbi, i, FREE_NID_LIST, false); + list_for_each_entry_safe(i, next_i, &nm_i->free_nid_list, list) { + __remove_free_nid(sbi, i, FREE_NID, false); spin_unlock(&nm_i->nid_list_lock); kmem_cache_free(free_nid_slab, i); spin_lock(&nm_i->nid_list_lock); } - f2fs_bug_on(sbi, nm_i->nid_cnt[FREE_NID_LIST]); - f2fs_bug_on(sbi, nm_i->nid_cnt[ALLOC_NID_LIST]); - f2fs_bug_on(sbi, !list_empty(&nm_i->nid_list[ALLOC_NID_LIST])); + f2fs_bug_on(sbi, nm_i->nid_cnt[FREE_NID]); + f2fs_bug_on(sbi, nm_i->nid_cnt[PREALLOC_NID]); + f2fs_bug_on(sbi, !list_empty(&nm_i->free_nid_list)); spin_unlock(&nm_i->nid_list_lock); /* destroy nat cache */ diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index bb53e9955ff2..e91b08b4a51a 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -150,18 +150,10 @@ struct nat_entry_set { unsigned int entry_cnt; /* the # of nat entries in set */ }; -/* - * For free nid mangement - */ -enum nid_state { - NID_NEW, /* newly added to free nid list */ - NID_ALLOC /* it is allocated */ -}; - struct free_nid { struct list_head list; /* for free node id list */ nid_t nid; /* node id */ - int state; /* in use or not: NID_NEW or NID_ALLOC */ + int state; /* in use or not: FREE_NID or PREALLOC_NID */ }; static inline void next_free_nid(struct f2fs_sb_info *sbi, nid_t *nid) @@ -170,12 +162,11 @@ static inline void next_free_nid(struct f2fs_sb_info *sbi, nid_t *nid) struct free_nid *fnid; spin_lock(&nm_i->nid_list_lock); - if (nm_i->nid_cnt[FREE_NID_LIST] <= 0) { + if (nm_i->nid_cnt[FREE_NID] <= 0) { spin_unlock(&nm_i->nid_list_lock); return; } - fnid = list_first_entry(&nm_i->nid_list[FREE_NID_LIST], - struct free_nid, list); + fnid = list_first_entry(&nm_i->free_nid_list, struct free_nid, list); *nid = fnid->nid; spin_unlock(&nm_i->nid_list_lock); } diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c index 5c60fc28ec75..0b5664a1a6cc 100644 --- a/fs/f2fs/shrinker.c +++ b/fs/f2fs/shrinker.c @@ -28,7 +28,7 @@ static unsigned long __count_nat_entries(struct f2fs_sb_info *sbi) static unsigned long __count_free_nids(struct f2fs_sb_info *sbi) { - long count = NM_I(sbi)->nid_cnt[FREE_NID_LIST] - MAX_FREE_NIDS; + long count = NM_I(sbi)->nid_cnt[FREE_NID] - MAX_FREE_NIDS; return count > 0 ? count : 0; } From 814b463d262f19f5997c3632256ea41a4ee0be11 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 29 Sep 2017 13:59:36 +0800 Subject: [PATCH 440/804] f2fs: drop FI_UPDATE_WRITE tag after f2fs_issue_flush If we failed to issue flush in ->fsync, we need to keep FI_UPDATE_WRITE flag to make sure triggering flush in next ->fsync. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 531379f513fa..43617d7c596c 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -297,10 +297,12 @@ sync_nodes: remove_ino_entry(sbi, ino, APPEND_INO); clear_inode_flag(inode, FI_APPEND_WRITE); flush_out: - remove_ino_entry(sbi, ino, UPDATE_INO); - clear_inode_flag(inode, FI_UPDATE_WRITE); if (!atomic) ret = f2fs_issue_flush(sbi); + if (!ret) { + remove_ino_entry(sbi, ino, UPDATE_INO); + clear_inode_flag(inode, FI_UPDATE_WRITE); + } f2fs_update_time(sbi, REQ_TIME); out: trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret); From 9c2526ac2ecbb716523bfd21bf1c3e55e1e28c9d Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 29 Sep 2017 13:59:37 +0800 Subject: [PATCH 441/804] f2fs: fix to show ino management cache size correctly It needs to stat size of ino management cache with all type instead of orphan ino type. Fixes: 652be55162dc ("f2fs: show # of orphan inodes") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 14095fbb4039..d441660c3ba6 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -240,7 +240,7 @@ get_cache: si->cache_mem += NM_I(sbi)->dirty_nat_cnt * sizeof(struct nat_entry_set); si->cache_mem += si->inmem_pages * sizeof(struct inmem_pages); - for (i = 0; i <= ORPHAN_INO; i++) + for (i = 0; i < MAX_INO_ENTRY; i++) si->cache_mem += sbi->im[i].ino_num * sizeof(struct ino_entry); si->cache_mem += atomic_read(&sbi->total_ext_tree) * sizeof(struct extent_tree); From 08bb9d68d51b2946f244f77865a48b23b29af1eb Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 29 Sep 2017 13:59:38 +0800 Subject: [PATCH 442/804] f2fs: enhance multiple device flush When multiple device feature is enabled, during ->fsync we will issue flush in all devices to make sure node/data of the file being persisted into storage. But some flushes of device could be unneeded as file's data may be not writebacked into those devices. So this patch adds and manage bitmap per inode in global cache to indicate which device is dirty and it needs to issue flush during ->fsync, hence, we could improve performance of fsync in scenario of multiple device. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 36 +++++++++++++++++++++++++++++----- fs/f2fs/data.c | 1 + fs/f2fs/f2fs.h | 14 +++++++++++--- fs/f2fs/file.c | 3 ++- fs/f2fs/gc.c | 2 ++ fs/f2fs/inline.c | 1 + fs/f2fs/inode.c | 1 + fs/f2fs/node.c | 3 ++- fs/f2fs/segment.c | 46 +++++++++++++++++++++++++++++++++----------- 9 files changed, 86 insertions(+), 21 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index e86f67ac96c6..b1c6e75c2764 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -402,7 +402,8 @@ const struct address_space_operations f2fs_meta_aops = { #endif }; -static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) +static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, + unsigned int devidx, int type) { struct inode_management *im = &sbi->im[type]; struct ino_entry *e, *tmp; @@ -427,6 +428,10 @@ retry: if (type != ORPHAN_INO) im->ino_num++; } + + if (type == FLUSH_INO) + f2fs_set_bit(devidx, (char *)&e->dirty_device); + spin_unlock(&im->ino_lock); radix_tree_preload_end(); @@ -455,7 +460,7 @@ static void __remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) void add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) { /* add new dirty ino entry into list */ - __add_ino_entry(sbi, ino, type); + __add_ino_entry(sbi, ino, 0, type); } void remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) @@ -481,7 +486,7 @@ void release_ino_entry(struct f2fs_sb_info *sbi, bool all) struct ino_entry *e, *tmp; int i; - for (i = all ? ORPHAN_INO: APPEND_INO; i <= UPDATE_INO; i++) { + for (i = all ? ORPHAN_INO : APPEND_INO; i < MAX_INO_ENTRY; i++) { struct inode_management *im = &sbi->im[i]; spin_lock(&im->ino_lock); @@ -495,6 +500,27 @@ void release_ino_entry(struct f2fs_sb_info *sbi, bool all) } } +void set_dirty_device(struct f2fs_sb_info *sbi, nid_t ino, + unsigned int devidx, int type) +{ + __add_ino_entry(sbi, ino, devidx, type); +} + +bool is_dirty_device(struct f2fs_sb_info *sbi, nid_t ino, + unsigned int devidx, int type) +{ + struct inode_management *im = &sbi->im[type]; + struct ino_entry *e; + bool is_dirty = false; + + spin_lock(&im->ino_lock); + e = radix_tree_lookup(&im->ino_root, ino); + if (e && f2fs_test_bit(devidx, (char *)&e->dirty_device)) + is_dirty = true; + spin_unlock(&im->ino_lock); + return is_dirty; +} + int acquire_orphan_inode(struct f2fs_sb_info *sbi) { struct inode_management *im = &sbi->im[ORPHAN_INO]; @@ -531,7 +557,7 @@ void release_orphan_inode(struct f2fs_sb_info *sbi) void add_orphan_inode(struct inode *inode) { /* add new orphan ino entry into list */ - __add_ino_entry(F2FS_I_SB(inode), inode->i_ino, ORPHAN_INO); + __add_ino_entry(F2FS_I_SB(inode), inode->i_ino, 0, ORPHAN_INO); update_inode_page(inode); } @@ -555,7 +581,7 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) return err; } - __add_ino_entry(sbi, ino, ORPHAN_INO); + __add_ino_entry(sbi, ino, 0, ORPHAN_INO); inode = f2fs_iget_retry(sbi->sb, ino); if (IS_ERR(inode)) { diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 143355c91873..a655a39d60b3 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1495,6 +1495,7 @@ static int __write_data_page(struct page *page, bool *submitted, int err = 0; struct f2fs_io_info fio = { .sbi = sbi, + .ino = inode->i_ino, .type = DATA, .op = REQ_OP_WRITE, .op_flags = wbc_to_write_flags(wbc), diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 685145846946..06a4d784abce 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -244,12 +244,14 @@ enum { ORPHAN_INO, /* for orphan ino list */ APPEND_INO, /* for append ino list */ UPDATE_INO, /* for update ino list */ + FLUSH_INO, /* for multiple device flushing */ MAX_INO_ENTRY, /* max. list */ }; struct ino_entry { - struct list_head list; /* list head */ - nid_t ino; /* inode number */ + struct list_head list; /* list head */ + nid_t ino; /* inode number */ + unsigned int dirty_device; /* dirty device bitmap */ }; /* for the list of inodes to be GCed */ @@ -838,6 +840,7 @@ enum { struct flush_cmd { struct completion wait; struct llist_node llnode; + nid_t ino; int ret; }; @@ -965,6 +968,7 @@ enum iostat_type { struct f2fs_io_info { struct f2fs_sb_info *sbi; /* f2fs_sb_info pointer */ + nid_t ino; /* inode number */ enum page_type type; /* contains DATA/NODE/META/META_FLUSH */ enum temp_type temp; /* contains HOT/WARM/COLD */ int op; /* contains REQ_OP_ */ @@ -2602,7 +2606,7 @@ void drop_inmem_page(struct inode *inode, struct page *page); int commit_inmem_pages(struct inode *inode); void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need); void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi); -int f2fs_issue_flush(struct f2fs_sb_info *sbi); +int f2fs_issue_flush(struct f2fs_sb_info *sbi, nid_t ino); int create_flush_cmd_control(struct f2fs_sb_info *sbi); void destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free); void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr); @@ -2664,6 +2668,10 @@ void add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type); void remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type); void release_ino_entry(struct f2fs_sb_info *sbi, bool all); bool exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode); +void set_dirty_device(struct f2fs_sb_info *sbi, nid_t ino, + unsigned int devidx, int type); +bool is_dirty_device(struct f2fs_sb_info *sbi, nid_t ino, + unsigned int devidx, int type); int f2fs_sync_inode_meta(struct f2fs_sb_info *sbi); int acquire_orphan_inode(struct f2fs_sb_info *sbi); void release_orphan_inode(struct f2fs_sb_info *sbi); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 43617d7c596c..cd569d394272 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -298,10 +298,11 @@ sync_nodes: clear_inode_flag(inode, FI_APPEND_WRITE); flush_out: if (!atomic) - ret = f2fs_issue_flush(sbi); + ret = f2fs_issue_flush(sbi, inode->i_ino); if (!ret) { remove_ino_entry(sbi, ino, UPDATE_INO); clear_inode_flag(inode, FI_UPDATE_WRITE); + remove_ino_entry(sbi, ino, FLUSH_INO); } f2fs_update_time(sbi, REQ_TIME); out: diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 3a6eaf01cdf7..32b0b0632e15 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -598,6 +598,7 @@ static void move_data_block(struct inode *inode, block_t bidx, { struct f2fs_io_info fio = { .sbi = F2FS_I_SB(inode), + .ino = inode->i_ino, .type = DATA, .temp = COLD, .op = REQ_OP_READ, @@ -728,6 +729,7 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type, } else { struct f2fs_io_info fio = { .sbi = F2FS_I_SB(inode), + .ino = inode->i_ino, .type = DATA, .temp = COLD, .op = REQ_OP_WRITE, diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 7e76c415b913..0fa5ca0907ba 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -112,6 +112,7 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) { struct f2fs_io_info fio = { .sbi = F2FS_I_SB(dn->inode), + .ino = dn->inode->i_ino, .type = DATA, .op = REQ_OP_WRITE, .op_flags = REQ_SYNC | REQ_NOIDLE | REQ_PRIO, diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 50c88e37ed66..ad4f7d52c0ad 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -480,6 +480,7 @@ void f2fs_evict_inode(struct inode *inode) remove_ino_entry(sbi, inode->i_ino, APPEND_INO); remove_ino_entry(sbi, inode->i_ino, UPDATE_INO); + remove_ino_entry(sbi, inode->i_ino, FLUSH_INO); sb_start_intwrite(inode->i_sb); set_inode_flag(inode, FI_NO_ALLOC); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 513f5dfb1952..733a8e14a4c8 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -63,7 +63,7 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type) } else if (type == INO_ENTRIES) { int i; - for (i = 0; i <= UPDATE_INO; i++) + for (i = 0; i < MAX_INO_ENTRY; i++) mem_size += sbi->im[i].ino_num * sizeof(struct ino_entry); mem_size >>= PAGE_SHIFT; @@ -1340,6 +1340,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, struct node_info ni; struct f2fs_io_info fio = { .sbi = sbi, + .ino = ino_of_node(page), .type = NODE, .op = REQ_OP_WRITE, .op_flags = wbc_to_write_flags(wbc), diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index f5c494389483..5351caa2ffd9 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -313,6 +313,7 @@ static int __commit_inmem_pages(struct inode *inode, struct inmem_pages *cur, *tmp; struct f2fs_io_info fio = { .sbi = sbi, + .ino = inode->i_ino, .type = DATA, .op = REQ_OP_WRITE, .op_flags = REQ_SYNC | REQ_PRIO, @@ -485,15 +486,17 @@ static int __submit_flush_wait(struct f2fs_sb_info *sbi, return ret; } -static int submit_flush_wait(struct f2fs_sb_info *sbi) +static int submit_flush_wait(struct f2fs_sb_info *sbi, nid_t ino) { - int ret = __submit_flush_wait(sbi, sbi->sb->s_bdev); + int ret = 0; int i; - if (!sbi->s_ndevs || ret) - return ret; + if (!sbi->s_ndevs) + return __submit_flush_wait(sbi, sbi->sb->s_bdev); - for (i = 1; i < sbi->s_ndevs; i++) { + for (i = 0; i < sbi->s_ndevs; i++) { + if (!is_dirty_device(sbi, ino, i, FLUSH_INO)) + continue; ret = __submit_flush_wait(sbi, FDEV(i).bdev); if (ret) break; @@ -519,7 +522,9 @@ repeat: fcc->dispatch_list = llist_del_all(&fcc->issue_list); fcc->dispatch_list = llist_reverse_order(fcc->dispatch_list); - ret = submit_flush_wait(sbi); + cmd = llist_entry(fcc->dispatch_list, struct flush_cmd, llnode); + + ret = submit_flush_wait(sbi, cmd->ino); atomic_inc(&fcc->issued_flush); llist_for_each_entry_safe(cmd, next, @@ -537,7 +542,7 @@ repeat: goto repeat; } -int f2fs_issue_flush(struct f2fs_sb_info *sbi) +int f2fs_issue_flush(struct f2fs_sb_info *sbi, nid_t ino) { struct flush_cmd_control *fcc = SM_I(sbi)->fcc_info; struct flush_cmd cmd; @@ -547,19 +552,20 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi) return 0; if (!test_opt(sbi, FLUSH_MERGE)) { - ret = submit_flush_wait(sbi); + ret = submit_flush_wait(sbi, ino); atomic_inc(&fcc->issued_flush); return ret; } - if (atomic_inc_return(&fcc->issing_flush) == 1) { - ret = submit_flush_wait(sbi); + if (atomic_inc_return(&fcc->issing_flush) == 1 || sbi->s_ndevs > 1) { + ret = submit_flush_wait(sbi, ino); atomic_dec(&fcc->issing_flush); atomic_inc(&fcc->issued_flush); return ret; } + cmd.ino = ino; init_completion(&cmd.wait); llist_add(&cmd.llnode, &fcc->issue_list); @@ -583,7 +589,7 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi) } else { struct flush_cmd *tmp, *next; - ret = submit_flush_wait(sbi); + ret = submit_flush_wait(sbi, ino); llist_for_each_entry_safe(tmp, next, list, llnode) { if (tmp == &cmd) { @@ -2464,6 +2470,20 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, mutex_unlock(&curseg->curseg_mutex); } +static void update_device_state(struct f2fs_io_info *fio) +{ + struct f2fs_sb_info *sbi = fio->sbi; + unsigned int devidx; + + if (!sbi->s_ndevs) + return; + + devidx = f2fs_target_device_index(sbi, fio->new_blkaddr); + + /* update device state for fsync */ + set_dirty_device(sbi, fio->ino, devidx, FLUSH_INO); +} + static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) { int type = __get_segment_type(fio); @@ -2478,6 +2498,8 @@ reallocate: if (err == -EAGAIN) { fio->old_blkaddr = fio->new_blkaddr; goto reallocate; + } else if (!err) { + update_device_state(fio); } } @@ -2538,6 +2560,8 @@ int rewrite_data_page(struct f2fs_io_info *fio) stat_inc_inplace_blocks(fio->sbi); err = f2fs_submit_page_bio(fio); + if (!err) + update_device_state(fio); f2fs_update_iostat(fio->sbi, fio->io_type, F2FS_BLKSIZE); From 27eaad09380fe2f1fd8dbfb1e3e7ae6afd70ca80 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 29 Sep 2017 13:59:39 +0800 Subject: [PATCH 443/804] f2fs: fix to flush multiple device in checkpoint If f2fs manages multiple devices, in checkpoint, we need to issue flush in those devices which contain dirty data/node in their cache before we write checkpoint region, otherwise, filesystem metadata could be corrupted if hitting SPO after checkpoint. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 6 ++++++ fs/f2fs/f2fs.h | 3 +++ fs/f2fs/segment.c | 29 +++++++++++++++++++++++++++++ fs/f2fs/super.c | 3 +++ 4 files changed, 41 insertions(+) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index b1c6e75c2764..90ff066c9569 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1173,6 +1173,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) struct super_block *sb = sbi->sb; struct curseg_info *seg_i = CURSEG_I(sbi, CURSEG_HOT_NODE); u64 kbytes_written; + int err; /* Flush all the NAT/SIT pages */ while (get_pages(sbi, F2FS_DIRTY_META)) { @@ -1266,6 +1267,11 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) if (unlikely(f2fs_cp_error(sbi))) return -EIO; + /* flush all device cache */ + err = f2fs_flush_device_cache(sbi); + if (err) + return err; + /* write out checkpoint buffer at block 0 */ update_meta_page(sbi, ckpt, start_blk++); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 06a4d784abce..fdf216423473 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1186,6 +1186,8 @@ struct f2fs_sb_info { struct list_head s_list; int s_ndevs; /* number of devices */ struct f2fs_dev_info *devs; /* for device list */ + unsigned int dirty_device; /* for checkpoint data flush */ + spinlock_t dev_lock; /* protect dirty_device */ struct mutex umount_mutex; unsigned int shrinker_run_no; @@ -2608,6 +2610,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need); void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi); int f2fs_issue_flush(struct f2fs_sb_info *sbi, nid_t ino); int create_flush_cmd_control(struct f2fs_sb_info *sbi); +int f2fs_flush_device_cache(struct f2fs_sb_info *sbi); void destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free); void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr); bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 5351caa2ffd9..c009bdff2ff6 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -659,6 +659,28 @@ void destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free) } } +int f2fs_flush_device_cache(struct f2fs_sb_info *sbi) +{ + int ret = 0, i; + + if (!sbi->s_ndevs) + return 0; + + for (i = 1; i < sbi->s_ndevs; i++) { + if (!f2fs_test_bit(i, (char *)&sbi->dirty_device)) + continue; + ret = __submit_flush_wait(sbi, FDEV(i).bdev); + if (ret) + break; + + spin_lock(&sbi->dev_lock); + f2fs_clear_bit(i, (char *)&sbi->dirty_device); + spin_unlock(&sbi->dev_lock); + } + + return ret; +} + static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, enum dirty_type dirty_type) { @@ -2482,6 +2504,13 @@ static void update_device_state(struct f2fs_io_info *fio) /* update device state for fsync */ set_dirty_device(sbi, fio->ino, devidx, FLUSH_INO); + + /* update device state for checkpoint */ + if (!f2fs_test_bit(devidx, (char *)&sbi->dirty_device)) { + spin_lock(&sbi->dev_lock); + f2fs_set_bit(devidx, (char *)&sbi->dirty_device); + spin_unlock(&sbi->dev_lock); + } } static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 482bb0333806..5fe6047d1db8 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1969,6 +1969,9 @@ static void init_sb_info(struct f2fs_sb_info *sbi) for (j = HOT; j < NR_TEMP_TYPE; j++) mutex_init(&sbi->wio_mutex[i][j]); spin_lock_init(&sbi->cp_lock); + + sbi->dirty_device = 0; + spin_lock_init(&sbi->dev_lock); } static int init_percpu_info(struct f2fs_sb_info *sbi) From 684447dad1385fef8a1c2bfaff770860b0beddc2 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 4 Oct 2017 09:08:32 +0800 Subject: [PATCH 444/804] f2fs: support issuing/waiting discard in range Fstrim intends to trim invalid blocks of filesystem only with specified range and granularity, but actually, it will issue all previous cached discard commands which may be out-of-range and be with unmatched granularity, it's unneeded. In order to fix above issues, this patch introduces new helps to support to issue and wait discard in range and adds a new fstrim_list for tracking in-flight discard from ->fstrim. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 3 +- fs/f2fs/segment.c | 127 ++++++++++++++++++++++++++++++++++++++-------- fs/f2fs/super.c | 2 +- 3 files changed, 109 insertions(+), 23 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index fdf216423473..ea2cd4112a40 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -316,6 +316,7 @@ struct discard_cmd_control { struct list_head pend_list[MAX_PLIST_NUM];/* store pending entries */ unsigned char pend_list_tag[MAX_PLIST_NUM];/* tag for pending entries */ struct list_head wait_list; /* store on-flushing entries */ + struct list_head fstrim_list; /* in-flight discard from fstrim */ wait_queue_head_t discard_wait_queue; /* waiting queue for wake-up */ unsigned int discard_wake; /* to wake up discard thread */ struct mutex cmd_lock; @@ -2616,7 +2617,7 @@ void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr); bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr); void refresh_sit_entry(struct f2fs_sb_info *sbi, block_t old, block_t new); void stop_discard_thread(struct f2fs_sb_info *sbi); -void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi, bool umount); +void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi); void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc); void release_discard_addrs(struct f2fs_sb_info *sbi); int npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index c009bdff2ff6..8bdc31d1c847 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -954,9 +954,11 @@ void __check_sit_bitmap(struct f2fs_sb_info *sbi, /* this function is copied from blkdev_issue_discard from block/blk-lib.c */ static void __submit_discard_cmd(struct f2fs_sb_info *sbi, - struct discard_cmd *dc) + struct discard_cmd *dc, bool fstrim) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct list_head *wait_list = fstrim ? &(dcc->fstrim_list) : + &(dcc->wait_list); struct bio *bio = NULL; if (dc->state != D_PREP) @@ -977,7 +979,7 @@ static void __submit_discard_cmd(struct f2fs_sb_info *sbi, bio->bi_private = dc; bio->bi_end_io = f2fs_submit_discard_endio; submit_bio(REQ_SYNC, bio); - list_move_tail(&dc->list, &dcc->wait_list); + list_move_tail(&dc->list, wait_list); __check_sit_bitmap(sbi, dc->start, dc->start + dc->len); f2fs_update_iostat(sbi, FS_DISCARD, 1); @@ -1162,6 +1164,68 @@ static int __queue_discard_cmd(struct f2fs_sb_info *sbi, return 0; } +static void __issue_discard_cmd_range(struct f2fs_sb_info *sbi, + unsigned int start, unsigned int end, + unsigned int granularity) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct discard_cmd *prev_dc = NULL, *next_dc = NULL; + struct rb_node **insert_p = NULL, *insert_parent = NULL; + struct discard_cmd *dc; + struct blk_plug plug; + int issued; + +next: + issued = 0; + + mutex_lock(&dcc->cmd_lock); + f2fs_bug_on(sbi, !__check_rb_tree_consistence(sbi, &dcc->root)); + + dc = (struct discard_cmd *)__lookup_rb_tree_ret(&dcc->root, + NULL, start, + (struct rb_entry **)&prev_dc, + (struct rb_entry **)&next_dc, + &insert_p, &insert_parent, true); + if (!dc) + dc = next_dc; + + blk_start_plug(&plug); + + while (dc && dc->lstart <= end) { + struct rb_node *node; + + if (dc->len < granularity) + goto skip; + + if (dc->state != D_PREP) { + list_move_tail(&dc->list, &dcc->fstrim_list); + goto skip; + } + + __submit_discard_cmd(sbi, dc, true); + + if (++issued >= DISCARD_ISSUE_RATE) { + start = dc->lstart + dc->len; + + blk_finish_plug(&plug); + mutex_unlock(&dcc->cmd_lock); + + schedule(); + + goto next; + } +skip: + node = rb_next(&dc->rb_node); + dc = rb_entry_safe(node, struct discard_cmd, rb_node); + + if (fatal_signal_pending(current)) + break; + } + + blk_finish_plug(&plug); + mutex_unlock(&dcc->cmd_lock); +} + static int __issue_discard_cmd(struct f2fs_sb_info *sbi, bool issue_cond) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; @@ -1184,22 +1248,19 @@ static int __issue_discard_cmd(struct f2fs_sb_info *sbi, bool issue_cond) /* Hurry up to finish fstrim */ if (dcc->pend_list_tag[i] & P_TRIM) { - __submit_discard_cmd(sbi, dc); + __submit_discard_cmd(sbi, dc, false); issued++; - - if (fatal_signal_pending(current)) - break; continue; } if (!issue_cond) { - __submit_discard_cmd(sbi, dc); + __submit_discard_cmd(sbi, dc, false); issued++; continue; } if (is_idle(sbi)) { - __submit_discard_cmd(sbi, dc); + __submit_discard_cmd(sbi, dc, false); issued++; } else { io_interrupted = true; @@ -1253,10 +1314,14 @@ static void __wait_one_discard_bio(struct f2fs_sb_info *sbi, mutex_unlock(&dcc->cmd_lock); } -static void __wait_discard_cmd(struct f2fs_sb_info *sbi, bool wait_cond) +static void __wait_discard_cmd_range(struct f2fs_sb_info *sbi, bool wait_cond, + block_t start, block_t end, + unsigned int granularity, + bool fstrim) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; - struct list_head *wait_list = &(dcc->wait_list); + struct list_head *wait_list = fstrim ? &(dcc->fstrim_list) : + &(dcc->wait_list); struct discard_cmd *dc, *tmp; bool need_wait; @@ -1265,6 +1330,10 @@ next: mutex_lock(&dcc->cmd_lock); list_for_each_entry_safe(dc, tmp, wait_list, list) { + if (dc->lstart + dc->len <= start || end <= dc->lstart) + continue; + if (dc->len < granularity) + continue; if (!wait_cond || (dc->state == D_DONE && !dc->ref)) { wait_for_completion_io(&dc->wait); __remove_discard_cmd(sbi, dc); @@ -1282,6 +1351,11 @@ next: } } +static void __wait_all_discard_cmd(struct f2fs_sb_info *sbi, bool wait_cond) +{ + __wait_discard_cmd_range(sbi, wait_cond, 0, UINT_MAX, 1, false); +} + /* This should be covered by global mutex, &sit_i->sentry_lock */ void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) { @@ -1317,12 +1391,12 @@ void stop_discard_thread(struct f2fs_sb_info *sbi) } } -/* This comes from f2fs_put_super and f2fs_trim_fs */ -void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi, bool umount) +/* This comes from f2fs_put_super */ +void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi) { __issue_discard_cmd(sbi, false); __drop_discard_cmd(sbi); - __wait_discard_cmd(sbi, !umount); + __wait_all_discard_cmd(sbi, false); } static void mark_discard_range_all(struct f2fs_sb_info *sbi) @@ -1366,7 +1440,7 @@ static int issue_discard_thread(void *data) issued = __issue_discard_cmd(sbi, true); if (issued) { - __wait_discard_cmd(sbi, true); + __wait_all_discard_cmd(sbi, true); wait_ms = DEF_MIN_DISCARD_ISSUE_TIME; } else { wait_ms = DEF_MAX_DISCARD_ISSUE_TIME; @@ -1677,6 +1751,7 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi) dcc->pend_list_tag[i] |= P_ACTIVE; } INIT_LIST_HEAD(&dcc->wait_list); + INIT_LIST_HEAD(&dcc->fstrim_list); mutex_init(&dcc->cmd_lock); atomic_set(&dcc->issued_discard, 0); atomic_set(&dcc->issing_discard, 0); @@ -2304,7 +2379,8 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) { __u64 start = F2FS_BYTES_TO_BLK(range->start); __u64 end = start + F2FS_BYTES_TO_BLK(range->len) - 1; - unsigned int start_segno, end_segno; + unsigned int start_segno, end_segno, cur_segno; + block_t start_block, end_block; struct cp_control cpc; int err = 0; @@ -2325,12 +2401,17 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) start_segno = (start <= MAIN_BLKADDR(sbi)) ? 0 : GET_SEGNO(sbi, start); end_segno = (end >= MAX_BLKADDR(sbi)) ? MAIN_SEGS(sbi) - 1 : GET_SEGNO(sbi, end); + + start_block = START_BLOCK(sbi, start_segno); + end_block = START_BLOCK(sbi, end_segno + 1); + cpc.reason = CP_DISCARD; cpc.trim_minlen = max_t(__u64, 1, F2FS_BYTES_TO_BLK(range->minlen)); /* do checkpoint to issue discard commands safely */ - for (; start_segno <= end_segno; start_segno = cpc.trim_end + 1) { - cpc.trim_start = start_segno; + for (cur_segno = start_segno; cur_segno <= end_segno; + cur_segno = cpc.trim_end + 1) { + cpc.trim_start = cur_segno; if (sbi->discard_blks == 0) break; @@ -2338,7 +2419,7 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) cpc.trim_end = end_segno; else cpc.trim_end = min_t(unsigned int, - rounddown(start_segno + + rounddown(cur_segno + BATCHED_TRIM_SEGMENTS(sbi), sbi->segs_per_sec) - 1, end_segno); @@ -2350,9 +2431,13 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) schedule(); } - /* It's time to issue all the filed discards */ - mark_discard_range_all(sbi); - f2fs_wait_discard_bios(sbi, false); + + start_block = START_BLOCK(sbi, start_segno); + end_block = START_BLOCK(sbi, min(cur_segno, end_segno) + 1); + + __issue_discard_cmd_range(sbi, start_block, end_block, cpc.trim_minlen); + __wait_discard_cmd_range(sbi, true, start_block, end_block, + cpc.trim_minlen, true); out: range->len = F2FS_BLK_TO_BYTES(cpc.trimmed); return err; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 5fe6047d1db8..07734666eae1 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -801,7 +801,7 @@ static void f2fs_put_super(struct super_block *sb) } /* be sure to wait for any on-going discard commands */ - f2fs_wait_discard_bios(sbi, true); + f2fs_wait_discard_bios(sbi); if (f2fs_discard_en(sbi) && !sbi->discard_blks) { struct cp_control cpc = { From 1e65afd14d32eb318caaebf16f5797e2c723fa20 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 4 Oct 2017 09:08:33 +0800 Subject: [PATCH 445/804] f2fs: wrap discard policy This patch wraps scattered optional parameters into discard policy as below, later, with it we expect that we can adjust these parameters with proper strategy in different scenario. struct discard_policy { unsigned int min_interval; /* used for candidates exist */ unsigned int max_interval; /* used for candidates not exist */ unsigned int max_requests; /* # of discards issued per round */ unsigned int io_aware_gran; /* minimum granularity discard not be aware of I/O */ bool io_aware; /* issue discard in idle time */ bool sync; /* submit discard with REQ_SYNC flag */ }; This patch doesn't change any logic of codes. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 12 +++++++++++- fs/f2fs/segment.c | 38 +++++++++++++++++++++++++++++--------- 2 files changed, 40 insertions(+), 10 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index ea2cd4112a40..d4dd9efd48ec 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -214,7 +214,7 @@ enum { #define BATCHED_TRIM_BLOCKS(sbi) \ (BATCHED_TRIM_SEGMENTS(sbi) << (sbi)->log_blocks_per_seg) #define MAX_DISCARD_BLOCKS(sbi) BLKS_PER_SEC(sbi) -#define DISCARD_ISSUE_RATE 8 +#define DEF_MAX_DISCARD_REQUEST 8 /* issue 8 discards per round */ #define DEF_MIN_DISCARD_ISSUE_TIME 50 /* 50 ms, if exists */ #define DEF_MAX_DISCARD_ISSUE_TIME 60000 /* 60 s, if no candidates */ #define DEF_CP_INTERVAL 60 /* 60 secs */ @@ -310,6 +310,15 @@ struct discard_cmd { int error; /* bio error */ }; +struct discard_policy { + unsigned int min_interval; /* used for candidates exist */ + unsigned int max_interval; /* used for candidates not exist */ + unsigned int max_requests; /* # of discards issued per round */ + unsigned int io_aware_gran; /* minimum granularity discard not be aware of I/O */ + bool io_aware; /* issue discard in idle time */ + bool sync; /* submit discard with REQ_SYNC flag */ +}; + struct discard_cmd_control { struct task_struct *f2fs_issue_discard; /* discard thread */ struct list_head entry_list; /* 4KB discard entry list */ @@ -328,6 +337,7 @@ struct discard_cmd_control { atomic_t issing_discard; /* # of issing discard */ atomic_t discard_cmd_cnt; /* # of cached cmd count */ struct rb_root root; /* root of discard rb-tree */ + struct discard_policy dpolicy; /* current discard policy */ }; /* for the list of fsync inodes, used only during recovery */ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 8bdc31d1c847..c1d648a7d214 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -960,6 +960,7 @@ static void __submit_discard_cmd(struct f2fs_sb_info *sbi, struct list_head *wait_list = fstrim ? &(dcc->fstrim_list) : &(dcc->wait_list); struct bio *bio = NULL; + int flag = dcc->dpolicy.sync ? REQ_SYNC : 0; if (dc->state != D_PREP) return; @@ -978,7 +979,7 @@ static void __submit_discard_cmd(struct f2fs_sb_info *sbi, if (bio) { bio->bi_private = dc; bio->bi_end_io = f2fs_submit_discard_endio; - submit_bio(REQ_SYNC, bio); + submit_bio(flag, bio); list_move_tail(&dc->list, wait_list); __check_sit_bitmap(sbi, dc->start, dc->start + dc->len); @@ -1172,6 +1173,7 @@ static void __issue_discard_cmd_range(struct f2fs_sb_info *sbi, struct discard_cmd *prev_dc = NULL, *next_dc = NULL; struct rb_node **insert_p = NULL, *insert_parent = NULL; struct discard_cmd *dc; + struct discard_policy *dpolicy = &dcc->dpolicy; struct blk_plug plug; int issued; @@ -1204,7 +1206,7 @@ next: __submit_discard_cmd(sbi, dc, true); - if (++issued >= DISCARD_ISSUE_RATE) { + if (++issued >= dpolicy->max_requests) { start = dc->lstart + dc->len; blk_finish_plug(&plug); @@ -1232,6 +1234,7 @@ static int __issue_discard_cmd(struct f2fs_sb_info *sbi, bool issue_cond) struct list_head *pend_list; struct discard_cmd *dc, *tmp; struct blk_plug plug; + struct discard_policy *dpolicy = &dcc->dpolicy; int iter = 0, issued = 0; int i; bool io_interrupted = false; @@ -1259,14 +1262,16 @@ static int __issue_discard_cmd(struct f2fs_sb_info *sbi, bool issue_cond) continue; } - if (is_idle(sbi)) { - __submit_discard_cmd(sbi, dc, false); - issued++; - } else { + if (dpolicy->io_aware && i < dpolicy->io_aware_gran && + !is_idle(sbi)) { io_interrupted = true; + goto skip; } - if (++iter >= DISCARD_ISSUE_RATE) + __submit_discard_cmd(sbi, dc, false); + issued++; +skip: + if (++iter >= dpolicy->max_requests) goto out; } if (list_empty(pend_list) && dcc->pend_list_tag[i] & P_TRIM) @@ -1415,6 +1420,7 @@ static int issue_discard_thread(void *data) struct f2fs_sb_info *sbi = data; struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; wait_queue_head_t *q = &dcc->discard_wait_queue; + struct discard_policy *dpolicy = &dcc->dpolicy; unsigned int wait_ms = DEF_MIN_DISCARD_ISSUE_TIME; int issued; @@ -1441,9 +1447,9 @@ static int issue_discard_thread(void *data) issued = __issue_discard_cmd(sbi, true); if (issued) { __wait_all_discard_cmd(sbi, true); - wait_ms = DEF_MIN_DISCARD_ISSUE_TIME; + wait_ms = dpolicy->min_interval; } else { - wait_ms = DEF_MAX_DISCARD_ISSUE_TIME; + wait_ms = dpolicy->max_interval; } sb_end_intwrite(sbi->sb); @@ -1728,6 +1734,18 @@ skip: wake_up_discard_thread(sbi, false); } +static void inline init_discard_policy(struct discard_cmd_control *dcc) +{ + struct discard_policy *dpolicy = &dcc->dpolicy; + + dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME; + dpolicy->max_interval = DEF_MAX_DISCARD_ISSUE_TIME; + dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST; + dpolicy->io_aware_gran = MAX_PLIST_NUM; + dpolicy->io_aware = true; + dpolicy->sync = true; +} + static int create_discard_cmd_control(struct f2fs_sb_info *sbi) { dev_t dev = sbi->sb->s_bdev->bd_dev; @@ -1761,6 +1779,8 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi) dcc->undiscard_blks = 0; dcc->root = RB_ROOT; + init_discard_policy(dcc); + init_waitqueue_head(&dcc->discard_wait_queue); SM_I(sbi)->dcc_info = dcc; init_thread: From a34ab5ca4f94543741fa304c4cb2095f0bc82898 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 4 Oct 2017 09:08:34 +0800 Subject: [PATCH 446/804] f2fs: split discard policy There are many different scenarios such as fstrim, umount, urgent or background where we will issue discards, actually, they need use different policy in aspect of io aware, discard granularity, delay interval and so on. But now they just share one common discard policy, so there will be race when changing policy in between these scenarios, the interference of changing discard policy will be very serious. This patch changes to split discard policy for different scenarios. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 17 ++++-- fs/f2fs/segment.c | 149 +++++++++++++++++++++++----------------------- fs/f2fs/segment.h | 5 +- fs/f2fs/sysfs.c | 13 ---- 4 files changed, 88 insertions(+), 96 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index d4dd9efd48ec..ec7a55218967 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -275,10 +275,6 @@ struct discard_entry { #define plist_idx(blk_num) ((blk_num) >= MAX_PLIST_NUM ? \ (MAX_PLIST_NUM - 1) : (blk_num - 1)) -#define P_ACTIVE 0x01 -#define P_TRIM 0x02 -#define plist_issue(tag) (((tag) & P_ACTIVE) || ((tag) & P_TRIM)) - enum { D_PREP, D_SUBMIT, @@ -310,13 +306,23 @@ struct discard_cmd { int error; /* bio error */ }; +enum { + DPOLICY_BG, + DPOLICY_FORCE, + DPOLICY_FSTRIM, + DPOLICY_UMOUNT, + MAX_DPOLICY, +}; + struct discard_policy { + int type; /* type of discard */ unsigned int min_interval; /* used for candidates exist */ unsigned int max_interval; /* used for candidates not exist */ unsigned int max_requests; /* # of discards issued per round */ unsigned int io_aware_gran; /* minimum granularity discard not be aware of I/O */ bool io_aware; /* issue discard in idle time */ bool sync; /* submit discard with REQ_SYNC flag */ + unsigned int granularity; /* discard granularity */ }; struct discard_cmd_control { @@ -337,7 +343,6 @@ struct discard_cmd_control { atomic_t issing_discard; /* # of issing discard */ atomic_t discard_cmd_cnt; /* # of cached cmd count */ struct rb_root root; /* root of discard rb-tree */ - struct discard_policy dpolicy; /* current discard policy */ }; /* for the list of fsync inodes, used only during recovery */ @@ -2625,6 +2630,8 @@ int f2fs_flush_device_cache(struct f2fs_sb_info *sbi); void destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free); void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr); bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr); +void init_discard_policy(struct discard_policy *dpolicy, int discard_type, + unsigned int granularity); void refresh_sit_entry(struct f2fs_sb_info *sbi, block_t old, block_t new); void stop_discard_thread(struct f2fs_sb_info *sbi); void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index c1d648a7d214..f1dbf8d5574e 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -954,13 +954,14 @@ void __check_sit_bitmap(struct f2fs_sb_info *sbi, /* this function is copied from blkdev_issue_discard from block/blk-lib.c */ static void __submit_discard_cmd(struct f2fs_sb_info *sbi, - struct discard_cmd *dc, bool fstrim) + struct discard_policy *dpolicy, + struct discard_cmd *dc) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; - struct list_head *wait_list = fstrim ? &(dcc->fstrim_list) : - &(dcc->wait_list); + struct list_head *wait_list = (dpolicy->type == DPOLICY_FSTRIM) ? + &(dcc->fstrim_list) : &(dcc->wait_list); struct bio *bio = NULL; - int flag = dcc->dpolicy.sync ? REQ_SYNC : 0; + int flag = dpolicy->sync ? REQ_SYNC : 0; if (dc->state != D_PREP) return; @@ -1166,14 +1167,13 @@ static int __queue_discard_cmd(struct f2fs_sb_info *sbi, } static void __issue_discard_cmd_range(struct f2fs_sb_info *sbi, - unsigned int start, unsigned int end, - unsigned int granularity) + struct discard_policy *dpolicy, + unsigned int start, unsigned int end) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct discard_cmd *prev_dc = NULL, *next_dc = NULL; struct rb_node **insert_p = NULL, *insert_parent = NULL; struct discard_cmd *dc; - struct discard_policy *dpolicy = &dcc->dpolicy; struct blk_plug plug; int issued; @@ -1196,7 +1196,7 @@ next: while (dc && dc->lstart <= end) { struct rb_node *node; - if (dc->len < granularity) + if (dc->len < dpolicy->granularity) goto skip; if (dc->state != D_PREP) { @@ -1204,7 +1204,7 @@ next: goto skip; } - __submit_discard_cmd(sbi, dc, true); + __submit_discard_cmd(sbi, dpolicy, dc); if (++issued >= dpolicy->max_requests) { start = dc->lstart + dc->len; @@ -1228,54 +1228,39 @@ skip: mutex_unlock(&dcc->cmd_lock); } -static int __issue_discard_cmd(struct f2fs_sb_info *sbi, bool issue_cond) +static int __issue_discard_cmd(struct f2fs_sb_info *sbi, + struct discard_policy *dpolicy) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct list_head *pend_list; struct discard_cmd *dc, *tmp; struct blk_plug plug; - struct discard_policy *dpolicy = &dcc->dpolicy; - int iter = 0, issued = 0; - int i; + int i, iter = 0, issued = 0; bool io_interrupted = false; mutex_lock(&dcc->cmd_lock); f2fs_bug_on(sbi, !__check_rb_tree_consistence(sbi, &dcc->root)); blk_start_plug(&plug); - for (i = MAX_PLIST_NUM - 1; - i >= 0 && plist_issue(dcc->pend_list_tag[i]); i--) { + for (i = MAX_PLIST_NUM - 1; i >= 0; i--) { + if (i + 1 < dpolicy->granularity) + break; pend_list = &dcc->pend_list[i]; list_for_each_entry_safe(dc, tmp, pend_list, list) { f2fs_bug_on(sbi, dc->state != D_PREP); - /* Hurry up to finish fstrim */ - if (dcc->pend_list_tag[i] & P_TRIM) { - __submit_discard_cmd(sbi, dc, false); - issued++; - continue; - } - - if (!issue_cond) { - __submit_discard_cmd(sbi, dc, false); - issued++; - continue; - } - if (dpolicy->io_aware && i < dpolicy->io_aware_gran && !is_idle(sbi)) { io_interrupted = true; goto skip; } - __submit_discard_cmd(sbi, dc, false); + __submit_discard_cmd(sbi, dpolicy, dc); issued++; skip: if (++iter >= dpolicy->max_requests) goto out; } - if (list_empty(pend_list) && dcc->pend_list_tag[i] & P_TRIM) - dcc->pend_list_tag[i] &= (~P_TRIM); } out: blk_finish_plug(&plug); @@ -1319,14 +1304,13 @@ static void __wait_one_discard_bio(struct f2fs_sb_info *sbi, mutex_unlock(&dcc->cmd_lock); } -static void __wait_discard_cmd_range(struct f2fs_sb_info *sbi, bool wait_cond, - block_t start, block_t end, - unsigned int granularity, - bool fstrim) +static void __wait_discard_cmd_range(struct f2fs_sb_info *sbi, + struct discard_policy *dpolicy, + block_t start, block_t end) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; - struct list_head *wait_list = fstrim ? &(dcc->fstrim_list) : - &(dcc->wait_list); + struct list_head *wait_list = (dpolicy->type == DPOLICY_FSTRIM) ? + &(dcc->fstrim_list) : &(dcc->wait_list); struct discard_cmd *dc, *tmp; bool need_wait; @@ -1337,9 +1321,9 @@ next: list_for_each_entry_safe(dc, tmp, wait_list, list) { if (dc->lstart + dc->len <= start || end <= dc->lstart) continue; - if (dc->len < granularity) + if (dc->len < dpolicy->granularity) continue; - if (!wait_cond || (dc->state == D_DONE && !dc->ref)) { + if (dc->state == D_DONE && !dc->ref) { wait_for_completion_io(&dc->wait); __remove_discard_cmd(sbi, dc); } else { @@ -1356,9 +1340,10 @@ next: } } -static void __wait_all_discard_cmd(struct f2fs_sb_info *sbi, bool wait_cond) +static void __wait_all_discard_cmd(struct f2fs_sb_info *sbi, + struct discard_policy *dpolicy) { - __wait_discard_cmd_range(sbi, wait_cond, 0, UINT_MAX, 1, false); + __wait_discard_cmd_range(sbi, dpolicy, 0, UINT_MAX); } /* This should be covered by global mutex, &sit_i->sentry_lock */ @@ -1398,21 +1383,14 @@ void stop_discard_thread(struct f2fs_sb_info *sbi) /* This comes from f2fs_put_super */ void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi) -{ - __issue_discard_cmd(sbi, false); - __drop_discard_cmd(sbi); - __wait_all_discard_cmd(sbi, false); -} - -static void mark_discard_range_all(struct f2fs_sb_info *sbi) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; - int i; + struct discard_policy dpolicy; - mutex_lock(&dcc->cmd_lock); - for (i = 0; i < MAX_PLIST_NUM; i++) - dcc->pend_list_tag[i] |= P_TRIM; - mutex_unlock(&dcc->cmd_lock); + init_discard_policy(&dpolicy, DPOLICY_UMOUNT, dcc->discard_granularity); + __issue_discard_cmd(sbi, &dpolicy); + __drop_discard_cmd(sbi); + __wait_all_discard_cmd(sbi, &dpolicy); } static int issue_discard_thread(void *data) @@ -1420,13 +1398,16 @@ static int issue_discard_thread(void *data) struct f2fs_sb_info *sbi = data; struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; wait_queue_head_t *q = &dcc->discard_wait_queue; - struct discard_policy *dpolicy = &dcc->dpolicy; + struct discard_policy dpolicy; unsigned int wait_ms = DEF_MIN_DISCARD_ISSUE_TIME; int issued; set_freezable(); do { + init_discard_policy(&dpolicy, DPOLICY_BG, + dcc->discard_granularity); + wait_event_interruptible_timeout(*q, kthread_should_stop() || freezing(current) || dcc->discard_wake, @@ -1439,17 +1420,18 @@ static int issue_discard_thread(void *data) if (dcc->discard_wake) { dcc->discard_wake = 0; if (sbi->gc_thread && sbi->gc_thread->gc_urgent) - mark_discard_range_all(sbi); + init_discard_policy(&dpolicy, + DPOLICY_FORCE, 1); } sb_start_intwrite(sbi->sb); - issued = __issue_discard_cmd(sbi, true); + issued = __issue_discard_cmd(sbi, &dpolicy); if (issued) { - __wait_all_discard_cmd(sbi, true); - wait_ms = dpolicy->min_interval; + __wait_all_discard_cmd(sbi, &dpolicy); + wait_ms = dpolicy.min_interval; } else { - wait_ms = dpolicy->max_interval; + wait_ms = dpolicy.max_interval; } sb_end_intwrite(sbi->sb); @@ -1734,16 +1716,35 @@ skip: wake_up_discard_thread(sbi, false); } -static void inline init_discard_policy(struct discard_cmd_control *dcc) +void init_discard_policy(struct discard_policy *dpolicy, + int discard_type, unsigned int granularity) { - struct discard_policy *dpolicy = &dcc->dpolicy; - - dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME; - dpolicy->max_interval = DEF_MAX_DISCARD_ISSUE_TIME; - dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST; - dpolicy->io_aware_gran = MAX_PLIST_NUM; - dpolicy->io_aware = true; + /* common policy */ + dpolicy->type = discard_type; dpolicy->sync = true; + dpolicy->granularity = granularity; + + if (discard_type == DPOLICY_BG) { + dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME; + dpolicy->max_interval = DEF_MAX_DISCARD_ISSUE_TIME; + dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST; + dpolicy->io_aware_gran = MAX_PLIST_NUM; + dpolicy->io_aware = true; + } else if (discard_type == DPOLICY_FORCE) { + dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME; + dpolicy->max_interval = DEF_MAX_DISCARD_ISSUE_TIME; + dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST; + dpolicy->io_aware_gran = MAX_PLIST_NUM; + dpolicy->io_aware = true; + } else if (discard_type == DPOLICY_FSTRIM) { + dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST; + dpolicy->io_aware_gran = MAX_PLIST_NUM; + dpolicy->io_aware = false; + } else if (discard_type == DPOLICY_UMOUNT) { + dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST; + dpolicy->io_aware_gran = MAX_PLIST_NUM; + dpolicy->io_aware = false; + } } static int create_discard_cmd_control(struct f2fs_sb_info *sbi) @@ -1763,11 +1764,8 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi) dcc->discard_granularity = DEFAULT_DISCARD_GRANULARITY; INIT_LIST_HEAD(&dcc->entry_list); - for (i = 0; i < MAX_PLIST_NUM; i++) { + for (i = 0; i < MAX_PLIST_NUM; i++) INIT_LIST_HEAD(&dcc->pend_list[i]); - if (i >= dcc->discard_granularity - 1) - dcc->pend_list_tag[i] |= P_ACTIVE; - } INIT_LIST_HEAD(&dcc->wait_list); INIT_LIST_HEAD(&dcc->fstrim_list); mutex_init(&dcc->cmd_lock); @@ -1779,8 +1777,6 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi) dcc->undiscard_blks = 0; dcc->root = RB_ROOT; - init_discard_policy(dcc); - init_waitqueue_head(&dcc->discard_wait_queue); SM_I(sbi)->dcc_info = dcc; init_thread: @@ -2402,6 +2398,7 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) unsigned int start_segno, end_segno, cur_segno; block_t start_block, end_block; struct cp_control cpc; + struct discard_policy dpolicy; int err = 0; if (start >= MAX_BLKADDR(sbi) || range->len < sbi->blocksize) @@ -2455,9 +2452,9 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) start_block = START_BLOCK(sbi, start_segno); end_block = START_BLOCK(sbi, min(cur_segno, end_segno) + 1); - __issue_discard_cmd_range(sbi, start_block, end_block, cpc.trim_minlen); - __wait_discard_cmd_range(sbi, true, start_block, end_block, - cpc.trim_minlen, true); + init_discard_policy(&dpolicy, DPOLICY_FSTRIM, cpc.trim_minlen); + __issue_discard_cmd_range(sbi, &dpolicy, start_block, end_block); + __wait_discard_cmd_range(sbi, &dpolicy, start_block, end_block); out: range->len = F2FS_BLK_TO_BYTES(cpc.trimmed); return err; diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index ffa11274b0ce..d12d9cd99f91 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -795,8 +795,9 @@ static inline void wake_up_discard_thread(struct f2fs_sb_info *sbi, bool force) goto wake_up; mutex_lock(&dcc->cmd_lock); - for (i = MAX_PLIST_NUM - 1; - i >= 0 && plist_issue(dcc->pend_list_tag[i]); i--) { + for (i = MAX_PLIST_NUM - 1; i >= 0; i--) { + if (i + 1 < dcc->discard_granularity) + break; if (!list_empty(&dcc->pend_list[i])) { wakeup = true; break; diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index e2c258f717cd..89f25efffd43 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -154,23 +154,10 @@ static ssize_t f2fs_sbi_store(struct f2fs_attr *a, } if (!strcmp(a->attr.name, "discard_granularity")) { - struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; - int i; - if (t == 0 || t > MAX_PLIST_NUM) return -EINVAL; if (t == *ui) return count; - - mutex_lock(&dcc->cmd_lock); - for (i = 0; i < MAX_PLIST_NUM; i++) { - if (i >= t - 1) - dcc->pend_list_tag[i] |= P_ACTIVE; - else - dcc->pend_list_tag[i] &= (~P_ACTIVE); - } - mutex_unlock(&dcc->cmd_lock); - *ui = t; return count; } From bd502c6e3e7a59aaf28b6d065384bd90f40790bf Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 4 Oct 2017 09:08:35 +0800 Subject: [PATCH 447/804] f2fs: reduce cmd_lock coverage in __issue_discard_cmd __submit_discard_cmd may lead long latency due to exhaustion of I/O request resource in block layer, so issuing all discard under cmd_lock may lead to hangtask, in order to avoid that, let's reduce it's coverage. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index f1dbf8d5574e..859ead471243 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1238,14 +1238,14 @@ static int __issue_discard_cmd(struct f2fs_sb_info *sbi, int i, iter = 0, issued = 0; bool io_interrupted = false; - mutex_lock(&dcc->cmd_lock); - f2fs_bug_on(sbi, - !__check_rb_tree_consistence(sbi, &dcc->root)); - blk_start_plug(&plug); for (i = MAX_PLIST_NUM - 1; i >= 0; i--) { if (i + 1 < dpolicy->granularity) break; pend_list = &dcc->pend_list[i]; + + mutex_lock(&dcc->cmd_lock); + f2fs_bug_on(sbi, !__check_rb_tree_consistence(sbi, &dcc->root)); + blk_start_plug(&plug); list_for_each_entry_safe(dc, tmp, pend_list, list) { f2fs_bug_on(sbi, dc->state != D_PREP); @@ -1259,12 +1259,14 @@ static int __issue_discard_cmd(struct f2fs_sb_info *sbi, issued++; skip: if (++iter >= dpolicy->max_requests) - goto out; + break; } + blk_finish_plug(&plug); + mutex_unlock(&dcc->cmd_lock); + + if (iter >= dpolicy->max_requests) + break; } -out: - blk_finish_plug(&plug); - mutex_unlock(&dcc->cmd_lock); if (!issued && io_interrupted) issued = -1; From df74eacb207596ba0f4323bbb6b2bbc974c6f87b Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 4 Oct 2017 09:08:36 +0800 Subject: [PATCH 448/804] f2fs: trace f2fs_remove_discard This patch adds tracepoint to trace f2fs_remove_discard. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 2 ++ include/trace/events/f2fs.h | 7 +++++++ 2 files changed, 9 insertions(+) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 859ead471243..41b3fc0cca62 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -822,6 +822,8 @@ static void __remove_discard_cmd(struct f2fs_sb_info *sbi, { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + trace_f2fs_remove_discard(dc->bdev, dc->start, dc->len); + f2fs_bug_on(sbi, dc->ref); if (dc->error == -EOPNOTSUPP) diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 7063bbcca03b..1c725ff5786b 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -1274,6 +1274,13 @@ DEFINE_EVENT(f2fs_discard, f2fs_issue_discard, TP_ARGS(dev, blkstart, blklen) ); +DEFINE_EVENT(f2fs_discard, f2fs_remove_discard, + + TP_PROTO(struct block_device *dev, block_t blkstart, block_t blklen), + + TP_ARGS(dev, blkstart, blklen) +); + TRACE_EVENT(f2fs_issue_reset_zone, TP_PROTO(struct block_device *dev, block_t blkstart), From 68e801abc520b06ba24af6fde667408e4372fe3a Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 4 Oct 2017 09:08:37 +0800 Subject: [PATCH 449/804] f2fs: give up CP_TRIMMED_FLAG if it drops discards In ->umount, once we drop remained discard entries, we should not set CP_TRIMMED_FLAG with another checkpoint. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- fs/f2fs/segment.c | 13 ++++++++++--- fs/f2fs/super.c | 5 +++-- 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index ec7a55218967..9267f50dfe8f 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2634,7 +2634,7 @@ void init_discard_policy(struct discard_policy *dpolicy, int discard_type, unsigned int granularity); void refresh_sit_entry(struct f2fs_sb_info *sbi, block_t old, block_t new); void stop_discard_thread(struct f2fs_sb_info *sbi); -void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi); +bool f2fs_wait_discard_bios(struct f2fs_sb_info *sbi); void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc); void release_discard_addrs(struct f2fs_sb_info *sbi); int npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 41b3fc0cca62..a065a2c01b5f 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1276,12 +1276,13 @@ skip: return issued; } -static void __drop_discard_cmd(struct f2fs_sb_info *sbi) +static bool __drop_discard_cmd(struct f2fs_sb_info *sbi) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct list_head *pend_list; struct discard_cmd *dc, *tmp; int i; + bool dropped = false; mutex_lock(&dcc->cmd_lock); for (i = MAX_PLIST_NUM - 1; i >= 0; i--) { @@ -1289,9 +1290,12 @@ static void __drop_discard_cmd(struct f2fs_sb_info *sbi) list_for_each_entry_safe(dc, tmp, pend_list, list) { f2fs_bug_on(sbi, dc->state != D_PREP); __remove_discard_cmd(sbi, dc); + dropped = true; } } mutex_unlock(&dcc->cmd_lock); + + return dropped; } static void __wait_one_discard_bio(struct f2fs_sb_info *sbi, @@ -1386,15 +1390,18 @@ void stop_discard_thread(struct f2fs_sb_info *sbi) } /* This comes from f2fs_put_super */ -void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi) +bool f2fs_wait_discard_bios(struct f2fs_sb_info *sbi) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct discard_policy dpolicy; + bool dropped; init_discard_policy(&dpolicy, DPOLICY_UMOUNT, dcc->discard_granularity); __issue_discard_cmd(sbi, &dpolicy); - __drop_discard_cmd(sbi); + dropped = __drop_discard_cmd(sbi); __wait_all_discard_cmd(sbi, &dpolicy); + + return dropped; } static int issue_discard_thread(void *data) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 07734666eae1..8e7ef3712bcc 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -781,6 +781,7 @@ static void f2fs_put_super(struct super_block *sb) { struct f2fs_sb_info *sbi = F2FS_SB(sb); int i; + bool dropped; f2fs_quota_off_umount(sb); @@ -801,9 +802,9 @@ static void f2fs_put_super(struct super_block *sb) } /* be sure to wait for any on-going discard commands */ - f2fs_wait_discard_bios(sbi); + dropped = f2fs_wait_discard_bios(sbi); - if (f2fs_discard_en(sbi) && !sbi->discard_blks) { + if (f2fs_discard_en(sbi) && !sbi->discard_blks && !dropped) { struct cp_control cpc = { .reason = CP_UMOUNT | CP_TRIMMED, }; From 0c47a892d555f0bcf6613c6b85a6e95d92c55973 Mon Sep 17 00:00:00 2001 From: Thomas Meyer Date: Sat, 7 Oct 2017 16:02:21 +0200 Subject: [PATCH 450/804] f2fs: Fix bool initialization/comparison Bool initializations should use true and false. Bool tests don't need comparisons. Signed-off-by: Thomas Meyer Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index a655a39d60b3..277dafd7c964 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -416,8 +416,8 @@ next: bio_page = fio->encrypted_page ? fio->encrypted_page : fio->page; - /* set submitted = 1 as a return value */ - fio->submitted = 1; + /* set submitted = true as a return value */ + fio->submitted = true; inc_page_count(sbi, WB_DATA_TYPE(bio_page)); From 85853e7e38d7691ae9d41ef7ed4579313b857584 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 9 Oct 2017 17:55:19 +0800 Subject: [PATCH 451/804] f2fs: fix to avoid race when accessing last_disk_size last_disk_size could be wrong due to concurrently updating, so using i_sem semaphore to make last_disk_size updating exclusive to fix this issue. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 3 +++ fs/f2fs/f2fs.h | 10 ++++++++-- fs/f2fs/file.c | 4 ++++ 3 files changed, 15 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 277dafd7c964..43e32216e681 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1567,8 +1567,11 @@ write: err = do_write_data_page(&fio); } } + + down_write(&F2FS_I(inode)->i_sem); if (F2FS_I(inode)->last_disk_size < psize) F2FS_I(inode)->last_disk_size = psize; + up_write(&F2FS_I(inode)->i_sem); done: if (err && err != -ENOENT) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 9267f50dfe8f..bd5839895737 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2345,9 +2345,10 @@ static inline void clear_file(struct inode *inode, int type) static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync) { + bool ret; + if (dsync) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - bool ret; spin_lock(&sbi->inode_lock[DIRTY_META]); ret = list_empty(&F2FS_I(inode)->gdirty_list); @@ -2358,7 +2359,12 @@ static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync) file_keep_isize(inode) || i_size_read(inode) & PAGE_MASK) return false; - return F2FS_I(inode)->last_disk_size == i_size_read(inode); + + down_read(&F2FS_I(inode)->i_sem); + ret = F2FS_I(inode)->last_disk_size == i_size_read(inode); + up_read(&F2FS_I(inode)->i_sem); + + return ret; } static inline int f2fs_readonly(struct super_block *sb) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index cd569d394272..bf6845c01d38 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -759,6 +759,10 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) inode->i_mtime = inode->i_ctime = current_time(inode); } + down_write(&F2FS_I(inode)->i_sem); + F2FS_I(inode)->last_disk_size = i_size_read(inode); + up_write(&F2FS_I(inode)->i_sem); + size_changed = true; } From 5562a3c53963a24d2f34c258fe11e09ce8aa336f Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Sat, 7 Oct 2017 00:08:05 -0700 Subject: [PATCH 452/804] f2fs/crypto: drop crypto key at evict_inode only This patch avoids dropping crypto key in f2fs_drop_inode, so we can guarantee it happens only at evict_inode. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 8e7ef3712bcc..c78bbb78c870 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -673,7 +673,6 @@ static int f2fs_drop_inode(struct inode *inode) sb_end_intwrite(inode->i_sb); - fscrypt_put_encryption_info(inode, NULL); spin_lock(&inode->i_lock); atomic_dec(&inode->i_count); } From cb98f70dea02334bb6f30bb9e879456d789f3afe Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 12 Oct 2017 19:12:53 -0700 Subject: [PATCH 453/804] f2fs: avoid stale fi->gdirty_list pointer When doing fault injection test, f2fs_evict_inode() didn't remove gdirty_list which incurs a kernel panic due to wrong pointer access. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/inode.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index ad4f7d52c0ad..3617e7fca930 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -520,8 +520,10 @@ no_delete: stat_dec_inline_dir(inode); stat_dec_inline_inode(inode); - if (!is_set_ckpt_flags(sbi, CP_ERROR_FLAG)) + if (likely(!is_set_ckpt_flags(sbi, CP_ERROR_FLAG))) f2fs_bug_on(sbi, is_inode_flag_set(inode, FI_DIRTY_INODE)); + else + f2fs_inode_synced(inode); /* ino == 0, if f2fs_new_inode() was failed t*/ if (inode->i_ino) From 8e84f379df61f86bb72830241a9a67de13a1b119 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 13 Oct 2017 10:27:45 -0700 Subject: [PATCH 454/804] f2fs: expose some sectors to user in inline data or dentry case If there's some data written through inline data or dentry, we need to shouw st_blocks. This fixes reporting zero blocks even though there is small written data. Cc: stable@vger.kernel.org Reviewed-by: Chao Yu [Jaegeuk Kim: avoid link file for quotacheck] Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index bf6845c01d38..b41a4a2d0a0a 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -668,6 +668,12 @@ int f2fs_getattr(struct vfsmount *mnt, { struct inode *inode = d_inode(dentry); generic_fillattr(inode, stat); + + /* we need to show initial sectors used for inline_data/dentries */ + if ((S_ISREG(inode->i_mode) && f2fs_has_inline_data(inode)) || + f2fs_has_inline_dentry(inode)) + stat->blocks += (stat->size + 511) >> 9; + return 0; } From 40d6250f046a3ddddc4410fc35af0fcc4ea6772d Mon Sep 17 00:00:00 2001 From: Weichao Guo Date: Sat, 14 Oct 2017 08:13:32 +0800 Subject: [PATCH 455/804] f2fs: skip searching non-exist range in truncate_hole Let's skip entire non-exist area to speed up truncate_hole by using get_next_page_offset. Signed-off-by: Weichao Guo Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index b41a4a2d0a0a..505fb4ee03a4 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -842,7 +842,7 @@ int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end) err = get_dnode_of_data(&dn, pg_start, LOOKUP_NODE); if (err) { if (err == -ENOENT) { - pg_start++; + pg_start = get_next_page_offset(&dn, pg_start); continue; } return err; From bb0db666d4bca3a2139bce9da3ed321a226a974a Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 17 Oct 2017 17:33:41 +0800 Subject: [PATCH 456/804] f2fs: trace f2fs_lookup This patch adds trace for f2fs_lookup. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/namei.c | 49 +++++++++++++++++++++----------- include/trace/events/f2fs.h | 56 +++++++++++++++++++++++++++++++++++++ 2 files changed, 88 insertions(+), 17 deletions(-) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index d92b8e9064cb..5503da9c55f8 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -331,12 +331,15 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, struct inode *inode = NULL; struct f2fs_dir_entry *de; struct page *page; - nid_t ino; + struct dentry *new; + nid_t ino = -1; int err = 0; unsigned int root_ino = F2FS_ROOT_INO(F2FS_I_SB(dir)); + trace_f2fs_lookup_start(dir, dentry, flags); + if (f2fs_encrypted_inode(dir)) { - int res = fscrypt_get_encryption_info(dir); + err = fscrypt_get_encryption_info(dir); /* * DCACHE_ENCRYPTED_WITH_KEY is set if the dentry is @@ -346,18 +349,22 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, if (fscrypt_has_encryption_key(dir)) fscrypt_set_encrypted_dentry(dentry); fscrypt_set_d_op(dentry); - if (res && res != -ENOKEY) - return ERR_PTR(res); + if (err && err != -ENOKEY) + goto out; } - if (dentry->d_name.len > F2FS_NAME_LEN) - return ERR_PTR(-ENAMETOOLONG); + if (dentry->d_name.len > F2FS_NAME_LEN) { + err = -ENAMETOOLONG; + goto out; + } de = f2fs_find_entry(dir, &dentry->d_name, &page); if (!de) { - if (IS_ERR(page)) - return (struct dentry *)page; - return d_splice_alias(inode, dentry); + if (IS_ERR(page)) { + err = PTR_ERR(page); + goto out; + } + goto out_splice; } ino = le32_to_cpu(de->ino); @@ -365,19 +372,21 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, f2fs_put_page(page, 0); inode = f2fs_iget(dir->i_sb, ino); - if (IS_ERR(inode)) - return ERR_CAST(inode); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto out; + } if ((dir->i_ino == root_ino) && f2fs_has_inline_dots(dir)) { err = __recover_dot_dentries(dir, root_ino); if (err) - goto err_out; + goto out_iput; } if (f2fs_has_inline_dots(inode)) { err = __recover_dot_dentries(inode, dir->i_ino); if (err) - goto err_out; + goto out_iput; } if (f2fs_encrypted_inode(dir) && (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) && @@ -386,12 +395,18 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, "Inconsistent encryption contexts: %lu/%lu", dir->i_ino, inode->i_ino); err = -EPERM; - goto err_out; + goto out_iput; } - return d_splice_alias(inode, dentry); - -err_out: +out_splice: + new = d_splice_alias(inode, dentry); + if (IS_ERR(new)) + err = PTR_ERR(new); + trace_f2fs_lookup_end(dir, dentry, ino, err); + return new; +out_iput: iput(inode); +out: + trace_f2fs_lookup_end(dir, dentry, ino, err); return ERR_PTR(err); } diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 1c725ff5786b..427ad06189ec 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -716,6 +716,62 @@ TRACE_EVENT(f2fs_get_victim, __entry->free) ); +TRACE_EVENT(f2fs_lookup_start, + + TP_PROTO(struct inode *dir, struct dentry *dentry, unsigned int flags), + + TP_ARGS(dir, dentry, flags), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(ino_t, ino) + __field(const char *, name) + __field(unsigned int, flags) + ), + + TP_fast_assign( + __entry->dev = dir->i_sb->s_dev; + __entry->ino = dir->i_ino; + __entry->name = dentry->d_name.name; + __entry->flags = flags; + ), + + TP_printk("dev = (%d,%d), pino = %lu, name:%s, flags:%u", + show_dev_ino(__entry), + __entry->name, + __entry->flags) +); + +TRACE_EVENT(f2fs_lookup_end, + + TP_PROTO(struct inode *dir, struct dentry *dentry, nid_t ino, + int err), + + TP_ARGS(dir, dentry, ino, err), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(ino_t, ino) + __field(const char *, name) + __field(nid_t, cino) + __field(int, err) + ), + + TP_fast_assign( + __entry->dev = dir->i_sb->s_dev; + __entry->ino = dir->i_ino; + __entry->name = dentry->d_name.name; + __entry->cino = ino; + __entry->err = err; + ), + + TP_printk("dev = (%d,%d), pino = %lu, name:%s, ino:%u, err:%d", + show_dev_ino(__entry), + __entry->name, + __entry->cino, + __entry->err) +); + TRACE_EVENT(f2fs_fallocate, TP_PROTO(struct inode *inode, int mode, From 2b903fe94cd0743df361d23aa0ce486ee78be510 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 13 Oct 2017 18:01:33 +0800 Subject: [PATCH 457/804] f2fs: trace f2fs_readdir This patch adds trace for f2fs_readdir. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 14 +++++++++----- include/trace/events/f2fs.h | 29 +++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 4f2a8fedb313..c745f977869c 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -14,6 +14,7 @@ #include "node.h" #include "acl.h" #include "xattr.h" +#include static unsigned long dir_blocks(struct inode *inode) { @@ -847,6 +848,7 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) struct f2fs_dentry_block *dentry_blk = NULL; struct page *dentry_page = NULL; struct file_ra_state *ra = &file->f_ra; + loff_t start_pos = ctx->pos; unsigned int n = ((unsigned long)ctx->pos / NR_DENTRY_IN_BLOCK); struct f2fs_dentry_ptr d; struct fscrypt_str fstr = FSTR_INIT(NULL, 0); @@ -855,16 +857,16 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) if (f2fs_encrypted_inode(inode)) { err = fscrypt_get_encryption_info(inode); if (err && err != -ENOKEY) - return err; + goto out; err = fscrypt_fname_alloc_buffer(inode, F2FS_NAME_LEN, &fstr); if (err < 0) - return err; + goto out; } if (f2fs_has_inline_dentry(inode)) { err = f2fs_read_inline_dir(file, ctx, &fstr); - goto out; + goto out_free; } /* readahead for multi pages of dir */ @@ -880,7 +882,7 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) err = 0; continue; } else { - goto out; + goto out_free; } } @@ -900,8 +902,10 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) kunmap(dentry_page); f2fs_put_page(dentry_page, 1); } -out: +out_free: fscrypt_fname_free_buffer(&fstr); +out: + trace_f2fs_readdir(inode, start_pos, ctx->pos, err); return err < 0 ? err : 0; } diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 427ad06189ec..c9be882c2718 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -772,6 +772,35 @@ TRACE_EVENT(f2fs_lookup_end, __entry->err) ); +TRACE_EVENT(f2fs_readdir, + + TP_PROTO(struct inode *dir, loff_t start_pos, loff_t end_pos, int err), + + TP_ARGS(dir, start_pos, end_pos, err), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(ino_t, ino) + __field(loff_t, start) + __field(loff_t, end) + __field(int, err) + ), + + TP_fast_assign( + __entry->dev = dir->i_sb->s_dev; + __entry->ino = dir->i_ino; + __entry->start = start_pos; + __entry->end = end_pos; + __entry->err = err; + ), + + TP_printk("dev = (%d,%d), ino = %lu, start_pos:%llu, end_pos:%llu, err:%d", + show_dev_ino(__entry), + __entry->start, + __entry->end, + __entry->err) +); + TRACE_EVENT(f2fs_fallocate, TP_PROTO(struct inode *inode, int mode, From c8be47b54018a12c96dd7328951405202f6c3d89 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 13 Oct 2017 18:01:34 +0800 Subject: [PATCH 458/804] f2fs: allow readdir() to be interrupted This patch follows ext4 to allow readdir() in large empty directory to be interrupted. Referenced commit of ext4: 1f60fbe72749 ("ext4: allow readdir()'s of large empty directories to be interrupted"). Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index c745f977869c..95500eaae681 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -10,6 +10,7 @@ */ #include #include +#include #include "f2fs.h" #include "node.h" #include "acl.h" @@ -875,6 +876,14 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) min(npages - n, (pgoff_t)MAX_DIR_RA_PAGES)); for (; n < npages; n++) { + + /* allow readdir() to be interrupted */ + if (fatal_signal_pending(current)) { + err = -ERESTARTSYS; + goto out_free; + } + cond_resched(); + dentry_page = get_lock_data_page(inode, n, false); if (IS_ERR(dentry_page)) { err = PTR_ERR(dentry_page); From 4d6e68be2534b03a135d5e98dee94d072166c455 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 13 Oct 2017 18:01:35 +0800 Subject: [PATCH 459/804] f2fs: relocate readahead codes in readdir() Previously, for large directory, we just do readahead only once in readdir(), readdir()'s performance may drop when traversing latter blocks. In order to avoid this, relocate readahead codes to covering all traverse flow. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 95500eaae681..65c528539b78 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -870,11 +870,6 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) goto out_free; } - /* readahead for multi pages of dir */ - if (npages - n > 1 && !ra_has_index(ra, n)) - page_cache_sync_readahead(inode->i_mapping, ra, file, n, - min(npages - n, (pgoff_t)MAX_DIR_RA_PAGES)); - for (; n < npages; n++) { /* allow readdir() to be interrupted */ @@ -884,6 +879,11 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) } cond_resched(); + /* readahead for multi pages of dir */ + if (npages - n > 1 && !ra_has_index(ra, n)) + page_cache_sync_readahead(inode->i_mapping, ra, file, n, + min(npages - n, (pgoff_t)MAX_DIR_RA_PAGES)); + dentry_page = get_lock_data_page(inode, n, false); if (IS_ERR(dentry_page)) { err = PTR_ERR(dentry_page); From 83ed7a615f0a8c2efb448a17c0e51ab33adbee2c Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 13 Oct 2017 18:01:36 +0800 Subject: [PATCH 460/804] f2fs: update ctx->pos correctly when hitting hole in directory This patch fixes to update ctx->pos correctly when hitting hole in directory. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 65c528539b78..1955707b138b 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -870,7 +870,7 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) goto out_free; } - for (; n < npages; n++) { + for (; n < npages; n++, ctx->pos = n * NR_DENTRY_IN_BLOCK) { /* allow readdir() to be interrupted */ if (fatal_signal_pending(current)) { @@ -907,7 +907,6 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) break; } - ctx->pos = (n + 1) * NR_DENTRY_IN_BLOCK; kunmap(dentry_page); f2fs_put_page(dentry_page, 1); } From 171b638fc49bdaf3302d7df8eb7b9d5bc2d3dfbe Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 18 Oct 2017 19:05:57 -0700 Subject: [PATCH 461/804] f2fs: limit # of inmemory pages If some abnormal users try lots of atomic write operations, f2fs is able to produce pinned pages in the main memory which affects system performance. This patch limits that as 20% over total memory size, and if f2fs reaches to the limit, it will drop all the inmemory pages. Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 8 ++++++++ fs/f2fs/f2fs.h | 3 +++ fs/f2fs/node.c | 4 ++++ fs/f2fs/node.h | 1 + fs/f2fs/segment.c | 38 ++++++++++++++++++++++++++++++++++++++ fs/f2fs/super.c | 1 + 6 files changed, 55 insertions(+) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 43e32216e681..6750584b7107 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1941,6 +1941,12 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, trace_f2fs_write_begin(inode, pos, len, flags); + if (f2fs_is_atomic_file(inode) && + !available_free_memory(sbi, INMEM_PAGES)) { + err = -ENOMEM; + goto fail; + } + /* * We should check this at this moment to avoid deadlock on inode page * and #0 page. The locking rule for inline_data conversion should be: @@ -2017,6 +2023,8 @@ repeat: fail: f2fs_put_page(page, 1); f2fs_write_failed(mapping, pos + len); + if (f2fs_is_atomic_file(inode)) + drop_inmem_pages_all(sbi); return err; } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index bd5839895737..571b7f18171c 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -665,6 +665,7 @@ struct f2fs_inode_info { #endif struct list_head dirty_list; /* dirty list for dirs and files */ struct list_head gdirty_list; /* linked in global dirty list */ + struct list_head inmem_ilist; /* list for inmem inodes */ struct list_head inmem_pages; /* inmemory pages managed by f2fs */ struct task_struct *inmem_task; /* store inmemory task */ struct mutex inmem_lock; /* lock for inmemory pages */ @@ -1029,6 +1030,7 @@ enum inode_type { DIR_INODE, /* for dirty dir inode */ FILE_INODE, /* for dirty regular/symlink inode */ DIRTY_META, /* for all dirtied inode metadata */ + ATOMIC_FILE, /* for all atomic files */ NR_INODE_TYPE, }; @@ -2625,6 +2627,7 @@ void destroy_node_manager_caches(void); */ bool need_SSR(struct f2fs_sb_info *sbi); void register_inmem_page(struct inode *inode, struct page *page); +void drop_inmem_pages_all(struct f2fs_sb_info *sbi); void drop_inmem_pages(struct inode *inode); void drop_inmem_page(struct inode *inode, struct page *page); int commit_inmem_pages(struct inode *inode); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 733a8e14a4c8..22f2ba4a6326 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -74,6 +74,10 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type) atomic_read(&sbi->total_ext_node) * sizeof(struct extent_node)) >> PAGE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); + } else if (type == INMEM_PAGES) { + /* it allows 20% / total_ram for inmemory pages */ + mem_size = get_pages(sbi, F2FS_INMEM_PAGES); + res = mem_size < (val.totalram / 5); } else { if (!sbi->sb->s_bdi->wb.dirty_exceeded) return true; diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index e91b08b4a51a..0ee3e5ff49a3 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -140,6 +140,7 @@ enum mem_type { DIRTY_DENTS, /* indicates dirty dentry pages */ INO_ENTRIES, /* indicates inode entries */ EXTENT_CACHE, /* indicates extent cache */ + INMEM_PAGES, /* indicates inmemory pages */ BASE_CHECK, /* check kernel status */ }; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index a065a2c01b5f..f0916b24f5b4 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -186,6 +186,7 @@ bool need_SSR(struct f2fs_sb_info *sbi) void register_inmem_page(struct inode *inode, struct page *page) { + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); struct inmem_pages *new; @@ -204,6 +205,10 @@ void register_inmem_page(struct inode *inode, struct page *page) mutex_lock(&fi->inmem_lock); get_page(page); list_add_tail(&new->list, &fi->inmem_pages); + spin_lock(&sbi->inode_lock[ATOMIC_FILE]); + if (list_empty(&fi->inmem_ilist)) + list_add_tail(&fi->inmem_ilist, &sbi->inode_list[ATOMIC_FILE]); + spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); inc_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES); mutex_unlock(&fi->inmem_lock); @@ -262,12 +267,41 @@ next: return err; } +void drop_inmem_pages_all(struct f2fs_sb_info *sbi) +{ + struct list_head *head = &sbi->inode_list[ATOMIC_FILE]; + struct inode *inode; + struct f2fs_inode_info *fi; +next: + spin_lock(&sbi->inode_lock[ATOMIC_FILE]); + if (list_empty(head)) { + spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); + return; + } + fi = list_first_entry(head, struct f2fs_inode_info, inmem_ilist); + inode = igrab(&fi->vfs_inode); + spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); + + if (inode) { + drop_inmem_pages(inode); + iput(inode); + } + congestion_wait(BLK_RW_ASYNC, HZ/50); + cond_resched(); + goto next; +} + void drop_inmem_pages(struct inode *inode) { + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); mutex_lock(&fi->inmem_lock); __revoke_inmem_pages(inode, &fi->inmem_pages, true, false); + spin_lock(&sbi->inode_lock[ATOMIC_FILE]); + if (!list_empty(&fi->inmem_ilist)) + list_del_init(&fi->inmem_ilist); + spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); mutex_unlock(&fi->inmem_lock); clear_inode_flag(inode, FI_ATOMIC_FILE); @@ -399,6 +433,10 @@ int commit_inmem_pages(struct inode *inode) /* drop all uncommitted pages */ __revoke_inmem_pages(inode, &fi->inmem_pages, true, false); } + spin_lock(&sbi->inode_lock[ATOMIC_FILE]); + if (!list_empty(&fi->inmem_ilist)) + list_del_init(&fi->inmem_ilist); + spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); mutex_unlock(&fi->inmem_lock); clear_inode_flag(inode, FI_ATOMIC_COMMIT); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index c78bbb78c870..1f8711bbb89f 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -625,6 +625,7 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) init_rwsem(&fi->i_sem); INIT_LIST_HEAD(&fi->dirty_list); INIT_LIST_HEAD(&fi->gdirty_list); + INIT_LIST_HEAD(&fi->inmem_ilist); INIT_LIST_HEAD(&fi->inmem_pages); mutex_init(&fi->inmem_lock); init_rwsem(&fi->dio_rwsem[READ]); From 032a6906825a2ebe60204ba67002a17116113c13 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 19 Oct 2017 09:43:56 -0700 Subject: [PATCH 462/804] f2fs: retry ENOMEM for quota_read|write This gives another chance to read or write quota data. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 1f8711bbb89f..92abf034bde7 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1322,8 +1322,13 @@ static ssize_t f2fs_quota_read(struct super_block *sb, int type, char *data, tocopy = min_t(unsigned long, sb->s_blocksize - offset, toread); repeat: page = read_mapping_page(mapping, blkidx, NULL); - if (IS_ERR(page)) + if (IS_ERR(page)) { + if (PTR_ERR(page) == -ENOMEM) { + congestion_wait(BLK_RW_ASYNC, HZ/50); + goto repeat; + } return PTR_ERR(page); + } lock_page(page); @@ -1366,11 +1371,16 @@ static ssize_t f2fs_quota_write(struct super_block *sb, int type, while (towrite > 0) { tocopy = min_t(unsigned long, sb->s_blocksize - offset, towrite); - +retry: err = a_ops->write_begin(NULL, mapping, off, tocopy, 0, &page, NULL); - if (unlikely(err)) + if (unlikely(err)) { + if (err == -ENOMEM) { + congestion_wait(BLK_RW_ASYNC, HZ/50); + goto retry; + } break; + } kaddr = kmap_atomic(page); memcpy(kaddr + offset, data, tocopy); From 5f5f59322240a84bcf4c1896754b255b4b158d1d Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 19 Oct 2017 11:48:57 -0700 Subject: [PATCH 463/804] f2fs: remove obsolete pointer for truncate_xattr_node This patch removes obosolete parameter for truncate_xattr_node. Suggested-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- fs/f2fs/node.c | 10 ++++------ fs/f2fs/xattr.c | 2 +- 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 571b7f18171c..72de8ae4be13 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2592,7 +2592,7 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni); pgoff_t get_next_page_offset(struct dnode_of_data *dn, pgoff_t pgofs); int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode); int truncate_inode_blocks(struct inode *inode, pgoff_t from); -int truncate_xattr_node(struct inode *inode, struct page *page); +int truncate_xattr_node(struct inode *inode); int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino); int remove_inode_page(struct inode *inode); struct page *new_inode_page(struct inode *inode); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 22f2ba4a6326..d7e0a4366527 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -962,7 +962,8 @@ fail: return err > 0 ? 0 : err; } -int truncate_xattr_node(struct inode *inode, struct page *page) +/* caller must lock inode page */ +int truncate_xattr_node(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); nid_t nid = F2FS_I(inode)->i_xattr_nid; @@ -978,10 +979,7 @@ int truncate_xattr_node(struct inode *inode, struct page *page) f2fs_i_xnid_write(inode, 0); - set_new_dnode(&dn, inode, page, npage, nid); - - if (page) - dn.inode_page_locked = true; + set_new_dnode(&dn, inode, NULL, npage, nid); truncate_node(&dn); return 0; } @@ -1000,7 +998,7 @@ int remove_inode_page(struct inode *inode) if (err) return err; - err = truncate_xattr_node(inode, dn.inode_page); + err = truncate_xattr_node(inode); if (err) { f2fs_put_dnode(&dn); return err; diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index c5e6a7e42262..442c7ec5acd0 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -468,7 +468,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, /* no need to use xattr node block */ if (hsize <= inline_size) { - err = truncate_xattr_node(inode, ipage); + err = truncate_xattr_node(inode); alloc_nid_failed(sbi, new_nid); return err; } From 5c15033ceaea9900ecd1a5551a8080ee1a4abfdb Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 19 Oct 2017 12:07:11 -0700 Subject: [PATCH 464/804] Revert "f2fs: return wrong error number on f2fs_quota_write" This reverts commit 4f31d26b0c17f2aae6a6afeb823a87e20671ab4b. It turns out that we need to report error number if nothing was written. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 92abf034bde7..8d79b1887cb4 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1397,7 +1397,7 @@ retry: } if (len == towrite) - return 0; + return err; inode->i_version++; inode->i_mtime = inode->i_ctime = current_time(inode); f2fs_mark_inode_dirty_sync(inode, false); From 03b1cb0bb4a2f4b1e512aa2b3dcaf22717e7e76e Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 18 Oct 2017 10:34:14 +0800 Subject: [PATCH 465/804] f2fs: fix to correct no_fggc_candidate There may be extreme case as below: For one section contains one segment, and there are total 100 segments with 10% over-privision ratio in f2fs partition, fggc_threshold will be rounded down to 460 instead of 460.8 as below caclulation: sbi->fggc_threshold = div_u64((u64)(main_count - ovp_count) * BLKS_PER_SEC(sbi), (main_count - resv_count)); If section usage is as: 60 segments which contain 460 valid blocks 40 segments which contain 462 valid blocks As valid block number in all sections is large than fggc_threshold, so none of them will be chosen as candidate due to incorrect fggc_threshold. Let's just soften the term of choosing foreground GC candidates. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index d12d9cd99f91..9342b973da65 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -730,7 +730,7 @@ static inline block_t sum_blk_addr(struct f2fs_sb_info *sbi, int base, int type) static inline bool no_fggc_candidate(struct f2fs_sb_info *sbi, unsigned int secno) { - if (get_valid_blocks(sbi, GET_SEG_FROM_SEC(sbi, secno), true) >= + if (get_valid_blocks(sbi, GET_SEG_FROM_SEC(sbi, secno), true) > sbi->fggc_threshold) return true; return false; From 807486c79534a3e9286f40a3cbf908a827d5a957 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 19 Oct 2017 11:52:47 +0200 Subject: [PATCH 466/804] f2fs: avoid using timespec All uses of timespec are deprecated, and this one is not particularly useful, as the documented method for converting seconds to jiffies is to multiply by 'HZ'. Signed-off-by: Arnd Bergmann Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 72de8ae4be13..88d5c99f44e7 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1269,8 +1269,7 @@ static inline void f2fs_update_time(struct f2fs_sb_info *sbi, int type) static inline bool f2fs_time_over(struct f2fs_sb_info *sbi, int type) { - struct timespec ts = {sbi->interval_time[type], 0}; - unsigned long interval = timespec_to_jiffies(&ts); + unsigned long interval = sbi->interval_time[type] * HZ; return time_after(jiffies, sbi->last_time[type] + interval); } From 91bea0c391b3c01b617237a107e76151c2b376b7 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 19 Oct 2017 12:58:21 +0200 Subject: [PATCH 467/804] f2fs: remove several redundant assignments There are several assignments to variables that are redundant as the values are never read when the variables are updated later and so the redundant statements can be safely removed. Cleans up clang warnings: fs/f2fs/segment.c:923:19: warning: Value stored to 'p' during its initialization is never read fs/f2fs/segment.c:2060:2: warning: Value stored to 'hint' is never read fs/f2fs/segment.c:2353:2: warning: Value stored to 'start_block' is never read fs/f2fs/segment.c:2354:2: warning: Value stored to 'end_block' is never read Signed-off-by: Colin Ian King Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index f0916b24f5b4..85295baa74c8 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1038,7 +1038,7 @@ static struct discard_cmd *__insert_discard_tree(struct f2fs_sb_info *sbi, struct rb_node *insert_parent) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; - struct rb_node **p = &dcc->root.rb_node; + struct rb_node **p; struct rb_node *parent = NULL; struct discard_cmd *dc = NULL; @@ -2175,7 +2175,6 @@ find_other_zone: } secno = left_start; skip_left: - hint = secno; segno = GET_SEG_FROM_SEC(sbi, secno); zoneno = GET_ZONE_FROM_SEC(sbi, secno); @@ -2468,9 +2467,6 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) end_segno = (end >= MAX_BLKADDR(sbi)) ? MAIN_SEGS(sbi) - 1 : GET_SEGNO(sbi, end); - start_block = START_BLOCK(sbi, start_segno); - end_block = START_BLOCK(sbi, end_segno + 1); - cpc.reason = CP_DISCARD; cpc.trim_minlen = max_t(__u64, 1, F2FS_BYTES_TO_BLK(range->minlen)); From fc13f9d7ce1e4d04dcc4204dad751cb9dda11d3b Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 23 Oct 2017 23:48:49 +0200 Subject: [PATCH 468/804] f2fs: stop all the operations by cp_error flag This patch replaces to use cp_error flag instead of RDONLY for quota off. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/acl.c | 3 +++ fs/f2fs/checkpoint.c | 1 - fs/f2fs/file.c | 26 ++++++++++++++++++++++++++ fs/f2fs/namei.c | 30 ++++++++++++++++++++++++++++++ fs/f2fs/super.c | 3 +++ 5 files changed, 62 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index 112f8e04c549..3f52efa0f94f 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -253,6 +253,9 @@ static int __f2fs_set_acl(struct inode *inode, int type, int f2fs_set_acl(struct inode *inode, struct posix_acl *acl, int type) { + if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) + return -EIO; + return __f2fs_set_acl(inode, type, acl, NULL); } diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 90ff066c9569..48f9366240a2 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -29,7 +29,6 @@ struct kmem_cache *inode_entry_slab; void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io) { set_ckpt_flags(sbi, CP_ERROR_FLAG); - sbi->sb->s_flags |= MS_RDONLY; if (!end_io) f2fs_flush_merged_writes(sbi); } diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 505fb4ee03a4..19cdf9f5261b 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -56,6 +56,11 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, struct dnode_of_data dn; int err; + if (unlikely(f2fs_cp_error(sbi))) { + err = -EIO; + goto err; + } + sb_start_pagefault(inode->i_sb); f2fs_bug_on(sbi, f2fs_has_inline_data(inode)); @@ -117,6 +122,7 @@ out_sem: out: sb_end_pagefault(inode->i_sb); f2fs_update_time(sbi, REQ_TIME); +err: return block_page_mkwrite_return(err); } @@ -313,6 +319,8 @@ out: int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) { + if (unlikely(f2fs_cp_error(F2FS_I_SB(file_inode(file))))) + return -EIO; return f2fs_do_sync_file(file, start, end, datasync, false); } @@ -449,6 +457,9 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma) struct inode *inode = file_inode(file); int err; + if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) + return -EIO; + /* we don't need to use inline_data strictly */ err = f2fs_convert_inline_inode(inode); if (err) @@ -635,6 +646,9 @@ int f2fs_truncate(struct inode *inode) { int err; + if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) + return -EIO; + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))) return 0; @@ -713,6 +727,9 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) int err; bool size_changed = false; + if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) + return -EIO; + err = inode_change_ok(inode, attr); if (err) return err; @@ -1444,6 +1461,9 @@ static long f2fs_fallocate(struct file *file, int mode, struct inode *inode = file_inode(file); long ret = 0; + if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) + return -EIO; + /* f2fs only support ->fallocate for regular file */ if (!S_ISREG(inode->i_mode)) return -EINVAL; @@ -2420,6 +2440,9 @@ static int f2fs_ioc_get_features(struct file *filp, unsigned long arg) long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { + if (unlikely(f2fs_cp_error(F2FS_I_SB(file_inode(filp))))) + return -EIO; + switch (cmd) { case F2FS_IOC_GETFLAGS: return f2fs_ioc_getflags(filp, arg); @@ -2473,6 +2496,9 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) struct blk_plug plug; ssize_t ret; + if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) + return -EIO; + inode_lock(inode); ret = generic_write_checks(iocb, from); if (ret > 0) { diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 5503da9c55f8..a2402ccc0779 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -177,6 +177,9 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, nid_t ino = 0; int err; + if (unlikely(f2fs_cp_error(sbi))) + return -EIO; + err = dquot_initialize(dir); if (err) return err; @@ -221,6 +224,9 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir, struct f2fs_sb_info *sbi = F2FS_I_SB(dir); int err; + if (unlikely(f2fs_cp_error(sbi))) + return -EIO; + if (f2fs_encrypted_inode(dir) && !fscrypt_has_permitted_context(dir, inode)) return -EPERM; @@ -420,6 +426,9 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) trace_f2fs_unlink_enter(dir, dentry); + if (unlikely(f2fs_cp_error(sbi))) + return -EIO; + err = dquot_initialize(dir); if (err) return err; @@ -472,6 +481,9 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, struct fscrypt_symlink_data *sd = NULL; int err; + if (unlikely(f2fs_cp_error(sbi))) + return -EIO; + if (f2fs_encrypted_inode(dir)) { err = fscrypt_get_encryption_info(dir); if (err) @@ -578,6 +590,9 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) struct inode *inode; int err; + if (unlikely(f2fs_cp_error(sbi))) + return -EIO; + err = dquot_initialize(dir); if (err) return err; @@ -630,6 +645,9 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry, struct inode *inode; int err = 0; + if (unlikely(f2fs_cp_error(sbi))) + return -EIO; + err = dquot_initialize(dir); if (err) return err; @@ -724,6 +742,9 @@ out: static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) { + if (unlikely(f2fs_cp_error(F2FS_I_SB(dir)))) + return -EIO; + if (f2fs_encrypted_inode(dir)) { int err = fscrypt_get_encryption_info(dir); if (err) @@ -735,6 +756,9 @@ static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) static int f2fs_create_whiteout(struct inode *dir, struct inode **whiteout) { + if (unlikely(f2fs_cp_error(F2FS_I_SB(dir)))) + return -EIO; + return __f2fs_tmpfile(dir, NULL, S_IFCHR | WHITEOUT_MODE, whiteout); } @@ -754,6 +778,9 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, bool is_old_inline = f2fs_has_inline_dentry(old_dir); int err = -ENOENT; + if (unlikely(f2fs_cp_error(sbi))) + return -EIO; + if ((f2fs_encrypted_inode(old_dir) && !fscrypt_has_encryption_key(old_dir)) || (f2fs_encrypted_inode(new_dir) && @@ -947,6 +974,9 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, int old_nlink = 0, new_nlink = 0; int err = -ENOENT; + if (unlikely(f2fs_cp_error(sbi))) + return -EIO; + if ((f2fs_encrypted_inode(old_dir) && !fscrypt_has_encryption_key(old_dir)) || (f2fs_encrypted_inode(new_dir) && diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 8d79b1887cb4..94bbcaeb2b6e 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -860,6 +860,9 @@ int f2fs_sync_fs(struct super_block *sb, int sync) struct f2fs_sb_info *sbi = F2FS_SB(sb); int err = 0; + if (unlikely(f2fs_cp_error(sbi))) + return 0; + trace_f2fs_sync_fs(sb, sync); if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) From 46d4a691f035642af7a51786182963ec8a1748fb Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 24 Oct 2017 09:46:54 +0200 Subject: [PATCH 469/804] f2fs: show # of dirty segments via sysfs This patch adds one sysfs entry to show # of dirty segments which can be used for gc timing by user. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/sysfs.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 89f25efffd43..48ebe6153cc5 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -63,6 +63,13 @@ static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type) return NULL; } +static ssize_t dirty_segments_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%llu\n", + (unsigned long long)(dirty_segments(sbi))); +} + static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { @@ -278,6 +285,7 @@ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, iostat_enable, iostat_enable); F2FS_RW_ATTR(FAULT_INFO_RATE, f2fs_fault_info, inject_rate, inject_rate); F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type); #endif +F2FS_GENERAL_RO_ATTR(dirty_segments); F2FS_GENERAL_RO_ATTR(lifetime_write_kbytes); F2FS_GENERAL_RO_ATTR(features); @@ -320,6 +328,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(inject_rate), ATTR_LIST(inject_type), #endif + ATTR_LIST(dirty_segments), ATTR_LIST(lifetime_write_kbytes), ATTR_LIST(features), ATTR_LIST(reserved_blocks), From 5b8ff1301a61a8b93e2e6c3c72c4378a993e8989 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 23 Oct 2017 23:50:15 +0200 Subject: [PATCH 470/804] f2fs: add missing quota_initialize This patch adds to call quota_intialize in f2fs_set_acl, f2fs_unlink, and f2fs_rename. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/namei.c | 9 +++++++++ fs/f2fs/xattr.c | 4 ++++ 2 files changed, 13 insertions(+) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index a2402ccc0779..f78de030b8b7 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -430,6 +430,9 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) return -EIO; err = dquot_initialize(dir); + if (err) + return err; + err = dquot_initialize(inode); if (err) return err; @@ -806,6 +809,12 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, if (err) goto out; + if (new_inode) { + err = dquot_initialize(new_inode); + if (err) + goto out; + } + old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page); if (!old_entry) { if (IS_ERR(old_page)) diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 442c7ec5acd0..1f35ae6a4170 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -718,6 +718,10 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name, struct f2fs_sb_info *sbi = F2FS_I_SB(inode); int err; + err = dquot_initialize(inode); + if (err) + return err; + /* this case is only from init_inode_metadata */ if (ipage) return __f2fs_setxattr(inode, index, name, value, From ada4061e191bd0eba88f811d386e72fcc39cce97 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 26 Oct 2017 10:31:22 +0200 Subject: [PATCH 471/804] f2fs: show current cp state This patch shows whether checkpoint met any error case. Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index d441660c3ba6..f7eec506ceea 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -264,9 +264,10 @@ static int stat_show(struct seq_file *s, void *v) list_for_each_entry(si, &f2fs_stat_list, stat_list) { update_general_status(si->sbi); - seq_printf(s, "\n=====[ partition info(%pg). #%d, %s]=====\n", + seq_printf(s, "\n=====[ partition info(%pg). #%d, %s, CP: %s]=====\n", si->sbi->sb->s_bdev, i++, - f2fs_readonly(si->sbi->sb) ? "RO": "RW"); + f2fs_readonly(si->sbi->sb) ? "RO": "RW", + f2fs_cp_error(si->sbi) ? "Error": "Good"); seq_printf(s, "[SB: 1] [CP: 2] [SIT: %d] [NAT: %d] ", si->sit_area_segs, si->nat_area_segs); seq_printf(s, "[SSA: %d] [MAIN: %d", From 7368e30495c52bc0b42819e75f78269ddec6c530 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 6 Sep 2017 21:59:50 +0800 Subject: [PATCH 472/804] f2fs: support flexible inline xattr size Now, in product, more and more features based on file encryption were introduced, their demand of xattr space is increasing, however, inline xattr has fixed-size of 200 bytes, once inline xattr space is full, new increased xattr data would occupy additional xattr block which may bring us more space usage and performance regression during persisting. In order to resolve above issue, it's better to expand inline xattr size flexibly according to user's requirement. So this patch introduces new filesystem feature 'flexible inline xattr', and new mount option 'inline_xattr_size=%u', once mkfs enables the feature, we can use the option to make f2fs supporting flexible inline xattr size. To support this feature, we add extra attribute i_inline_xattr_size in inode layout, indicating that how many space inline xattr borrows from block address mapping space in inode layout, by this, we can easily locate and store flexible-sized inline xattr data in inode. Inode disk layout: +----------------------+ | .i_mode | | ... | | .i_ext | +----------------------+ | .i_extra_isize | | .i_inline_xattr_size |-----------+ | ... | | +----------------------+ | | .i_addr | | | - block address or | | | - inline data | | +----------------------+<---+ v | inline xattr | +---inline xattr range +----------------------+<---+ | .i_nid | +----------------------+ | node_footer | | (nid, ino, offset) | +----------------------+ Note that, we have to cnosider backward compatibility which reserved inline_data space, 200 bytes, all the time, reported by Sheng Yong. Previous inline data or directory always reserved 200 bytes in inode layout, even if inline_xattr is disabled. In order to keep inline_dentry's structure for backward compatibility, we get the space back only from inline_data. Signed-off-by: Chao Yu Reported-by: Sheng Yong Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 39 ++++++++++++++++++++++++++------------- fs/f2fs/inode.c | 21 +++++++++++++++++++++ fs/f2fs/namei.c | 13 +++++++++++++ fs/f2fs/node.c | 10 ++++++++-- fs/f2fs/super.c | 32 +++++++++++++++++++++++++++++++- fs/f2fs/sysfs.c | 7 +++++++ fs/f2fs/xattr.c | 18 +++++++++--------- include/linux/f2fs_fs.h | 5 +++-- 8 files changed, 118 insertions(+), 27 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 88d5c99f44e7..6ae52815b33f 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -94,6 +94,7 @@ extern char *fault_name[FAULT_MAX]; #define F2FS_MOUNT_GRPQUOTA 0x00100000 #define F2FS_MOUNT_PRJQUOTA 0x00200000 #define F2FS_MOUNT_QUOTA 0x00400000 +#define F2FS_MOUNT_INLINE_XATTR_SIZE 0x00800000 #define clear_opt(sbi, option) ((sbi)->mount_opt.opt &= ~F2FS_MOUNT_##option) #define set_opt(sbi, option) ((sbi)->mount_opt.opt |= F2FS_MOUNT_##option) @@ -119,6 +120,7 @@ struct f2fs_mount_info { #define F2FS_FEATURE_EXTRA_ATTR 0x0008 #define F2FS_FEATURE_PRJQUOTA 0x0010 #define F2FS_FEATURE_INODE_CHKSUM 0x0020 +#define F2FS_FEATURE_FLEXIBLE_INLINE_XATTR 0x0040 #define F2FS_HAS_FEATURE(sb, mask) \ ((F2FS_SB(sb)->raw_super->feature & cpu_to_le32(mask)) != 0) @@ -461,11 +463,14 @@ struct f2fs_flush_device { /* for inline stuff */ #define DEF_INLINE_RESERVED_SIZE 1 +#define DEF_MIN_INLINE_SIZE 1 static inline int get_extra_isize(struct inode *inode); -#define MAX_INLINE_DATA(inode) (sizeof(__le32) * \ - (CUR_ADDRS_PER_INODE(inode) - \ - DEF_INLINE_RESERVED_SIZE - \ - F2FS_INLINE_XATTR_ADDRS)) +static inline int get_inline_xattr_addrs(struct inode *inode); +#define F2FS_INLINE_XATTR_ADDRS(inode) get_inline_xattr_addrs(inode) +#define MAX_INLINE_DATA(inode) (sizeof(__le32) * \ + (CUR_ADDRS_PER_INODE(inode) - \ + F2FS_INLINE_XATTR_ADDRS(inode) - \ + DEF_INLINE_RESERVED_SIZE)) /* for inline dir */ #define NR_INLINE_DENTRY(inode) (MAX_INLINE_DATA(inode) * BITS_PER_BYTE / \ @@ -676,6 +681,7 @@ struct f2fs_inode_info { int i_extra_isize; /* size of extra space located in i_addr */ kprojid_t i_projid; /* id for project quota */ + int i_inline_xattr_size; /* inline xattr size */ }; static inline void get_extent_info(struct extent_info *ext, @@ -1133,6 +1139,7 @@ struct f2fs_sb_info { loff_t max_file_blocks; /* max block index of file */ int active_logs; /* # of active logs */ int dir_level; /* directory level */ + int inline_xattr_size; /* inline xattr size */ block_t user_block_count; /* # of user blocks */ block_t total_valid_block_count; /* # of valid blocks */ @@ -2247,25 +2254,20 @@ static inline int f2fs_has_inline_xattr(struct inode *inode) static inline unsigned int addrs_per_inode(struct inode *inode) { - if (f2fs_has_inline_xattr(inode)) - return CUR_ADDRS_PER_INODE(inode) - F2FS_INLINE_XATTR_ADDRS; - return CUR_ADDRS_PER_INODE(inode); + return CUR_ADDRS_PER_INODE(inode) - F2FS_INLINE_XATTR_ADDRS(inode); } -static inline void *inline_xattr_addr(struct page *page) +static inline void *inline_xattr_addr(struct inode *inode, struct page *page) { struct f2fs_inode *ri = F2FS_INODE(page); return (void *)&(ri->i_addr[DEF_ADDRS_PER_INODE - - F2FS_INLINE_XATTR_ADDRS]); + F2FS_INLINE_XATTR_ADDRS(inode)]); } static inline int inline_xattr_size(struct inode *inode) { - if (f2fs_has_inline_xattr(inode)) - return F2FS_INLINE_XATTR_ADDRS << 2; - else - return 0; + return get_inline_xattr_addrs(inode) * sizeof(__le32); } static inline int f2fs_has_inline_data(struct inode *inode) @@ -2435,6 +2437,12 @@ static inline int get_extra_isize(struct inode *inode) return F2FS_I(inode)->i_extra_isize / sizeof(__le32); } +static inline int f2fs_sb_has_flexible_inline_xattr(struct super_block *sb); +static inline int get_inline_xattr_addrs(struct inode *inode) +{ + return F2FS_I(inode)->i_inline_xattr_size; +} + #define get_inode_mode(i) \ ((is_inode_flag_set(i, FI_ACL_MODE)) ? \ (F2FS_I(i)->i_acl_mode) : ((i)->i_mode)) @@ -3104,6 +3112,11 @@ static inline int f2fs_sb_has_inode_chksum(struct super_block *sb) return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_INODE_CHKSUM); } +static inline int f2fs_sb_has_flexible_inline_xattr(struct super_block *sb) +{ + return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_FLEXIBLE_INLINE_XATTR); +} + #ifdef CONFIG_BLK_DEV_ZONED static inline int get_blkz_type(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t blkaddr) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 3617e7fca930..9684d53563f1 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -232,6 +232,23 @@ static int do_read_inode(struct inode *inode) fi->i_extra_isize = f2fs_has_extra_attr(inode) ? le16_to_cpu(ri->i_extra_isize) : 0; + if (f2fs_sb_has_flexible_inline_xattr(sbi->sb)) { + f2fs_bug_on(sbi, !f2fs_has_extra_attr(inode)); + fi->i_inline_xattr_size = le16_to_cpu(ri->i_inline_xattr_size); + } else if (f2fs_has_inline_xattr(inode) || + f2fs_has_inline_dentry(inode)) { + fi->i_inline_xattr_size = DEFAULT_INLINE_XATTR_ADDRS; + } else { + + /* + * Previous inline data or directory always reserved 200 bytes + * in inode layout, even if inline_xattr is disabled. In order + * to keep inline_dentry's structure for backward compatibility, + * we get the space back only from inline_data. + */ + fi->i_inline_xattr_size = 0; + } + /* check data exist */ if (f2fs_has_inline_data(inode) && !f2fs_exist_data(inode)) __recover_inline_status(inode, node_page); @@ -384,6 +401,10 @@ int update_inode(struct inode *inode, struct page *node_page) if (f2fs_has_extra_attr(inode)) { ri->i_extra_isize = cpu_to_le16(F2FS_I(inode)->i_extra_isize); + if (f2fs_sb_has_flexible_inline_xattr(F2FS_I_SB(inode)->sb)) + ri->i_inline_xattr_size = + cpu_to_le16(F2FS_I(inode)->i_inline_xattr_size); + if (f2fs_sb_has_project_quota(F2FS_I_SB(inode)->sb) && F2FS_FITS_IN_INODE(ri, F2FS_I(inode)->i_extra_isize, i_projid)) { diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index f78de030b8b7..cf8f4370d256 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -29,6 +29,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) nid_t ino; struct inode *inode; bool nid_free = false; + int xattr_size = 0; int err; inode = new_inode(dir->i_sb); @@ -86,11 +87,23 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) if (test_opt(sbi, INLINE_XATTR)) set_inode_flag(inode, FI_INLINE_XATTR); + if (test_opt(sbi, INLINE_DATA) && f2fs_may_inline_data(inode)) set_inode_flag(inode, FI_INLINE_DATA); if (f2fs_may_inline_dentry(inode)) set_inode_flag(inode, FI_INLINE_DENTRY); + if (f2fs_sb_has_flexible_inline_xattr(sbi->sb)) { + f2fs_bug_on(sbi, !f2fs_has_extra_attr(inode)); + if (f2fs_has_inline_xattr(inode)) + xattr_size = sbi->inline_xattr_size; + /* Otherwise, will be 0 */ + } else if (f2fs_has_inline_xattr(inode) || + f2fs_has_inline_dentry(inode)) { + xattr_size = DEFAULT_INLINE_XATTR_ADDRS; + } + F2FS_I(inode)->i_inline_xattr_size = xattr_size; + f2fs_init_extent_tree(inode, NULL); stat_inc_inline_xattr(inode); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index d7e0a4366527..77b39a058a34 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2194,8 +2194,8 @@ void recover_inline_xattr(struct inode *inode, struct page *page) goto update_inode; } - dst_addr = inline_xattr_addr(ipage); - src_addr = inline_xattr_addr(page); + dst_addr = inline_xattr_addr(inode, ipage); + src_addr = inline_xattr_addr(inode, page); inline_size = inline_xattr_size(inode); f2fs_wait_on_page_writeback(ipage, NODE, true); @@ -2284,6 +2284,12 @@ retry: dst->i_inline = src->i_inline & (F2FS_INLINE_XATTR | F2FS_EXTRA_ATTR); if (dst->i_inline & F2FS_EXTRA_ATTR) { dst->i_extra_isize = src->i_extra_isize; + + if (f2fs_sb_has_flexible_inline_xattr(sbi->sb) && + F2FS_FITS_IN_INODE(src, le16_to_cpu(src->i_extra_isize), + i_inline_xattr_size)) + dst->i_inline_xattr_size = src->i_inline_xattr_size; + if (f2fs_sb_has_project_quota(sbi->sb) && F2FS_FITS_IN_INODE(src, le16_to_cpu(src->i_extra_isize), i_projid)) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 94bbcaeb2b6e..ff3cbfa0c1c9 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -92,6 +92,7 @@ enum { Opt_disable_ext_identify, Opt_inline_xattr, Opt_noinline_xattr, + Opt_inline_xattr_size, Opt_inline_data, Opt_inline_dentry, Opt_noinline_dentry, @@ -141,6 +142,7 @@ static match_table_t f2fs_tokens = { {Opt_disable_ext_identify, "disable_ext_identify"}, {Opt_inline_xattr, "inline_xattr"}, {Opt_noinline_xattr, "noinline_xattr"}, + {Opt_inline_xattr_size, "inline_xattr_size=%u"}, {Opt_inline_data, "inline_data"}, {Opt_inline_dentry, "inline_dentry"}, {Opt_noinline_dentry, "noinline_dentry"}, @@ -383,6 +385,12 @@ static int parse_options(struct super_block *sb, char *options) case Opt_noinline_xattr: clear_opt(sbi, INLINE_XATTR); break; + case Opt_inline_xattr_size: + if (args->from && match_int(args, &arg)) + return -EINVAL; + set_opt(sbi, INLINE_XATTR_SIZE); + sbi->inline_xattr_size = arg; + break; #else case Opt_user_xattr: f2fs_msg(sb, KERN_INFO, @@ -604,6 +612,24 @@ static int parse_options(struct super_block *sb, char *options) F2FS_IO_SIZE_KB(sbi)); return -EINVAL; } + + if (test_opt(sbi, INLINE_XATTR_SIZE)) { + if (!test_opt(sbi, INLINE_XATTR)) { + f2fs_msg(sb, KERN_ERR, + "inline_xattr_size option should be " + "set with inline_xattr option"); + return -EINVAL; + } + if (!sbi->inline_xattr_size || + sbi->inline_xattr_size >= DEF_ADDRS_PER_INODE - + F2FS_TOTAL_EXTRA_ATTR_SIZE - + DEF_INLINE_RESERVED_SIZE - + DEF_MIN_INLINE_SIZE) { + f2fs_msg(sb, KERN_ERR, + "inline xattr size is out of range"); + return -EINVAL; + } + } return 0; } @@ -1051,6 +1077,9 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",inline_xattr"); else seq_puts(seq, ",noinline_xattr"); + if (test_opt(sbi, INLINE_XATTR_SIZE)) + seq_printf(seq, ",inline_xattr_size=%u", + sbi->inline_xattr_size); #endif #ifdef CONFIG_F2FS_FS_POSIX_ACL if (test_opt(sbi, POSIX_ACL)) @@ -1113,6 +1142,7 @@ static void default_options(struct f2fs_sb_info *sbi) { /* init some FS parameters */ sbi->active_logs = NR_CURSEG_TYPE; + sbi->inline_xattr_size = DEFAULT_INLINE_XATTR_ADDRS; set_opt(sbi, BG_GC); set_opt(sbi, INLINE_XATTR); @@ -1674,7 +1704,7 @@ static loff_t max_file_blocks(void) /* * note: previously, result is equal to (DEF_ADDRS_PER_INODE - - * F2FS_INLINE_XATTR_ADDRS), but now f2fs try to reserve more + * DEFAULT_INLINE_XATTR_ADDRS), but now f2fs try to reserve more * space in inode.i_addr, it will be more safe to reassign * result as zero. */ diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 48ebe6153cc5..e09e59cc678a 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -107,6 +107,9 @@ static ssize_t features_show(struct f2fs_attr *a, if (f2fs_sb_has_inode_chksum(sb)) len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "inode_checksum"); + if (f2fs_sb_has_flexible_inline_xattr(sb)) + len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len ? ", " : "", "flexible_inline_xattr"); len += snprintf(buf + len, PAGE_SIZE - len, "\n"); return len; } @@ -216,6 +219,7 @@ enum feat_id { FEAT_EXTRA_ATTR, FEAT_PROJECT_QUOTA, FEAT_INODE_CHECKSUM, + FEAT_FLEXIBLE_INLINE_XATTR, }; static ssize_t f2fs_feature_show(struct f2fs_attr *a, @@ -228,6 +232,7 @@ static ssize_t f2fs_feature_show(struct f2fs_attr *a, case FEAT_EXTRA_ATTR: case FEAT_PROJECT_QUOTA: case FEAT_INODE_CHECKSUM: + case FEAT_FLEXIBLE_INLINE_XATTR: return snprintf(buf, PAGE_SIZE, "supported\n"); } return 0; @@ -299,6 +304,7 @@ F2FS_FEATURE_RO_ATTR(atomic_write, FEAT_ATOMIC_WRITE); F2FS_FEATURE_RO_ATTR(extra_attr, FEAT_EXTRA_ATTR); F2FS_FEATURE_RO_ATTR(project_quota, FEAT_PROJECT_QUOTA); F2FS_FEATURE_RO_ATTR(inode_checksum, FEAT_INODE_CHECKSUM); +F2FS_FEATURE_RO_ATTR(flexible_inline_xattr, FEAT_FLEXIBLE_INLINE_XATTR); #define ATTR_LIST(name) (&f2fs_attr_##name.attr) static struct attribute *f2fs_attrs[] = { @@ -346,6 +352,7 @@ static struct attribute *f2fs_feat_attrs[] = { ATTR_LIST(extra_attr), ATTR_LIST(project_quota), ATTR_LIST(inode_checksum), + ATTR_LIST(flexible_inline_xattr), NULL, }; diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 1f35ae6a4170..bcf455abe845 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -264,12 +264,12 @@ static struct f2fs_xattr_entry *__find_xattr(void *base_addr, int index, return entry; } -static struct f2fs_xattr_entry *__find_inline_xattr(void *base_addr, - void **last_addr, int index, - size_t len, const char *name) +static struct f2fs_xattr_entry *__find_inline_xattr(struct inode *inode, + void *base_addr, void **last_addr, int index, + size_t len, const char *name) { struct f2fs_xattr_entry *entry; - unsigned int inline_size = F2FS_INLINE_XATTR_ADDRS << 2; + unsigned int inline_size = inline_xattr_size(inode); list_for_each_xattr(entry, base_addr) { if ((void *)entry + sizeof(__u32) > base_addr + inline_size || @@ -297,13 +297,13 @@ static int read_inline_xattr(struct inode *inode, struct page *ipage, void *inline_addr; if (ipage) { - inline_addr = inline_xattr_addr(ipage); + inline_addr = inline_xattr_addr(inode, ipage); } else { page = get_node_page(sbi, inode->i_ino); if (IS_ERR(page)) return PTR_ERR(page); - inline_addr = inline_xattr_addr(page); + inline_addr = inline_xattr_addr(inode, page); } memcpy(txattr_addr, inline_addr, inline_size); f2fs_put_page(page, 1); @@ -356,7 +356,7 @@ static int lookup_all_xattrs(struct inode *inode, struct page *ipage, if (err) goto out; - *xe = __find_inline_xattr(txattr_addr, &last_addr, + *xe = __find_inline_xattr(inode, txattr_addr, &last_addr, index, len, name); if (*xe) goto check; @@ -451,7 +451,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, void *inline_addr; if (ipage) { - inline_addr = inline_xattr_addr(ipage); + inline_addr = inline_xattr_addr(inode, ipage); f2fs_wait_on_page_writeback(ipage, NODE, true); set_page_dirty(ipage); } else { @@ -460,7 +460,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, alloc_nid_failed(sbi, new_nid); return PTR_ERR(page); } - inline_addr = inline_xattr_addr(page); + inline_addr = inline_xattr_addr(inode, page); f2fs_wait_on_page_writeback(page, NODE, true); } memcpy(inline_addr, txattr_addr, inline_size); diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index c2a975e4a711..d79ca96045e4 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -184,7 +184,8 @@ struct f2fs_extent { } __packed; #define F2FS_NAME_LEN 255 -#define F2FS_INLINE_XATTR_ADDRS 50 /* 200 bytes for inline xattrs */ +/* 200 bytes for inline xattrs by default */ +#define DEFAULT_INLINE_XATTR_ADDRS 50 #define DEF_ADDRS_PER_INODE 923 /* Address Pointers in an Inode */ #define CUR_ADDRS_PER_INODE(inode) (DEF_ADDRS_PER_INODE - \ get_extra_isize(inode)) @@ -238,7 +239,7 @@ struct f2fs_inode { union { struct { __le16 i_extra_isize; /* extra inode attribute size */ - __le16 i_padding; /* padding */ + __le16 i_inline_xattr_size; /* inline xattr size, unit: 4 bytes */ __le32 i_projid; /* project id */ __le32 i_inode_checksum;/* inode meta checksum */ __le32 i_extra_end[0]; /* for attribute size calculation */ From b7b2e629b6f6a360dd2314dcbf594dfe111efc49 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 16 Oct 2017 15:05:16 -0700 Subject: [PATCH 473/804] f2fs: handle error case when adding xattr entry This patch fixes recovering incomplete xattr entries remaining in inline xattr and xattr block, caused by any kind of errors. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/xattr.c | 51 +++++++++++++++++++++++++++++-------------------- 1 file changed, 30 insertions(+), 21 deletions(-) diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index bcf455abe845..7acf56ebda65 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -436,10 +436,12 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); size_t inline_size = inline_xattr_size(inode); + struct page *in_page = NULL; void *xattr_addr; + void *inline_addr = NULL; struct page *xpage; nid_t new_nid = 0; - int err; + int err = 0; if (hsize > inline_size && !F2FS_I(inode)->i_xattr_nid) if (!alloc_nid(sbi, &new_nid)) @@ -447,30 +449,30 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, /* write to inline xattr */ if (inline_size) { - struct page *page = NULL; - void *inline_addr; - if (ipage) { inline_addr = inline_xattr_addr(inode, ipage); - f2fs_wait_on_page_writeback(ipage, NODE, true); - set_page_dirty(ipage); } else { - page = get_node_page(sbi, inode->i_ino); - if (IS_ERR(page)) { + in_page = get_node_page(sbi, inode->i_ino); + if (IS_ERR(in_page)) { alloc_nid_failed(sbi, new_nid); - return PTR_ERR(page); + return PTR_ERR(in_page); } - inline_addr = inline_xattr_addr(inode, page); - f2fs_wait_on_page_writeback(page, NODE, true); + inline_addr = inline_xattr_addr(inode, in_page); } - memcpy(inline_addr, txattr_addr, inline_size); - f2fs_put_page(page, 1); + f2fs_wait_on_page_writeback(ipage ? ipage : in_page, + NODE, true); /* no need to use xattr node block */ if (hsize <= inline_size) { err = truncate_xattr_node(inode); alloc_nid_failed(sbi, new_nid); - return err; + if (err) { + f2fs_put_page(in_page, 1); + return err; + } + memcpy(inline_addr, txattr_addr, inline_size); + set_page_dirty(ipage ? ipage : in_page); + goto in_page_out; } } @@ -479,7 +481,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, xpage = get_node_page(sbi, F2FS_I(inode)->i_xattr_nid); if (IS_ERR(xpage)) { alloc_nid_failed(sbi, new_nid); - return PTR_ERR(xpage); + goto in_page_out; } f2fs_bug_on(sbi, new_nid); f2fs_wait_on_page_writeback(xpage, NODE, true); @@ -489,17 +491,24 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, xpage = new_node_page(&dn, XATTR_NODE_OFFSET); if (IS_ERR(xpage)) { alloc_nid_failed(sbi, new_nid); - return PTR_ERR(xpage); + goto in_page_out; } alloc_nid_done(sbi, new_nid); } - xattr_addr = page_address(xpage); - memcpy(xattr_addr, txattr_addr + inline_size, VALID_XATTR_BLOCK_SIZE); - set_page_dirty(xpage); - f2fs_put_page(xpage, 1); - return 0; + if (inline_size) + memcpy(inline_addr, txattr_addr, inline_size); + memcpy(xattr_addr, txattr_addr + inline_size, VALID_XATTR_BLOCK_SIZE); + + if (inline_size) + set_page_dirty(ipage ? ipage : in_page); + set_page_dirty(xpage); + + f2fs_put_page(xpage, 1); +in_page_out: + f2fs_put_page(in_page, 1); + return err; } int f2fs_getxattr(struct inode *inode, int index, const char *name, From e945474a9c1b018159ff5ba398bfafb1dc6a5956 Mon Sep 17 00:00:00 2001 From: Yunlong Song Date: Fri, 27 Oct 2017 20:45:05 +0800 Subject: [PATCH 474/804] f2fs: support soft block reservation It supports to extend reserved_blocks sysfs interface to be soft threshold, which allows user configure it exceeding current available user space. This patch also introduces a new sysfs interface called current_reserved_blocks, which shows the current blocks that have already been reserved. Signed-off-by: Yunlong Song Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 13 ++++++++++++- fs/f2fs/f2fs.h | 13 +++++++++++-- fs/f2fs/super.c | 3 ++- fs/f2fs/sysfs.c | 15 ++++++++++++--- 4 files changed, 37 insertions(+), 7 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index 500c60403653..2174c66ce1fe 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -132,7 +132,18 @@ What: /sys/fs/f2fs//reserved_blocks Date: June 2017 Contact: "Chao Yu" Description: - Controls current reserved blocks in system. + Controls target reserved blocks in system, the threshold + is soft, it could exceed current available user space. + +What: /sys/fs/f2fs//current_reserved_blocks +Date: October 2017 +Contact: "Yunlong Song" +Contact: "Chao Yu" +Description: + Shows current reserved blocks in system, it may be temporarily + smaller than target_reserved_blocks, but will gradually + increase to target_reserved_blocks when more free blocks are + freed by user later. What: /sys/fs/f2fs//gc_urgent Date: August 2017 diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 6ae52815b33f..d67d1d972459 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1146,6 +1146,7 @@ struct f2fs_sb_info { block_t discard_blks; /* discard command candidats */ block_t last_valid_block_count; /* for recovery */ block_t reserved_blocks; /* configurable reserved blocks */ + block_t current_reserved_blocks; /* current reserved blocks */ u32 s_next_generation; /* for NFS support */ @@ -1618,7 +1619,8 @@ static inline int inc_valid_block_count(struct f2fs_sb_info *sbi, spin_lock(&sbi->stat_lock); sbi->total_valid_block_count += (block_t)(*count); - avail_user_block_count = sbi->user_block_count - sbi->reserved_blocks; + avail_user_block_count = sbi->user_block_count - + sbi->current_reserved_blocks; if (unlikely(sbi->total_valid_block_count > avail_user_block_count)) { diff = sbi->total_valid_block_count - avail_user_block_count; *count -= diff; @@ -1652,6 +1654,10 @@ static inline void dec_valid_block_count(struct f2fs_sb_info *sbi, f2fs_bug_on(sbi, sbi->total_valid_block_count < (block_t) count); f2fs_bug_on(sbi, inode->i_blocks < sectors); sbi->total_valid_block_count -= (block_t)count; + if (sbi->reserved_blocks && + sbi->current_reserved_blocks < sbi->reserved_blocks) + sbi->current_reserved_blocks = min(sbi->reserved_blocks, + sbi->current_reserved_blocks + count); spin_unlock(&sbi->stat_lock); f2fs_i_blocks_write(inode, count, false, true); } @@ -1798,7 +1804,7 @@ static inline int inc_valid_node_count(struct f2fs_sb_info *sbi, spin_lock(&sbi->stat_lock); valid_block_count = sbi->total_valid_block_count + 1; - if (unlikely(valid_block_count + sbi->reserved_blocks > + if (unlikely(valid_block_count + sbi->current_reserved_blocks > sbi->user_block_count)) { spin_unlock(&sbi->stat_lock); goto enospc; @@ -1841,6 +1847,9 @@ static inline void dec_valid_node_count(struct f2fs_sb_info *sbi, sbi->total_valid_node_count--; sbi->total_valid_block_count--; + if (sbi->reserved_blocks && + sbi->current_reserved_blocks < sbi->reserved_blocks) + sbi->current_reserved_blocks++; spin_unlock(&sbi->stat_lock); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index ff3cbfa0c1c9..8a99182b368f 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -988,7 +988,7 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_blocks = total_count - start_count; buf->f_bfree = user_block_count - valid_user_blocks(sbi) + ovp_count; buf->f_bavail = user_block_count - valid_user_blocks(sbi) - - sbi->reserved_blocks; + sbi->current_reserved_blocks; avail_node_count = sbi->total_node_count - F2FS_RESERVED_NODE_NUM; @@ -2466,6 +2466,7 @@ try_onemore: le64_to_cpu(sbi->ckpt->valid_block_count); sbi->last_valid_block_count = sbi->total_valid_block_count; sbi->reserved_blocks = 0; + sbi->current_reserved_blocks = 0; for (i = 0; i < NR_INODE_TYPE; i++) { INIT_LIST_HEAD(&sbi->inode_list[i]); diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index e09e59cc678a..4166ac74e837 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -30,7 +30,7 @@ enum { FAULT_INFO_RATE, /* struct f2fs_fault_info */ FAULT_INFO_TYPE, /* struct f2fs_fault_info */ #endif - RESERVED_BLOCKS, + RESERVED_BLOCKS, /* struct f2fs_sb_info */ }; struct f2fs_attr { @@ -114,6 +114,12 @@ static ssize_t features_show(struct f2fs_attr *a, return len; } +static ssize_t current_reserved_blocks_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", sbi->current_reserved_blocks); +} + static ssize_t f2fs_sbi_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { @@ -153,12 +159,13 @@ static ssize_t f2fs_sbi_store(struct f2fs_attr *a, #endif if (a->struct_type == RESERVED_BLOCKS) { spin_lock(&sbi->stat_lock); - if ((unsigned long)sbi->total_valid_block_count + t > - (unsigned long)sbi->user_block_count) { + if (t > (unsigned long)sbi->user_block_count) { spin_unlock(&sbi->stat_lock); return -EINVAL; } *ui = t; + sbi->current_reserved_blocks = min(sbi->reserved_blocks, + sbi->user_block_count - valid_user_blocks(sbi)); spin_unlock(&sbi->stat_lock); return count; } @@ -293,6 +300,7 @@ F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type); F2FS_GENERAL_RO_ATTR(dirty_segments); F2FS_GENERAL_RO_ATTR(lifetime_write_kbytes); F2FS_GENERAL_RO_ATTR(features); +F2FS_GENERAL_RO_ATTR(current_reserved_blocks); #ifdef CONFIG_F2FS_FS_ENCRYPTION F2FS_FEATURE_RO_ATTR(encryption, FEAT_CRYPTO); @@ -338,6 +346,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(lifetime_write_kbytes), ATTR_LIST(features), ATTR_LIST(reserved_blocks), + ATTR_LIST(current_reserved_blocks), NULL, }; From 09a073cc8c565c797b73e525cb0c700cc4daaf77 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 28 Oct 2017 16:52:29 +0800 Subject: [PATCH 475/804] f2fs: add missing sysfs description There are some missing sysfs entries' description in document, add them. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index 2174c66ce1fe..a07134c517e0 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -51,6 +51,12 @@ Description: Controls the dirty page count condition for the in-place-update policies. +What: /sys/fs/f2fs//min_hot_blocks +Date: March 2017 +Contact: "Jaegeuk Kim" +Description: + Controls the dirty page count condition for redefining hot data. + What: /sys/fs/f2fs//max_small_discards Date: November 2013 Contact: "Jaegeuk Kim" @@ -96,6 +102,18 @@ Contact: "Jaegeuk Kim" Description: Controls the checkpoint timing. +What: /sys/fs/f2fs//idle_interval +Date: January 2016 +Contact: "Jaegeuk Kim" +Description: + Controls the idle timing. + +What: /sys/fs/f2fs//iostat_enable +Date: August 2017 +Contact: "Chao Yu" +Description: + Controls to enable/disable IO stat. + What: /sys/fs/f2fs//ra_nid_pages Date: October 2015 Contact: "Chao Yu" @@ -116,6 +134,12 @@ Contact: "Shuoran Liu" Description: Shows total written kbytes issued to disk. +What: /sys/fs/f2fs//feature +Date: July 2017 +Contact: "Jaegeuk Kim" +Description: + Shows all enabled features in current device. + What: /sys/fs/f2fs//inject_rate Date: May 2016 Contact: "Sheng Yong" From 583b7a274c2719d939b0703e53d480edc35c8ac1 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 28 Oct 2017 16:52:30 +0800 Subject: [PATCH 476/804] f2fs: support get_page error injection This patch adds to support get_page error injection to simulate out-of-memory test scenario. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 3 ++- fs/f2fs/f2fs.h | 14 ++++++++++++++ fs/f2fs/gc.c | 4 ++-- fs/f2fs/node.c | 3 ++- fs/f2fs/super.c | 1 + 5 files changed, 21 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 6750584b7107..81ec0c6aeedf 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1962,7 +1962,8 @@ repeat: * Do not use grab_cache_page_write_begin() to avoid deadlock due to * wait_for_stable_page. Will wait that below with our IO control. */ - page = grab_cache_page(mapping, index); + page = f2fs_pagecache_get_page(mapping, index, + FGP_LOCK | FGP_WRITE | FGP_CREAT, GFP_NOFS); if (!page) { err = -ENOMEM; goto fail; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index d67d1d972459..b35d894762b5 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -47,6 +47,7 @@ enum { FAULT_KMALLOC, FAULT_PAGE_ALLOC, + FAULT_PAGE_GET, FAULT_ALLOC_NID, FAULT_ORPHAN, FAULT_BLOCK, @@ -1896,6 +1897,19 @@ static inline struct page *f2fs_grab_cache_page(struct address_space *mapping, return grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS); } +static inline struct page *f2fs_pagecache_get_page( + struct address_space *mapping, pgoff_t index, + int fgp_flags, gfp_t gfp_mask) +{ +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_GET)) { + f2fs_show_injection_info(FAULT_PAGE_GET); + return NULL; + } +#endif + return pagecache_get_page(mapping, index, fgp_flags, gfp_mask); +} + static inline void f2fs_copy_page(struct page *src, struct page *dst) { char *src_kaddr = kmap(src); diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 32b0b0632e15..359e7b5590f6 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -650,8 +650,8 @@ static void move_data_block(struct inode *inode, block_t bidx, allocate_data_block(fio.sbi, NULL, fio.old_blkaddr, &newaddr, &sum, CURSEG_COLD_DATA, NULL, false); - fio.encrypted_page = pagecache_get_page(META_MAPPING(fio.sbi), newaddr, - FGP_LOCK | FGP_CREAT, GFP_NOFS); + fio.encrypted_page = f2fs_pagecache_get_page(META_MAPPING(fio.sbi), + newaddr, FGP_LOCK | FGP_CREAT, GFP_NOFS); if (!fio.encrypted_page) { err = -ENOMEM; goto recover_block; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 77b39a058a34..1c81a915c343 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1218,7 +1218,8 @@ static void flush_inline_data(struct f2fs_sb_info *sbi, nid_t ino) if (!inode) return; - page = pagecache_get_page(inode->i_mapping, 0, FGP_LOCK|FGP_NOWAIT, 0); + page = f2fs_pagecache_get_page(inode->i_mapping, 0, + FGP_LOCK|FGP_NOWAIT, 0); if (!page) goto iput_out; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 8a99182b368f..a5304d9d1392 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -44,6 +44,7 @@ static struct kmem_cache *f2fs_inode_cachep; char *fault_name[FAULT_MAX] = { [FAULT_KMALLOC] = "kmalloc", [FAULT_PAGE_ALLOC] = "page alloc", + [FAULT_PAGE_GET] = "page get", [FAULT_ALLOC_NID] = "alloc nid", [FAULT_ORPHAN] = "orphan", [FAULT_BLOCK] = "no more block", From 5612922fb0acb33f54fc0a67837b3510dde4b00b Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 28 Oct 2017 16:52:31 +0800 Subject: [PATCH 477/804] f2fs: support bio allocation error injection This patch adds to support bio allocation error injection to simulate out-of-memory test scenario. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 4 ++-- fs/f2fs/f2fs.h | 23 +++++++++++++++++------ fs/f2fs/segment.c | 4 ++-- fs/f2fs/super.c | 1 + 4 files changed, 22 insertions(+), 10 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 81ec0c6aeedf..043394aa6c62 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -171,7 +171,7 @@ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr, { struct bio *bio; - bio = f2fs_bio_alloc(npages); + bio = f2fs_bio_alloc(sbi, npages, true); f2fs_target_device(sbi, blk_addr, bio); bio->bi_end_io = is_read ? f2fs_read_end_io : f2fs_write_end_io; @@ -471,7 +471,7 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, f2fs_wait_on_block_writeback(sbi, blkaddr); } - bio = bio_alloc(GFP_KERNEL, min_t(int, nr_pages, BIO_MAX_PAGES)); + bio = f2fs_bio_alloc(sbi, min_t(int, nr_pages, BIO_MAX_PAGES), false); if (!bio) { if (ctx) fscrypt_release_ctx(ctx); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index b35d894762b5..28df48ca5f1e 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -48,6 +48,7 @@ enum { FAULT_KMALLOC, FAULT_PAGE_ALLOC, FAULT_PAGE_GET, + FAULT_ALLOC_BIO, FAULT_ALLOC_NID, FAULT_ORPHAN, FAULT_BLOCK, @@ -1959,15 +1960,25 @@ static inline void *f2fs_kmem_cache_alloc(struct kmem_cache *cachep, return entry; } -static inline struct bio *f2fs_bio_alloc(int npages) +static inline struct bio *f2fs_bio_alloc(struct f2fs_sb_info *sbi, + int npages, bool no_fail) { struct bio *bio; - /* No failure on bio allocation */ - bio = bio_alloc(GFP_NOIO, npages); - if (!bio) - bio = bio_alloc(GFP_NOIO | __GFP_NOFAIL, npages); - return bio; + if (no_fail) { + /* No failure on bio allocation */ + bio = bio_alloc(GFP_NOIO, npages); + if (!bio) + bio = bio_alloc(GFP_NOIO | __GFP_NOFAIL, npages); + return bio; + } +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(sbi, FAULT_ALLOC_BIO)) { + f2fs_show_injection_info(FAULT_ALLOC_BIO); + return NULL; + } +#endif + return bio_alloc(GFP_KERNEL, npages); } static inline void f2fs_radix_tree_insert(struct radix_tree_root *root, diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 85295baa74c8..132e1e424ffe 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -511,7 +511,7 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) static int __submit_flush_wait(struct f2fs_sb_info *sbi, struct block_device *bdev) { - struct bio *bio = f2fs_bio_alloc(0); + struct bio *bio = f2fs_bio_alloc(sbi, 0, true); int ret; bio->bi_rw = REQ_OP_WRITE; @@ -943,7 +943,7 @@ static int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, if (ret) return ret; } - bio = f2fs_bio_alloc(1); + bio = bio_alloc(GFP_NOIO | __GFP_NOFAIL, 1); bio->bi_iter.bi_sector = sector; bio->bi_bdev = bdev; bio_set_op_attrs(bio, op, 0); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index a5304d9d1392..4c87c75ed352 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -45,6 +45,7 @@ char *fault_name[FAULT_MAX] = { [FAULT_KMALLOC] = "kmalloc", [FAULT_PAGE_ALLOC] = "page alloc", [FAULT_PAGE_GET] = "page get", + [FAULT_ALLOC_BIO] = "alloc bio", [FAULT_ALLOC_NID] = "alloc nid", [FAULT_ORPHAN] = "orphan", [FAULT_BLOCK] = "no more block", From 90c28a18d2a499c53dbff24b382d1b8e4e9547d3 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 28 Oct 2017 16:52:32 +0800 Subject: [PATCH 478/804] f2fs: give correct trimmed blocks in fstrim We have supported to issue discard in specified range during fstrim, it needs to return caller with successfully trimmed bytes in that range instead of bytes of invalid blocks which are scanned in checkpoint. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 1 - fs/f2fs/segment.c | 27 +++++++++++++++++++-------- 2 files changed, 19 insertions(+), 9 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 28df48ca5f1e..8e6ad6543bcf 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -229,7 +229,6 @@ struct cp_control { __u64 trim_start; __u64 trim_end; __u64 trim_minlen; - __u64 trimmed; }; /* diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 132e1e424ffe..27a6df3bbff3 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1336,21 +1336,27 @@ static bool __drop_discard_cmd(struct f2fs_sb_info *sbi) return dropped; } -static void __wait_one_discard_bio(struct f2fs_sb_info *sbi, +static unsigned int __wait_one_discard_bio(struct f2fs_sb_info *sbi, struct discard_cmd *dc) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + unsigned int len = 0; wait_for_completion_io(&dc->wait); mutex_lock(&dcc->cmd_lock); f2fs_bug_on(sbi, dc->state != D_DONE); dc->ref--; - if (!dc->ref) + if (!dc->ref) { + if (!dc->error) + len = dc->len; __remove_discard_cmd(sbi, dc); + } mutex_unlock(&dcc->cmd_lock); + + return len; } -static void __wait_discard_cmd_range(struct f2fs_sb_info *sbi, +static unsigned int __wait_discard_cmd_range(struct f2fs_sb_info *sbi, struct discard_policy *dpolicy, block_t start, block_t end) { @@ -1359,6 +1365,7 @@ static void __wait_discard_cmd_range(struct f2fs_sb_info *sbi, &(dcc->fstrim_list) : &(dcc->wait_list); struct discard_cmd *dc, *tmp; bool need_wait; + unsigned int trimmed = 0; next: need_wait = false; @@ -1371,6 +1378,8 @@ next: continue; if (dc->state == D_DONE && !dc->ref) { wait_for_completion_io(&dc->wait); + if (!dc->error) + trimmed += dc->len; __remove_discard_cmd(sbi, dc); } else { dc->ref++; @@ -1381,9 +1390,11 @@ next: mutex_unlock(&dcc->cmd_lock); if (need_wait) { - __wait_one_discard_bio(sbi, dc); + trimmed += __wait_one_discard_bio(sbi, dc); goto next; } + + return trimmed; } static void __wait_all_discard_cmd(struct f2fs_sb_info *sbi, @@ -1744,7 +1755,6 @@ find_next: f2fs_issue_discard(sbi, entry->start_blkaddr + cur_pos, len); - cpc->trimmed += len; total_len += len; } else { next_pos = find_next_bit_le(entry->discard_map, @@ -2447,12 +2457,12 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) block_t start_block, end_block; struct cp_control cpc; struct discard_policy dpolicy; + unsigned long long trimmed = 0; int err = 0; if (start >= MAX_BLKADDR(sbi) || range->len < sbi->blocksize) return -EINVAL; - cpc.trimmed = 0; if (end <= MAIN_BLKADDR(sbi)) goto out; @@ -2499,9 +2509,10 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) init_discard_policy(&dpolicy, DPOLICY_FSTRIM, cpc.trim_minlen); __issue_discard_cmd_range(sbi, &dpolicy, start_block, end_block); - __wait_discard_cmd_range(sbi, &dpolicy, start_block, end_block); + trimmed = __wait_discard_cmd_range(sbi, &dpolicy, + start_block, end_block); out: - range->len = F2FS_BLK_TO_BYTES(cpc.trimmed); + range->len = F2FS_BLK_TO_BYTES(trimmed); return err; } From ae66786296b4a210c75db6259300636ceb1abdba Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 28 Oct 2017 16:52:33 +0800 Subject: [PATCH 479/804] f2fs: export SSR allocation threshold This patch exports min_ssr_segments threshold in sysfs to let user control triggering SSR allocation flexibly. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 6 ++++++ fs/f2fs/f2fs.h | 2 ++ fs/f2fs/segment.c | 3 ++- fs/f2fs/sysfs.c | 2 ++ 4 files changed, 12 insertions(+), 1 deletion(-) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index a07134c517e0..2baed1151eac 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -57,6 +57,12 @@ Contact: "Jaegeuk Kim" Description: Controls the dirty page count condition for redefining hot data. +What: /sys/fs/f2fs//min_ssr_sections +Date: October 2017 +Contact: "Chao Yu" +Description: + Controls the fee section threshold to trigger SSR allocation. + What: /sys/fs/f2fs//max_small_discards Date: November 2013 Contact: "Jaegeuk Kim" diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 8e6ad6543bcf..6654a96f0907 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -904,6 +904,7 @@ struct f2fs_sm_info { unsigned int min_ipu_util; /* in-place-update threshold */ unsigned int min_fsync_blocks; /* threshold for fsync */ unsigned int min_hot_blocks; /* threshold for hot block allocation */ + unsigned int min_ssr_sections; /* threshold to trigger SSR allocation */ /* for flush command control */ struct flush_cmd_control *fcc_info; @@ -1141,6 +1142,7 @@ struct f2fs_sb_info { int active_logs; /* # of active logs */ int dir_level; /* directory level */ int inline_xattr_size; /* inline xattr size */ + unsigned int trigger_ssr_threshold; /* threshold to trigger ssr */ block_t user_block_count; /* # of user blocks */ block_t total_valid_block_count; /* # of valid blocks */ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 27a6df3bbff3..af536d427424 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -181,7 +181,7 @@ bool need_SSR(struct f2fs_sb_info *sbi) return true; return free_sections(sbi) <= (node_secs + 2 * dent_secs + imeta_secs + - 2 * reserved_sections(sbi)); + SM_I(sbi)->min_ssr_sections + reserved_sections(sbi)); } void register_inmem_page(struct inode *inode, struct page *page) @@ -3751,6 +3751,7 @@ int build_segment_manager(struct f2fs_sb_info *sbi) sm_info->min_ipu_util = DEF_MIN_IPU_UTIL; sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS; sm_info->min_hot_blocks = DEF_MIN_HOT_BLOCKS; + sm_info->min_ssr_sections = reserved_sections(sbi); sm_info->trim_sections = DEF_BATCHED_TRIM_SECTIONS; diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 4166ac74e837..f0fdc89ce82f 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -285,6 +285,7 @@ F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_hot_blocks, min_hot_blocks); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ssr_sections, min_ssr_sections); F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh); F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ra_nid_pages, ra_nid_pages); F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, dirty_nats_ratio, dirty_nats_ratio); @@ -330,6 +331,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(min_ipu_util), ATTR_LIST(min_fsync_blocks), ATTR_LIST(min_hot_blocks), + ATTR_LIST(min_ssr_sections), ATTR_LIST(max_victim_search), ATTR_LIST(dir_level), ATTR_LIST(ram_thresh), From 873ec505cb075ab6ad41721289fd8530ba777cab Mon Sep 17 00:00:00 2001 From: Fan Li Date: Sat, 28 Oct 2017 19:03:37 +0800 Subject: [PATCH 480/804] f2fs: add a function to move nid This patch add a new function to move nid from one state to another. Move operation is heavily used, by adding a new function for it we can cut down some branches from several flow. Signed-off-by: Fan li Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 57 +++++++++++++++++++++++++++++++------------------- 1 file changed, 36 insertions(+), 21 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 1c81a915c343..a1f8307b1085 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1765,15 +1765,13 @@ static struct free_nid *__lookup_free_nid_list(struct f2fs_nm_info *nm_i, } static int __insert_free_nid(struct f2fs_sb_info *sbi, - struct free_nid *i, enum nid_state state, bool new) + struct free_nid *i, enum nid_state state) { struct f2fs_nm_info *nm_i = NM_I(sbi); - if (new) { - int err = radix_tree_insert(&nm_i->free_nid_root, i->nid, i); - if (err) - return err; - } + int err = radix_tree_insert(&nm_i->free_nid_root, i->nid, i); + if (err) + return err; f2fs_bug_on(sbi, state != i->state); nm_i->nid_cnt[state]++; @@ -1783,7 +1781,7 @@ static int __insert_free_nid(struct f2fs_sb_info *sbi, } static void __remove_free_nid(struct f2fs_sb_info *sbi, - struct free_nid *i, enum nid_state state, bool reuse) + struct free_nid *i, enum nid_state state) { struct f2fs_nm_info *nm_i = NM_I(sbi); @@ -1791,8 +1789,29 @@ static void __remove_free_nid(struct f2fs_sb_info *sbi, nm_i->nid_cnt[state]--; if (state == FREE_NID) list_del(&i->list); - if (!reuse) - radix_tree_delete(&nm_i->free_nid_root, i->nid); + radix_tree_delete(&nm_i->free_nid_root, i->nid); +} + +static void __move_free_nid(struct f2fs_sb_info *sbi, struct free_nid *i, + enum nid_state org_state, enum nid_state dst_state) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + + f2fs_bug_on(sbi, org_state != i->state); + i->state = dst_state; + nm_i->nid_cnt[org_state]--; + nm_i->nid_cnt[dst_state]++; + + switch (dst_state) { + case PREALLOC_NID: + list_del(&i->list); + break; + case FREE_NID: + list_add_tail(&i->list, &nm_i->free_nid_list); + break; + default: + BUG_ON(1); + } } /* return if the nid is recognized as free */ @@ -1852,7 +1871,7 @@ static bool add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) } } ret = true; - err = __insert_free_nid(sbi, i, FREE_NID, true); + err = __insert_free_nid(sbi, i, FREE_NID); err_out: spin_unlock(&nm_i->nid_list_lock); radix_tree_preload_end(); @@ -1871,7 +1890,7 @@ static void remove_free_nid(struct f2fs_sb_info *sbi, nid_t nid) spin_lock(&nm_i->nid_list_lock); i = __lookup_free_nid_list(nm_i, nid); if (i && i->state == FREE_NID) { - __remove_free_nid(sbi, i, FREE_NID, false); + __remove_free_nid(sbi, i, FREE_NID); need_free = true; } spin_unlock(&nm_i->nid_list_lock); @@ -2082,9 +2101,7 @@ retry: struct free_nid, list); *nid = i->nid; - __remove_free_nid(sbi, i, FREE_NID, true); - i->state = PREALLOC_NID; - __insert_free_nid(sbi, i, PREALLOC_NID, false); + __move_free_nid(sbi, i, FREE_NID, PREALLOC_NID); nm_i->available_nids--; update_free_nid_bitmap(sbi, *nid, false, false); @@ -2110,7 +2127,7 @@ void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid) spin_lock(&nm_i->nid_list_lock); i = __lookup_free_nid_list(nm_i, nid); f2fs_bug_on(sbi, !i); - __remove_free_nid(sbi, i, PREALLOC_NID, false); + __remove_free_nid(sbi, i, PREALLOC_NID); spin_unlock(&nm_i->nid_list_lock); kmem_cache_free(free_nid_slab, i); @@ -2133,12 +2150,10 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) f2fs_bug_on(sbi, !i); if (!available_free_memory(sbi, FREE_NIDS)) { - __remove_free_nid(sbi, i, PREALLOC_NID, false); + __remove_free_nid(sbi, i, PREALLOC_NID); need_free = true; } else { - __remove_free_nid(sbi, i, PREALLOC_NID, true); - i->state = FREE_NID; - __insert_free_nid(sbi, i, FREE_NID, false); + __move_free_nid(sbi, i, PREALLOC_NID, FREE_NID); } nm_i->available_nids++; @@ -2169,7 +2184,7 @@ int try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink) nm_i->nid_cnt[FREE_NID] <= MAX_FREE_NIDS) break; - __remove_free_nid(sbi, i, FREE_NID, false); + __remove_free_nid(sbi, i, FREE_NID); kmem_cache_free(free_nid_slab, i); nr_shrink--; } @@ -2748,7 +2763,7 @@ void destroy_node_manager(struct f2fs_sb_info *sbi) /* destroy free nid list */ spin_lock(&nm_i->nid_list_lock); list_for_each_entry_safe(i, next_i, &nm_i->free_nid_list, list) { - __remove_free_nid(sbi, i, FREE_NID, false); + __remove_free_nid(sbi, i, FREE_NID); spin_unlock(&nm_i->nid_list_lock); kmem_cache_free(free_nid_slab, i); spin_lock(&nm_i->nid_list_lock); From c713fdb5a23cdda4ed85e04c7dec1094ab9c691f Mon Sep 17 00:00:00 2001 From: Yunlong Song Date: Mon, 30 Oct 2017 09:33:41 +0800 Subject: [PATCH 481/804] Revert "f2fs: handle dirty segments inside refresh_sit_entry" This reverts commit 5e443818fa0b2a2845561ee25bec181424fb2889 The commit should be reverted because call sequence of below two parts of code must be kept: a. update sit information, it needs to be updated before segment allocation since latter allocation may trigger SSR, and SSR allocation needs latest valid block information of all segments. b. update segment status, it needs to be updated after segment allocation since we can skip updating current opened segment status. Fixes: 5e443818fa0b ("f2fs: handle dirty segments inside refresh_sit_entry") Suggested-by: Chao Yu Signed-off-by: Yunlong Song Reviewed-by: Chao Yu [Jaegeuk Kim: remove refresh_sit_entry function] Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 1 - fs/f2fs/segment.c | 27 ++++++++++++++------------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 6654a96f0907..d8fcdead2aa5 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2683,7 +2683,6 @@ void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr); bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr); void init_discard_policy(struct discard_policy *dpolicy, int discard_type, unsigned int granularity); -void refresh_sit_entry(struct f2fs_sb_info *sbi, block_t old, block_t new); void stop_discard_thread(struct f2fs_sb_info *sbi); bool f2fs_wait_discard_bios(struct f2fs_sb_info *sbi); void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index af536d427424..f59b00aa502b 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1974,16 +1974,6 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) get_sec_entry(sbi, segno)->valid_blocks += del; } -void refresh_sit_entry(struct f2fs_sb_info *sbi, block_t old, block_t new) -{ - update_sit_entry(sbi, new, 1); - if (GET_SEGNO(sbi, old) != NULL_SEGNO) - update_sit_entry(sbi, old, -1); - - locate_dirty_segment(sbi, GET_SEGNO(sbi, old)); - locate_dirty_segment(sbi, GET_SEGNO(sbi, new)); -} - void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr) { unsigned int segno = GET_SEGNO(sbi, addr); @@ -2620,13 +2610,24 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, stat_inc_block_count(sbi, curseg); + /* + * SIT information should be updated before segment allocation, + * since SSR needs latest valid block information. + */ + update_sit_entry(sbi, *new_blkaddr, 1); + if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) + update_sit_entry(sbi, old_blkaddr, -1); + if (!__has_curseg_space(sbi, type)) sit_i->s_ops->allocate_segment(sbi, type, false); + /* - * SIT information should be updated after segment allocation, - * since we need to keep dirty segments precisely under SSR. + * segment dirty status should be updated after segment allocation, + * so we just need to update status only one time after previous + * segment being closed. */ - refresh_sit_entry(sbi, old_blkaddr, *new_blkaddr); + locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr)); + locate_dirty_segment(sbi, GET_SEGNO(sbi, *new_blkaddr)); mutex_unlock(&sit_i->sentry_lock); From f46ae958c701e580c4405c7b455438980d78585a Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Mon, 30 Oct 2017 14:18:55 +0800 Subject: [PATCH 482/804] f2fs: modify for accurate fggc node io stat modify for accurate fggc node io stat Signed-off-by: Yunlei He Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 62 +++++++++++++++++++++++++------------------------- 1 file changed, 31 insertions(+), 31 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index a1f8307b1085..99c966fcf32d 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1243,37 +1243,6 @@ iput_out: iput(inode); } -void move_node_page(struct page *node_page, int gc_type) -{ - if (gc_type == FG_GC) { - struct f2fs_sb_info *sbi = F2FS_P_SB(node_page); - struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL, - .nr_to_write = 1, - .for_reclaim = 0, - }; - - set_page_dirty(node_page); - f2fs_wait_on_page_writeback(node_page, NODE, true); - - f2fs_bug_on(sbi, PageWriteback(node_page)); - if (!clear_page_dirty_for_io(node_page)) - goto out_page; - - if (NODE_MAPPING(sbi)->a_ops->writepage(node_page, &wbc)) - unlock_page(node_page); - goto release_page; - } else { - /* set page dirty and write it */ - if (!PageWriteback(node_page)) - set_page_dirty(node_page); - } -out_page: - unlock_page(node_page); -release_page: - f2fs_put_page(node_page, 0); -} - static struct page *last_fsync_dnode(struct f2fs_sb_info *sbi, nid_t ino) { pgoff_t index, end; @@ -1416,6 +1385,37 @@ redirty_out: return AOP_WRITEPAGE_ACTIVATE; } +void move_node_page(struct page *node_page, int gc_type) +{ + if (gc_type == FG_GC) { + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = 1, + .for_reclaim = 0, + }; + + set_page_dirty(node_page); + f2fs_wait_on_page_writeback(node_page, NODE, true); + + f2fs_bug_on(F2FS_P_SB(node_page), PageWriteback(node_page)); + if (!clear_page_dirty_for_io(node_page)) + goto out_page; + + if (__write_node_page(node_page, false, NULL, + &wbc, false, FS_GC_NODE_IO)) + unlock_page(node_page); + goto release_page; + } else { + /* set page dirty and write it */ + if (!PageWriteback(node_page)) + set_page_dirty(node_page); + } +out_page: + unlock_page(node_page); +release_page: + f2fs_put_page(node_page, 0); +} + static int f2fs_write_node_page(struct page *page, struct writeback_control *wbc) { From ddb8e2ae9811a6e1a186e1366db817924231842e Mon Sep 17 00:00:00 2001 From: Fan Li Date: Mon, 30 Oct 2017 15:19:48 +0800 Subject: [PATCH 483/804] f2fs: optimize __update_nat_bits Make three modification for __update_nat_bits: 1. Take the codes of dealing the nat with nid 0 out of the loop Such nat only needs to be dealt with once at beginning. 2. Use " nat_index == 0" instead of " start_nid == 0" to decide if it's the first nat block It's better that we don't assume @start_nid is the first nid of the nat block it's in. 3. Use " if (nat_blk->entries[i].block_addr != NULL_ADDR)" to explicitly comfirm the value of block_addr use constant to make sure the codes is right, even if the value of NULL_ADDR changes. Signed-off-by: Fan li Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 99c966fcf32d..09707de3c9c5 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2424,15 +2424,17 @@ static void __update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid, unsigned int nat_index = start_nid / NAT_ENTRY_PER_BLOCK; struct f2fs_nat_block *nat_blk = page_address(page); int valid = 0; - int i; + int i = 0; if (!enabled_nat_bits(sbi, NULL)) return; - for (i = 0; i < NAT_ENTRY_PER_BLOCK; i++) { - if (start_nid == 0 && i == 0) - valid++; - if (nat_blk->entries[i].block_addr) + if (nat_index == 0) { + valid = 1; + i = 1; + } + for (; i < NAT_ENTRY_PER_BLOCK; i++) { + if (nat_blk->entries[i].block_addr != NULL_ADDR) valid++; } if (valid == 0) { From 26dfec49b25a3a895181b5f76bc8c762924f6197 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 5 Oct 2017 21:03:06 -0700 Subject: [PATCH 484/804] f2fs: add quota_ino feature infra This patch adds quota_ino feature infra to be used for quota files. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 6 ++++++ fs/f2fs/sysfs.c | 7 +++++++ include/linux/f2fs_fs.h | 6 +++++- 3 files changed, 18 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index d8fcdead2aa5..ff5cd87e745f 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -123,6 +123,7 @@ struct f2fs_mount_info { #define F2FS_FEATURE_PRJQUOTA 0x0010 #define F2FS_FEATURE_INODE_CHKSUM 0x0020 #define F2FS_FEATURE_FLEXIBLE_INLINE_XATTR 0x0040 +#define F2FS_FEATURE_QUOTA_INO 0x0080 #define F2FS_HAS_FEATURE(sb, mask) \ ((F2FS_SB(sb)->raw_super->feature & cpu_to_le32(mask)) != 0) @@ -3151,6 +3152,11 @@ static inline int f2fs_sb_has_flexible_inline_xattr(struct super_block *sb) return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_FLEXIBLE_INLINE_XATTR); } +static inline int f2fs_sb_has_quota_ino(struct super_block *sb) +{ + return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_QUOTA_INO); +} + #ifdef CONFIG_BLK_DEV_ZONED static inline int get_blkz_type(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t blkaddr) diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index f0fdc89ce82f..9835348b6e5d 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -110,6 +110,9 @@ static ssize_t features_show(struct f2fs_attr *a, if (f2fs_sb_has_flexible_inline_xattr(sb)) len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "flexible_inline_xattr"); + if (f2fs_sb_has_quota_ino(sb)) + len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len ? ", " : "", "quota_ino"); len += snprintf(buf + len, PAGE_SIZE - len, "\n"); return len; } @@ -227,6 +230,7 @@ enum feat_id { FEAT_PROJECT_QUOTA, FEAT_INODE_CHECKSUM, FEAT_FLEXIBLE_INLINE_XATTR, + FEAT_QUOTA_INO, }; static ssize_t f2fs_feature_show(struct f2fs_attr *a, @@ -240,6 +244,7 @@ static ssize_t f2fs_feature_show(struct f2fs_attr *a, case FEAT_PROJECT_QUOTA: case FEAT_INODE_CHECKSUM: case FEAT_FLEXIBLE_INLINE_XATTR: + case FEAT_QUOTA_INO: return snprintf(buf, PAGE_SIZE, "supported\n"); } return 0; @@ -314,6 +319,7 @@ F2FS_FEATURE_RO_ATTR(extra_attr, FEAT_EXTRA_ATTR); F2FS_FEATURE_RO_ATTR(project_quota, FEAT_PROJECT_QUOTA); F2FS_FEATURE_RO_ATTR(inode_checksum, FEAT_INODE_CHECKSUM); F2FS_FEATURE_RO_ATTR(flexible_inline_xattr, FEAT_FLEXIBLE_INLINE_XATTR); +F2FS_FEATURE_RO_ATTR(quota_ino, FEAT_QUOTA_INO); #define ATTR_LIST(name) (&f2fs_attr_##name.attr) static struct attribute *f2fs_attrs[] = { @@ -364,6 +370,7 @@ static struct attribute *f2fs_feat_attrs[] = { ATTR_LIST(project_quota), ATTR_LIST(inode_checksum), ATTR_LIST(flexible_inline_xattr), + ATTR_LIST(quota_ino), NULL, }; diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index d79ca96045e4..3e15df3cebcb 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -36,6 +36,9 @@ #define F2FS_NODE_INO(sbi) (sbi->node_ino_num) #define F2FS_META_INO(sbi) (sbi->meta_ino_num) +#define F2FS_QUOTA_INO 3 +#define F2FS_MAX_QUOTAS 3 + #define F2FS_IO_SIZE(sbi) (1 << (sbi)->write_io_size_bits) /* Blocks */ #define F2FS_IO_SIZE_KB(sbi) (1 << ((sbi)->write_io_size_bits + 2)) /* KB */ #define F2FS_IO_SIZE_BYTES(sbi) (1 << ((sbi)->write_io_size_bits + 12)) /* B */ @@ -108,7 +111,8 @@ struct f2fs_super_block { __u8 encryption_level; /* versioning level for encryption */ __u8 encrypt_pw_salt[16]; /* Salt used for string2key algorithm */ struct f2fs_device devs[MAX_DEVICES]; /* device list */ - __u8 reserved[327]; /* valid reserved region */ + __le32 qf_ino[F2FS_MAX_QUOTAS]; /* quota inode numbers */ + __u8 reserved[315]; /* valid reserved region */ } __packed; /* From 82750d346ab7d09c9ffc2d9ecce84420422bd6fa Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 6 Oct 2017 09:14:28 -0700 Subject: [PATCH 485/804] f2fs: support quota sys files This patch supports hidden quota files in the system, which will be used for Android. It requires up-to-date f2fs-tools later than v1.9.0. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 9 ++- fs/f2fs/f2fs.h | 10 ++- fs/f2fs/recovery.c | 8 ++- fs/f2fs/super.c | 149 ++++++++++++++++++++++++++++++++++++---- include/linux/f2fs_fs.h | 1 - 5 files changed, 158 insertions(+), 19 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 48f9366240a2..a69795e046bb 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -616,6 +616,9 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi) block_t start_blk, orphan_blocks, i, j; unsigned int s_flags = sbi->sb->s_flags; int err = 0; +#ifdef CONFIG_QUOTA + int quota_enabled; +#endif if (!is_set_ckpt_flags(sbi, CP_ORPHAN_PRESENT_FLAG)) return 0; @@ -628,8 +631,9 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi) #ifdef CONFIG_QUOTA /* Needed for iput() to work correctly and not trash data */ sbi->sb->s_flags |= MS_ACTIVE; + /* Turn on quotas so that they are updated correctly */ - f2fs_enable_quota_files(sbi); + quota_enabled = f2fs_enable_quota_files(sbi, s_flags & MS_RDONLY); #endif start_blk = __start_cp_addr(sbi) + 1 + __cp_payload(sbi); @@ -657,7 +661,8 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi) out: #ifdef CONFIG_QUOTA /* Turn quotas off */ - f2fs_quota_off_umount(sbi->sb); + if (quota_enabled) + f2fs_quota_off_umount(sbi->sb); #endif sbi->sb->s_flags = s_flags; /* Restore MS_RDONLY status */ diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index ff5cd87e745f..72d5ea456250 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1445,6 +1445,13 @@ static inline unsigned long long cur_cp_version(struct f2fs_checkpoint *cp) return le64_to_cpu(cp->checkpoint_ver); } +static inline unsigned long f2fs_qf_ino(struct super_block *sb, int type) +{ + if (type < F2FS_MAX_QUOTAS) + return le32_to_cpu(F2FS_SB(sb)->raw_super->qf_ino[type]); + return 0; +} + static inline __u64 cur_cp_crc(struct f2fs_checkpoint *cp) { size_t crc_offset = le32_to_cpu(cp->checksum_offset); @@ -2406,6 +2413,7 @@ static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync) return ret; } +#define sb_rdonly f2fs_readonly static inline int f2fs_readonly(struct super_block *sb) { return sb->s_flags & MS_RDONLY; @@ -2607,7 +2615,7 @@ static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode) */ int f2fs_inode_dirtied(struct inode *inode, bool sync); void f2fs_inode_synced(struct inode *inode); -void f2fs_enable_quota_files(struct f2fs_sb_info *sbi); +int f2fs_enable_quota_files(struct f2fs_sb_info *sbi, bool rdonly); void f2fs_quota_off_umount(struct super_block *sb); int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover); int f2fs_sync_fs(struct super_block *sb, int sync); diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 9626758bc762..92c57ace1939 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -594,6 +594,9 @@ int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) int ret = 0; unsigned long s_flags = sbi->sb->s_flags; bool need_writecp = false; +#ifdef CONFIG_QUOTA + int quota_enabled; +#endif if (s_flags & MS_RDONLY) { f2fs_msg(sbi->sb, KERN_INFO, "orphan cleanup on readonly fs"); @@ -604,7 +607,7 @@ int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) /* Needed for iput() to work correctly and not trash data */ sbi->sb->s_flags |= MS_ACTIVE; /* Turn on quotas so that they are updated correctly */ - f2fs_enable_quota_files(sbi); + quota_enabled = f2fs_enable_quota_files(sbi, s_flags & MS_RDONLY); #endif fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry", @@ -665,7 +668,8 @@ skip: out: #ifdef CONFIG_QUOTA /* Turn quotas off */ - f2fs_quota_off_umount(sbi->sb); + if (quota_enabled) + f2fs_quota_off_umount(sbi->sb); #endif sbi->sb->s_flags = s_flags; /* Restore MS_RDONLY status */ diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 4c87c75ed352..e304ce603c5d 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -213,6 +213,12 @@ static int f2fs_set_qf_name(struct super_block *sb, int qtype, "quota options when quota turned on"); return -EINVAL; } + if (f2fs_sb_has_quota_ino(sb)) { + f2fs_msg(sb, KERN_INFO, + "QUOTA feature is enabled, so ignore qf_name"); + return 0; + } + qname = match_strdup(args); if (!qname) { f2fs_msg(sb, KERN_ERR, @@ -291,6 +297,18 @@ static int f2fs_check_quota_options(struct f2fs_sb_info *sbi) return -1; } } + + if (f2fs_sb_has_quota_ino(sbi->sb) && sbi->s_jquota_fmt) { + f2fs_msg(sbi->sb, KERN_INFO, + "QUOTA feature is enabled, so ignore jquota_fmt"); + sbi->s_jquota_fmt = 0; + } + if (f2fs_sb_has_quota_ino(sbi->sb) && sb_rdonly(sbi->sb)) { + f2fs_msg(sbi->sb, KERN_INFO, + "Filesystem with quota feature cannot be mounted RDWR " + "without CONFIG_QUOTA"); + return -1; + } return 0; } #endif @@ -1173,6 +1191,9 @@ static void default_options(struct f2fs_sb_info *sbi) #endif } +#ifdef CONFIG_QUOTA +static int f2fs_enable_quotas(struct super_block *sb); +#endif static int f2fs_remount(struct super_block *sb, int *flags, char *data) { struct f2fs_sb_info *sbi = F2FS_SB(sb); @@ -1239,6 +1260,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) if (f2fs_readonly(sb) && (*flags & MS_RDONLY)) goto skip; +#ifdef CONFIG_QUOTA if (!f2fs_readonly(sb) && (*flags & MS_RDONLY)) { err = dquot_suspend(sb, -1); if (err < 0) @@ -1246,9 +1268,15 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) } else { /* dquot_resume needs RW */ sb->s_flags &= ~MS_RDONLY; - dquot_resume(sb, -1); + if (sb_any_quota_suspended(sb)) { + dquot_resume(sb, -1); + } else if (f2fs_sb_has_quota_ino(sb)) { + err = f2fs_enable_quotas(sb); + if (err) + goto restore_opts; + } } - +#endif /* disallow enable/disable extent_cache dynamically */ if (no_extent_cache == !!test_opt(sbi, EXTENT_CACHE)) { err = -EINVAL; @@ -1455,19 +1483,91 @@ static int f2fs_quota_on_mount(struct f2fs_sb_info *sbi, int type) sbi->s_jquota_fmt, type); } -void f2fs_enable_quota_files(struct f2fs_sb_info *sbi) +int f2fs_enable_quota_files(struct f2fs_sb_info *sbi, bool rdonly) { - int i, ret; + int enabled = 0; + int i, err; + + if (f2fs_sb_has_quota_ino(sbi->sb) && rdonly) { + err = f2fs_enable_quotas(sbi->sb); + if (err) { + f2fs_msg(sbi->sb, KERN_ERR, + "Cannot turn on quota_ino: %d", err); + return 0; + } + return 1; + } for (i = 0; i < MAXQUOTAS; i++) { if (sbi->s_qf_names[i]) { - ret = f2fs_quota_on_mount(sbi, i); - if (ret < 0) - f2fs_msg(sbi->sb, KERN_ERR, - "Cannot turn on journaled " - "quota: error %d", ret); + err = f2fs_quota_on_mount(sbi, i); + if (!err) { + enabled = 1; + continue; + } + f2fs_msg(sbi->sb, KERN_ERR, + "Cannot turn on quotas: %d on %d", err, i); } } + return enabled; +} + +static int f2fs_quota_enable(struct super_block *sb, int type, int format_id, + unsigned int flags) +{ + struct inode *qf_inode; + unsigned long qf_inum; + int err; + + BUG_ON(!f2fs_sb_has_quota_ino(sb)); + + qf_inum = f2fs_qf_ino(sb, type); + if (!qf_inum) + return -EPERM; + + qf_inode = f2fs_iget(sb, qf_inum); + if (IS_ERR(qf_inode)) { + f2fs_msg(sb, KERN_ERR, + "Bad quota inode %u:%lu", type, qf_inum); + return PTR_ERR(qf_inode); + } + + /* Don't account quota for quota files to avoid recursion */ + qf_inode->i_flags |= S_NOQUOTA; + err = dquot_enable(qf_inode, type, format_id, flags); + iput(qf_inode); + return err; +} + +static int f2fs_enable_quotas(struct super_block *sb) +{ + int type, err = 0; + unsigned long qf_inum; + bool quota_mopt[MAXQUOTAS] = { + test_opt(F2FS_SB(sb), USRQUOTA), + test_opt(F2FS_SB(sb), GRPQUOTA), + test_opt(F2FS_SB(sb), PRJQUOTA), + }; + + sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE; + for (type = 0; type < MAXQUOTAS; type++) { + qf_inum = f2fs_qf_ino(sb, type); + if (qf_inum) { + err = f2fs_quota_enable(sb, type, QFMT_VFS_V1, + DQUOT_USAGE_ENABLED | + (quota_mopt[type] ? DQUOT_LIMITS_ENABLED : 0)); + if (err) { + f2fs_msg(sb, KERN_ERR, + "Failed to enable quota tracking " + "(type=%d, err=%d). Please run " + "fsck to fix.", type, err); + for (type--; type >= 0; type--) + dquot_quota_off(sb, type); + return err; + } + } + } + return 0; } static int f2fs_quota_sync(struct super_block *sb, int type) @@ -1538,7 +1638,7 @@ static int f2fs_quota_off(struct super_block *sb, int type) f2fs_quota_sync(sb, type); err = dquot_quota_off(sb, type); - if (err) + if (err || f2fs_sb_has_quota_ino(sb)) goto out_put; inode_lock(inode); @@ -2372,7 +2472,10 @@ try_onemore: #ifdef CONFIG_QUOTA sb->dq_op = &f2fs_quota_operations; - sb->s_qcop = &f2fs_quotactl_ops; + if (f2fs_sb_has_quota_ino(sb)) + sb->s_qcop = &dquot_quotactl_sysfile_ops; + else + sb->s_qcop = &f2fs_quotactl_ops; sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ; #endif @@ -2543,10 +2646,24 @@ try_onemore: if (err) goto free_root_inode; +#ifdef CONFIG_QUOTA + /* + * Turn on quotas which were not enabled for read-only mounts if + * filesystem has quota feature, so that they are updated correctly. + */ + if (f2fs_sb_has_quota_ino(sb) && !sb_rdonly(sb)) { + err = f2fs_enable_quotas(sb); + if (err) { + f2fs_msg(sb, KERN_ERR, + "Cannot turn on quotas: error %d", err); + goto free_sysfs; + } + } +#endif /* if there are nt orphan nodes free them */ err = recover_orphan_inodes(sbi); if (err) - goto free_sysfs; + goto free_meta; /* recover fsynced data */ if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) { @@ -2580,7 +2697,7 @@ try_onemore: err = -EINVAL; f2fs_msg(sb, KERN_ERR, "Need to recover fsync data"); - goto free_sysfs; + goto free_meta; } } skip_recovery: @@ -2614,6 +2731,10 @@ skip_recovery: return 0; free_meta: +#ifdef CONFIG_QUOTA + if (f2fs_sb_has_quota_ino(sb) && !sb_rdonly(sb)) + f2fs_quota_off_umount(sbi->sb); +#endif f2fs_sync_inode_meta(sbi); /* * Some dirty meta pages can be produced by recover_orphan_inodes() @@ -2622,7 +2743,9 @@ free_meta: * falls into an infinite loop in sync_meta_pages(). */ truncate_inode_pages_final(META_MAPPING(sbi)); +#ifdef CONFIG_QUOTA free_sysfs: +#endif f2fs_unregister_sysfs(sbi); free_root_inode: dput(sb->s_root); diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index 3e15df3cebcb..fef1caeddf54 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -36,7 +36,6 @@ #define F2FS_NODE_INO(sbi) (sbi->node_ino_num) #define F2FS_META_INO(sbi) (sbi->meta_ino_num) -#define F2FS_QUOTA_INO 3 #define F2FS_MAX_QUOTAS 3 #define F2FS_IO_SIZE(sbi) (1 << (sbi)->write_io_size_bits) /* Blocks */ From c5470498e59be4c3d9ebc9d7ee396dd8e6c6b1ea Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 30 Oct 2017 17:49:53 +0800 Subject: [PATCH 486/804] f2fs: use rw_semaphore to protect SIT cache There are some cases user didn't update SIT cache under this lock, so let's use rw_semaphore instead of mutex to enhance concurrently accessing. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 12 ++++++------ fs/f2fs/segment.c | 34 +++++++++++++++++++--------------- fs/f2fs/segment.h | 2 +- 3 files changed, 26 insertions(+), 22 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 359e7b5590f6..297c204ea221 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -456,10 +456,10 @@ static int check_valid_map(struct f2fs_sb_info *sbi, struct seg_entry *sentry; int ret; - mutex_lock(&sit_i->sentry_lock); + down_read(&sit_i->sentry_lock); sentry = get_seg_entry(sbi, segno); ret = f2fs_test_bit(offset, sentry->cur_valid_map); - mutex_unlock(&sit_i->sentry_lock); + up_read(&sit_i->sentry_lock); return ret; } @@ -893,10 +893,10 @@ static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim, struct sit_info *sit_i = SIT_I(sbi); int ret; - mutex_lock(&sit_i->sentry_lock); + down_write(&sit_i->sentry_lock); ret = DIRTY_I(sbi)->v_ops->get_victim(sbi, victim, gc_type, NO_CHECK_TYPE, LFS); - mutex_unlock(&sit_i->sentry_lock); + up_write(&sit_i->sentry_lock); return ret; } @@ -944,8 +944,8 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, /* * this is to avoid deadlock: * - lock_page(sum_page) - f2fs_replace_block - * - check_valid_map() - mutex_lock(sentry_lock) - * - mutex_lock(sentry_lock) - change_curseg() + * - check_valid_map() - down_write(sentry_lock) + * - down_read(sentry_lock) - change_curseg() * - lock_page(sum_page) */ if (type == SUM_TYPE_NODE) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index f59b00aa502b..7dfd4580380e 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1984,14 +1984,14 @@ void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr) return; /* add it into sit main buffer */ - mutex_lock(&sit_i->sentry_lock); + down_write(&sit_i->sentry_lock); update_sit_entry(sbi, addr, -1); /* add it into dirty seglist */ locate_dirty_segment(sbi, segno); - mutex_unlock(&sit_i->sentry_lock); + up_write(&sit_i->sentry_lock); } bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr) @@ -2004,7 +2004,7 @@ bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr) if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR) return true; - mutex_lock(&sit_i->sentry_lock); + down_read(&sit_i->sentry_lock); segno = GET_SEGNO(sbi, blkaddr); se = get_seg_entry(sbi, segno); @@ -2013,7 +2013,7 @@ bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr) if (f2fs_test_bit(offset, se->ckpt_valid_map)) is_cp = true; - mutex_unlock(&sit_i->sentry_lock); + up_read(&sit_i->sentry_lock); return is_cp; } @@ -2409,12 +2409,16 @@ void allocate_new_segments(struct f2fs_sb_info *sbi) unsigned int old_segno; int i; + down_write(&SIT_I(sbi)->sentry_lock); + for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { curseg = CURSEG_I(sbi, i); old_segno = curseg->segno; SIT_I(sbi)->s_ops->allocate_segment(sbi, i, true); locate_dirty_segment(sbi, old_segno); } + + up_write(&SIT_I(sbi)->sentry_lock); } static const struct segment_allocation default_salloc_ops = { @@ -2426,14 +2430,14 @@ bool exist_trim_candidates(struct f2fs_sb_info *sbi, struct cp_control *cpc) __u64 trim_start = cpc->trim_start; bool has_candidate = false; - mutex_lock(&SIT_I(sbi)->sentry_lock); + down_write(&SIT_I(sbi)->sentry_lock); for (; cpc->trim_start <= cpc->trim_end; cpc->trim_start++) { if (add_discard_addrs(sbi, cpc, true)) { has_candidate = true; break; } } - mutex_unlock(&SIT_I(sbi)->sentry_lock); + up_write(&SIT_I(sbi)->sentry_lock); cpc->trim_start = trim_start; return has_candidate; @@ -2593,7 +2597,7 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, struct curseg_info *curseg = CURSEG_I(sbi, type); mutex_lock(&curseg->curseg_mutex); - mutex_lock(&sit_i->sentry_lock); + down_write(&sit_i->sentry_lock); *new_blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); @@ -2629,7 +2633,7 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr)); locate_dirty_segment(sbi, GET_SEGNO(sbi, *new_blkaddr)); - mutex_unlock(&sit_i->sentry_lock); + up_write(&sit_i->sentry_lock); if (page && IS_NODESEG(type)) { fill_node_footer_blkaddr(page, NEXT_FREE_BLKADDR(sbi, curseg)); @@ -2787,7 +2791,7 @@ void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, curseg = CURSEG_I(sbi, type); mutex_lock(&curseg->curseg_mutex); - mutex_lock(&sit_i->sentry_lock); + down_write(&sit_i->sentry_lock); old_cursegno = curseg->segno; old_blkoff = curseg->next_blkoff; @@ -2819,7 +2823,7 @@ void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, curseg->next_blkoff = old_blkoff; } - mutex_unlock(&sit_i->sentry_lock); + up_write(&sit_i->sentry_lock); mutex_unlock(&curseg->curseg_mutex); } @@ -3274,7 +3278,7 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) bool to_journal = true; struct seg_entry *se; - mutex_lock(&sit_i->sentry_lock); + down_write(&sit_i->sentry_lock); if (!sit_i->dirty_sentries) goto out; @@ -3368,7 +3372,7 @@ out: cpc->trim_start = trim_start; } - mutex_unlock(&sit_i->sentry_lock); + up_write(&sit_i->sentry_lock); set_prefree_as_free_segments(sbi); } @@ -3461,7 +3465,7 @@ static int build_sit_info(struct f2fs_sb_info *sbi) sit_i->sents_per_block = SIT_ENTRY_PER_BLOCK; sit_i->elapsed_time = le64_to_cpu(sbi->ckpt->elapsed_time); sit_i->mounted_time = CURRENT_TIME_SEC.tv_sec; - mutex_init(&sit_i->sentry_lock); + init_rwsem(&sit_i->sentry_lock); return 0; } @@ -3702,7 +3706,7 @@ static void init_min_max_mtime(struct f2fs_sb_info *sbi) struct sit_info *sit_i = SIT_I(sbi); unsigned int segno; - mutex_lock(&sit_i->sentry_lock); + down_write(&sit_i->sentry_lock); sit_i->min_mtime = LLONG_MAX; @@ -3719,7 +3723,7 @@ static void init_min_max_mtime(struct f2fs_sb_info *sbi) sit_i->min_mtime = mtime; } sit_i->max_mtime = get_mtime(sbi); - mutex_unlock(&sit_i->sentry_lock); + up_write(&sit_i->sentry_lock); } int build_segment_manager(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 9342b973da65..4f19eb45eada 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -231,7 +231,7 @@ struct sit_info { unsigned long *dirty_sentries_bitmap; /* bitmap for dirty sentries */ unsigned int dirty_sentries; /* # of dirty sentries */ unsigned int sents_per_block; /* # of SIT entries per block */ - struct mutex sentry_lock; /* to protect SIT cache */ + struct rw_semaphore sentry_lock; /* to protect SIT cache */ struct seg_entry *sentries; /* SIT segment-level cache */ struct sec_entry *sec_entries; /* SIT section-level cache */ From 42c7c71824fc026f8d0ed1c2261680752ba7ffa4 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 30 Oct 2017 17:49:54 +0800 Subject: [PATCH 487/804] f2fs: check curseg space before foreground GC When we are closing to trigger foreground GC, if there are only a few of dirty metas, we can log these dirty metas in left space of opened segments instead of triggering foreground GC. With this patch, total count of foreground GC triggered by test/generic/* of fstest suit reduce from 254 to 184. So let's do the check before foreground GC anyway. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.h | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 4f19eb45eada..5264b6ed120c 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -497,6 +497,33 @@ static inline int reserved_sections(struct f2fs_sb_info *sbi) return GET_SEC_FROM_SEG(sbi, (unsigned int)reserved_segments(sbi)); } +static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi) +{ + unsigned int node_blocks = get_pages(sbi, F2FS_DIRTY_NODES) + + get_pages(sbi, F2FS_DIRTY_DENTS); + unsigned int dent_blocks = get_pages(sbi, F2FS_DIRTY_DENTS); + unsigned int segno, left_blocks; + int i; + + /* check current node segment */ + for (i = CURSEG_HOT_NODE; i <= CURSEG_COLD_NODE; i++) { + segno = CURSEG_I(sbi, i)->segno; + left_blocks = sbi->blocks_per_seg - + get_seg_entry(sbi, segno)->ckpt_valid_blocks; + + if (node_blocks > left_blocks) + return false; + } + + /* check current data segment */ + segno = CURSEG_I(sbi, CURSEG_HOT_DATA)->segno; + left_blocks = sbi->blocks_per_seg - + get_seg_entry(sbi, segno)->ckpt_valid_blocks; + if (dent_blocks > left_blocks) + return false; + return true; +} + static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed, int needed) { @@ -507,6 +534,9 @@ static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) return false; + if (free_sections(sbi) + freed == reserved_sections(sbi) + needed && + has_curseg_enough_space(sbi)) + return false; return (free_sections(sbi) + freed) <= (node_secs + 2 * dent_secs + imeta_secs + reserved_sections(sbi) + needed); From 8b92814117d5b040c30c4978b5489dcac166a8aa Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 30 Oct 2017 11:11:56 -0400 Subject: [PATCH 488/804] f2fs: don't bother with inode->i_version f2fs does not set the SB_I_VERSION flag, so the i_version will never be incremented on write. It was recently changed to increment the i_version on a quota write, which isn't necessary here. Signed-off-by: Jeff Layton Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index e304ce603c5d..76e2f1518224 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -664,7 +664,6 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) init_once((void *) fi); /* Initialize f2fs-specific inode info */ - fi->vfs_inode.i_version = 1; atomic_set(&fi->dirty_pages, 0); fi->i_current_depth = 1; fi->i_advise = 0; @@ -1461,7 +1460,6 @@ retry: if (len == towrite) return err; - inode->i_version++; inode->i_mtime = inode->i_ctime = current_time(inode); f2fs_mark_inode_dirty_sync(inode, false); return len - towrite; From 55c7b9595bb93d69f8b099cd9915ae19cefe53a0 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 2 Nov 2017 20:41:01 +0800 Subject: [PATCH 489/804] f2fs: remove unneeded semicolon Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index a69795e046bb..d6c02bb8fcf8 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1017,7 +1017,7 @@ int f2fs_sync_inode_meta(struct f2fs_sb_info *sbi) update_inode_page(inode); iput(inode); } - }; + } return 0; } From 44889e487981b1aa258399696a35f1a7be96ea9f Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 2 Nov 2017 20:41:02 +0800 Subject: [PATCH 490/804] f2fs: remove dead code in update_meta_page After commit a468f0ef516f ("f2fs: use crc and cp version to determine roll-forward recovery"), last caller of update_meta_page passing @src with NULL is gone, so remove related dead code there. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 7dfd4580380e..9538e1ac652d 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2071,12 +2071,8 @@ struct page *get_sum_page(struct f2fs_sb_info *sbi, unsigned int segno) void update_meta_page(struct f2fs_sb_info *sbi, void *src, block_t blk_addr) { struct page *page = grab_meta_page(sbi, blk_addr); - void *dst = page_address(page); - if (src) - memcpy(dst, src, PAGE_SIZE); - else - memset(dst, 0, PAGE_SIZE); + memcpy(page_address(page), src, PAGE_SIZE); set_page_dirty(page); f2fs_put_page(page, 1); } From 3e3b40557525c0bdb32f4b8d19f02b660245bc27 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 2 Nov 2017 20:41:03 +0800 Subject: [PATCH 491/804] f2fs: fix summary info corruption Sometimes, after running generic/270 of fstest, fsck reports summary info and actual position of block address in direct node becoming inconsistent. The root cause is race in between __f2fs_replace_block and change_curseg as below: Thread A Thread B - __clone_blkaddrs - f2fs_replace_block - __f2fs_replace_block - segnoA = GET_SEGNO(sbi, blkaddrA); - type = se->type:=CURSEG_HOT_DATA - if (!IS_CURSEG(sbi, segnoA)) type = CURSEG_WARM_DATA - allocate_data_block - allocate_segment - get_ssr_segment - change_curseg(segnoA, CURSEG_HOT_DATA) - change_curseg(segnoA, CURSEG_WARM_DATA) - reset_curseg - __set_sit_entry_type - change se->type from CURSEG_HOT_DATA to CURSEG_WARM_DATA So finally, hot curseg locates in segnoA, but type of segnoA becomes CURSEG_WARM_DATA. Then if we invoke __f2fs_replace_block(blkaddrB, blkaddrA, true, false), as blkaddrA locates in segnoA, so we will move warm type curseg to segnoA, then change its summary cache and writeback it to summary block. But segnoA is used by hot type curseg too, once it moves or persist, it will cover summary block content with inner old summary cache, result in inconsistent status. This patch tries to fix this issue by introduce global curseg lock to avoid race in between __f2fs_replace_block and change_curseg. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 ++ fs/f2fs/segment.c | 28 +++++++++++++++++++++++++++- 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 72d5ea456250..b6b382888a94 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -884,6 +884,8 @@ struct f2fs_sm_info { struct dirty_seglist_info *dirty_info; /* dirty segment information */ struct curseg_info *curseg_array; /* active segment information */ + struct rw_semaphore curseg_lock; /* for preventing curseg change */ + block_t seg0_blkaddr; /* block address of 0'th segment */ block_t main_blkaddr; /* start block address of main area */ block_t ssa_blkaddr; /* start block address of SSA area */ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 9538e1ac652d..734c6a880633 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2592,6 +2592,8 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, struct sit_info *sit_i = SIT_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, type); + down_read(&SM_I(sbi)->curseg_lock); + mutex_lock(&curseg->curseg_mutex); down_write(&sit_i->sentry_lock); @@ -2649,6 +2651,8 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, } mutex_unlock(&curseg->curseg_mutex); + + up_read(&SM_I(sbi)->curseg_lock); } static void update_device_state(struct f2fs_io_info *fio) @@ -2756,6 +2760,18 @@ int rewrite_data_page(struct f2fs_io_info *fio) return err; } +static inline int __f2fs_get_curseg(struct f2fs_sb_info *sbi, + unsigned int segno) +{ + int i; + + for (i = CURSEG_HOT_DATA; i < NO_CHECK_TYPE; i++) { + if (CURSEG_I(sbi, i)->segno == segno) + break; + } + return i; +} + void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, block_t old_blkaddr, block_t new_blkaddr, bool recover_curseg, bool recover_newaddr) @@ -2771,6 +2787,8 @@ void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, se = get_seg_entry(sbi, segno); type = se->type; + down_write(&SM_I(sbi)->curseg_lock); + if (!recover_curseg) { /* for recovery flow */ if (se->valid_blocks == 0 && !IS_CURSEG(sbi, segno)) { @@ -2780,8 +2798,13 @@ void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, type = CURSEG_WARM_DATA; } } else { - if (!IS_CURSEG(sbi, segno)) + if (IS_CURSEG(sbi, segno)) { + /* se->type is volatile as SSR allocation */ + type = __f2fs_get_curseg(sbi, segno); + f2fs_bug_on(sbi, type == NO_CHECK_TYPE); + } else { type = CURSEG_WARM_DATA; + } } curseg = CURSEG_I(sbi, type); @@ -2821,6 +2844,7 @@ void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, up_write(&sit_i->sentry_lock); mutex_unlock(&curseg->curseg_mutex); + up_write(&SM_I(sbi)->curseg_lock); } void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn, @@ -3758,6 +3782,8 @@ int build_segment_manager(struct f2fs_sb_info *sbi) INIT_LIST_HEAD(&sm_info->sit_entry_set); + init_rwsem(&sm_info->curseg_lock); + if (!f2fs_readonly(sbi->sb)) { err = create_flush_cmd_control(sbi); if (err) From 4423778adf0e777147f9c0252f6a4f42cbb91256 Mon Sep 17 00:00:00 2001 From: Fan Li Date: Thu, 2 Nov 2017 11:02:52 +0800 Subject: [PATCH 492/804] f2fs: save a multiplication for last_nid calculation Use a slightly easier way to calculate last_nid. Signed-off-by: Fan li Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 09707de3c9c5..930bdb90faac 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2629,7 +2629,7 @@ static inline void load_free_nid_bitmap(struct f2fs_sb_info *sbi) __set_bit_le(i, nm_i->nat_block_bitmap); nid = i * NAT_ENTRY_PER_BLOCK; - last_nid = (i + 1) * NAT_ENTRY_PER_BLOCK; + last_nid = nid + NAT_ENTRY_PER_BLOCK; spin_lock(&NM_I(sbi)->nid_list_lock); for (; nid < last_nid; nid++) From 3c8f767e13741c5174909e39f238655c82be1c20 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 3 Nov 2017 10:21:05 +0800 Subject: [PATCH 493/804] f2fs: avoid race in between GC and block exchange During block exchange in {insert,collapse,move}_range, page-block mapping is unstable due to mapping moving or recovery, so there should be no concurrent cache read operation rely on such mapping, nor cache write operation to mess up block exchange. So this patch let background GC be aware of that. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 26 +++++++++++++++++++++----- fs/f2fs/gc.c | 7 +++++++ 2 files changed, 28 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 19cdf9f5261b..62f23f82b971 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1174,11 +1174,14 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) if (ret) goto out; + /* avoid gc operation during block exchange */ + down_write(&F2FS_I(inode)->dio_rwsem[WRITE]); + truncate_pagecache(inode, offset); ret = f2fs_do_collapse(inode, pg_start, pg_end); if (ret) - goto out; + goto out_unlock; /* write out all moved pages, if possible */ filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); @@ -1190,7 +1193,8 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) ret = truncate_blocks(inode, new_size, true); if (!ret) f2fs_i_size_write(inode, new_size); - +out_unlock: + up_write(&F2FS_I(inode)->dio_rwsem[WRITE]); out: up_write(&F2FS_I(inode)->i_mmap_sem); return ret; @@ -1373,6 +1377,9 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) if (ret) goto out; + /* avoid gc operation during block exchange */ + down_write(&F2FS_I(inode)->dio_rwsem[WRITE]); + truncate_pagecache(inode, offset); pg_start = offset >> PAGE_SHIFT; @@ -1400,6 +1407,8 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) if (!ret) f2fs_i_size_write(inode, new_size); + + up_write(&F2FS_I(inode)->dio_rwsem[WRITE]); out: up_write(&F2FS_I(inode)->i_mmap_sem); return ret; @@ -2254,9 +2263,13 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, } inode_lock(src); + down_write(&F2FS_I(src)->dio_rwsem[WRITE]); if (src != dst) { - if (!inode_trylock(dst)) { - ret = -EBUSY; + ret = -EBUSY; + if (!inode_trylock(dst)) + goto out; + if (!down_write_trylock(&F2FS_I(dst)->dio_rwsem[WRITE])) { + inode_unlock(dst); goto out; } } @@ -2316,9 +2329,12 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, } f2fs_unlock_op(sbi); out_unlock: - if (src != dst) + if (src != dst) { + up_write(&F2FS_I(dst)->dio_rwsem[WRITE]); inode_unlock(dst); + } out: + up_write(&F2FS_I(src)->dio_rwsem[WRITE]); inode_unlock(src); return ret; } diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 297c204ea221..be9fd616736b 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -832,10 +832,17 @@ next_step: continue; } + if (!down_write_trylock( + &F2FS_I(inode)->dio_rwsem[WRITE])) { + iput(inode); + continue; + } + start_bidx = start_bidx_of_node(nofs, inode); data_page = get_read_data_page(inode, start_bidx + ofs_in_node, REQ_RAHEAD, true); + up_write(&F2FS_I(inode)->dio_rwsem[WRITE]); if (IS_ERR(data_page)) { iput(inode); continue; From 5d4b6efcfd09ce00a2ef238ee333cdabcf1d87c6 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sun, 5 Nov 2017 21:53:30 +0800 Subject: [PATCH 494/804] f2fs: keep isize once block is reserved cross EOF Without FADVISE_KEEP_SIZE_BIT, we will try to recover file size according to last non-hole block, so in fallocate(), we must set FADVISE_KEEP_SIZE_BIT flag once we have preallocated block cross EOF, instead of when all preallocation is success. Otherwise, file size will be incorrect due to lack of this flag. Simple testcase to reproduce this: 1. echo 2 > /sys/fs/f2fs//inject_type 2. echo 10 > /sys/fs/f2fs//inject_rate 3. run tests/generic/392 4. disable fault injection 5. do remount Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 62f23f82b971..0ebf08f00b8f 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1458,8 +1458,12 @@ static int expand_inode_data(struct inode *inode, loff_t offset, new_size = ((loff_t)pg_end << PAGE_SHIFT) + off_end; } - if (!(mode & FALLOC_FL_KEEP_SIZE) && i_size_read(inode) < new_size) - f2fs_i_size_write(inode, new_size); + if (new_size > i_size_read(inode)) { + if (mode & FALLOC_FL_KEEP_SIZE) + file_set_keep_isize(inode); + else + f2fs_i_size_write(inode, new_size); + } return err; } @@ -1506,8 +1510,6 @@ static long f2fs_fallocate(struct file *file, int mode, if (!ret) { inode->i_mtime = inode->i_ctime = current_time(inode); f2fs_mark_inode_dirty_sync(inode, false); - if (mode & FALLOC_FL_KEEP_SIZE) - file_set_keep_isize(inode); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); } From 0186182c0c4d208a43d0c09bb04027b9e7e8f15a Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 6 Nov 2017 22:51:45 +0800 Subject: [PATCH 495/804] f2fs: trace checkpoint reason in fsync() This patch slightly changes need_do_checkpoint to return the detail info that indicates why we need do checkpoint, then caller could print it with trace message. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 12 ++++++++++++ fs/f2fs/file.c | 34 ++++++++++++++++++---------------- include/trace/events/f2fs.h | 24 ++++++++++++++++++------ 3 files changed, 48 insertions(+), 22 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index b6b382888a94..31edffdd51d4 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -977,6 +977,18 @@ enum need_lock_type { LOCK_RETRY, }; +enum cp_reason_type { + CP_NO_NEEDED, + CP_NON_REGULAR, + CP_HARDLINK, + CP_SB_NEED_CP, + CP_WRONG_PINO, + CP_NO_SPC_ROLL, + CP_NODE_NEED_CP, + CP_FASTBOOT_MODE, + CP_SPEC_LOG_NUM, +}; + enum iostat_type { APP_DIRECT_IO, /* app direct IOs */ APP_BUFFERED_IO, /* app buffered IOs */ diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 0ebf08f00b8f..3de13816d2ac 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -147,27 +147,29 @@ static int get_parent_ino(struct inode *inode, nid_t *pino) return 1; } -static inline bool need_do_checkpoint(struct inode *inode) +static inline enum cp_reason_type need_do_checkpoint(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - bool need_cp = false; + enum cp_reason_type cp_reason = CP_NO_NEEDED; - if (!S_ISREG(inode->i_mode) || inode->i_nlink != 1) - need_cp = true; + if (!S_ISREG(inode->i_mode)) + cp_reason = CP_NON_REGULAR; + else if (inode->i_nlink != 1) + cp_reason = CP_HARDLINK; else if (is_sbi_flag_set(sbi, SBI_NEED_CP)) - need_cp = true; + cp_reason = CP_SB_NEED_CP; else if (file_wrong_pino(inode)) - need_cp = true; + cp_reason = CP_WRONG_PINO; else if (!space_for_roll_forward(sbi)) - need_cp = true; + cp_reason = CP_NO_SPC_ROLL; else if (!is_checkpointed_node(sbi, F2FS_I(inode)->i_pino)) - need_cp = true; + cp_reason = CP_NODE_NEED_CP; else if (test_opt(sbi, FASTBOOT)) - need_cp = true; + cp_reason = CP_FASTBOOT_MODE; else if (sbi->active_logs == 2) - need_cp = true; + cp_reason = CP_SPEC_LOG_NUM; - return need_cp; + return cp_reason; } static bool need_inode_page_update(struct f2fs_sb_info *sbi, nid_t ino) @@ -202,7 +204,7 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end, struct f2fs_sb_info *sbi = F2FS_I_SB(inode); nid_t ino = inode->i_ino; int ret = 0; - bool need_cp = false; + enum cp_reason_type cp_reason = 0; struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, .nr_to_write = LONG_MAX, @@ -221,7 +223,7 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end, clear_inode_flag(inode, FI_NEED_IPU); if (ret) { - trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret); + trace_f2fs_sync_file_exit(inode, cp_reason, datasync, ret); return ret; } @@ -252,10 +254,10 @@ go_write: * sudden-power-off. */ down_read(&F2FS_I(inode)->i_sem); - need_cp = need_do_checkpoint(inode); + cp_reason = need_do_checkpoint(inode); up_read(&F2FS_I(inode)->i_sem); - if (need_cp) { + if (cp_reason) { /* all the dirty node pages should be flushed for POR */ ret = f2fs_sync_fs(inode->i_sb, 1); @@ -312,7 +314,7 @@ flush_out: } f2fs_update_time(sbi, REQ_TIME); out: - trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret); + trace_f2fs_sync_file_exit(inode, cp_reason, datasync, ret); f2fs_trace_ios(NULL, 1); return ret; } diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index c9be882c2718..589df6f73789 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -128,6 +128,18 @@ TRACE_DEFINE_ENUM(CP_TRIMMED); { CP_DISCARD, "Discard" }, \ { CP_UMOUNT | CP_TRIMMED, "Umount,Trimmed" }) +#define show_fsync_cpreason(type) \ + __print_symbolic(type, \ + { CP_NO_NEEDED, "no needed" }, \ + { CP_NON_REGULAR, "non regular" }, \ + { CP_HARDLINK, "hardlink" }, \ + { CP_SB_NEED_CP, "sb needs cp" }, \ + { CP_WRONG_PINO, "wrong pino" }, \ + { CP_NO_SPC_ROLL, "no space roll forward" }, \ + { CP_NODE_NEED_CP, "node needs cp" }, \ + { CP_FASTBOOT_MODE, "fastboot mode" }, \ + { CP_SPEC_LOG_NUM, "log type is 2" }) + struct victim_sel_policy; struct f2fs_map_blocks; @@ -202,14 +214,14 @@ DEFINE_EVENT(f2fs__inode, f2fs_sync_file_enter, TRACE_EVENT(f2fs_sync_file_exit, - TP_PROTO(struct inode *inode, int need_cp, int datasync, int ret), + TP_PROTO(struct inode *inode, int cp_reason, int datasync, int ret), - TP_ARGS(inode, need_cp, datasync, ret), + TP_ARGS(inode, cp_reason, datasync, ret), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) - __field(int, need_cp) + __field(int, cp_reason) __field(int, datasync) __field(int, ret) ), @@ -217,15 +229,15 @@ TRACE_EVENT(f2fs_sync_file_exit, TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; - __entry->need_cp = need_cp; + __entry->cp_reason = cp_reason; __entry->datasync = datasync; __entry->ret = ret; ), - TP_printk("dev = (%d,%d), ino = %lu, checkpoint is %s, " + TP_printk("dev = (%d,%d), ino = %lu, cp_reason: %s, " "datasync = %d, ret = %d", show_dev_ino(__entry), - __entry->need_cp ? "needed" : "not needed", + show_fsync_cpreason(__entry->cp_reason), __entry->datasync, __entry->ret) ); From 460688b59e8bc67d25340c430c099f1c8ebcdb4d Mon Sep 17 00:00:00 2001 From: Fan Li Date: Tue, 7 Nov 2017 11:04:33 +0800 Subject: [PATCH 496/804] f2fs: keep scanning until enough free nids are acquired In current version, after scan_free_nid_bits, the scan is over if nid_cnt[FREE_NID] != 0. In most cases, there are still free nids in the free list during the scan, and scan_free_nid_bits usually can't increase nid_cnt[FREE_NID]. It causes that __build_free_nids is called many times without solving the shortage of the free nids. This patch fixes that. Signed-off-by: Fan li Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 930bdb90faac..c75c1ac06f3a 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2018,7 +2018,7 @@ static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount) /* try to find free nids in free_nid_bitmap */ scan_free_nid_bits(sbi); - if (nm_i->nid_cnt[FREE_NID]) + if (nm_i->nid_cnt[FREE_NID] >= NAT_ENTRY_PER_BLOCK) return; } From ca28e9670e807900f4ad9a447ffa50b0b4cbff5f Mon Sep 17 00:00:00 2001 From: Fan Li Date: Tue, 7 Nov 2017 19:14:24 +0800 Subject: [PATCH 497/804] f2fs: optimize the way of traversing free_nid_bitmap We call scan_free_nid_bits only when there isn't many free nids left, it means that marked bits in free_nid_bitmap are supposed to be few, use find_next_bit_le is more efficient in such case. According to my tests, use find_next_bit_le instead of test_bit_le will cut down the traversal time to one third of its original. Signed-off-by: Fan li Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index c75c1ac06f3a..ffaa695224f7 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1958,6 +1958,7 @@ static void scan_free_nid_bits(struct f2fs_sb_info *sbi) struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); struct f2fs_journal *journal = curseg->journal; unsigned int i, idx; + nid_t nid; down_read(&nm_i->nat_tree_lock); @@ -1967,10 +1968,10 @@ static void scan_free_nid_bits(struct f2fs_sb_info *sbi) if (!nm_i->free_nid_count[i]) continue; for (idx = 0; idx < NAT_ENTRY_PER_BLOCK; idx++) { - nid_t nid; - - if (!test_bit_le(idx, nm_i->free_nid_bitmap[i])) - continue; + idx = find_next_bit_le(nm_i->free_nid_bitmap[i], + NAT_ENTRY_PER_BLOCK, idx); + if (idx >= NAT_ENTRY_PER_BLOCK) + break; nid = i * NAT_ENTRY_PER_BLOCK + idx; add_free_nid(sbi, nid, true); @@ -1983,7 +1984,6 @@ out: down_read(&curseg->journal_rwsem); for (i = 0; i < nats_in_cursum(journal); i++) { block_t addr; - nid_t nid; addr = le32_to_cpu(nat_in_journal(journal, i).block_addr); nid = le32_to_cpu(nid_in_journal(journal, i)); From ac9819160586ff12691558c3a3b07554069a8024 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 8 Nov 2017 17:47:36 +0800 Subject: [PATCH 498/804] f2fs: introduce scan_curseg_cache for cleanup Commit 4ac912427c42 ("f2fs: introduce free nid bitmap") copied codes from __build_free_nids() into scan_free_nid_bits(), they are redundant, introduce one common function scan_curseg_cache for cleanup. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 49 +++++++++++++++++++++++-------------------------- 1 file changed, 23 insertions(+), 26 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index ffaa695224f7..62e597b08e09 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1952,11 +1952,30 @@ static void scan_nat_page(struct f2fs_sb_info *sbi, } } +static void scan_curseg_cache(struct f2fs_sb_info *sbi) +{ + struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); + struct f2fs_journal *journal = curseg->journal; + int i; + + down_read(&curseg->journal_rwsem); + for (i = 0; i < nats_in_cursum(journal); i++) { + block_t addr; + nid_t nid; + + addr = le32_to_cpu(nat_in_journal(journal, i).block_addr); + nid = le32_to_cpu(nid_in_journal(journal, i)); + if (addr == NULL_ADDR) + add_free_nid(sbi, nid, true); + else + remove_free_nid(sbi, nid); + } + up_read(&curseg->journal_rwsem); +} + static void scan_free_nid_bits(struct f2fs_sb_info *sbi) { struct f2fs_nm_info *nm_i = NM_I(sbi); - struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); - struct f2fs_journal *journal = curseg->journal; unsigned int i, idx; nid_t nid; @@ -1981,26 +2000,14 @@ static void scan_free_nid_bits(struct f2fs_sb_info *sbi) } } out: - down_read(&curseg->journal_rwsem); - for (i = 0; i < nats_in_cursum(journal); i++) { - block_t addr; + scan_curseg_cache(sbi); - addr = le32_to_cpu(nat_in_journal(journal, i).block_addr); - nid = le32_to_cpu(nid_in_journal(journal, i)); - if (addr == NULL_ADDR) - add_free_nid(sbi, nid, true); - else - remove_free_nid(sbi, nid); - } - up_read(&curseg->journal_rwsem); up_read(&nm_i->nat_tree_lock); } static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount) { struct f2fs_nm_info *nm_i = NM_I(sbi); - struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); - struct f2fs_journal *journal = curseg->journal; int i = 0; nid_t nid = nm_i->next_scan_nid; @@ -2046,18 +2053,8 @@ static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount) nm_i->next_scan_nid = nid; /* find free nids from current sum_pages */ - down_read(&curseg->journal_rwsem); - for (i = 0; i < nats_in_cursum(journal); i++) { - block_t addr; + scan_curseg_cache(sbi); - addr = le32_to_cpu(nat_in_journal(journal, i).block_addr); - nid = le32_to_cpu(nid_in_journal(journal, i)); - if (addr == NULL_ADDR) - add_free_nid(sbi, nid, true); - else - remove_free_nid(sbi, nid); - } - up_read(&curseg->journal_rwsem); up_read(&nm_i->nat_tree_lock); ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nm_i->next_scan_nid), From 47af6c72d9440c90674d9b79da13ce8922491d24 Mon Sep 17 00:00:00 2001 From: Hyunchul Lee Date: Thu, 9 Nov 2017 14:51:27 +0900 Subject: [PATCH 499/804] f2fs: apply write hints to select the type of segments for buffered write Write hints helps F2FS to determine which type of segments would be selected for buffered write. This patch implements the mapping from write hints to segment types as shown below. hints segment type ----- ------------ WRITE_LIFE_SHORT CURSEG_HOT_DATA WRITE_LIFE_EXTREME CURSEG_COLD_DATA others CURSEG_WARM_DATA the F2FS poliy for hot/cold seperation has precedence over this hints. And hints are not applied in in-place update. Signed-off-by: Hyunchul Lee Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 734c6a880633..94939a5a96c8 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2514,6 +2514,20 @@ static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type) return false; } +#if 0 +int rw_hint_to_seg_type(enum rw_hint hint) +{ + switch (hint) { + case WRITE_LIFE_SHORT: + return CURSEG_HOT_DATA; + case WRITE_LIFE_EXTREME: + return CURSEG_COLD_DATA; + default: + return CURSEG_WARM_DATA; + } +} +#endif + static int __get_segment_type_2(struct f2fs_io_info *fio) { if (fio->type == DATA) @@ -2548,6 +2562,7 @@ static int __get_segment_type_6(struct f2fs_io_info *fio) return CURSEG_COLD_DATA; if (is_inode_flag_set(inode, FI_HOT_DATA)) return CURSEG_HOT_DATA; + /* rw_hint_to_seg_type(inode->i_write_hint); */ return CURSEG_WARM_DATA; } else { if (IS_DNODE(fio->page)) From baf9275a4bbdf42fcc443ba3ba90482ee9995665 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 10 Nov 2017 09:30:42 +0800 Subject: [PATCH 500/804] f2fs: avoid opened loop codes in __add_ino_entry We will keep __add_ino_entry success all the time, for ENOMEM failure case, we have already handled it by using __GFP_NOFAIL flag, so we don't have to use additional opened loop codes here, remove them. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index d6c02bb8fcf8..2eb778174a9b 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -408,18 +408,16 @@ static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, struct ino_entry *e, *tmp; tmp = f2fs_kmem_cache_alloc(ino_entry_slab, GFP_NOFS); -retry: + radix_tree_preload(GFP_NOFS | __GFP_NOFAIL); spin_lock(&im->ino_lock); e = radix_tree_lookup(&im->ino_root, ino); if (!e) { e = tmp; - if (radix_tree_insert(&im->ino_root, ino, e)) { - spin_unlock(&im->ino_lock); - radix_tree_preload_end(); - goto retry; - } + if (unlikely(radix_tree_insert(&im->ino_root, ino, e))) + f2fs_bug_on(sbi, 1); + memset(e, 0, sizeof(struct ino_entry)); e->ino = ino; From 48c72b4c8c5016521b3c670d003acb1dc664c3ac Mon Sep 17 00:00:00 2001 From: LiFan Date: Fri, 10 Nov 2017 15:41:42 +0800 Subject: [PATCH 501/804] f2fs: validate before set/clear free nat bitmap In flush_nat_entries, all dirty nats will be flushed and if their new address isn't NULL_ADDR, their bitmaps will be updated, the free_nid_count of the bitmaps will be increaced regardless of whether the nats have already been occupied before. This could lead to wrong free_nid_count. So this patch checks the status of the bits beforeactually set/clear them. Fixes: 586d1492f301 ("f2fs: skip scanning free nid bitmap of full NAT blocks") Signed-off-by: Fan li Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 62e597b08e09..7e3ee2c5e497 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1909,15 +1909,18 @@ static void update_free_nid_bitmap(struct f2fs_sb_info *sbi, nid_t nid, if (!test_bit_le(nat_ofs, nm_i->nat_block_bitmap)) return; - if (set) + if (set) { + if (test_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs])) + return; __set_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]); - else - __clear_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]); - - if (set) nm_i->free_nid_count[nat_ofs]++; - else if (!build) - nm_i->free_nid_count[nat_ofs]--; + } else { + if (!test_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs])) + return; + __clear_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]); + if (!build) + nm_i->free_nid_count[nat_ofs]--; + } } static void scan_nat_page(struct f2fs_sb_info *sbi, From c4cd2efe835b9b3dc0d9ca0807f46b603f6e3532 Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Fri, 10 Nov 2017 13:36:51 -0800 Subject: [PATCH 502/804] f2fs: separate nat entry mem alloc from nat_tree_lock This patch splits memory allocation part in nat_entry to avoid lock contention. Signed-off-by: Yunlei He Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 98 ++++++++++++++++++++++++++++++-------------------- 1 file changed, 59 insertions(+), 39 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 7e3ee2c5e497..964c99655942 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -138,6 +138,44 @@ static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid) return dst_page; } +static struct nat_entry *__alloc_nat_entry(nid_t nid, bool no_fail) +{ + struct nat_entry *new; + + if (no_fail) + new = f2fs_kmem_cache_alloc(nat_entry_slab, + GFP_NOFS | __GFP_ZERO); + else + new = kmem_cache_alloc(nat_entry_slab, + GFP_NOFS | __GFP_ZERO); + if (new) { + nat_set_nid(new, nid); + nat_reset_flag(new); + } + return new; +} + +static void __free_nat_entry(struct nat_entry *e) +{ + kmem_cache_free(nat_entry_slab, e); +} + +/* must be locked by nat_tree_lock */ +static struct nat_entry *__init_nat_entry(struct f2fs_nm_info *nm_i, + struct nat_entry *ne, struct f2fs_nat_entry *raw_ne, bool no_fail) +{ + if (no_fail) + f2fs_radix_tree_insert(&nm_i->nat_root, nat_get_nid(ne), ne); + else if (radix_tree_insert(&nm_i->nat_root, nat_get_nid(ne), ne)) + return NULL; + + if (raw_ne) + node_info_from_raw_nat(&ne->ni, raw_ne); + list_add_tail(&ne->list, &nm_i->nat_entries); + nm_i->nat_cnt++; + return ne; +} + static struct nat_entry *__lookup_nat_cache(struct f2fs_nm_info *nm_i, nid_t n) { return radix_tree_lookup(&nm_i->nat_root, n); @@ -154,7 +192,7 @@ static void __del_from_nat_cache(struct f2fs_nm_info *nm_i, struct nat_entry *e) list_del(&e->list); radix_tree_delete(&nm_i->nat_root, nat_get_nid(e)); nm_i->nat_cnt--; - kmem_cache_free(nat_entry_slab, e); + __free_nat_entry(e); } static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i, @@ -250,49 +288,29 @@ bool need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino) return need_update; } -static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid, - bool no_fail) -{ - struct nat_entry *new; - - if (no_fail) { - new = f2fs_kmem_cache_alloc(nat_entry_slab, GFP_NOFS); - f2fs_radix_tree_insert(&nm_i->nat_root, nid, new); - } else { - new = kmem_cache_alloc(nat_entry_slab, GFP_NOFS); - if (!new) - return NULL; - if (radix_tree_insert(&nm_i->nat_root, nid, new)) { - kmem_cache_free(nat_entry_slab, new); - return NULL; - } - } - - memset(new, 0, sizeof(struct nat_entry)); - nat_set_nid(new, nid); - nat_reset_flag(new); - list_add_tail(&new->list, &nm_i->nat_entries); - nm_i->nat_cnt++; - return new; -} - +/* must be locked by nat_tree_lock */ static void cache_nat_entry(struct f2fs_sb_info *sbi, nid_t nid, struct f2fs_nat_entry *ne) { struct f2fs_nm_info *nm_i = NM_I(sbi); - struct nat_entry *e; + struct nat_entry *new, *e; + new = __alloc_nat_entry(nid, false); + if (!new) + return; + + down_write(&nm_i->nat_tree_lock); e = __lookup_nat_cache(nm_i, nid); - if (!e) { - e = grab_nat_entry(nm_i, nid, false); - if (e) - node_info_from_raw_nat(&e->ni, ne); - } else { + if (!e) + e = __init_nat_entry(nm_i, new, ne, false); + else f2fs_bug_on(sbi, nat_get_ino(e) != le32_to_cpu(ne->ino) || nat_get_blkaddr(e) != le32_to_cpu(ne->block_addr) || nat_get_version(e) != ne->version); - } + up_write(&nm_i->nat_tree_lock); + if (e != new) + __free_nat_entry(new); } static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, @@ -300,11 +318,12 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, { struct f2fs_nm_info *nm_i = NM_I(sbi); struct nat_entry *e; + struct nat_entry *new = __alloc_nat_entry(ni->nid, true); down_write(&nm_i->nat_tree_lock); e = __lookup_nat_cache(nm_i, ni->nid); if (!e) { - e = grab_nat_entry(nm_i, ni->nid, true); + e = __init_nat_entry(nm_i, new, NULL, true); copy_node_info(&e->ni, ni); f2fs_bug_on(sbi, ni->blk_addr == NEW_ADDR); } else if (new_blkaddr == NEW_ADDR) { @@ -316,6 +335,9 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, copy_node_info(&e->ni, ni); f2fs_bug_on(sbi, ni->blk_addr != NULL_ADDR); } + /* let's free early to reduce memory consumption */ + if (e != new) + __free_nat_entry(new); /* sanity check */ f2fs_bug_on(sbi, nat_get_blkaddr(e) != ni->blk_addr); @@ -424,9 +446,7 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni) f2fs_put_page(page, 1); cache: /* cache nat entry */ - down_write(&nm_i->nat_tree_lock); cache_nat_entry(sbi, nid, &ne); - up_write(&nm_i->nat_tree_lock); } /* @@ -2377,8 +2397,8 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi) ne = __lookup_nat_cache(nm_i, nid); if (!ne) { - ne = grab_nat_entry(nm_i, nid, true); - node_info_from_raw_nat(&ne->ni, &raw_ne); + ne = __alloc_nat_entry(nid, true); + __init_nat_entry(nm_i, ne, &raw_ne, true); } /* From e6cfc5de2d057d457d53084acb52c9383e62f44a Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 13 Nov 2017 17:46:38 -0800 Subject: [PATCH 503/804] f2fs: expose quota information in debugfs This patch shows # of dirty pages and # of hidden quota files. Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 11 +++++++++++ fs/f2fs/f2fs.h | 10 ++++++++-- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index f7eec506ceea..ecada8425268 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -45,9 +45,18 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->ndirty_dent = get_pages(sbi, F2FS_DIRTY_DENTS); si->ndirty_meta = get_pages(sbi, F2FS_DIRTY_META); si->ndirty_data = get_pages(sbi, F2FS_DIRTY_DATA); + si->ndirty_qdata = get_pages(sbi, F2FS_DIRTY_QDATA); si->ndirty_imeta = get_pages(sbi, F2FS_DIRTY_IMETA); si->ndirty_dirs = sbi->ndirty_inode[DIR_INODE]; si->ndirty_files = sbi->ndirty_inode[FILE_INODE]; + + si->nquota_files = 0; + if (f2fs_sb_has_quota_ino(sbi->sb)) { + for (i = 0; i < MAXQUOTAS; i++) { + if (f2fs_qf_ino(sbi->sb, i)) + si->nquota_files++; + } + } si->ndirty_all = sbi->ndirty_inode[DIRTY_META]; si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES); si->aw_cnt = atomic_read(&sbi->aw_cnt); @@ -369,6 +378,8 @@ static int stat_show(struct seq_file *s, void *v) si->ndirty_dent, si->ndirty_dirs, si->ndirty_all); seq_printf(s, " - datas: %4d in files:%4d\n", si->ndirty_data, si->ndirty_files); + seq_printf(s, " - quota datas: %4d in quota files:%4d\n", + si->ndirty_qdata, si->nquota_files); seq_printf(s, " - meta: %4d in %4d\n", si->ndirty_meta, si->meta_pages); seq_printf(s, " - imeta: %4d\n", diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 31edffdd51d4..7569347fd453 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -929,6 +929,7 @@ struct f2fs_sm_info { enum count_type { F2FS_DIRTY_DENTS, F2FS_DIRTY_DATA, + F2FS_DIRTY_QDATA, F2FS_DIRTY_NODES, F2FS_DIRTY_META, F2FS_INMEM_PAGES, @@ -1703,6 +1704,8 @@ static inline void inode_inc_dirty_pages(struct inode *inode) atomic_inc(&F2FS_I(inode)->dirty_pages); inc_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ? F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA); + if (IS_NOQUOTA(inode)) + inc_page_count(F2FS_I_SB(inode), F2FS_DIRTY_QDATA); } static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type) @@ -1719,6 +1722,8 @@ static inline void inode_dec_dirty_pages(struct inode *inode) atomic_dec(&F2FS_I(inode)->dirty_pages); dec_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ? F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA); + if (IS_NOQUOTA(inode)) + dec_page_count(F2FS_I_SB(inode), F2FS_DIRTY_QDATA); } static inline s64 get_pages(struct f2fs_sb_info *sbi, int count_type) @@ -2853,9 +2858,10 @@ struct f2fs_stat_info { unsigned long long hit_largest, hit_cached, hit_rbtree; unsigned long long hit_total, total_ext; int ext_tree, zombie_tree, ext_node; - int ndirty_node, ndirty_dent, ndirty_meta, ndirty_data, ndirty_imeta; + int ndirty_node, ndirty_dent, ndirty_meta, ndirty_imeta; + int ndirty_data, ndirty_qdata; int inmem_pages; - unsigned int ndirty_dirs, ndirty_files, ndirty_all; + unsigned int ndirty_dirs, ndirty_files, nquota_files, ndirty_all; int nats, dirty_nats, sits, dirty_sits; int free_nids, avail_nids, alloc_nids; int total_count, utilization; From 9262922510220084179c7fec823dd05526bdf77f Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 13 Nov 2017 17:32:39 +0800 Subject: [PATCH 504/804] f2fs: fix to clear FI_NO_PREALLOC We need to clear FI_NO_PREALLOC flag in error path of f2fs_file_write_iter, otherwise we will lose the chance to preallocate blocks in latter write() at one time. Fixes: dc91de78e5e1 ("f2fs: do not preallocate blocks which has wrong buffer") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 3de13816d2ac..52d29785154a 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2529,6 +2529,7 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) err = f2fs_preallocate_blocks(iocb, from); if (err) { + clear_inode_flag(inode, FI_NO_PREALLOC); inode_unlock(inode); return err; } From c394842e26e555a5d26d476e8fb2014ce7fbae57 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 13 Nov 2017 17:32:40 +0800 Subject: [PATCH 505/804] f2fs: inject fault in inc_valid_node_count This patch adds missing fault injection in inc_valid_node_count. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 7569347fd453..fc9c00ae7159 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1831,6 +1831,13 @@ static inline int inc_valid_node_count(struct f2fs_sb_info *sbi, return ret; } +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(sbi, FAULT_BLOCK)) { + f2fs_show_injection_info(FAULT_BLOCK); + goto enospc; + } +#endif + spin_lock(&sbi->stat_lock); valid_block_count = sbi->total_valid_block_count + 1; From 56a07b07051015d5fab339561103c4cc346c9685 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 14 Nov 2017 19:28:42 +0800 Subject: [PATCH 506/804] f2fs: deny accessing encryption policy if encryption is off This patch adds missing feature check in encryption ioctl interface. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 52d29785154a..bfff53f658e1 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1905,6 +1905,9 @@ static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); + if (!f2fs_sb_has_crypto(inode->i_sb)) + return -EOPNOTSUPP; + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); return fscrypt_ioctl_set_policy(filp, (const void __user *)arg); @@ -1912,6 +1915,8 @@ static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg) static int f2fs_ioc_get_encryption_policy(struct file *filp, unsigned long arg) { + if (!f2fs_sb_has_crypto(file_inode(filp)->i_sb)) + return -EOPNOTSUPP; return fscrypt_ioctl_get_policy(filp, (void __user *)arg); } From a53dc7e00559b3302b38deeffab735fdfe6ec20c Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 22 Jun 2017 12:14:40 -0700 Subject: [PATCH 507/804] fscrypt: make ->dummy_context() return bool This makes it consistent with ->is_encrypted(), ->empty_dir(), and fscrypt_dummy_context_enabled(). Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- include/linux/fscrypt_common.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/fscrypt_common.h b/include/linux/fscrypt_common.h index 4022c61f7e9b..e3e1208e0f54 100644 --- a/include/linux/fscrypt_common.h +++ b/include/linux/fscrypt_common.h @@ -77,7 +77,7 @@ struct fscrypt_operations { const char *key_prefix; int (*get_context)(struct inode *, void *, size_t); int (*set_context)(struct inode *, const void *, size_t, void *); - int (*dummy_context)(struct inode *); + bool (*dummy_context)(struct inode *); bool (*is_encrypted)(struct inode *); bool (*empty_dir)(struct inode *); unsigned (*max_namelen)(struct inode *); From bc4a61c60bea8d5e7468885ef7b7a41ba16b1b96 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 9 Oct 2017 12:46:18 -0700 Subject: [PATCH 508/804] fscrypt: fix dereference of NULL user_key_payload When an fscrypt-encrypted file is opened, we request the file's master key from the keyrings service as a logon key, then access its payload. However, a revoked key has a NULL payload, and we failed to check for this. request_key() *does* skip revoked keys, but there is still a window where the key can be revoked before we acquire its semaphore. Fix it by checking for a NULL payload, treating it like a key which was already revoked at the time it was requested. Fixes: 88bd6ccdcdd6 ("ext4 crypto: add encryption key management facilities") Reviewed-by: James Morris Cc: [v4.1+] Signed-off-by: Eric Biggers Signed-off-by: David Howells --- fs/crypto/keyinfo.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c index 66e0728e9bbe..169fefb62940 100644 --- a/fs/crypto/keyinfo.c +++ b/fs/crypto/keyinfo.c @@ -109,6 +109,11 @@ static int validate_user_key(struct fscrypt_info *crypt_info, goto out; } ukp = user_key_payload(keyring_key); + if (!ukp) { + /* key was revoked before we acquired its semaphore */ + res = -EKEYREVOKED; + goto out; + } if (ukp->datalen != sizeof(struct fscrypt_key)) { res = -EINVAL; goto out; From ff0a3dbc9392233e967b54534459c68d91e7963e Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 9 Oct 2017 12:15:34 -0700 Subject: [PATCH 509/804] fscrypt: clean up include file mess Filesystems have to include different header files based on whether they are compiled with encryption support or not. That's nasty and messy. Instead, rationalise the headers so we have a single include fscrypt.h and let it decide what internal implementation to include based on the __FS_HAS_ENCRYPTION define. Filesystems set __FS_HAS_ENCRYPTION to 1 before including linux/fscrypt.h if they are built with encryption support. Otherwise, they must set __FS_HAS_ENCRYPTION to 0. Add guards to prevent fscrypt_supp.h and fscrypt_notsupp.h from being directly included by filesystems. Signed-off-by: Dave Chinner [EB: use 1 and 0 rather than defined/undefined] Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/crypto/fscrypt_private.h | 3 +- fs/ext4/ext4.h | 3 ++ fs/f2fs/f2fs.h | 8 ++-- include/linux/{fscrypt_common.h => fscrypt.h} | 41 +++++++++++++------ include/linux/fscrypt_notsupp.h | 7 +++- include/linux/fscrypt_supp.h | 7 ++-- 6 files changed, 45 insertions(+), 24 deletions(-) rename include/linux/{fscrypt_common.h => fscrypt.h} (79%) diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index 79d79755d79b..ff97988fe6e9 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -11,7 +11,8 @@ #ifndef _FSCRYPT_PRIVATE_H #define _FSCRYPT_PRIVATE_H -#include +#define __FS_HAS_ENCRYPTION 1 +#include #include /* Encryption parameters */ diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index b7e921d207fb..bea7d7febdab 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -37,6 +37,9 @@ #include #endif +#define __FS_HAS_ENCRYPTION IS_ENABLED(CONFIG_EXT4_FS_ENCRYPTION) +#include + /* * The fourth extended filesystem constants/structures */ diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index fc9c00ae7159..b036ea741a03 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -23,14 +23,12 @@ #include #include #include -#ifdef CONFIG_F2FS_FS_ENCRYPTION -#include -#else -#include -#endif #include #include +#define __FS_HAS_ENCRYPTION IS_ENABLED(CONFIG_F2FS_FS_ENCRYPTION) +#include + #ifdef CONFIG_F2FS_CHECK_FS #define f2fs_bug_on(sbi, condition) BUG_ON(condition) #else diff --git a/include/linux/fscrypt_common.h b/include/linux/fscrypt.h similarity index 79% rename from include/linux/fscrypt_common.h rename to include/linux/fscrypt.h index e3e1208e0f54..58663327f692 100644 --- a/include/linux/fscrypt_common.h +++ b/include/linux/fscrypt.h @@ -1,14 +1,17 @@ /* - * fscrypt_common.h: common declarations for per-file encryption + * fscrypt.h: declarations for per-file encryption + * + * Filesystems that implement per-file encryption include this header + * file with the __FS_HAS_ENCRYPTION set according to whether that filesystem + * is being built with encryption support or not. * * Copyright (C) 2015, Google, Inc. * * Written by Michael Halcrow, 2015. * Modified by Jaegeuk Kim, 2015. */ - -#ifndef _LINUX_FSCRYPT_COMMON_H -#define _LINUX_FSCRYPT_COMMON_H +#ifndef _LINUX_FSCRYPT_H +#define _LINUX_FSCRYPT_H #include #include @@ -116,23 +119,35 @@ static inline bool fscrypt_is_dot_dotdot(const struct qstr *str) return false; } +#if __FS_HAS_ENCRYPTION + static inline struct page *fscrypt_control_page(struct page *page) { -#if IS_ENABLED(CONFIG_FS_ENCRYPTION) return ((struct fscrypt_ctx *)page_private(page))->w.control_page; -#else +} + +static inline bool fscrypt_has_encryption_key(const struct inode *inode) +{ + return (inode->i_crypt_info != NULL); +} + +#include + +#else /* !__FS_HAS_ENCRYPTION */ + +static inline struct page *fscrypt_control_page(struct page *page) +{ WARN_ON_ONCE(1); return ERR_PTR(-EINVAL); -#endif } -static inline int fscrypt_has_encryption_key(const struct inode *inode) +static inline bool fscrypt_has_encryption_key(const struct inode *inode) { -#if IS_ENABLED(CONFIG_FS_ENCRYPTION) - return (inode->i_crypt_info != NULL); -#else return 0; -#endif } -#endif /* _LINUX_FSCRYPT_COMMON_H */ +#include +#endif /* __FS_HAS_ENCRYPTION */ + + +#endif /* _LINUX_FSCRYPT_H */ diff --git a/include/linux/fscrypt_notsupp.h b/include/linux/fscrypt_notsupp.h index ec406aed2f2f..2d0b6960831e 100644 --- a/include/linux/fscrypt_notsupp.h +++ b/include/linux/fscrypt_notsupp.h @@ -3,13 +3,16 @@ * * This stubs out the fscrypt functions for filesystems configured without * encryption support. + * + * Do not include this file directly. Use fscrypt.h instead! */ +#ifndef _LINUX_FSCRYPT_H +#error "Incorrect include of linux/fscrypt_notsupp.h!" +#endif #ifndef _LINUX_FSCRYPT_NOTSUPP_H #define _LINUX_FSCRYPT_NOTSUPP_H -#include - /* crypto.c */ static inline struct fscrypt_ctx *fscrypt_get_ctx(const struct inode *inode, gfp_t gfp_flags) diff --git a/include/linux/fscrypt_supp.h b/include/linux/fscrypt_supp.h index 32e2fcf13b01..5a90e5ef4687 100644 --- a/include/linux/fscrypt_supp.h +++ b/include/linux/fscrypt_supp.h @@ -1,14 +1,15 @@ /* * fscrypt_supp.h * - * This is included by filesystems configured with encryption support. + * Do not include this file directly. Use fscrypt.h instead! */ +#ifndef _LINUX_FSCRYPT_H +#error "Incorrect include of linux/fscrypt_supp.h!" +#endif #ifndef _LINUX_FSCRYPT_SUPP_H #define _LINUX_FSCRYPT_SUPP_H -#include - /* crypto.c */ extern struct kmem_cache *fscrypt_info_cachep; extern struct fscrypt_ctx *fscrypt_get_ctx(const struct inode *, gfp_t); From a4781dd1f17554560631993375a446c04c7d6c78 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 9 Oct 2017 12:15:35 -0700 Subject: [PATCH 510/804] fs, fscrypt: add an S_ENCRYPTED inode flag Introduce a flag S_ENCRYPTED which can be set in ->i_flags to indicate that the inode is encrypted using the fscrypt (fs/crypto/) mechanism. Checking this flag will give the same information that inode->i_sb->s_cop->is_encrypted(inode) currently does, but will be more efficient. This will be useful for adding higher-level helper functions for filesystems to use. For example we'll be able to replace this: if (ext4_encrypted_inode(inode)) { ret = fscrypt_get_encryption_info(inode); if (ret) return ret; if (!fscrypt_has_encryption_key(inode)) return -ENOKEY; } with this: ret = fscrypt_require_key(inode); if (ret) return ret; ... since we'll be able to retain the fast path for unencrypted files as a single flag check, using an inline function. This wasn't possible before because we'd have had to frequently call through the ->i_sb->s_cop->is_encrypted function pointer, even when the encryption support was disabled or not being used. Note: we don't define S_ENCRYPTED to 0 if CONFIG_FS_ENCRYPTION is disabled because we want to continue to return an error if an encrypted file is accessed without encryption support, rather than pretending that it is unencrypted. Reviewed-by: Chao Yu Acked-by: Dave Chinner Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 5 ++++- fs/f2fs/f2fs.h | 1 + fs/f2fs/inode.c | 5 ++++- include/linux/fs.h | 2 ++ 4 files changed, 11 insertions(+), 2 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 28702932a908..df30d04f6760 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4103,8 +4103,11 @@ void ext4_set_inode_flags(struct inode *inode) new_fl |= S_DIRSYNC; if (test_opt(inode->i_sb, DAX)) new_fl |= S_DAX; + if (flags & EXT4_ENCRYPT_FL) + new_fl |= S_ENCRYPTED; inode_set_flags(inode, new_fl, - S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX); + S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX| + S_ENCRYPTED); } /* Propagate flags from i_flags to EXT4_I(inode)->i_flags */ diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index b036ea741a03..dc4a95e848af 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3147,6 +3147,7 @@ static inline void f2fs_set_encrypted_inode(struct inode *inode) { #ifdef CONFIG_F2FS_FS_ENCRYPTION file_set_encrypt(inode); + inode->i_flags |= S_ENCRYPTED; #endif } diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 9684d53563f1..b4c4f2b25304 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -43,8 +43,11 @@ void f2fs_set_inode_flags(struct inode *inode) new_fl |= S_NOATIME; if (flags & FS_DIRSYNC_FL) new_fl |= S_DIRSYNC; + if (f2fs_encrypted_inode(inode)) + new_fl |= S_ENCRYPTED; inode_set_flags(inode, new_fl, - S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); + S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC| + S_ENCRYPTED); } static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri) diff --git a/include/linux/fs.h b/include/linux/fs.h index a88271902ff2..933978eb92fb 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1778,6 +1778,7 @@ struct super_operations { #else #define S_DAX 0 /* Make all the DAX code disappear */ #endif +#define S_ENCRYPTED 16384 /* Encrypted file (using fs/crypto/) */ /* * Note that nosuid etc flags are inode-specific: setting some file-system @@ -1816,6 +1817,7 @@ struct super_operations { #define IS_AUTOMOUNT(inode) ((inode)->i_flags & S_AUTOMOUNT) #define IS_NOSEC(inode) ((inode)->i_flags & S_NOSEC) #define IS_DAX(inode) ((inode)->i_flags & S_DAX) +#define IS_ENCRYPTED(inode) ((inode)->i_flags & S_ENCRYPTED) #define IS_WHITEOUT(inode) (S_ISCHR(inode->i_mode) && \ (inode)->i_rdev == WHITEOUT_DEV) From 32c0d3ae9d664766abf4e64f89398dcd92614b35 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 9 Oct 2017 12:15:36 -0700 Subject: [PATCH 511/804] fscrypt: switch from ->is_encrypted() to IS_ENCRYPTED() IS_ENCRYPTED() now gives the same information as i_sb->s_cop->is_encrypted() but is more efficient, since IS_ENCRYPTED() is just a simple flag check. Prepare to remove ->is_encrypted() by switching all callers to IS_ENCRYPTED(). Acked-by: Dave Chinner Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/crypto/crypto.c | 2 +- fs/crypto/fname.c | 3 +-- fs/crypto/keyinfo.c | 2 +- fs/crypto/policy.c | 6 +++--- include/linux/fscrypt_notsupp.h | 2 +- 5 files changed, 7 insertions(+), 8 deletions(-) diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index c7835df7e7b8..608f6bbe0f31 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -340,7 +340,7 @@ static int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags) return -ECHILD; dir = dget_parent(dentry); - if (!d_inode(dir)->i_sb->s_cop->is_encrypted(d_inode(dir))) { + if (!IS_ENCRYPTED(d_inode(dir))) { dput(dir); return 0; } diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index ad9f814fdead..2878289b3ed2 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -382,8 +382,7 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, memset(fname, 0, sizeof(struct fscrypt_name)); fname->usr_fname = iname; - if (!dir->i_sb->s_cop->is_encrypted(dir) || - fscrypt_is_dot_dotdot(iname)) { + if (!IS_ENCRYPTED(dir) || fscrypt_is_dot_dotdot(iname)) { fname->disk_name.name = (unsigned char *)iname->name; fname->disk_name.len = iname->len; return 0; diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c index 169fefb62940..3ce6ca91ce23 100644 --- a/fs/crypto/keyinfo.c +++ b/fs/crypto/keyinfo.c @@ -273,7 +273,7 @@ int fscrypt_get_encryption_info(struct inode *inode) res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); if (res < 0) { if (!fscrypt_dummy_context_enabled(inode) || - inode->i_sb->s_cop->is_encrypted(inode)) + IS_ENCRYPTED(inode)) return res; /* Fake up a context for an unencrypted directory */ memset(&ctx, 0, sizeof(ctx)); diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index 9914d51dff86..2f2c53f2e136 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -109,7 +109,7 @@ int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg) struct fscrypt_policy policy; int res; - if (!inode->i_sb->s_cop->is_encrypted(inode)) + if (!IS_ENCRYPTED(inode)) return -ENODATA; res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); @@ -166,11 +166,11 @@ int fscrypt_has_permitted_context(struct inode *parent, struct inode *child) return 1; /* No restrictions if the parent directory is unencrypted */ - if (!cops->is_encrypted(parent)) + if (!IS_ENCRYPTED(parent)) return 1; /* Encrypted directories must not contain unencrypted files */ - if (!cops->is_encrypted(child)) + if (!IS_ENCRYPTED(child)) return 0; /* diff --git a/include/linux/fscrypt_notsupp.h b/include/linux/fscrypt_notsupp.h index 2d0b6960831e..7b390e356f7f 100644 --- a/include/linux/fscrypt_notsupp.h +++ b/include/linux/fscrypt_notsupp.h @@ -100,7 +100,7 @@ static inline int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, int lookup, struct fscrypt_name *fname) { - if (dir->i_sb->s_cop->is_encrypted(dir)) + if (IS_ENCRYPTED(dir)) return -EOPNOTSUPP; memset(fname, 0, sizeof(struct fscrypt_name)); From 1034eeec516a7ff036c55e46ff9fe124584fdb82 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 9 Oct 2017 12:15:37 -0700 Subject: [PATCH 512/804] fscrypt: remove ->is_encrypted() Now that all callers of fscrypt_operations.is_encrypted() have been switched to IS_ENCRYPTED(), remove ->is_encrypted(). Reviewed-by: Chao Yu Acked-by: Dave Chinner Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/f2fs/super.c | 2 -- include/linux/fscrypt.h | 1 - 2 files changed, 3 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 76e2f1518224..07a5628a6779 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1742,13 +1742,11 @@ static const struct fscrypt_operations f2fs_cryptops = { .key_prefix = "f2fs:", .get_context = f2fs_get_context, .set_context = f2fs_set_context, - .is_encrypted = f2fs_encrypted_inode, .empty_dir = f2fs_empty_dir, .max_namelen = f2fs_max_namelen, }; #else static const struct fscrypt_operations f2fs_cryptops = { - .is_encrypted = f2fs_encrypted_inode, }; #endif diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index 58663327f692..800e0f812f36 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -81,7 +81,6 @@ struct fscrypt_operations { int (*get_context)(struct inode *, void *, size_t); int (*set_context)(struct inode *, const void *, size_t, void *); bool (*dummy_context)(struct inode *); - bool (*is_encrypted)(struct inode *); bool (*empty_dir)(struct inode *); unsigned (*max_namelen)(struct inode *); }; From 272e43502577d08921becbce635d8e0a48c8086d Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 9 Oct 2017 12:15:38 -0700 Subject: [PATCH 513/804] fscrypt: remove unneeded empty fscrypt_operations structs In the case where a filesystem has been configured without encryption support, there is no longer any need to initialize ->s_cop at all, since none of the methods are ever called. Reviewed-by: Chao Yu Acked-by: Dave Chinner Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/f2fs/super.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 07a5628a6779..187cead7bd37 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1745,9 +1745,6 @@ static const struct fscrypt_operations f2fs_cryptops = { .empty_dir = f2fs_empty_dir, .max_namelen = f2fs_max_namelen, }; -#else -static const struct fscrypt_operations f2fs_cryptops = { -}; #endif static struct inode *f2fs_nfs_get_inode(struct super_block *sb, @@ -2476,7 +2473,9 @@ try_onemore: #endif sb->s_op = &f2fs_sops; +#ifdef CONFIG_F2FS_FS_ENCRYPTION sb->s_cop = &f2fs_cryptops; +#endif sb->s_xattr = f2fs_xattr_handlers; sb->s_export_op = &f2fs_export_ops; sb->s_magic = F2FS_SUPER_MAGIC; From 8c815f381cd6224828e63dbfb5435bdd58240ed4 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 9 Oct 2017 12:15:39 -0700 Subject: [PATCH 514/804] fscrypt: new helper function - fscrypt_require_key() Add a helper function which checks if an inode is encrypted, and if so, tries to set up its encryption key. This is a pattern which is duplicated in multiple places in each of ext4, f2fs, and ubifs --- for example, when a regular file is asked to be opened or truncated. Acked-by: Dave Chinner Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- include/linux/fscrypt.h | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index 800e0f812f36..b1e3914c3e49 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -148,5 +148,30 @@ static inline bool fscrypt_has_encryption_key(const struct inode *inode) #include #endif /* __FS_HAS_ENCRYPTION */ +/** + * fscrypt_require_key - require an inode's encryption key + * @inode: the inode we need the key for + * + * If the inode is encrypted, set up its encryption key if not already done. + * Then require that the key be present and return -ENOKEY otherwise. + * + * No locks are needed, and the key will live as long as the struct inode --- so + * it won't go away from under you. + * + * Return: 0 on success, -ENOKEY if the key is missing, or another -errno code + * if a problem occurred while setting up the encryption key. + */ +static inline int fscrypt_require_key(struct inode *inode) +{ + if (IS_ENCRYPTED(inode)) { + int err = fscrypt_get_encryption_info(inode); + + if (err) + return err; + if (!fscrypt_has_encryption_key(inode)) + return -ENOKEY; + } + return 0; +} #endif /* _LINUX_FSCRYPT_H */ From 2b4b4f98dddf0430cb52d9729a51066fe16153b5 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 9 Oct 2017 12:15:40 -0700 Subject: [PATCH 515/804] fscrypt: new helper function - fscrypt_file_open() Add a helper function which prepares to open a regular file which may be encrypted. It handles setting up the file's encryption key, then checking that the file's encryption policy matches that of its parent directory (if the parent directory is encrypted). It may be set as the ->open() method or it can be called from another ->open() method. Acked-by: Dave Chinner Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/crypto/Makefile | 2 +- fs/crypto/hooks.c | 49 +++++++++++++++++++++++++++++++++ include/linux/fscrypt_notsupp.h | 9 ++++++ include/linux/fscrypt_supp.h | 3 ++ 4 files changed, 62 insertions(+), 1 deletion(-) create mode 100644 fs/crypto/hooks.c diff --git a/fs/crypto/Makefile b/fs/crypto/Makefile index 9f6607f17b53..cb496989a6b6 100644 --- a/fs/crypto/Makefile +++ b/fs/crypto/Makefile @@ -1,4 +1,4 @@ obj-$(CONFIG_FS_ENCRYPTION) += fscrypto.o -fscrypto-y := crypto.o fname.o policy.o keyinfo.o +fscrypto-y := crypto.o fname.o hooks.o keyinfo.o policy.o fscrypto-$(CONFIG_BLOCK) += bio.o diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c new file mode 100644 index 000000000000..069088e91ea9 --- /dev/null +++ b/fs/crypto/hooks.c @@ -0,0 +1,49 @@ +/* + * fs/crypto/hooks.c + * + * Encryption hooks for higher-level filesystem operations. + */ + +#include +#include "fscrypt_private.h" + +/** + * fscrypt_file_open - prepare to open a possibly-encrypted regular file + * @inode: the inode being opened + * @filp: the struct file being set up + * + * Currently, an encrypted regular file can only be opened if its encryption key + * is available; access to the raw encrypted contents is not supported. + * Therefore, we first set up the inode's encryption key (if not already done) + * and return an error if it's unavailable. + * + * We also verify that if the parent directory (from the path via which the file + * is being opened) is encrypted, then the inode being opened uses the same + * encryption policy. This is needed as part of the enforcement that all files + * in an encrypted directory tree use the same encryption policy, as a + * protection against certain types of offline attacks. Note that this check is + * needed even when opening an *unencrypted* file, since it's forbidden to have + * an unencrypted file in an encrypted directory. + * + * Return: 0 on success, -ENOKEY if the key is missing, or another -errno code + */ +int fscrypt_file_open(struct inode *inode, struct file *filp) +{ + int err; + struct dentry *dir; + + err = fscrypt_require_key(inode); + if (err) + return err; + + dir = dget_parent(file_dentry(filp)); + if (IS_ENCRYPTED(d_inode(dir)) && + !fscrypt_has_permitted_context(d_inode(dir), inode)) { + pr_warn_ratelimited("fscrypt: inconsistent encryption contexts: %lu/%lu", + d_inode(dir)->i_ino, inode->i_ino); + err = -EPERM; + } + dput(dir); + return err; +} +EXPORT_SYMBOL_GPL(fscrypt_file_open); diff --git a/include/linux/fscrypt_notsupp.h b/include/linux/fscrypt_notsupp.h index 7b390e356f7f..162da6517ac4 100644 --- a/include/linux/fscrypt_notsupp.h +++ b/include/linux/fscrypt_notsupp.h @@ -177,4 +177,13 @@ static inline int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, return -EOPNOTSUPP; } +/* hooks.c */ + +static inline int fscrypt_file_open(struct inode *inode, struct file *filp) +{ + if (IS_ENCRYPTED(inode)) + return -EOPNOTSUPP; + return 0; +} + #endif /* _LINUX_FSCRYPT_NOTSUPP_H */ diff --git a/include/linux/fscrypt_supp.h b/include/linux/fscrypt_supp.h index 5a90e5ef4687..fd2f6decaee4 100644 --- a/include/linux/fscrypt_supp.h +++ b/include/linux/fscrypt_supp.h @@ -143,4 +143,7 @@ extern void fscrypt_pullback_bio_page(struct page **, bool); extern int fscrypt_zeroout_range(const struct inode *, pgoff_t, sector_t, unsigned int); +/* hooks.c */ +extern int fscrypt_file_open(struct inode *inode, struct file *filp); + #endif /* _LINUX_FSCRYPT_SUPP_H */ From 95efafb6239dd82ca0bb3d9e32edaa41da58f54e Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 9 Oct 2017 12:15:41 -0700 Subject: [PATCH 516/804] fscrypt: new helper function - fscrypt_prepare_link() Introduce a helper function which prepares to link an inode into a possibly-encrypted directory. It handles setting up the target directory's encryption key, then verifying that the link won't violate the constraint that all files in an encrypted directory tree use the same encryption policy. Acked-by: Dave Chinner Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/crypto/hooks.c | 15 +++++++++++++++ include/linux/fscrypt.h | 27 +++++++++++++++++++++++++++ include/linux/fscrypt_notsupp.h | 6 ++++++ include/linux/fscrypt_supp.h | 1 + 4 files changed, 49 insertions(+) diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c index 069088e91ea9..8b90217320dd 100644 --- a/fs/crypto/hooks.c +++ b/fs/crypto/hooks.c @@ -47,3 +47,18 @@ int fscrypt_file_open(struct inode *inode, struct file *filp) return err; } EXPORT_SYMBOL_GPL(fscrypt_file_open); + +int __fscrypt_prepare_link(struct inode *inode, struct inode *dir) +{ + int err; + + err = fscrypt_require_key(dir); + if (err) + return err; + + if (!fscrypt_has_permitted_context(dir, inode)) + return -EPERM; + + return 0; +} +EXPORT_SYMBOL_GPL(__fscrypt_prepare_link); diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index b1e3914c3e49..4a2b0e307711 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -174,4 +174,31 @@ static inline int fscrypt_require_key(struct inode *inode) return 0; } +/** + * fscrypt_prepare_link - prepare to link an inode into a possibly-encrypted directory + * @old_dentry: an existing dentry for the inode being linked + * @dir: the target directory + * @dentry: negative dentry for the target filename + * + * A new link can only be added to an encrypted directory if the directory's + * encryption key is available --- since otherwise we'd have no way to encrypt + * the filename. Therefore, we first set up the directory's encryption key (if + * not already done) and return an error if it's unavailable. + * + * We also verify that the link will not violate the constraint that all files + * in an encrypted directory tree use the same encryption policy. + * + * Return: 0 on success, -ENOKEY if the directory's encryption key is missing, + * -EPERM if the link would result in an inconsistent encryption policy, or + * another -errno code. + */ +static inline int fscrypt_prepare_link(struct dentry *old_dentry, + struct inode *dir, + struct dentry *dentry) +{ + if (IS_ENCRYPTED(dir)) + return __fscrypt_prepare_link(d_inode(old_dentry), dir); + return 0; +} + #endif /* _LINUX_FSCRYPT_H */ diff --git a/include/linux/fscrypt_notsupp.h b/include/linux/fscrypt_notsupp.h index 162da6517ac4..d7d1039eb6b5 100644 --- a/include/linux/fscrypt_notsupp.h +++ b/include/linux/fscrypt_notsupp.h @@ -186,4 +186,10 @@ static inline int fscrypt_file_open(struct inode *inode, struct file *filp) return 0; } +static inline int __fscrypt_prepare_link(struct inode *inode, + struct inode *dir) +{ + return -EOPNOTSUPP; +} + #endif /* _LINUX_FSCRYPT_NOTSUPP_H */ diff --git a/include/linux/fscrypt_supp.h b/include/linux/fscrypt_supp.h index fd2f6decaee4..80706283da75 100644 --- a/include/linux/fscrypt_supp.h +++ b/include/linux/fscrypt_supp.h @@ -145,5 +145,6 @@ extern int fscrypt_zeroout_range(const struct inode *, pgoff_t, sector_t, /* hooks.c */ extern int fscrypt_file_open(struct inode *inode, struct file *filp); +extern int __fscrypt_prepare_link(struct inode *inode, struct inode *dir); #endif /* _LINUX_FSCRYPT_SUPP_H */ From a31feba5c18ff73cedb4301c5a0f2ffa7624218b Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 9 Oct 2017 12:15:42 -0700 Subject: [PATCH 517/804] fscrypt: new helper function - fscrypt_prepare_rename() Introduce a helper function which prepares to rename a file into a possibly encrypted directory. It handles loading the encryption keys for the source and target directories if needed, and it handles enforcing that if the target directory (and the source directory for a cross-rename) is encrypted, then the file being moved into the directory has the same encryption policy as its containing directory. Acked-by: Dave Chinner Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/crypto/hooks.c | 30 ++++++++++++++++++++++++++++++ include/linux/fscrypt.h | 33 +++++++++++++++++++++++++++++++++ include/linux/fscrypt_notsupp.h | 9 +++++++++ include/linux/fscrypt_supp.h | 5 +++++ 4 files changed, 77 insertions(+) diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c index 8b90217320dd..822cb78f9b45 100644 --- a/fs/crypto/hooks.c +++ b/fs/crypto/hooks.c @@ -62,3 +62,33 @@ int __fscrypt_prepare_link(struct inode *inode, struct inode *dir) return 0; } EXPORT_SYMBOL_GPL(__fscrypt_prepare_link); + +int __fscrypt_prepare_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) +{ + int err; + + err = fscrypt_require_key(old_dir); + if (err) + return err; + + err = fscrypt_require_key(new_dir); + if (err) + return err; + + if (old_dir != new_dir) { + if (IS_ENCRYPTED(new_dir) && + !fscrypt_has_permitted_context(new_dir, + d_inode(old_dentry))) + return -EPERM; + + if ((flags & RENAME_EXCHANGE) && + IS_ENCRYPTED(old_dir) && + !fscrypt_has_permitted_context(old_dir, + d_inode(new_dentry))) + return -EPERM; + } + return 0; +} +EXPORT_SYMBOL_GPL(__fscrypt_prepare_rename); diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index 4a2b0e307711..d331050e93f4 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -201,4 +201,37 @@ static inline int fscrypt_prepare_link(struct dentry *old_dentry, return 0; } +/** + * fscrypt_prepare_rename - prepare for a rename between possibly-encrypted directories + * @old_dir: source directory + * @old_dentry: dentry for source file + * @new_dir: target directory + * @new_dentry: dentry for target location (may be negative unless exchanging) + * @flags: rename flags (we care at least about %RENAME_EXCHANGE) + * + * Prepare for ->rename() where the source and/or target directories may be + * encrypted. A new link can only be added to an encrypted directory if the + * directory's encryption key is available --- since otherwise we'd have no way + * to encrypt the filename. A rename to an existing name, on the other hand, + * *is* cryptographically possible without the key. However, we take the more + * conservative approach and just forbid all no-key renames. + * + * We also verify that the rename will not violate the constraint that all files + * in an encrypted directory tree use the same encryption policy. + * + * Return: 0 on success, -ENOKEY if an encryption key is missing, -EPERM if the + * rename would cause inconsistent encryption policies, or another -errno code. + */ +static inline int fscrypt_prepare_rename(struct inode *old_dir, + struct dentry *old_dentry, + struct inode *new_dir, + struct dentry *new_dentry, + unsigned int flags) +{ + if (IS_ENCRYPTED(old_dir) || IS_ENCRYPTED(new_dir)) + return __fscrypt_prepare_rename(old_dir, old_dentry, + new_dir, new_dentry, flags); + return 0; +} + #endif /* _LINUX_FSCRYPT_H */ diff --git a/include/linux/fscrypt_notsupp.h b/include/linux/fscrypt_notsupp.h index d7d1039eb6b5..6af378d8126e 100644 --- a/include/linux/fscrypt_notsupp.h +++ b/include/linux/fscrypt_notsupp.h @@ -192,4 +192,13 @@ static inline int __fscrypt_prepare_link(struct inode *inode, return -EOPNOTSUPP; } +static inline int __fscrypt_prepare_rename(struct inode *old_dir, + struct dentry *old_dentry, + struct inode *new_dir, + struct dentry *new_dentry, + unsigned int flags) +{ + return -EOPNOTSUPP; +} + #endif /* _LINUX_FSCRYPT_NOTSUPP_H */ diff --git a/include/linux/fscrypt_supp.h b/include/linux/fscrypt_supp.h index 80706283da75..40f35073145f 100644 --- a/include/linux/fscrypt_supp.h +++ b/include/linux/fscrypt_supp.h @@ -146,5 +146,10 @@ extern int fscrypt_zeroout_range(const struct inode *, pgoff_t, sector_t, /* hooks.c */ extern int fscrypt_file_open(struct inode *inode, struct file *filp); extern int __fscrypt_prepare_link(struct inode *inode, struct inode *dir); +extern int __fscrypt_prepare_rename(struct inode *old_dir, + struct dentry *old_dentry, + struct inode *new_dir, + struct dentry *new_dentry, + unsigned int flags); #endif /* _LINUX_FSCRYPT_SUPP_H */ From 5cbdd42ad248df655c3a07c0be92078e6e4ebe84 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 9 Oct 2017 12:15:43 -0700 Subject: [PATCH 518/804] fscrypt: new helper function - fscrypt_prepare_lookup() Introduce a helper function which prepares to look up the given dentry in the given directory. If the directory is encrypted, it handles loading the directory's encryption key, setting the dentry's ->d_op to fscrypt_d_ops, and setting DCACHE_ENCRYPTED_WITH_KEY if the directory's encryption key is available. Note: once all filesystems switch over to this, we'll be able to move fscrypt_d_ops and fscrypt_set_encrypted_dentry() to fscrypt_private.h. Acked-by: Dave Chinner Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/crypto/hooks.c | 18 ++++++++++++++++++ include/linux/fscrypt.h | 28 ++++++++++++++++++++++++++++ include/linux/fscrypt_notsupp.h | 6 ++++++ include/linux/fscrypt_supp.h | 1 + 4 files changed, 53 insertions(+) diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c index 822cb78f9b45..9f5fb2eb9cf7 100644 --- a/fs/crypto/hooks.c +++ b/fs/crypto/hooks.c @@ -92,3 +92,21 @@ int __fscrypt_prepare_rename(struct inode *old_dir, struct dentry *old_dentry, return 0; } EXPORT_SYMBOL_GPL(__fscrypt_prepare_rename); + +int __fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry) +{ + int err = fscrypt_get_encryption_info(dir); + + if (err) + return err; + + if (fscrypt_has_encryption_key(dir)) { + spin_lock(&dentry->d_lock); + dentry->d_flags |= DCACHE_ENCRYPTED_WITH_KEY; + spin_unlock(&dentry->d_lock); + } + + d_set_d_op(dentry, &fscrypt_d_ops); + return 0; +} +EXPORT_SYMBOL_GPL(__fscrypt_prepare_lookup); diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index d331050e93f4..9f1050721ab1 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -234,4 +234,32 @@ static inline int fscrypt_prepare_rename(struct inode *old_dir, return 0; } +/** + * fscrypt_prepare_lookup - prepare to lookup a name in a possibly-encrypted directory + * @dir: directory being searched + * @dentry: filename being looked up + * @flags: lookup flags + * + * Prepare for ->lookup() in a directory which may be encrypted. Lookups can be + * done with or without the directory's encryption key; without the key, + * filenames are presented in encrypted form. Therefore, we'll try to set up + * the directory's encryption key, but even without it the lookup can continue. + * + * To allow invalidating stale dentries if the directory's encryption key is + * added later, we also install a custom ->d_revalidate() method and use the + * DCACHE_ENCRYPTED_WITH_KEY flag to indicate whether a given dentry is a + * plaintext name (flag set) or a ciphertext name (flag cleared). + * + * Return: 0 on success, -errno if a problem occurred while setting up the + * encryption key + */ +static inline int fscrypt_prepare_lookup(struct inode *dir, + struct dentry *dentry, + unsigned int flags) +{ + if (IS_ENCRYPTED(dir)) + return __fscrypt_prepare_lookup(dir, dentry); + return 0; +} + #endif /* _LINUX_FSCRYPT_H */ diff --git a/include/linux/fscrypt_notsupp.h b/include/linux/fscrypt_notsupp.h index 6af378d8126e..c4c6bf2c390e 100644 --- a/include/linux/fscrypt_notsupp.h +++ b/include/linux/fscrypt_notsupp.h @@ -201,4 +201,10 @@ static inline int __fscrypt_prepare_rename(struct inode *old_dir, return -EOPNOTSUPP; } +static inline int __fscrypt_prepare_lookup(struct inode *dir, + struct dentry *dentry) +{ + return -EOPNOTSUPP; +} + #endif /* _LINUX_FSCRYPT_NOTSUPP_H */ diff --git a/include/linux/fscrypt_supp.h b/include/linux/fscrypt_supp.h index 40f35073145f..2db5e9706f60 100644 --- a/include/linux/fscrypt_supp.h +++ b/include/linux/fscrypt_supp.h @@ -151,5 +151,6 @@ extern int __fscrypt_prepare_rename(struct inode *old_dir, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags); +extern int __fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry); #endif /* _LINUX_FSCRYPT_SUPP_H */ From 2286508d17c258719b7f1e37f8000cb4faebf51b Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 9 Oct 2017 12:15:44 -0700 Subject: [PATCH 519/804] fscrypt: new helper function - fscrypt_prepare_setattr() Introduce a helper function for filesystems to call when processing ->setattr() on a possibly-encrypted inode. It handles enforcing that an encrypted file can only be truncated if its encryption key is available. Acked-by: Dave Chinner Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- include/linux/fscrypt.h | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index 9f1050721ab1..8641e56b8f8a 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -262,4 +262,29 @@ static inline int fscrypt_prepare_lookup(struct inode *dir, return 0; } +/** + * fscrypt_prepare_setattr - prepare to change a possibly-encrypted inode's attributes + * @dentry: dentry through which the inode is being changed + * @attr: attributes to change + * + * Prepare for ->setattr() on a possibly-encrypted inode. On an encrypted file, + * most attribute changes are allowed even without the encryption key. However, + * without the encryption key we do have to forbid truncates. This is needed + * because the size being truncated to may not be a multiple of the filesystem + * block size, and in that case we'd have to decrypt the final block, zero the + * portion past i_size, and re-encrypt it. (We *could* allow truncating to a + * filesystem block boundary, but it's simpler to just forbid all truncates --- + * and we already forbid all other contents modifications without the key.) + * + * Return: 0 on success, -ENOKEY if the key is missing, or another -errno code + * if a problem occurred while setting up the encryption key. + */ +static inline int fscrypt_prepare_setattr(struct dentry *dentry, + struct iattr *attr) +{ + if (attr->ia_valid & ATTR_SIZE) + return fscrypt_require_key(d_inode(dentry)); + return 0; +} + #endif /* _LINUX_FSCRYPT_H */ From 42d89da82b25f2d2e6bc062c1181d1fdd3926446 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 29 Oct 2017 06:30:19 -0400 Subject: [PATCH 520/804] fscrypt: lock mutex before checking for bounce page pool fscrypt_initialize(), which allocates the global bounce page pool when an encrypted file is first accessed, uses "double-checked locking" to try to avoid locking fscrypt_init_mutex. However, it doesn't use any memory barriers, so it's theoretically possible for a thread to observe a bounce page pool which has not been fully initialized. This is a classic bug with "double-checked locking". While "only a theoretical issue" in the latest kernel, in pre-4.8 kernels the pointer that was checked was not even the last to be initialized, so it was easily possible for a crash (NULL pointer dereference) to happen. This was changed only incidentally by the large refactor to use fs/crypto/. Solve both problems in a trivial way that can easily be backported: just always take the mutex. It's theoretically less efficient, but it shouldn't be noticeable in practice as the mutex is only acquired very briefly once per encrypted file. Later I'd like to make this use a helper macro like DO_ONCE(). However, DO_ONCE() runs in atomic context, so we'd need to add a new macro that allows blocking. Cc: stable@vger.kernel.org # v4.1+ Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/crypto/crypto.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index 608f6bbe0f31..472326737717 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -410,11 +410,8 @@ int fscrypt_initialize(unsigned int cop_flags) { int i, res = -ENOMEM; - /* - * No need to allocate a bounce page pool if there already is one or - * this FS won't use it. - */ - if (cop_flags & FS_CFLG_OWN_PAGES || fscrypt_bounce_page_pool) + /* No need to allocate a bounce page pool if this FS won't use it. */ + if (cop_flags & FS_CFLG_OWN_PAGES) return 0; mutex_lock(&fscrypt_init_mutex); From 4ecacbed6e1ca727c1df55b231b3a7247ac38c4f Mon Sep 17 00:00:00 2001 From: Gilad Ben-Yossef Date: Wed, 18 Oct 2017 08:00:38 +0100 Subject: [PATCH 521/804] crypto: introduce crypto wait for async op Invoking a possibly async. crypto op and waiting for completion while correctly handling backlog processing is a common task in the crypto API implementation and outside users of it. This patch adds a generic implementation for doing so in preparation for using it across the board instead of hand rolled versions. Signed-off-by: Gilad Ben-Yossef CC: Eric Biggers CC: Jonathan Cameron Signed-off-by: Herbert Xu --- crypto/api.c | 13 +++++++++++++ include/linux/crypto.h | 40 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+) diff --git a/crypto/api.c b/crypto/api.c index bbc147cb5dec..e5c1abfd451f 100644 --- a/crypto/api.c +++ b/crypto/api.c @@ -24,6 +24,7 @@ #include #include #include +#include #include "internal.h" LIST_HEAD(crypto_alg_list); @@ -611,5 +612,17 @@ int crypto_has_alg(const char *name, u32 type, u32 mask) } EXPORT_SYMBOL_GPL(crypto_has_alg); +void crypto_req_done(struct crypto_async_request *req, int err) +{ + struct crypto_wait *wait = req->data; + + if (err == -EINPROGRESS) + return; + + wait->err = err; + complete(&wait->completion); +} +EXPORT_SYMBOL_GPL(crypto_req_done); + MODULE_DESCRIPTION("Cryptographic core API"); MODULE_LICENSE("GPL"); diff --git a/include/linux/crypto.h b/include/linux/crypto.h index e71cb70a1ac2..b7c1e1a7ebac 100644 --- a/include/linux/crypto.h +++ b/include/linux/crypto.h @@ -24,6 +24,7 @@ #include #include #include +#include /* * Autoloaded crypto modules should only use a prefixed name to avoid allowing @@ -469,6 +470,45 @@ struct crypto_alg { struct module *cra_module; } CRYPTO_MINALIGN_ATTR; +/* + * A helper struct for waiting for completion of async crypto ops + */ +struct crypto_wait { + struct completion completion; + int err; +}; + +/* + * Macro for declaring a crypto op async wait object on stack + */ +#define DECLARE_CRYPTO_WAIT(_wait) \ + struct crypto_wait _wait = { \ + COMPLETION_INITIALIZER_ONSTACK((_wait).completion), 0 } + +/* + * Async ops completion helper functioons + */ +void crypto_req_done(struct crypto_async_request *req, int err); + +static inline int crypto_wait_req(int err, struct crypto_wait *wait) +{ + switch (err) { + case -EINPROGRESS: + case -EBUSY: + wait_for_completion(&wait->completion); + reinit_completion(&wait->completion); + err = wait->err; + break; + }; + + return err; +} + +static inline void crypto_init_wait(struct crypto_wait *wait) +{ + init_completion(&wait->completion); +} + /* * Algorithm registration interface. */ From 9e32f17d241bf2aceef65a33c133d09890fa20d4 Mon Sep 17 00:00:00 2001 From: Gilad Ben-Yossef Date: Wed, 18 Oct 2017 08:00:44 +0100 Subject: [PATCH 522/804] fscrypt: move to generic async completion fscrypt starts several async. crypto ops and waiting for them to complete. Move it over to generic code doing the same. Signed-off-by: Gilad Ben-Yossef Signed-off-by: Herbert Xu --- fs/crypto/crypto.c | 28 ++++------------------------ fs/crypto/fname.c | 36 ++++++------------------------------ fs/crypto/fscrypt_private.h | 9 --------- fs/crypto/keyinfo.c | 21 +++------------------ 4 files changed, 13 insertions(+), 81 deletions(-) diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index 472326737717..732a786cce9d 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -126,21 +126,6 @@ struct fscrypt_ctx *fscrypt_get_ctx(const struct inode *inode, gfp_t gfp_flags) } EXPORT_SYMBOL(fscrypt_get_ctx); -/** - * page_crypt_complete() - completion callback for page crypto - * @req: The asynchronous cipher request context - * @res: The result of the cipher operation - */ -static void page_crypt_complete(struct crypto_async_request *req, int res) -{ - struct fscrypt_completion_result *ecr = req->data; - - if (res == -EINPROGRESS) - return; - ecr->res = res; - complete(&ecr->completion); -} - int fscrypt_do_page_crypto(const struct inode *inode, fscrypt_direction_t rw, u64 lblk_num, struct page *src_page, struct page *dest_page, unsigned int len, @@ -151,7 +136,7 @@ int fscrypt_do_page_crypto(const struct inode *inode, fscrypt_direction_t rw, u8 padding[FS_IV_SIZE - sizeof(__le64)]; } iv; struct skcipher_request *req = NULL; - DECLARE_FS_COMPLETION_RESULT(ecr); + DECLARE_CRYPTO_WAIT(wait); struct scatterlist dst, src; struct fscrypt_info *ci = inode->i_crypt_info; struct crypto_skcipher *tfm = ci->ci_ctfm; @@ -179,7 +164,7 @@ int fscrypt_do_page_crypto(const struct inode *inode, fscrypt_direction_t rw, skcipher_request_set_callback( req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, - page_crypt_complete, &ecr); + crypto_req_done, &wait); sg_init_table(&dst, 1); sg_set_page(&dst, dest_page, len, offs); @@ -187,14 +172,9 @@ int fscrypt_do_page_crypto(const struct inode *inode, fscrypt_direction_t rw, sg_set_page(&src, src_page, len, offs); skcipher_request_set_crypt(req, &src, &dst, len, &iv); if (rw == FS_DECRYPT) - res = crypto_skcipher_decrypt(req); + res = crypto_wait_req(crypto_skcipher_decrypt(req), &wait); else - res = crypto_skcipher_encrypt(req); - if (res == -EINPROGRESS || res == -EBUSY) { - BUG_ON(req->base.data != &ecr); - wait_for_completion(&ecr.completion); - res = ecr.res; - } + res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait); skcipher_request_free(req); if (res) { printk_ratelimited(KERN_ERR diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index 2878289b3ed2..6eb434363ff2 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -14,21 +14,6 @@ #include #include "fscrypt_private.h" -/** - * fname_crypt_complete() - completion callback for filename crypto - * @req: The asynchronous cipher request context - * @res: The result of the cipher operation - */ -static void fname_crypt_complete(struct crypto_async_request *req, int res) -{ - struct fscrypt_completion_result *ecr = req->data; - - if (res == -EINPROGRESS) - return; - ecr->res = res; - complete(&ecr->completion); -} - /** * fname_encrypt() - encrypt a filename * @@ -40,7 +25,7 @@ static int fname_encrypt(struct inode *inode, const struct qstr *iname, struct fscrypt_str *oname) { struct skcipher_request *req = NULL; - DECLARE_FS_COMPLETION_RESULT(ecr); + DECLARE_CRYPTO_WAIT(wait); struct fscrypt_info *ci = inode->i_crypt_info; struct crypto_skcipher *tfm = ci->ci_ctfm; int res = 0; @@ -76,17 +61,12 @@ static int fname_encrypt(struct inode *inode, } skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, - fname_crypt_complete, &ecr); + crypto_req_done, &wait); sg_init_one(&sg, oname->name, cryptlen); skcipher_request_set_crypt(req, &sg, &sg, cryptlen, iv); /* Do the encryption */ - res = crypto_skcipher_encrypt(req); - if (res == -EINPROGRESS || res == -EBUSY) { - /* Request is being completed asynchronously; wait for it */ - wait_for_completion(&ecr.completion); - res = ecr.res; - } + res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait); skcipher_request_free(req); if (res < 0) { printk_ratelimited(KERN_ERR @@ -110,7 +90,7 @@ static int fname_decrypt(struct inode *inode, struct fscrypt_str *oname) { struct skcipher_request *req = NULL; - DECLARE_FS_COMPLETION_RESULT(ecr); + DECLARE_CRYPTO_WAIT(wait); struct scatterlist src_sg, dst_sg; struct fscrypt_info *ci = inode->i_crypt_info; struct crypto_skcipher *tfm = ci->ci_ctfm; @@ -131,7 +111,7 @@ static int fname_decrypt(struct inode *inode, } skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, - fname_crypt_complete, &ecr); + crypto_req_done, &wait); /* Initialize IV */ memset(iv, 0, FS_CRYPTO_BLOCK_SIZE); @@ -140,11 +120,7 @@ static int fname_decrypt(struct inode *inode, sg_init_one(&src_sg, iname->name, iname->len); sg_init_one(&dst_sg, oname->name, oname->len); skcipher_request_set_crypt(req, &src_sg, &dst_sg, iname->len, iv); - res = crypto_skcipher_decrypt(req); - if (res == -EINPROGRESS || res == -EBUSY) { - wait_for_completion(&ecr.completion); - res = ecr.res; - } + res = crypto_wait_req(crypto_skcipher_decrypt(req), &wait); skcipher_request_free(req); if (res < 0) { printk_ratelimited(KERN_ERR diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index ff97988fe6e9..c3ad415cd14f 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -70,15 +70,6 @@ typedef enum { #define FS_CTX_REQUIRES_FREE_ENCRYPT_FL 0x00000001 #define FS_CTX_HAS_BOUNCE_BUFFER_FL 0x00000002 -struct fscrypt_completion_result { - struct completion completion; - int res; -}; - -#define DECLARE_FS_COMPLETION_RESULT(ecr) \ - struct fscrypt_completion_result ecr = { \ - COMPLETION_INITIALIZER_ONSTACK((ecr).completion), 0 } - /* bio stuffs */ #define REQ_OP_READ READ #define REQ_OP_WRITE WRITE diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c index 3ce6ca91ce23..444c65ed6db8 100644 --- a/fs/crypto/keyinfo.c +++ b/fs/crypto/keyinfo.c @@ -17,17 +17,6 @@ static struct crypto_shash *essiv_hash_tfm; -static void derive_crypt_complete(struct crypto_async_request *req, int rc) -{ - struct fscrypt_completion_result *ecr = req->data; - - if (rc == -EINPROGRESS) - return; - - ecr->res = rc; - complete(&ecr->completion); -} - /** * derive_key_aes() - Derive a key using AES-128-ECB * @deriving_key: Encryption key used for derivation. @@ -42,7 +31,7 @@ static int derive_key_aes(u8 deriving_key[FS_AES_128_ECB_KEY_SIZE], { int res = 0; struct skcipher_request *req = NULL; - DECLARE_FS_COMPLETION_RESULT(ecr); + DECLARE_CRYPTO_WAIT(wait); struct scatterlist src_sg, dst_sg; struct crypto_skcipher *tfm = crypto_alloc_skcipher("ecb(aes)", 0, 0); @@ -59,7 +48,7 @@ static int derive_key_aes(u8 deriving_key[FS_AES_128_ECB_KEY_SIZE], } skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, - derive_crypt_complete, &ecr); + crypto_req_done, &wait); res = crypto_skcipher_setkey(tfm, deriving_key, FS_AES_128_ECB_KEY_SIZE); if (res < 0) @@ -69,11 +58,7 @@ static int derive_key_aes(u8 deriving_key[FS_AES_128_ECB_KEY_SIZE], sg_init_one(&dst_sg, derived_raw_key, source_key->size); skcipher_request_set_crypt(req, &src_sg, &dst_sg, source_key->size, NULL); - res = crypto_skcipher_encrypt(req); - if (res == -EINPROGRESS || res == -EBUSY) { - wait_for_completion(&ecr.completion); - res = ecr.res; - } + res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait); out: skcipher_request_free(req); crypto_free_skcipher(tfm); From ba1ade71012d50c8c9bedfc6ed6c009a7f4de59e Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 9 Jan 2018 16:52:25 -0800 Subject: [PATCH 523/804] fscrypt: resolve some cherry-pick bugs - remove wrong linux/fscrypt.h declared in ext4 - remove obsolete function Fixes: 734f0d241d2b ("fscrypt: clean up include file mess") Signed-off-by: Jaegeuk Kim --- fs/ext4/ext4.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index bea7d7febdab..b7e921d207fb 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -37,9 +37,6 @@ #include #endif -#define __FS_HAS_ENCRYPTION IS_ENABLED(CONFIG_EXT4_FS_ENCRYPTION) -#include - /* * The fourth extended filesystem constants/structures */ From 4dd2d0733809951ac9ac4acdeefce30519019261 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 16 Nov 2017 16:59:14 +0800 Subject: [PATCH 524/804] f2fs: reserve nid resource for quota sysfile During mkfs, quota sysfiles have already occupied nid resource, it needs to adjust remaining available nid count in kernel side. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 9 +-------- fs/f2fs/f2fs.h | 2 ++ fs/f2fs/node.c | 2 +- fs/f2fs/super.c | 10 +++++++++- 4 files changed, 13 insertions(+), 10 deletions(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index ecada8425268..4d929627e210 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -49,14 +49,7 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->ndirty_imeta = get_pages(sbi, F2FS_DIRTY_IMETA); si->ndirty_dirs = sbi->ndirty_inode[DIR_INODE]; si->ndirty_files = sbi->ndirty_inode[FILE_INODE]; - - si->nquota_files = 0; - if (f2fs_sb_has_quota_ino(sbi->sb)) { - for (i = 0; i < MAXQUOTAS; i++) { - if (f2fs_qf_ino(sbi->sb, i)) - si->nquota_files++; - } - } + si->nquota_files = sbi->nquota_files; si->ndirty_all = sbi->ndirty_inode[DIRTY_META]; si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES); si->aw_cnt = atomic_read(&sbi->aw_cnt); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index dc4a95e848af..8c03659b13cd 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1165,6 +1165,8 @@ struct f2fs_sb_info { block_t reserved_blocks; /* configurable reserved blocks */ block_t current_reserved_blocks; /* current reserved blocks */ + unsigned int nquota_files; /* # of quota sysfile */ + u32 s_next_generation; /* for NFS support */ /* # of pages, see count_type */ diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 964c99655942..dca69888d6d3 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2683,7 +2683,7 @@ static int init_node_manager(struct f2fs_sb_info *sbi) /* not used nids: 0, node, meta, (and root counted as valid node) */ nm_i->available_nids = nm_i->max_nid - sbi->total_valid_node_count - - F2FS_RESERVED_NODE_NUM; + sbi->nquota_files - F2FS_RESERVED_NODE_NUM; nm_i->nid_cnt[FREE_NID] = 0; nm_i->nid_cnt[PREALLOC_NID] = 0; nm_i->nat_cnt = 0; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 187cead7bd37..037d22233886 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1009,7 +1009,8 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_bavail = user_block_count - valid_user_blocks(sbi) - sbi->current_reserved_blocks; - avail_node_count = sbi->total_node_count - F2FS_RESERVED_NODE_NUM; + avail_node_count = sbi->total_node_count - sbi->nquota_files - + F2FS_RESERVED_NODE_NUM; if (avail_node_count > user_block_count) { buf->f_files = user_block_count; @@ -2470,6 +2471,13 @@ try_onemore: else sb->s_qcop = &f2fs_quotactl_ops; sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ; + + if (f2fs_sb_has_quota_ino(sbi->sb)) { + for (i = 0; i < MAXQUOTAS; i++) { + if (f2fs_qf_ino(sbi->sb, i)) + sbi->nquota_files++; + } + } #endif sb->s_op = &f2fs_sops; From 2d69561135f2cd0e044657f52a031cea2d0d2652 Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Fri, 17 Nov 2017 16:13:38 +0800 Subject: [PATCH 525/804] f2fs: no need to read nat block if nat_block_bitmap is set No need to read nat block if nat_block_bitmap is set. Signed-off-by: Yunlei He Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index dca69888d6d3..81972b156ebe 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1952,9 +1952,6 @@ static void scan_nat_page(struct f2fs_sb_info *sbi, unsigned int nat_ofs = NAT_BLOCK_OFFSET(start_nid); int i; - if (test_bit_le(nat_ofs, nm_i->nat_block_bitmap)) - return; - __set_bit_le(nat_ofs, nm_i->nat_block_bitmap); i = start_nid % NAT_ENTRY_PER_BLOCK; @@ -2059,10 +2056,13 @@ static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount) down_read(&nm_i->nat_tree_lock); while (1) { - struct page *page = get_current_nat_page(sbi, nid); + if (!test_bit_le(NAT_BLOCK_OFFSET(nid), + nm_i->nat_block_bitmap)) { + struct page *page = get_current_nat_page(sbi, nid); - scan_nat_page(sbi, page, nid); - f2fs_put_page(page, 1); + scan_nat_page(sbi, page, nid); + f2fs_put_page(page, 1); + } nid += (NAT_ENTRY_PER_BLOCK - (nid % NAT_ENTRY_PER_BLOCK)); if (unlikely(nid >= nm_i->max_nid)) From e1398f6554b462729062d986b100e022b290e7a4 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 21 Nov 2017 17:49:54 +0800 Subject: [PATCH 526/804] f2fs: remove unneeded memory footprint accounting We forgot to remov memory footprint accounting of per-cpu type variables, fix it. Fixes: 35782b233f37 ("f2fs: remove percpu_count due to performance regression") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 4d929627e210..674f9bbe98d9 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -179,7 +179,6 @@ static void update_mem_info(struct f2fs_sb_info *sbi) si->base_mem += sizeof(struct f2fs_sb_info) + sbi->sb->s_blocksize; si->base_mem += 2 * sizeof(struct f2fs_inode_info); si->base_mem += sizeof(*sbi->ckpt); - si->base_mem += sizeof(struct percpu_counter) * NR_COUNT_TYPE; /* build sm */ si->base_mem += sizeof(struct f2fs_sm_info); From 55e2f89181ceff5dcebd87c0138d64dba8c3644d Mon Sep 17 00:00:00 2001 From: LiFan Date: Wed, 22 Nov 2017 16:07:23 +0800 Subject: [PATCH 527/804] f2fs: fix concurrent problem for updating free bitmap alloc_nid_failed and scan_nat_page can be called at the same time, and we haven't protected add_free_nid and update_free_nid_bitmap with the same nid_list_lock. That could lead to Thread A Thread B - __build_free_nids - scan_nat_page - add_free_nid - alloc_nid_failed - update_free_nid_bitmap - update_free_nid_bitmap scan_nat_page will clear the free bitmap since the nid is PREALLOC_NID, but alloc_nid_failed needs to set the free bitmap. This results in free nid with free bitmap cleared. This patch update the bitmap under the same nid_list_lock in add_free_nid. And use __GFP_NOFAIL to make sure to update status of free nid correctly. Signed-off-by: Fan li Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 85 +++++++++++++++++++++++++------------------------- 1 file changed, 43 insertions(+), 42 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 81972b156ebe..563c08c4aa7a 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1834,8 +1834,33 @@ static void __move_free_nid(struct f2fs_sb_info *sbi, struct free_nid *i, } } +static void update_free_nid_bitmap(struct f2fs_sb_info *sbi, nid_t nid, + bool set, bool build) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + unsigned int nat_ofs = NAT_BLOCK_OFFSET(nid); + unsigned int nid_ofs = nid - START_NID(nid); + + if (!test_bit_le(nat_ofs, nm_i->nat_block_bitmap)) + return; + + if (set) { + if (test_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs])) + return; + __set_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]); + nm_i->free_nid_count[nat_ofs]++; + } else { + if (!test_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs])) + return; + __clear_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]); + if (!build) + nm_i->free_nid_count[nat_ofs]--; + } +} + /* return if the nid is recognized as free */ -static bool add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) +static bool add_free_nid(struct f2fs_sb_info *sbi, + nid_t nid, bool build, bool update) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *i, *e; @@ -1851,8 +1876,7 @@ static bool add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) i->nid = nid; i->state = FREE_NID; - if (radix_tree_preload(GFP_NOFS)) - goto err; + radix_tree_preload(GFP_NOFS | __GFP_NOFAIL); spin_lock(&nm_i->nid_list_lock); @@ -1893,9 +1917,14 @@ static bool add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) ret = true; err = __insert_free_nid(sbi, i, FREE_NID); err_out: + if (update) { + update_free_nid_bitmap(sbi, nid, ret, build); + if (!build) + nm_i->available_nids++; + } spin_unlock(&nm_i->nid_list_lock); radix_tree_preload_end(); -err: + if (err) kmem_cache_free(free_nid_slab, i); return ret; @@ -1919,30 +1948,6 @@ static void remove_free_nid(struct f2fs_sb_info *sbi, nid_t nid) kmem_cache_free(free_nid_slab, i); } -static void update_free_nid_bitmap(struct f2fs_sb_info *sbi, nid_t nid, - bool set, bool build) -{ - struct f2fs_nm_info *nm_i = NM_I(sbi); - unsigned int nat_ofs = NAT_BLOCK_OFFSET(nid); - unsigned int nid_ofs = nid - START_NID(nid); - - if (!test_bit_le(nat_ofs, nm_i->nat_block_bitmap)) - return; - - if (set) { - if (test_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs])) - return; - __set_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]); - nm_i->free_nid_count[nat_ofs]++; - } else { - if (!test_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs])) - return; - __clear_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]); - if (!build) - nm_i->free_nid_count[nat_ofs]--; - } -} - static void scan_nat_page(struct f2fs_sb_info *sbi, struct page *nat_page, nid_t start_nid) { @@ -1957,18 +1962,18 @@ static void scan_nat_page(struct f2fs_sb_info *sbi, i = start_nid % NAT_ENTRY_PER_BLOCK; for (; i < NAT_ENTRY_PER_BLOCK; i++, start_nid++) { - bool freed = false; - if (unlikely(start_nid >= nm_i->max_nid)) break; blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr); f2fs_bug_on(sbi, blk_addr == NEW_ADDR); - if (blk_addr == NULL_ADDR) - freed = add_free_nid(sbi, start_nid, true); - spin_lock(&NM_I(sbi)->nid_list_lock); - update_free_nid_bitmap(sbi, start_nid, freed, true); - spin_unlock(&NM_I(sbi)->nid_list_lock); + if (blk_addr == NULL_ADDR) { + add_free_nid(sbi, start_nid, true, true); + } else { + spin_lock(&NM_I(sbi)->nid_list_lock); + update_free_nid_bitmap(sbi, start_nid, false, true); + spin_unlock(&NM_I(sbi)->nid_list_lock); + } } } @@ -1986,7 +1991,7 @@ static void scan_curseg_cache(struct f2fs_sb_info *sbi) addr = le32_to_cpu(nat_in_journal(journal, i).block_addr); nid = le32_to_cpu(nid_in_journal(journal, i)); if (addr == NULL_ADDR) - add_free_nid(sbi, nid, true); + add_free_nid(sbi, nid, true, false); else remove_free_nid(sbi, nid); } @@ -2013,7 +2018,7 @@ static void scan_free_nid_bits(struct f2fs_sb_info *sbi) break; nid = i * NAT_ENTRY_PER_BLOCK + idx; - add_free_nid(sbi, nid, true); + add_free_nid(sbi, nid, true, false); if (nm_i->nid_cnt[FREE_NID] >= MAX_FREE_NIDS) goto out; @@ -2519,11 +2524,7 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, nat_reset_flag(ne); __clear_nat_cache_dirty(NM_I(sbi), set, ne); if (nat_get_blkaddr(ne) == NULL_ADDR) { - add_free_nid(sbi, nid, false); - spin_lock(&NM_I(sbi)->nid_list_lock); - NM_I(sbi)->available_nids++; - update_free_nid_bitmap(sbi, nid, true, false); - spin_unlock(&NM_I(sbi)->nid_list_lock); + add_free_nid(sbi, nid, false, true); } else { spin_lock(&NM_I(sbi)->nid_list_lock); update_free_nid_bitmap(sbi, nid, false, false); From 47ee9b259811529a4653910f39648dbecd6bf89c Mon Sep 17 00:00:00 2001 From: Sheng Yong Date: Wed, 22 Nov 2017 18:23:38 +0800 Subject: [PATCH 528/804] f2fs: introduce sysfs readdir_ra to readahead inode block in readdir This patch introduces a sysfs interface readdir_ra to enable/disable readaheading inode block in f2fs_readdir. When readdir_ra is enabled, it improves the performance of "readdir + stat". For 300,000 files: time find /data/test > /dev/null disable readdir_ra: 1m25.69s real 0m01.94s user 0m50.80s system enable readdir_ra: 0m18.55s real 0m00.44s user 0m15.39s system Signed-off-by: Sheng Yong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 6 ++++++ fs/f2fs/dir.c | 4 ++++ fs/f2fs/f2fs.h | 1 + fs/f2fs/sysfs.c | 2 ++ 4 files changed, 13 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index 2baed1151eac..db7aab1516de 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -186,3 +186,9 @@ Date: August 2017 Contact: "Jaegeuk Kim" Description: Controls sleep time of GC urgent mode + +What: /sys/fs/f2fs//readdir_ra +Date: November 2017 +Contact: "Sheng Yong" +Description: + Controls readahead inode block in readdir. diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 1955707b138b..55fb45b66ed2 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -798,6 +798,7 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, unsigned int bit_pos; struct f2fs_dir_entry *de = NULL; struct fscrypt_str de_name = FSTR_INIT(NULL, 0); + struct f2fs_sb_info *sbi = F2FS_I_SB(d->inode); bit_pos = ((unsigned long)ctx->pos % d->max); @@ -836,6 +837,9 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, le32_to_cpu(de->ino), d_type)) return 1; + if (sbi->readdir_ra == 1) + ra_node_page(sbi, le32_to_cpu(de->ino)); + bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); ctx->pos = start_pos + bit_pos; } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 8c03659b13cd..e146ad84e09a 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1157,6 +1157,7 @@ struct f2fs_sb_info { int dir_level; /* directory level */ int inline_xattr_size; /* inline xattr size */ unsigned int trigger_ssr_threshold; /* threshold to trigger ssr */ + int readdir_ra; /* readahead inode in readdir */ block_t user_block_count; /* # of user blocks */ block_t total_valid_block_count; /* # of valid blocks */ diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 9835348b6e5d..93c3364250dd 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -299,6 +299,7 @@ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, cp_interval, interval_time[CP_TIME]); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, idle_interval, interval_time[REQ_TIME]); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, iostat_enable, iostat_enable); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, readdir_ra, readdir_ra); #ifdef CONFIG_F2FS_FAULT_INJECTION F2FS_RW_ATTR(FAULT_INFO_RATE, f2fs_fault_info, inject_rate, inject_rate); F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type); @@ -346,6 +347,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(cp_interval), ATTR_LIST(idle_interval), ATTR_LIST(iostat_enable), + ATTR_LIST(readdir_ra), #ifdef CONFIG_F2FS_FAULT_INJECTION ATTR_LIST(inject_rate), ATTR_LIST(inject_type), From f130dbb98a68aeab036a82f25589d59fb9a4721d Mon Sep 17 00:00:00 2001 From: Sheng Yong Date: Wed, 22 Nov 2017 18:23:39 +0800 Subject: [PATCH 529/804] f2fs: still write data if preallocate only partial blocks If there is not enough space left, f2fs_preallocate_blocks may only preallocte partial blocks. As a result, the write operation fails but i_blocks is not 0. To avoid this, f2fs should write data in non-preallocation way and write as many data as the size of i_blocks. Signed-off-by: Sheng Yong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 043394aa6c62..58fec1c9d460 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -860,8 +860,14 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) if (err) return err; } - if (!f2fs_has_inline_data(inode)) - return f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO); + if (!f2fs_has_inline_data(inode)) { + err = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO); + if (map.m_len > 0 && err == -ENOSPC) { + set_inode_flag(inode, FI_NO_PREALLOC); + err = 0; + } + return err; + } return err; } From e5c7c86010305630cfe34130d113a770d169ab4f Mon Sep 17 00:00:00 2001 From: Sheng Yong Date: Wed, 22 Nov 2017 18:23:40 +0800 Subject: [PATCH 530/804] f2fs: remove unused parameter Commit d260081ccf37 ("f2fs: change recovery policy of xattr node block") removes the use of blkaddr, which is no longer used. So remove the parameter. Signed-off-by: Sheng Yong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 3 +-- fs/f2fs/node.c | 2 +- fs/f2fs/recovery.c | 6 +++--- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index e146ad84e09a..a785fd3453bb 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2689,8 +2689,7 @@ void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid); void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid); int try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink); void recover_inline_xattr(struct inode *inode, struct page *page); -int recover_xattr_data(struct inode *inode, struct page *page, - block_t blkaddr); +int recover_xattr_data(struct inode *inode, struct page *page); int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page); int restore_node_summary(struct f2fs_sb_info *sbi, unsigned int segno, struct f2fs_summary_block *sum); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 563c08c4aa7a..ef7330e939f3 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2246,7 +2246,7 @@ update_inode: f2fs_put_page(ipage, 1); } -int recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr) +int recover_xattr_data(struct inode *inode, struct page *page) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); nid_t prev_xnid = F2FS_I(inode)->i_xattr_nid; diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 92c57ace1939..7d63faf51e52 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -404,7 +404,7 @@ truncate_out: } static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, - struct page *page, block_t blkaddr) + struct page *page) { struct dnode_of_data dn; struct node_info ni; @@ -415,7 +415,7 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, if (IS_INODE(page)) { recover_inline_xattr(inode, page); } else if (f2fs_has_xattr_block(ofs_of_node(page))) { - err = recover_xattr_data(inode, page, blkaddr); + err = recover_xattr_data(inode, page); if (!err) recovered++; goto out; @@ -568,7 +568,7 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list, break; } } - err = do_recover_data(sbi, entry->inode, page, blkaddr); + err = do_recover_data(sbi, entry->inode, page); if (err) { f2fs_put_page(page, 1); break; From e1f9be2f7c82b8b0ac1340fe9ddc3d25fea24b71 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 23 Nov 2017 23:26:52 +0800 Subject: [PATCH 531/804] f2fs: fix lock dependency in between dio_rwsem & i_mmap_sem test/generic/208 reports a potential deadlock as below: Chain exists of: &mm->mmap_sem --> &fi->i_mmap_sem --> &fi->dio_rwsem[WRITE] Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&fi->dio_rwsem[WRITE]); lock(&fi->i_mmap_sem); lock(&fi->dio_rwsem[WRITE]); lock(&mm->mmap_sem); This patch changes the lock dependency as below in fallocate() to fix this issue: - dio_rwsem - i_mmap_sem Fixes: bb06664a534b ("f2fs: avoid race in between GC and block exchange") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index bfff53f658e1..e2990f67a4ee 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1170,14 +1170,14 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) pg_start = offset >> PAGE_SHIFT; pg_end = (offset + len) >> PAGE_SHIFT; + /* avoid gc operation during block exchange */ + down_write(&F2FS_I(inode)->dio_rwsem[WRITE]); + down_write(&F2FS_I(inode)->i_mmap_sem); /* write out all dirty pages from offset */ ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); if (ret) - goto out; - - /* avoid gc operation during block exchange */ - down_write(&F2FS_I(inode)->dio_rwsem[WRITE]); + goto out_unlock; truncate_pagecache(inode, offset); @@ -1196,9 +1196,8 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) if (!ret) f2fs_i_size_write(inode, new_size); out_unlock: - up_write(&F2FS_I(inode)->dio_rwsem[WRITE]); -out: up_write(&F2FS_I(inode)->i_mmap_sem); + up_write(&F2FS_I(inode)->dio_rwsem[WRITE]); return ret; } @@ -1369,6 +1368,9 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) f2fs_balance_fs(sbi, true); + /* avoid gc operation during block exchange */ + down_write(&F2FS_I(inode)->dio_rwsem[WRITE]); + down_write(&F2FS_I(inode)->i_mmap_sem); ret = truncate_blocks(inode, i_size_read(inode), true); if (ret) @@ -1379,9 +1381,6 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) if (ret) goto out; - /* avoid gc operation during block exchange */ - down_write(&F2FS_I(inode)->dio_rwsem[WRITE]); - truncate_pagecache(inode, offset); pg_start = offset >> PAGE_SHIFT; @@ -1409,10 +1408,9 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) if (!ret) f2fs_i_size_write(inode, new_size); - - up_write(&F2FS_I(inode)->dio_rwsem[WRITE]); out: up_write(&F2FS_I(inode)->i_mmap_sem); + up_write(&F2FS_I(inode)->dio_rwsem[WRITE]); return ret; } From b01e03d724dec89a9366526e7e023212ebf9ccec Mon Sep 17 00:00:00 2001 From: LiFan Date: Sat, 25 Nov 2017 11:46:18 +0800 Subject: [PATCH 532/804] f2fs: remove an excess variable Remove the variable page_idx which no one would miss. Signed-off-by: Fan li Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 58fec1c9d460..d2558828915d 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1202,7 +1202,6 @@ static int f2fs_mpage_readpages(struct address_space *mapping, unsigned nr_pages) { struct bio *bio = NULL; - unsigned page_idx; sector_t last_block_in_bio = 0; struct inode *inode = mapping->host; const unsigned blkbits = inode->i_blkbits; @@ -1219,8 +1218,7 @@ static int f2fs_mpage_readpages(struct address_space *mapping, map.m_flags = 0; map.m_next_pgofs = NULL; - for (page_idx = 0; nr_pages; page_idx++, nr_pages--) { - + for (; nr_pages; nr_pages--) { if (pages) { page = list_last_entry(pages, struct page, lru); From 6d025237a1f8f205c08efdb80ba991bf41df98b1 Mon Sep 17 00:00:00 2001 From: Zhikang Zhang Date: Sun, 26 Nov 2017 02:34:28 +0800 Subject: [PATCH 533/804] f2fs: remove repeated f2fs_bug_on f2fs: remove repeated f2fs_bug_on which has already existed in function invalidate_blocks. Signed-off-by: Zhikang Zhang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index ef7330e939f3..dda40f5c4c9b 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -702,7 +702,6 @@ static void truncate_node(struct dnode_of_data *dn) struct node_info ni; get_node_info(sbi, dn->nid, &ni); - f2fs_bug_on(sbi, ni.blk_addr == NULL_ADDR); /* Deallocate node address */ invalidate_blocks(sbi, ni.blk_addr); @@ -2260,7 +2259,6 @@ int recover_xattr_data(struct inode *inode, struct page *page) /* 1: invalidate the previous xattr nid */ get_node_info(sbi, prev_xnid, &ni); - f2fs_bug_on(sbi, ni.blk_addr == NULL_ADDR); invalidate_blocks(sbi, ni.blk_addr); dec_valid_node_count(sbi, inode, false); set_node_addr(sbi, &ni, NULL_ADDR, false); From 101c6a96ad1c69011659bb1ddc9bad2b534aac17 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 29 Nov 2017 12:35:28 -0800 Subject: [PATCH 534/804] f2fs: switch to fscrypt_file_open() Reviewed-by: Chao Yu Signed-off-by: Eric Biggers Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 18 +++--------------- 1 file changed, 3 insertions(+), 15 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index e2990f67a4ee..81f298ac4cb2 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -474,22 +474,10 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma) static int f2fs_file_open(struct inode *inode, struct file *filp) { - struct dentry *dir; + int err = fscrypt_file_open(inode, filp); - if (f2fs_encrypted_inode(inode)) { - int ret = fscrypt_get_encryption_info(inode); - if (ret) - return -EACCES; - if (!fscrypt_has_encryption_key(inode)) - return -ENOKEY; - } - dir = dget_parent(file_dentry(filp)); - if (f2fs_encrypted_inode(d_inode(dir)) && - !fscrypt_has_permitted_context(d_inode(dir), inode)) { - dput(dir); - return -EPERM; - } - dput(dir); + if (err) + return err; return dquot_file_open(inode, filp); } From aeaac517a12d29fdb94edec69d1e6140f775e483 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 29 Nov 2017 12:35:29 -0800 Subject: [PATCH 535/804] f2fs: switch to fscrypt_prepare_link() Reviewed-by: Chao Yu Signed-off-by: Eric Biggers Signed-off-by: Jaegeuk Kim --- fs/f2fs/namei.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index cf8f4370d256..c93df5a1b305 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -240,9 +240,9 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir, if (unlikely(f2fs_cp_error(sbi))) return -EIO; - if (f2fs_encrypted_inode(dir) && - !fscrypt_has_permitted_context(dir, inode)) - return -EPERM; + err = fscrypt_prepare_link(old_dentry, dir, dentry); + if (err) + return err; if (is_inode_flag_set(dir, FI_PROJ_INHERIT) && (!projid_eq(F2FS_I(dir)->i_projid, From 9ab470eaf8a8fb0bee0eb781f151ddbe677385b4 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 29 Nov 2017 12:35:30 -0800 Subject: [PATCH 536/804] f2fs: switch to fscrypt_prepare_rename() Reviewed-by: Chao Yu Signed-off-by: Eric Biggers Signed-off-by: Jaegeuk Kim --- fs/f2fs/namei.c | 31 +++++++------------------------ 1 file changed, 7 insertions(+), 24 deletions(-) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index c93df5a1b305..e7fd30e45f47 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -797,18 +797,6 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, if (unlikely(f2fs_cp_error(sbi))) return -EIO; - if ((f2fs_encrypted_inode(old_dir) && - !fscrypt_has_encryption_key(old_dir)) || - (f2fs_encrypted_inode(new_dir) && - !fscrypt_has_encryption_key(new_dir))) - return -ENOKEY; - - if ((old_dir != new_dir) && f2fs_encrypted_inode(new_dir) && - !fscrypt_has_permitted_context(new_dir, old_inode)) { - err = -EPERM; - goto out; - } - if (is_inode_flag_set(new_dir, FI_PROJ_INHERIT) && (!projid_eq(F2FS_I(new_dir)->i_projid, F2FS_I(old_dentry->d_inode)->i_projid))) @@ -999,18 +987,6 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, if (unlikely(f2fs_cp_error(sbi))) return -EIO; - if ((f2fs_encrypted_inode(old_dir) && - !fscrypt_has_encryption_key(old_dir)) || - (f2fs_encrypted_inode(new_dir) && - !fscrypt_has_encryption_key(new_dir))) - return -ENOKEY; - - if ((f2fs_encrypted_inode(old_dir) || f2fs_encrypted_inode(new_dir)) && - (old_dir != new_dir) && - (!fscrypt_has_permitted_context(new_dir, old_inode) || - !fscrypt_has_permitted_context(old_dir, new_inode))) - return -EPERM; - if ((is_inode_flag_set(new_dir, FI_PROJ_INHERIT) && !projid_eq(F2FS_I(new_dir)->i_projid, F2FS_I(old_dentry->d_inode)->i_projid)) || @@ -1150,9 +1126,16 @@ static int f2fs_rename2(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { + int err; + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) return -EINVAL; + err = fscrypt_prepare_rename(old_dir, old_dentry, new_dir, new_dentry, + flags); + if (err) + return err; + if (flags & RENAME_EXCHANGE) { return f2fs_cross_rename(old_dir, old_dentry, new_dir, new_dentry); From bb8b850365ffd071b14def46b80eaa07bded0e13 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 29 Nov 2017 12:35:31 -0800 Subject: [PATCH 537/804] f2fs: switch to fscrypt_prepare_lookup() Reviewed-by: Chao Yu Signed-off-by: Eric Biggers Signed-off-by: Jaegeuk Kim --- fs/f2fs/namei.c | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index e7fd30e45f47..6e2c78c06f79 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -357,20 +357,9 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, trace_f2fs_lookup_start(dir, dentry, flags); - if (f2fs_encrypted_inode(dir)) { - err = fscrypt_get_encryption_info(dir); - - /* - * DCACHE_ENCRYPTED_WITH_KEY is set if the dentry is - * created while the directory was encrypted and we - * don't have access to the key. - */ - if (fscrypt_has_encryption_key(dir)) - fscrypt_set_encrypted_dentry(dentry); - fscrypt_set_d_op(dentry); - if (err && err != -ENOKEY) - goto out; - } + err = fscrypt_prepare_lookup(dir, dentry, flags); + if (err) + goto out; if (dentry->d_name.len > F2FS_NAME_LEN) { err = -ENAMETOOLONG; From c80f01959114c0f49bac2a006a419b8d59104353 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 29 Nov 2017 12:35:32 -0800 Subject: [PATCH 538/804] f2fs: switch to fscrypt_prepare_setattr() Reviewed-by: Chao Yu Signed-off-by: Eric Biggers Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 81f298ac4cb2..69bb0bb44826 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -724,6 +724,10 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) if (err) return err; + err = fscrypt_prepare_setattr(dentry, attr); + if (err) + return err; + if (is_quota_modification(inode, attr)) { err = dquot_initialize(inode); if (err) @@ -739,14 +743,6 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) } if (attr->ia_valid & ATTR_SIZE) { - if (f2fs_encrypted_inode(inode)) { - err = fscrypt_get_encryption_info(inode); - if (err) - return err; - if (!fscrypt_has_encryption_key(inode)) - return -ENOKEY; - } - if (attr->ia_size <= i_size_read(inode)) { down_write(&F2FS_I(inode)->i_mmap_sem); truncate_setsize(inode, attr->ia_size); From 3bc01114a338a9ac336b3e139948e69ef0488a43 Mon Sep 17 00:00:00 2001 From: Hyunchul Lee Date: Tue, 28 Nov 2017 09:23:00 +0900 Subject: [PATCH 539/804] f2fs: apply write hints to select the type of segment for direct write When blocks are allocated for direct write, select the type of segment using the kiocb hint. But if an inode has FI_NO_ALLOC, use the inode hint. Signed-off-by: Hyunchul Lee Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 28 ++++++++++++++++++++-------- fs/f2fs/f2fs.h | 11 +++++++++++ fs/f2fs/file.c | 6 ++++-- fs/f2fs/segment.c | 2 -- 4 files changed, 35 insertions(+), 12 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index d2558828915d..3b0cf32c1d66 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -781,7 +781,7 @@ got_it: return page; } -static int __allocate_data_block(struct dnode_of_data *dn) +static int __allocate_data_block(struct dnode_of_data *dn, int seg_type) { struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); struct f2fs_summary sum; @@ -806,7 +806,7 @@ alloc: set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); allocate_data_block(sbi, NULL, dn->data_blkaddr, &dn->data_blkaddr, - &sum, CURSEG_WARM_DATA, NULL, false); + &sum, seg_type, NULL, false); set_data_blkaddr(dn); /* update i_size */ @@ -849,12 +849,16 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) map.m_len = 0; map.m_next_pgofs = NULL; + map.m_seg_type = NO_CHECK_TYPE; - if (iocb->ki_flags & IOCB_DIRECT) + if (iocb->ki_flags & IOCB_DIRECT) { + /* map.m_seg_type = rw_hint_to_seg_type(iocb->ki_hint); */ + map.m_seg_type = rw_hint_to_seg_type(WRITE_LIFE_NOT_SET); return f2fs_map_blocks(inode, &map, 1, __force_buffered_io(inode, WRITE) ? F2FS_GET_BLOCK_PRE_AIO : F2FS_GET_BLOCK_PRE_DIO); + } if (iocb->ki_pos + iov_iter_count(from) > MAX_INLINE_DATA(inode)) { err = f2fs_convert_inline_inode(inode); if (err) @@ -964,7 +968,8 @@ next_block: last_ofs_in_node = dn.ofs_in_node; } } else { - err = __allocate_data_block(&dn); + err = __allocate_data_block(&dn, + map->m_seg_type); if (!err) set_inode_flag(inode, FI_APPEND_WRITE); } @@ -1057,7 +1062,7 @@ out: static int __get_data_block(struct inode *inode, sector_t iblock, struct buffer_head *bh, int create, int flag, - pgoff_t *next_pgofs) + pgoff_t *next_pgofs, int seg_type) { struct f2fs_map_blocks map; int err; @@ -1065,6 +1070,7 @@ static int __get_data_block(struct inode *inode, sector_t iblock, map.m_lblk = iblock; map.m_len = bh->b_size >> inode->i_blkbits; map.m_next_pgofs = next_pgofs; + map.m_seg_type = seg_type; err = f2fs_map_blocks(inode, &map, create, flag); if (!err) { @@ -1080,14 +1086,18 @@ static int get_data_block(struct inode *inode, sector_t iblock, pgoff_t *next_pgofs) { return __get_data_block(inode, iblock, bh_result, create, - flag, next_pgofs); + flag, next_pgofs, + NO_CHECK_TYPE); } static int get_data_block_dio(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { return __get_data_block(inode, iblock, bh_result, create, - F2FS_GET_BLOCK_DEFAULT, NULL); + F2FS_GET_BLOCK_DEFAULT, NULL, + rw_hint_to_seg_type( + WRITE_LIFE_NOT_SET)); + /* inode->i_write_hint)); */ } static int get_data_block_bmap(struct inode *inode, sector_t iblock, @@ -1098,7 +1108,8 @@ static int get_data_block_bmap(struct inode *inode, sector_t iblock, return -EFBIG; return __get_data_block(inode, iblock, bh_result, create, - F2FS_GET_BLOCK_BMAP, NULL); + F2FS_GET_BLOCK_BMAP, NULL, + NO_CHECK_TYPE); } static inline sector_t logical_to_blk(struct inode *inode, loff_t offset) @@ -1217,6 +1228,7 @@ static int f2fs_mpage_readpages(struct address_space *mapping, map.m_len = 0; map.m_flags = 0; map.m_next_pgofs = NULL; + map.m_seg_type = NO_CHECK_TYPE; for (; nr_pages; nr_pages--) { if (pages) { diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index a785fd3453bb..721ea01f28bb 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -606,6 +606,7 @@ struct f2fs_map_blocks { unsigned int m_len; unsigned int m_flags; pgoff_t *m_next_pgofs; /* point next possible non-hole pgofs */ + int m_seg_type; }; /* for flag in get_data_block */ @@ -2503,6 +2504,15 @@ static inline void *kvzalloc(size_t size, gfp_t flags) return ret; } +enum rw_hint { + WRITE_LIFE_NOT_SET = 0, + WRITE_LIFE_NONE = 1, /* RWH_WRITE_LIFE_NONE */ + WRITE_LIFE_SHORT = 2, /* RWH_WRITE_LIFE_SHORT */ + WRITE_LIFE_MEDIUM = 3, /* RWH_WRITE_LIFE_MEDIUM */ + WRITE_LIFE_LONG = 4, /* RWH_WRITE_LIFE_LONG */ + WRITE_LIFE_EXTREME = 5, /* RWH_WRITE_LIFE_EXTREME */ +}; + static inline int get_extra_isize(struct inode *inode) { return F2FS_I(inode)->i_extra_isize / sizeof(__le32); @@ -2756,6 +2766,7 @@ int build_segment_manager(struct f2fs_sb_info *sbi); void destroy_segment_manager(struct f2fs_sb_info *sbi); int __init create_segment_manager_caches(void); void destroy_segment_manager_caches(void); +int rw_hint_to_seg_type(enum rw_hint hint); /* * checkpoint.c diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 69bb0bb44826..a5e96f8fc42c 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1402,7 +1402,8 @@ static int expand_inode_data(struct inode *inode, loff_t offset, loff_t len, int mode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct f2fs_map_blocks map = { .m_next_pgofs = NULL }; + struct f2fs_map_blocks map = { .m_next_pgofs = NULL, + .m_seg_type = NO_CHECK_TYPE }; pgoff_t pg_end; loff_t new_size = i_size_read(inode); loff_t off_end; @@ -2042,7 +2043,8 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, struct f2fs_defragment *range) { struct inode *inode = file_inode(filp); - struct f2fs_map_blocks map = { .m_next_pgofs = NULL }; + struct f2fs_map_blocks map = { .m_next_pgofs = NULL, + .m_seg_type = NO_CHECK_TYPE }; struct extent_info ei = {0,0,0}; pgoff_t pg_start, pg_end; unsigned int blk_per_seg = sbi->blocks_per_seg; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 94939a5a96c8..82fb22b5e4ad 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2514,7 +2514,6 @@ static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type) return false; } -#if 0 int rw_hint_to_seg_type(enum rw_hint hint) { switch (hint) { @@ -2526,7 +2525,6 @@ int rw_hint_to_seg_type(enum rw_hint hint) return CURSEG_WARM_DATA; } } -#endif static int __get_segment_type_2(struct f2fs_io_info *fio) { From d94680798786ffd3e8c87c6d2727a75c6616dc5b Mon Sep 17 00:00:00 2001 From: LiFan Date: Tue, 28 Nov 2017 20:17:41 +0800 Subject: [PATCH 540/804] f2fs: remove a redundant conditional expression Avoid checking is_inode repeatedly, and make the logic a little bit clearer. Signed-off-by: Fan li Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 721ea01f28bb..62e6f630381c 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2057,11 +2057,11 @@ static inline block_t datablock_addr(struct inode *inode, raw_node = F2FS_NODE(node_page); /* from GC path only */ - if (!inode) { - if (is_inode) + if (is_inode) { + if (!inode) base = offset_in_addr(&raw_node->i); - } else if (f2fs_has_extra_attr(inode) && is_inode) { - base = get_extra_isize(inode); + else if (f2fs_has_extra_attr(inode)) + base = get_extra_isize(inode); } addr_array = blkaddr_in_node(raw_node); From 8b33886c37cdff86070ca0fec4bdf7f644dea219 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 30 Nov 2017 19:28:17 +0800 Subject: [PATCH 541/804] f2fs: inject fault to kzalloc This patch introduces f2fs_kzalloc based on f2fs_kmalloc in order to support error injection for kzalloc(). Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 2 +- fs/f2fs/debug.c | 2 +- fs/f2fs/f2fs.h | 6 ++++++ fs/f2fs/namei.c | 2 +- fs/f2fs/node.c | 7 ++++--- fs/f2fs/segment.c | 30 ++++++++++++++++-------------- fs/f2fs/xattr.c | 8 ++++---- 7 files changed, 33 insertions(+), 24 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 2eb778174a9b..8e629434cd05 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -797,7 +797,7 @@ int get_valid_checkpoint(struct f2fs_sb_info *sbi) block_t cp_blk_no; int i; - sbi->ckpt = kzalloc(cp_blks * blk_size, GFP_KERNEL); + sbi->ckpt = f2fs_kzalloc(sbi, cp_blks * blk_size, GFP_KERNEL); if (!sbi->ckpt) return -ENOMEM; /* diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 674f9bbe98d9..a66107b5cfff 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -439,7 +439,7 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); struct f2fs_stat_info *si; - si = kzalloc(sizeof(struct f2fs_stat_info), GFP_KERNEL); + si = f2fs_kzalloc(sbi, sizeof(struct f2fs_stat_info), GFP_KERNEL); if (!si) return -ENOMEM; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 62e6f630381c..569d9fb0bada 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2513,6 +2513,12 @@ enum rw_hint { WRITE_LIFE_EXTREME = 5, /* RWH_WRITE_LIFE_EXTREME */ }; +static inline void *f2fs_kzalloc(struct f2fs_sb_info *sbi, + size_t size, gfp_t flags) +{ + return f2fs_kmalloc(sbi, size, flags | __GFP_ZERO); +} + static inline int get_extra_isize(struct inode *inode) { return F2FS_I(inode)->i_extra_isize / sizeof(__le32); diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 6e2c78c06f79..f44ce8c34966 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -530,7 +530,7 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, struct qstr istr = QSTR_INIT(symname, len); struct fscrypt_str ostr; - sd = kzalloc(disk_link.len, GFP_NOFS); + sd = f2fs_kzalloc(sbi, disk_link.len, GFP_NOFS); if (!sd) { err = -ENOMEM; goto err_out; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index dda40f5c4c9b..f10f685a2601 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2603,8 +2603,8 @@ static int __get_nat_bitmaps(struct f2fs_sb_info *sbi) nm_i->nat_bits_blocks = F2FS_BYTES_TO_BLK((nat_bits_bytes << 1) + 8 + F2FS_BLKSIZE - 1); - nm_i->nat_bits = kzalloc(nm_i->nat_bits_blocks << F2FS_BLKSIZE_BITS, - GFP_KERNEL); + nm_i->nat_bits = f2fs_kzalloc(sbi, + nm_i->nat_bits_blocks << F2FS_BLKSIZE_BITS, GFP_KERNEL); if (!nm_i->nat_bits) return -ENOMEM; @@ -2750,7 +2750,8 @@ int build_node_manager(struct f2fs_sb_info *sbi) { int err; - sbi->nm_info = kzalloc(sizeof(struct f2fs_nm_info), GFP_KERNEL); + sbi->nm_info = f2fs_kzalloc(sbi, sizeof(struct f2fs_nm_info), + GFP_KERNEL); if (!sbi->nm_info) return -ENOMEM; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 82fb22b5e4ad..19403f8e2161 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -657,7 +657,7 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi) goto init_thread; } - fcc = kzalloc(sizeof(struct flush_cmd_control), GFP_KERNEL); + fcc = f2fs_kzalloc(sbi, sizeof(struct flush_cmd_control), GFP_KERNEL); if (!fcc) return -ENOMEM; atomic_set(&fcc->issued_flush, 0); @@ -1817,7 +1817,7 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi) goto init_thread; } - dcc = kzalloc(sizeof(struct discard_cmd_control), GFP_KERNEL); + dcc = f2fs_kzalloc(sbi, sizeof(struct discard_cmd_control), GFP_KERNEL); if (!dcc) return -ENOMEM; @@ -3419,7 +3419,7 @@ static int build_sit_info(struct f2fs_sb_info *sbi) unsigned int bitmap_size; /* allocate memory for SIT information */ - sit_i = kzalloc(sizeof(struct sit_info), GFP_KERNEL); + sit_i = f2fs_kzalloc(sbi, sizeof(struct sit_info), GFP_KERNEL); if (!sit_i) return -ENOMEM; @@ -3437,29 +3437,30 @@ static int build_sit_info(struct f2fs_sb_info *sbi) for (start = 0; start < MAIN_SEGS(sbi); start++) { sit_i->sentries[start].cur_valid_map - = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); + = f2fs_kzalloc(sbi, SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); sit_i->sentries[start].ckpt_valid_map - = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); + = f2fs_kzalloc(sbi, SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); if (!sit_i->sentries[start].cur_valid_map || !sit_i->sentries[start].ckpt_valid_map) return -ENOMEM; #ifdef CONFIG_F2FS_CHECK_FS sit_i->sentries[start].cur_valid_map_mir - = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); + = f2fs_kzalloc(sbi, SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); if (!sit_i->sentries[start].cur_valid_map_mir) return -ENOMEM; #endif if (f2fs_discard_en(sbi)) { sit_i->sentries[start].discard_map - = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); + = f2fs_kzalloc(sbi, SIT_VBLOCK_MAP_SIZE, + GFP_KERNEL); if (!sit_i->sentries[start].discard_map) return -ENOMEM; } } - sit_i->tmp_map = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); + sit_i->tmp_map = f2fs_kzalloc(sbi, SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); if (!sit_i->tmp_map) return -ENOMEM; @@ -3508,7 +3509,7 @@ static int build_free_segmap(struct f2fs_sb_info *sbi) unsigned int bitmap_size, sec_bitmap_size; /* allocate memory for free segmap information */ - free_i = kzalloc(sizeof(struct free_segmap_info), GFP_KERNEL); + free_i = f2fs_kzalloc(sbi, sizeof(struct free_segmap_info), GFP_KERNEL); if (!free_i) return -ENOMEM; @@ -3549,12 +3550,12 @@ static int build_curseg(struct f2fs_sb_info *sbi) for (i = 0; i < NR_CURSEG_TYPE; i++) { mutex_init(&array[i].curseg_mutex); - array[i].sum_blk = kzalloc(PAGE_SIZE, GFP_KERNEL); + array[i].sum_blk = f2fs_kzalloc(sbi, PAGE_SIZE, GFP_KERNEL); if (!array[i].sum_blk) return -ENOMEM; init_rwsem(&array[i].journal_rwsem); - array[i].journal = kzalloc(sizeof(struct f2fs_journal), - GFP_KERNEL); + array[i].journal = f2fs_kzalloc(sbi, + sizeof(struct f2fs_journal), GFP_KERNEL); if (!array[i].journal) return -ENOMEM; array[i].segno = NULL_SEGNO; @@ -3712,7 +3713,8 @@ static int build_dirty_segmap(struct f2fs_sb_info *sbi) unsigned int bitmap_size, i; /* allocate memory for dirty segments list information */ - dirty_i = kzalloc(sizeof(struct dirty_seglist_info), GFP_KERNEL); + dirty_i = f2fs_kzalloc(sbi, sizeof(struct dirty_seglist_info), + GFP_KERNEL); if (!dirty_i) return -ENOMEM; @@ -3766,7 +3768,7 @@ int build_segment_manager(struct f2fs_sb_info *sbi) struct f2fs_sm_info *sm_info; int err; - sm_info = kzalloc(sizeof(struct f2fs_sm_info), GFP_KERNEL); + sm_info = f2fs_kzalloc(sbi, sizeof(struct f2fs_sm_info), GFP_KERNEL); if (!sm_info) return -ENOMEM; diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 7acf56ebda65..47ac858787ea 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -345,8 +345,8 @@ static int lookup_all_xattrs(struct inode *inode, struct page *ipage, if (!size && !inline_size) return -ENODATA; - txattr_addr = kzalloc(inline_size + size + XATTR_PADDING_SIZE, - GFP_F2FS_ZERO); + txattr_addr = f2fs_kzalloc(F2FS_I_SB(inode), + inline_size + size + XATTR_PADDING_SIZE, GFP_NOFS); if (!txattr_addr) return -ENOMEM; @@ -398,8 +398,8 @@ static int read_all_xattrs(struct inode *inode, struct page *ipage, void *txattr_addr; int err; - txattr_addr = kzalloc(inline_size + size + XATTR_PADDING_SIZE, - GFP_F2FS_ZERO); + txattr_addr = f2fs_kzalloc(F2FS_I_SB(inode), + inline_size + size + XATTR_PADDING_SIZE, GFP_NOFS); if (!txattr_addr) return -ENOMEM; From 5d4e487b9929cced66ccdeb29e0ef429fed2f504 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 30 Nov 2017 19:28:18 +0800 Subject: [PATCH 542/804] f2fs: inject fault to kvmalloc This patch supports to inject fault into kvmalloc/kvzalloc. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 19 +++++++++++++++++++ fs/f2fs/file.c | 6 ++++-- fs/f2fs/node.c | 6 +++--- fs/f2fs/segment.c | 16 +++++++++------- fs/f2fs/super.c | 1 + 5 files changed, 36 insertions(+), 12 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 569d9fb0bada..1320f7255fb1 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -44,6 +44,7 @@ #ifdef CONFIG_F2FS_FAULT_INJECTION enum { FAULT_KMALLOC, + FAULT_KVMALLOC, FAULT_PAGE_ALLOC, FAULT_PAGE_GET, FAULT_ALLOC_BIO, @@ -2519,6 +2520,24 @@ static inline void *f2fs_kzalloc(struct f2fs_sb_info *sbi, return f2fs_kmalloc(sbi, size, flags | __GFP_ZERO); } +static inline void *f2fs_kvmalloc(struct f2fs_sb_info *sbi, + size_t size, gfp_t flags) +{ +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(sbi, FAULT_KVMALLOC)) { + f2fs_show_injection_info(FAULT_KVMALLOC); + return NULL; + } +#endif + return kvmalloc(size, flags); +} + +static inline void *f2fs_kvzalloc(struct f2fs_sb_info *sbi, + size_t size, gfp_t flags) +{ + return f2fs_kvmalloc(sbi, size, flags | __GFP_ZERO); +} + static inline int get_extra_isize(struct inode *inode) { return F2FS_I(inode)->i_extra_isize / sizeof(__le32); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index a5e96f8fc42c..260aeb0d8bc2 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1082,11 +1082,13 @@ static int __exchange_data_block(struct inode *src_inode, while (len) { olen = min((pgoff_t)4 * ADDRS_PER_BLOCK, len); - src_blkaddr = kvzalloc(sizeof(block_t) * olen, GFP_KERNEL); + src_blkaddr = f2fs_kvzalloc(F2FS_I_SB(src_inode), + sizeof(block_t) * olen, GFP_KERNEL); if (!src_blkaddr) return -ENOMEM; - do_replace = kvzalloc(sizeof(int) * olen, GFP_KERNEL); + do_replace = f2fs_kvzalloc(F2FS_I_SB(src_inode), + sizeof(int) * olen, GFP_KERNEL); if (!do_replace) { kvfree(src_blkaddr); return -ENOMEM; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index f10f685a2601..d833efceae82 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2729,17 +2729,17 @@ static int init_free_nid_cache(struct f2fs_sb_info *sbi) { struct f2fs_nm_info *nm_i = NM_I(sbi); - nm_i->free_nid_bitmap = kvzalloc(nm_i->nat_blocks * + nm_i->free_nid_bitmap = f2fs_kvzalloc(sbi, nm_i->nat_blocks * NAT_ENTRY_BITMAP_SIZE, GFP_KERNEL); if (!nm_i->free_nid_bitmap) return -ENOMEM; - nm_i->nat_block_bitmap = kvzalloc(nm_i->nat_blocks / 8, + nm_i->nat_block_bitmap = f2fs_kvzalloc(sbi, nm_i->nat_blocks / 8, GFP_KERNEL); if (!nm_i->nat_block_bitmap) return -ENOMEM; - nm_i->free_nid_count = kvzalloc(nm_i->nat_blocks * + nm_i->free_nid_count = f2fs_kvzalloc(sbi, nm_i->nat_blocks * sizeof(unsigned short), GFP_KERNEL); if (!nm_i->free_nid_count) return -ENOMEM; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 19403f8e2161..ac12f3deac75 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -3425,13 +3425,14 @@ static int build_sit_info(struct f2fs_sb_info *sbi) SM_I(sbi)->sit_info = sit_i; - sit_i->sentries = kvzalloc(MAIN_SEGS(sbi) * + sit_i->sentries = f2fs_kvzalloc(sbi, MAIN_SEGS(sbi) * sizeof(struct seg_entry), GFP_KERNEL); if (!sit_i->sentries) return -ENOMEM; bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi)); - sit_i->dirty_sentries_bitmap = kvzalloc(bitmap_size, GFP_KERNEL); + sit_i->dirty_sentries_bitmap = f2fs_kvzalloc(sbi, bitmap_size, + GFP_KERNEL); if (!sit_i->dirty_sentries_bitmap) return -ENOMEM; @@ -3465,7 +3466,7 @@ static int build_sit_info(struct f2fs_sb_info *sbi) return -ENOMEM; if (sbi->segs_per_sec > 1) { - sit_i->sec_entries = kvzalloc(MAIN_SECS(sbi) * + sit_i->sec_entries = f2fs_kvzalloc(sbi, MAIN_SECS(sbi) * sizeof(struct sec_entry), GFP_KERNEL); if (!sit_i->sec_entries) return -ENOMEM; @@ -3516,12 +3517,12 @@ static int build_free_segmap(struct f2fs_sb_info *sbi) SM_I(sbi)->free_info = free_i; bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi)); - free_i->free_segmap = kvmalloc(bitmap_size, GFP_KERNEL); + free_i->free_segmap = f2fs_kvmalloc(sbi, bitmap_size, GFP_KERNEL); if (!free_i->free_segmap) return -ENOMEM; sec_bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi)); - free_i->free_secmap = kvmalloc(sec_bitmap_size, GFP_KERNEL); + free_i->free_secmap = f2fs_kvmalloc(sbi, sec_bitmap_size, GFP_KERNEL); if (!free_i->free_secmap) return -ENOMEM; @@ -3701,7 +3702,7 @@ static int init_victim_secmap(struct f2fs_sb_info *sbi) struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); unsigned int bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi)); - dirty_i->victim_secmap = kvzalloc(bitmap_size, GFP_KERNEL); + dirty_i->victim_secmap = f2fs_kvzalloc(sbi, bitmap_size, GFP_KERNEL); if (!dirty_i->victim_secmap) return -ENOMEM; return 0; @@ -3724,7 +3725,8 @@ static int build_dirty_segmap(struct f2fs_sb_info *sbi) bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi)); for (i = 0; i < NR_DIRTY_TYPE; i++) { - dirty_i->dirty_segmap[i] = kvzalloc(bitmap_size, GFP_KERNEL); + dirty_i->dirty_segmap[i] = f2fs_kvzalloc(sbi, bitmap_size, + GFP_KERNEL); if (!dirty_i->dirty_segmap[i]) return -ENOMEM; } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 037d22233886..17ac1c9b2f85 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -43,6 +43,7 @@ static struct kmem_cache *f2fs_inode_cachep; char *fault_name[FAULT_MAX] = { [FAULT_KMALLOC] = "kmalloc", + [FAULT_KVMALLOC] = "kvmalloc", [FAULT_PAGE_ALLOC] = "page alloc", [FAULT_PAGE_GET] = "page get", [FAULT_ALLOC_BIO] = "alloc bio", From e7db649b5fb191a56fb83ed47c3bbe08f4b7c955 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 30 Nov 2017 19:28:19 +0800 Subject: [PATCH 543/804] f2fs: spread f2fs_k{m,z}alloc Use f2fs_k{m,z}alloc as much as possible to increase fault injection points. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 2 +- fs/f2fs/super.c | 16 +++++++++------- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index ac12f3deac75..fac18cc58c44 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -3543,7 +3543,7 @@ static int build_curseg(struct f2fs_sb_info *sbi) struct curseg_info *array; int i; - array = kcalloc(NR_CURSEG_TYPE, sizeof(*array), GFP_KERNEL); + array = f2fs_kzalloc(sbi, sizeof(*array) * NR_CURSEG_TYPE, GFP_KERNEL); if (!array) return -ENOMEM; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 17ac1c9b2f85..ce3b4d88de6e 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2154,14 +2154,15 @@ static int init_blkz_info(struct f2fs_sb_info *sbi, int devi) if (nr_sectors & (bdev_zone_sectors(bdev) - 1)) FDEV(devi).nr_blkz++; - FDEV(devi).blkz_type = kmalloc(FDEV(devi).nr_blkz, GFP_KERNEL); + FDEV(devi).blkz_type = f2fs_kmalloc(sbi, FDEV(devi).nr_blkz, + GFP_KERNEL); if (!FDEV(devi).blkz_type) return -ENOMEM; #define F2FS_REPORT_NR_ZONES 4096 - zones = kcalloc(F2FS_REPORT_NR_ZONES, sizeof(struct blk_zone), - GFP_KERNEL); + zones = f2fs_kzalloc(sbi, sizeof(struct blk_zone) * + F2FS_REPORT_NR_ZONES, GFP_KERNEL); if (!zones) return -ENOMEM; @@ -2305,8 +2306,8 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) * Initialize multiple devices information, or single * zoned block device information. */ - sbi->devs = kcalloc(max_devices, sizeof(struct f2fs_dev_info), - GFP_KERNEL); + sbi->devs = f2fs_kzalloc(sbi, sizeof(struct f2fs_dev_info) * + max_devices, GFP_KERNEL); if (!sbi->devs) return -ENOMEM; @@ -2512,8 +2513,9 @@ try_onemore: int n = (i == META) ? 1: NR_TEMP_TYPE; int j; - sbi->write_io[i] = kmalloc(n * sizeof(struct f2fs_bio_info), - GFP_KERNEL); + sbi->write_io[i] = f2fs_kmalloc(sbi, + n * sizeof(struct f2fs_bio_info), + GFP_KERNEL); if (!sbi->write_io[i]) { err = -ENOMEM; goto free_options; From 1f994d47080c0bac79eb20e90649664799cfce28 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 30 Nov 2017 19:28:20 +0800 Subject: [PATCH 544/804] f2fs: fix error handling in fill_super In fill_super, if we fail to call f2fs_build_stats(), it needs to detach from global f2fs shrink list, otherwise once system starts to shrink slab cache, we will encounter below panic: BUG: unable to handle kernel paging request at 00007d35 Oops: 0002 [#1] PREEMPT SMP EIP: __lock_acquire+0x70/0x12c0 Call Trace: lock_acquire+0xae/0x220 mutex_trylock+0xc5/0xf0 f2fs_shrink_count+0x32/0xb0 [f2fs] shrink_slab+0xf1/0x5b0 drop_slab_node+0x35/0x60 drop_slab+0xf/0x20 drop_caches_sysctl_handler+0x79/0xc0 proc_sys_call_handler+0xa4/0xc0 proc_sys_write+0x1f/0x30 __vfs_write+0x24/0x150 SyS_write+0x44/0x90 do_fast_syscall_32+0xa1/0x1ca entry_SYSENTER_32+0x4c/0x7b In addition, this patch relocates f2fs_join_shrinker in fill_super to avoid unneeded error handling of it. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index ce3b4d88de6e..b6a96a8fb794 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2623,18 +2623,16 @@ try_onemore: goto free_nm; } - f2fs_join_shrinker(sbi); - err = f2fs_build_stats(sbi); if (err) - goto free_nm; + goto free_node_inode; /* read root inode and dentry */ root = f2fs_iget(sb, F2FS_ROOT_INO(sbi)); if (IS_ERR(root)) { f2fs_msg(sb, KERN_ERR, "Failed to read root inode"); err = PTR_ERR(root); - goto free_node_inode; + goto free_stats; } if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) { iput(root); @@ -2730,6 +2728,8 @@ skip_recovery: sbi->valid_super_block ? 1 : 2, err); } + f2fs_join_shrinker(sbi); + f2fs_msg(sbi->sb, KERN_NOTICE, "Mounted with checkpoint version = %llx", cur_cp_version(F2FS_CKPT(sbi))); f2fs_update_time(sbi, CP_TIME); @@ -2756,14 +2756,12 @@ free_sysfs: free_root_inode: dput(sb->s_root); sb->s_root = NULL; -free_node_inode: - truncate_inode_pages_final(NODE_MAPPING(sbi)); - mutex_lock(&sbi->umount_mutex); - release_ino_entry(sbi, true); - f2fs_leave_shrinker(sbi); - iput(sbi->node_inode); - mutex_unlock(&sbi->umount_mutex); +free_stats: f2fs_destroy_stats(sbi); +free_node_inode: + release_ino_entry(sbi, true); + truncate_inode_pages_final(NODE_MAPPING(sbi)); + iput(sbi->node_inode); free_nm: destroy_node_manager(sbi); free_sm: From e4f5e26cdadf858848e1157067fc853d3c41eb4e Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 30 Nov 2017 19:28:21 +0800 Subject: [PATCH 545/804] f2fs: clean up hash codes f2fs_chksum and f2fs_crc32 use the same 'crc32' crypto engine, also their implementation are almost the same, except with different shash description context. Introduce __f2fs_crc32 to wrap the common codes, and reuse it in f2fs_chksum and f2fs_crc32. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 43 +++++++++++++++++++------------------------ 1 file changed, 19 insertions(+), 24 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 1320f7255fb1..751654b48a4d 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1319,30 +1319,7 @@ static inline bool is_idle(struct f2fs_sb_info *sbi) /* * Inline functions */ -static inline u32 f2fs_crc32(struct f2fs_sb_info *sbi, const void *address, - unsigned int length) -{ - SHASH_DESC_ON_STACK(shash, sbi->s_chksum_driver); - u32 *ctx = (u32 *)shash_desc_ctx(shash); - int err; - - shash->tfm = sbi->s_chksum_driver; - shash->flags = 0; - *ctx = F2FS_SUPER_MAGIC; - - err = crypto_shash_update(shash, address, length); - BUG_ON(err); - - return *ctx; -} - -static inline bool f2fs_crc_valid(struct f2fs_sb_info *sbi, __u32 blk_crc, - void *buf, size_t buf_size) -{ - return f2fs_crc32(sbi, buf, buf_size) == blk_crc; -} - -static inline u32 f2fs_chksum(struct f2fs_sb_info *sbi, u32 crc, +static inline u32 __f2fs_crc32(struct f2fs_sb_info *sbi, u32 crc, const void *address, unsigned int length) { struct { @@ -1363,6 +1340,24 @@ static inline u32 f2fs_chksum(struct f2fs_sb_info *sbi, u32 crc, return *(u32 *)desc.ctx; } +static inline u32 f2fs_crc32(struct f2fs_sb_info *sbi, const void *address, + unsigned int length) +{ + return __f2fs_crc32(sbi, F2FS_SUPER_MAGIC, address, length); +} + +static inline bool f2fs_crc_valid(struct f2fs_sb_info *sbi, __u32 blk_crc, + void *buf, size_t buf_size) +{ + return f2fs_crc32(sbi, buf, buf_size) == blk_crc; +} + +static inline u32 f2fs_chksum(struct f2fs_sb_info *sbi, u32 crc, + const void *address, unsigned int length) +{ + return __f2fs_crc32(sbi, crc, address, length); +} + static inline struct f2fs_inode_info *F2FS_I(struct inode *inode) { return container_of(inode, struct f2fs_inode_info, vfs_inode); From f7986c416d1b4d50e9129c02d6e2d6849db3ea24 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 30 Nov 2017 19:28:22 +0800 Subject: [PATCH 546/804] f2fs: clean up f2fs_map_blocks f2fs_map_blocks(): if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR) { if (create) { ... } else { ... if (flag == F2FS_GET_BLOCK_FIEMAP && blkaddr == NULL_ADDR) { ... } if (flag != F2FS_GET_BLOCK_FIEMAP || blkaddr != NEW_ADDR) goto sync_out; } It means we can break the loop in cases of: a) flag != F2FS_GET_BLOCK_FIEMAP or b) flag == F2FS_GET_BLOCK_FIEMAP && blkaddr == NULL_ADDR Condition b) is the same as previous one, so merge operations of them for readability. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 3b0cf32c1d66..b7fd9f010b2b 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -986,9 +986,9 @@ next_block: blkaddr == NULL_ADDR) { if (map->m_next_pgofs) *map->m_next_pgofs = pgofs + 1; + goto sync_out; } - if (flag != F2FS_GET_BLOCK_FIEMAP || - blkaddr != NEW_ADDR) + if (flag != F2FS_GET_BLOCK_FIEMAP) goto sync_out; } } From 925d0933d8f0f0ed01cca63c094ff6ae316d0787 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 30 Nov 2017 19:28:23 +0800 Subject: [PATCH 547/804] f2fs: don't return value in truncate_data_blocks_range There is no caller cares about return value of truncate_data_blocks_range, remove it. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- fs/f2fs/file.c | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 751654b48a4d..9a8a2624944b 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2594,7 +2594,7 @@ int f2fs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); int f2fs_setattr(struct dentry *dentry, struct iattr *attr); int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end); -int truncate_data_blocks_range(struct dnode_of_data *dn, int count); +void truncate_data_blocks_range(struct dnode_of_data *dn, int count); long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 260aeb0d8bc2..b88efbfd22e7 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -481,7 +481,7 @@ static int f2fs_file_open(struct inode *inode, struct file *filp) return dquot_file_open(inode, filp); } -int truncate_data_blocks_range(struct dnode_of_data *dn, int count) +void truncate_data_blocks_range(struct dnode_of_data *dn, int count) { struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); struct f2fs_node *raw_node; @@ -524,7 +524,6 @@ int truncate_data_blocks_range(struct dnode_of_data *dn, int count) f2fs_update_time(sbi, REQ_TIME); trace_f2fs_truncate_data_blocks_range(dn->inode, dn->nid, dn->ofs_in_node, nr_free); - return nr_free; } void truncate_data_blocks(struct dnode_of_data *dn) From 04d44000d633c51c2732cfd4e3540ae250299646 Mon Sep 17 00:00:00 2001 From: LiFan Date: Tue, 5 Dec 2017 16:38:01 +0800 Subject: [PATCH 548/804] f2fs: use unlikely for release case Since the variable release is only nonzero when another unlikely case occurs, use unlikely() on it seems logical. Signed-off-by: Fan li Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 9a8a2624944b..58e9c35bd55b 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1658,7 +1658,7 @@ static inline int inc_valid_block_count(struct f2fs_sb_info *sbi, } spin_unlock(&sbi->stat_lock); - if (release) + if (unlikely(release)) dquot_release_reservation_block(inode, release); f2fs_i_blocks_write(inode, *count, true, true); return 0; From e81cafbeba4bf252b24778a17aef3f623a0815e3 Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Wed, 6 Dec 2017 11:31:29 +0800 Subject: [PATCH 549/804] f2fs: no need return value in restore summary process No need return value in restore summary process Signed-off-by: Yunlei He Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- fs/f2fs/node.c | 3 +-- fs/f2fs/segment.c | 14 +++----------- 3 files changed, 5 insertions(+), 14 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 58e9c35bd55b..b59be85c5e24 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2721,7 +2721,7 @@ int try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink); void recover_inline_xattr(struct inode *inode, struct page *page); int recover_xattr_data(struct inode *inode, struct page *page); int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page); -int restore_node_summary(struct f2fs_sb_info *sbi, +void restore_node_summary(struct f2fs_sb_info *sbi, unsigned int segno, struct f2fs_summary_block *sum); void flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc); int build_node_manager(struct f2fs_sb_info *sbi); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index d833efceae82..9453975c9799 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2347,7 +2347,7 @@ retry: return 0; } -int restore_node_summary(struct f2fs_sb_info *sbi, +void restore_node_summary(struct f2fs_sb_info *sbi, unsigned int segno, struct f2fs_summary_block *sum) { struct f2fs_node *rn; @@ -2380,7 +2380,6 @@ int restore_node_summary(struct f2fs_sb_info *sbi, invalidate_mapping_pages(META_MAPPING(sbi), addr, addr + nrpages); } - return 0; } static void remove_nats_in_journal(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index fac18cc58c44..2206c297ec16 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2904,7 +2904,7 @@ void f2fs_wait_on_block_writeback(struct f2fs_sb_info *sbi, block_t blkaddr) } } -static int read_compacted_summaries(struct f2fs_sb_info *sbi) +static void read_compacted_summaries(struct f2fs_sb_info *sbi) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); struct curseg_info *seg_i; @@ -2961,7 +2961,6 @@ static int read_compacted_summaries(struct f2fs_sb_info *sbi) } } f2fs_put_page(page, 1); - return 0; } static int read_normal_summaries(struct f2fs_sb_info *sbi, int type) @@ -3007,13 +3006,7 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type) ns->ofs_in_node = 0; } } else { - int err; - - err = restore_node_summary(sbi, segno, sum); - if (err) { - f2fs_put_page(new, 1); - return err; - } + restore_node_summary(sbi, segno, sum); } } @@ -3052,8 +3045,7 @@ static int restore_curseg_summaries(struct f2fs_sb_info *sbi) META_CP, true); /* restore for compacted data summary */ - if (read_compacted_summaries(sbi)) - return -EINVAL; + read_compacted_summaries(sbi); type = CURSEG_HOT_NODE; } From cd38d5ada5a4dcad36e9791a279cecc3de57bd13 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 12 Dec 2017 14:11:40 +0800 Subject: [PATCH 550/804] f2fs: fix potential hangtask in f2fs_trace_pid As Jia-Ju Bai reported: "According to fs/f2fs/trace.c, the kernel module may sleep under a spinlock. The function call path is: f2fs_trace_pid (acquire the spinlock) f2fs_radix_tree_insert cond_resched --> may sleep I do not find a good way to fix it, so I only report. This possible bug is found by my static analysis tool (DSAC) and my code review." Obviously, it's problemetic to schedule in critical region of spinlock, which will cause uninterruptable sleep if there is no waker. This patch changes to use mutex lock intead of spinlock to avoid this condition. Reported-by: Jia-Ju Bai Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/trace.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/trace.c b/fs/f2fs/trace.c index bccbbf2616d2..a1fcd00bbb2b 100644 --- a/fs/f2fs/trace.c +++ b/fs/f2fs/trace.c @@ -17,7 +17,7 @@ #include "trace.h" static RADIX_TREE(pids, GFP_ATOMIC); -static spinlock_t pids_lock; +static struct mutex pids_lock; static struct last_io_info last_io; static inline void __print_last_io(void) @@ -64,7 +64,7 @@ void f2fs_trace_pid(struct page *page) if (radix_tree_preload(GFP_NOFS)) return; - spin_lock(&pids_lock); + mutex_lock(&pids_lock); p = radix_tree_lookup(&pids, pid); if (p == current) goto out; @@ -77,7 +77,7 @@ void f2fs_trace_pid(struct page *page) MAJOR(inode->i_sb->s_dev), MINOR(inode->i_sb->s_dev), pid, current->comm); out: - spin_unlock(&pids_lock); + mutex_unlock(&pids_lock); radix_tree_preload_end(); } @@ -122,7 +122,7 @@ void f2fs_trace_ios(struct f2fs_io_info *fio, int flush) void f2fs_build_trace_ios(void) { - spin_lock_init(&pids_lock); + mutex_init(&pids_lock); } #define PIDVEC_SIZE 128 @@ -150,7 +150,7 @@ void f2fs_destroy_trace_ios(void) pid_t next_pid = 0; unsigned int found; - spin_lock(&pids_lock); + mutex_lock(&pids_lock); while ((found = gang_lookup_pids(pid, next_pid, PIDVEC_SIZE))) { unsigned idx; @@ -158,5 +158,5 @@ void f2fs_destroy_trace_ios(void) for (idx = 0; idx < found; idx++) radix_tree_delete(&pids, pid[idx]); } - spin_unlock(&pids_lock); + mutex_unlock(&pids_lock); } From 8a2c11d8658d8136352713372fbd143a60af5533 Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Tue, 5 Dec 2017 12:07:47 +0800 Subject: [PATCH 551/804] f2fs: fix an error case of missing update inode page -Thread A Thread B -write_checkpoint -block_operations -f2fs_unlock_all -f2fs_sync_file -f2fs_write_inode -f2fs_inode_synced -f2fs_sync_inode_meta -sync_node_pages -set_page_drity In this case, if sudden power off without next new checkpoint, the last inode page update will lost. wb_writeback is same with fsync. Yunlei also reproduced the bug by: @@ -366,7 +366,7 @@ int update_inode(struct inode *inode, struct page *node_page) struct extent_tree *et = F2FS_I(inode)->extent_tree; f2fs_inode_synced(inode); - + msleep(10000); f2fs_wait_on_page_writeback(node_page, NODE, true); shell 1: shell2: dd if=/dev/zero of=./test bs=1M count=10 sync echo "hello" >> ./test fsync test // sleep 10s sync //return quickly echo c > /proc/sysrq-trigger Signed-off-by: Yunlei He Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 4 ++-- fs/f2fs/inode.c | 16 +++++++--------- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index b59be85c5e24..9502ec303555 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2607,8 +2607,8 @@ void f2fs_inode_chksum_set(struct f2fs_sb_info *sbi, struct page *page); struct inode *f2fs_iget(struct super_block *sb, unsigned long ino); struct inode *f2fs_iget_retry(struct super_block *sb, unsigned long ino); int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink); -int update_inode(struct inode *inode, struct page *node_page); -int update_inode_page(struct inode *inode); +void update_inode(struct inode *inode, struct page *node_page); +void update_inode_page(struct inode *inode); int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc); void f2fs_evict_inode(struct inode *inode); void handle_failed_inode(struct inode *inode); diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index b4c4f2b25304..234322889e65 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -360,14 +360,15 @@ retry: return inode; } -int update_inode(struct inode *inode, struct page *node_page) +void update_inode(struct inode *inode, struct page *node_page) { struct f2fs_inode *ri; struct extent_tree *et = F2FS_I(inode)->extent_tree; - f2fs_inode_synced(inode); - f2fs_wait_on_page_writeback(node_page, NODE, true); + set_page_dirty(node_page); + + f2fs_inode_synced(inode); ri = F2FS_INODE(node_page); @@ -426,14 +427,12 @@ int update_inode(struct inode *inode, struct page *node_page) if (inode->i_nlink == 0) clear_inline_node(node_page); - return set_page_dirty(node_page); } -int update_inode_page(struct inode *inode) +void update_inode_page(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct page *node_page; - int ret = 0; retry: node_page = get_node_page(sbi, inode->i_ino); if (IS_ERR(node_page)) { @@ -444,11 +443,10 @@ retry: } else if (err != -ENOENT) { f2fs_stop_checkpoint(sbi, false); } - return 0; + return; } - ret = update_inode(inode, node_page); + update_inode(inode, node_page); f2fs_put_page(node_page, 1); - return ret; } int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) From e2bb618a0a6bb232c22b37d27a5a631f2fc198af Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 19 Dec 2017 19:16:34 -0800 Subject: [PATCH 552/804] f2fs: return error during fill_super Let's avoid BUG_ON during fill_super, when on-disk was totall corrupted. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 16 ++++++++++++---- fs/f2fs/segment.h | 22 ++++++++++++++++++---- 2 files changed, 30 insertions(+), 8 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 2206c297ec16..6af71864b501 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -3557,7 +3557,7 @@ static int build_curseg(struct f2fs_sb_info *sbi) return restore_curseg_summaries(sbi); } -static void build_sit_entries(struct f2fs_sb_info *sbi) +static int build_sit_entries(struct f2fs_sb_info *sbi) { struct sit_info *sit_i = SIT_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA); @@ -3567,6 +3567,7 @@ static void build_sit_entries(struct f2fs_sb_info *sbi) int sit_blk_cnt = SIT_BLK_CNT(sbi); unsigned int i, start, end; unsigned int readed, start_blk = 0; + int err = 0; do { readed = ra_meta_pages(sbi, start_blk, BIO_MAX_PAGES, @@ -3585,7 +3586,9 @@ static void build_sit_entries(struct f2fs_sb_info *sbi) sit = sit_blk->entries[SIT_ENTRY_OFFSET(sit_i, start)]; f2fs_put_page(page, 1); - check_block_count(sbi, start, &sit); + err = check_block_count(sbi, start, &sit); + if (err) + return err; seg_info_from_raw_sit(se, &sit); /* build discard map only one time */ @@ -3620,7 +3623,9 @@ static void build_sit_entries(struct f2fs_sb_info *sbi) old_valid_blocks = se->valid_blocks; - check_block_count(sbi, start, &sit); + err = check_block_count(sbi, start, &sit); + if (err) + break; seg_info_from_raw_sit(se, &sit); if (f2fs_discard_en(sbi)) { @@ -3640,6 +3645,7 @@ static void build_sit_entries(struct f2fs_sb_info *sbi) se->valid_blocks - old_valid_blocks; } up_read(&curseg->journal_rwsem); + return err; } static void init_free_segmap(struct f2fs_sb_info *sbi) @@ -3814,7 +3820,9 @@ int build_segment_manager(struct f2fs_sb_info *sbi) return err; /* reinit free segmap based on SIT */ - build_sit_entries(sbi); + err = build_sit_entries(sbi); + if (err) + return err; init_free_segmap(sbi); err = build_dirty_segmap(sbi); diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 5264b6ed120c..5c4d432ebf1d 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -655,7 +655,7 @@ static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr) /* * Summary block is always treated as an invalid block */ -static inline void check_block_count(struct f2fs_sb_info *sbi, +static inline int check_block_count(struct f2fs_sb_info *sbi, int segno, struct f2fs_sit_entry *raw_sit) { #ifdef CONFIG_F2FS_CHECK_FS @@ -677,11 +677,25 @@ static inline void check_block_count(struct f2fs_sb_info *sbi, cur_pos = next_pos; is_valid = !is_valid; } while (cur_pos < sbi->blocks_per_seg); - BUG_ON(GET_SIT_VBLOCKS(raw_sit) != valid_blocks); + + if (unlikely(GET_SIT_VBLOCKS(raw_sit) != valid_blocks)) { + f2fs_msg(sbi->sb, KERN_ERR, + "Mismatch valid blocks %d vs. %d", + GET_SIT_VBLOCKS(raw_sit), valid_blocks); + set_sbi_flag(sbi, SBI_NEED_FSCK); + return -EINVAL; + } #endif /* check segment usage, and check boundary of a given segment number */ - f2fs_bug_on(sbi, GET_SIT_VBLOCKS(raw_sit) > sbi->blocks_per_seg - || segno > TOTAL_SEGS(sbi) - 1); + if (unlikely(GET_SIT_VBLOCKS(raw_sit) > sbi->blocks_per_seg + || segno > TOTAL_SEGS(sbi) - 1)) { + f2fs_msg(sbi->sb, KERN_ERR, + "Wrong valid blocks %d or segno %u", + GET_SIT_VBLOCKS(raw_sit), segno); + set_sbi_flag(sbi, SBI_NEED_FSCK); + return -EINVAL; + } + return 0; } static inline pgoff_t current_sit_addr(struct f2fs_sb_info *sbi, From cff2c7fe417b5f5750af6e665dd972e8efe70761 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 28 Dec 2017 08:09:44 -0800 Subject: [PATCH 553/804] f2fs: recover directory operations by fsync This fixes generic/342 which doesn't recover renamed file which was fsynced before. It will be done via another fsync on newly created file. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 2 ++ fs/f2fs/f2fs.h | 2 ++ fs/f2fs/file.c | 3 +++ fs/f2fs/namei.c | 4 ++++ include/trace/events/f2fs.h | 3 ++- 5 files changed, 13 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 55fb45b66ed2..bde445e4e690 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -713,6 +713,8 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, f2fs_update_time(F2FS_I_SB(dir), REQ_TIME); + add_ino_entry(F2FS_I_SB(dir), dir->i_ino, TRANS_DIR_INO); + if (f2fs_has_inline_dentry(dir)) return f2fs_delete_inline_entry(dentry, page, dir, inode); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 9502ec303555..8fa9cc3cdf23 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -247,6 +247,7 @@ enum { ORPHAN_INO, /* for orphan ino list */ APPEND_INO, /* for append ino list */ UPDATE_INO, /* for update ino list */ + TRANS_DIR_INO, /* for trasactions dir ino list */ FLUSH_INO, /* for multiple device flushing */ MAX_INO_ENTRY, /* max. list */ }; @@ -988,6 +989,7 @@ enum cp_reason_type { CP_NODE_NEED_CP, CP_FASTBOOT_MODE, CP_SPEC_LOG_NUM, + CP_RECOVER_DIR, }; enum iostat_type { diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index b88efbfd22e7..de0a167c8238 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -168,6 +168,9 @@ static inline enum cp_reason_type need_do_checkpoint(struct inode *inode) cp_reason = CP_FASTBOOT_MODE; else if (sbi->active_logs == 2) cp_reason = CP_SPEC_LOG_NUM; + else if (need_dentry_mark(sbi, inode->i_ino) && + exist_written_data(sbi, F2FS_I(inode)->i_pino, TRANS_DIR_INO)) + cp_reason = CP_RECOVER_DIR; return cp_reason; } diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index f44ce8c34966..a72c226c4d30 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -932,6 +932,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, } f2fs_i_links_write(old_dir, false); } + add_ino_entry(sbi, new_dir->i_ino, TRANS_DIR_INO); f2fs_unlock_op(sbi); @@ -1086,6 +1087,9 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, } f2fs_mark_inode_dirty_sync(new_dir, false); + add_ino_entry(sbi, old_dir->i_ino, TRANS_DIR_INO); + add_ino_entry(sbi, new_dir->i_ino, TRANS_DIR_INO); + f2fs_unlock_op(sbi); if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 589df6f73789..0cdf6cc5c557 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -138,7 +138,8 @@ TRACE_DEFINE_ENUM(CP_TRIMMED); { CP_NO_SPC_ROLL, "no space roll forward" }, \ { CP_NODE_NEED_CP, "node needs cp" }, \ { CP_FASTBOOT_MODE, "fastboot mode" }, \ - { CP_SPEC_LOG_NUM, "log type is 2" }) + { CP_SPEC_LOG_NUM, "log type is 2" }, \ + { CP_RECOVER_DIR, "dir needs recovery" }) struct victim_sel_policy; struct f2fs_map_blocks; From 25ef3006ba2320a9ee75d3afbfb02c482de9ee1b Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 28 Dec 2017 17:47:19 -0800 Subject: [PATCH 554/804] f2fs: fix missing error number for xattr operation This fixes generic/449 hang problem caused by no ENOSPC forever which should be returned by setxattr under disk full scenario. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/xattr.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 47ac858787ea..353fbff85bab 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -480,6 +480,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, if (F2FS_I(inode)->i_xattr_nid) { xpage = get_node_page(sbi, F2FS_I(inode)->i_xattr_nid); if (IS_ERR(xpage)) { + err = PTR_ERR(xpage); alloc_nid_failed(sbi, new_nid); goto in_page_out; } @@ -490,6 +491,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, set_new_dnode(&dn, inode, NULL, NULL, new_nid); xpage = new_node_page(&dn, XATTR_NODE_OFFSET); if (IS_ERR(xpage)) { + err = PTR_ERR(xpage); alloc_nid_failed(sbi, new_nid); goto in_page_out; } From 54bf13a0adcdb523deb12c23405a853115ee13bb Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Sun, 31 Dec 2017 16:26:38 -0800 Subject: [PATCH 555/804] f2fs: skip stop_checkpoint for user data writes We can give another chance to write user data, which can resolve generic/441. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index b7fd9f010b2b..a1dc4cfdcb8e 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -110,7 +110,8 @@ static void f2fs_write_end_io(struct bio *bio) if (unlikely(bio->bi_error)) { set_bit(AS_EIO, &page->mapping->flags); - f2fs_stop_checkpoint(sbi, true); + if (type == F2FS_WB_CP_DATA) + f2fs_stop_checkpoint(sbi, true); } dec_page_count(sbi, type); clear_cold_data(page); From 2b4d859bd9d89cd0dd4b2da699930208583488d4 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 2 Jan 2018 11:03:19 -0800 Subject: [PATCH 556/804] f2fs: enable quota at remount from r to w We have to enable quota only when remounting from read to write. Otherwise, we'll get remount failure. (e.g., write to write case) Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index b6a96a8fb794..ff9affb32890 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1266,7 +1266,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) err = dquot_suspend(sb, -1); if (err < 0) goto restore_opts; - } else { + } else if (f2fs_readonly(sb) && !(*flags & MS_RDONLY)) { /* dquot_resume needs RW */ sb->s_flags &= ~MS_RDONLY; if (sb_any_quota_suspended(sb)) { From 87b8168e9ef006e25036eba5fa0e7aa8ee95880a Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 3 Jan 2018 17:30:19 +0800 Subject: [PATCH 557/804] f2fs: continue to do direct IO if we only preallocate partial blocks While doing direct IO, if we run out-of-space when we preallocate blocks, we should not return ENOSPC error directly, instead, we should continue to do following direct IO, which will keep directIO of f2fs acting like other filesystems. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index a1dc4cfdcb8e..08b126366658 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -830,10 +830,12 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) { struct inode *inode = file_inode(iocb->ki_filp); struct f2fs_map_blocks map; + int flag; int err = 0; + bool direct_io = iocb->ki_flags & IOCB_DIRECT; /* convert inline data for Direct I/O*/ - if (iocb->ki_flags & IOCB_DIRECT) { + if (direct_io) { err = f2fs_convert_inline_inode(inode); if (err) return err; @@ -852,26 +854,30 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) map.m_next_pgofs = NULL; map.m_seg_type = NO_CHECK_TYPE; - if (iocb->ki_flags & IOCB_DIRECT) { + if (direct_io) { /* map.m_seg_type = rw_hint_to_seg_type(iocb->ki_hint); */ map.m_seg_type = rw_hint_to_seg_type(WRITE_LIFE_NOT_SET); - return f2fs_map_blocks(inode, &map, 1, - __force_buffered_io(inode, WRITE) ? - F2FS_GET_BLOCK_PRE_AIO : - F2FS_GET_BLOCK_PRE_DIO); + flag = __force_buffered_io(inode, WRITE) ? + F2FS_GET_BLOCK_PRE_AIO : + F2FS_GET_BLOCK_PRE_DIO; + goto map_blocks; } if (iocb->ki_pos + iov_iter_count(from) > MAX_INLINE_DATA(inode)) { err = f2fs_convert_inline_inode(inode); if (err) return err; } - if (!f2fs_has_inline_data(inode)) { - err = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO); - if (map.m_len > 0 && err == -ENOSPC) { - set_inode_flag(inode, FI_NO_PREALLOC); - err = 0; - } + if (f2fs_has_inline_data(inode)) return err; + + flag = F2FS_GET_BLOCK_PRE_AIO; + +map_blocks: + err = f2fs_map_blocks(inode, &map, 1, flag); + if (map.m_len > 0 && err == -ENOSPC) { + if (!direct_io) + set_inode_flag(inode, FI_NO_PREALLOC); + err = 0; } return err; } From b242d7edc5379043477d7eb817d2e7488f2fc16c Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 3 Jan 2018 17:32:51 +0800 Subject: [PATCH 558/804] f2fs: clean up unneeded declaration Commit 6afc662e68b5 ("f2fs: support flexible inline xattr size") declared f2fs_sb_has_flexible_inline_xattr in f2fs.h for latter being used in get_inline_xattr_addrs, but in latter version, related code has been changed, leave f2fs_sb_has_flexible_inline_xattr w/o any users. Let's remove it for cleanup. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 8fa9cc3cdf23..4c9f762ed355 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2540,7 +2540,6 @@ static inline int get_extra_isize(struct inode *inode) return F2FS_I(inode)->i_extra_isize / sizeof(__le32); } -static inline int f2fs_sb_has_flexible_inline_xattr(struct super_block *sb); static inline int get_inline_xattr_addrs(struct inode *inode) { return F2FS_I(inode)->i_inline_xattr_size; From 591b336387338e054067ff82bb535d4fde06179f Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 3 Jan 2018 10:55:07 -0800 Subject: [PATCH 559/804] f2fs: show precise # of blocks that user/root can use Let's show precise # of blocks that user/root can use through bavail and bfree respectively. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index ff9affb32890..4f888e1c5bae 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -995,20 +995,19 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) struct super_block *sb = dentry->d_sb; struct f2fs_sb_info *sbi = F2FS_SB(sb); u64 id = huge_encode_dev(sb->s_bdev->bd_dev); - block_t total_count, user_block_count, start_count, ovp_count; + block_t total_count, user_block_count, start_count; u64 avail_node_count; total_count = le64_to_cpu(sbi->raw_super->block_count); user_block_count = sbi->user_block_count; start_count = le32_to_cpu(sbi->raw_super->segment0_blkaddr); - ovp_count = SM_I(sbi)->ovp_segments << sbi->log_blocks_per_seg; buf->f_type = F2FS_SUPER_MAGIC; buf->f_bsize = sbi->blocksize; buf->f_blocks = total_count - start_count; - buf->f_bfree = user_block_count - valid_user_blocks(sbi) + ovp_count; - buf->f_bavail = user_block_count - valid_user_blocks(sbi) - + buf->f_bfree = user_block_count - valid_user_blocks(sbi) - sbi->current_reserved_blocks; + buf->f_bavail = buf->f_bfree; avail_node_count = sbi->total_node_count - sbi->nquota_files - F2FS_RESERVED_NODE_NUM; From 4c6bc4be375adf7c4c99188829dfd17fccae930c Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Wed, 3 Jan 2018 18:03:04 +0800 Subject: [PATCH 560/804] f2fs: update inode info to inode page for new file After checkpoint, 1. creat a new file A ,(with dirty inode && dirty inode page && xattr info) 2. backgroud wb write back file A inode page (without update from inode cache) 3. fsync file A, write back inode page of file A with inode cache info 4. sudden power off before new checkpoint In this case, recovery process will try to recover a zero inode page. Inline xattr flag of file A will be miss and xattr info will be taken as blkaddr index. Signed-off-by: Yunlei He Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 9453975c9799..ec486ec074da 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2229,7 +2229,9 @@ void recover_inline_xattr(struct inode *inode, struct page *page) f2fs_bug_on(F2FS_I_SB(inode), IS_ERR(ipage)); ri = F2FS_INODE(page); - if (!(ri->i_inline & F2FS_INLINE_XATTR)) { + if (ri->i_inline & F2FS_INLINE_XATTR) { + set_inode_flag(inode, FI_INLINE_XATTR); + } else { clear_inode_flag(inode, FI_INLINE_XATTR); goto update_inode; } From 06a366757ff766936c307afef902300f602cb6a2 Mon Sep 17 00:00:00 2001 From: Yunlong Song Date: Thu, 4 Jan 2018 15:02:02 +0800 Subject: [PATCH 561/804] f2fs: check segment type in __f2fs_replace_block In some case, the node blocks has wrong blkaddr whose segment type is NODE, e.g., recover inode has missing xattr flag and the blkaddr is in the xattr range. Since fsck.f2fs does not check the recovery nodes, this will cause __f2fs_replace_block change the curseg of node and do the update_sit_entry(sbi, new_blkaddr, 1) with no next_blkoff refresh, as a result, when recovery process write checkpoint and sync nodes, the next_blkoff of curseg is used in the segment bit map, then it will cause f2fs_bug_on. So let's check segment type in __f2fs_replace_block. Signed-off-by: Yunlong Song Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 6af71864b501..96b01c7bea42 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2820,6 +2820,7 @@ void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, } } + f2fs_bug_on(sbi, !IS_DATASEG(type)); curseg = CURSEG_I(sbi, type); mutex_lock(&curseg->curseg_mutex); From 62438ba87b798597d73aa86d0181aaafd11cd067 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 27 Dec 2017 15:05:52 -0800 Subject: [PATCH 562/804] f2fs: add reserved blocks for root user This patch allows root to reserve some blocks via mount option. "-o reserve_root=N" means N x 4KB-sized blocks for root only. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 20 ++++++++++++++++---- fs/f2fs/super.c | 37 ++++++++++++++++++++++++++++++++++++- fs/f2fs/sysfs.c | 3 ++- 3 files changed, 54 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 4c9f762ed355..38d595b99f58 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -96,6 +96,7 @@ extern char *fault_name[FAULT_MAX]; #define F2FS_MOUNT_PRJQUOTA 0x00200000 #define F2FS_MOUNT_QUOTA 0x00400000 #define F2FS_MOUNT_INLINE_XATTR_SIZE 0x00800000 +#define F2FS_MOUNT_RESERVE_ROOT 0x01000000 #define clear_opt(sbi, option) ((sbi)->mount_opt.opt &= ~F2FS_MOUNT_##option) #define set_opt(sbi, option) ((sbi)->mount_opt.opt |= F2FS_MOUNT_##option) @@ -1169,6 +1170,7 @@ struct f2fs_sb_info { block_t last_valid_block_count; /* for recovery */ block_t reserved_blocks; /* configurable reserved blocks */ block_t current_reserved_blocks; /* current reserved blocks */ + block_t root_reserved_blocks; /* root reserved blocks */ unsigned int nquota_files; /* # of quota sysfile */ @@ -1647,11 +1649,17 @@ static inline int inc_valid_block_count(struct f2fs_sb_info *sbi, sbi->total_valid_block_count += (block_t)(*count); avail_user_block_count = sbi->user_block_count - sbi->current_reserved_blocks; + + if (!(test_opt(sbi, RESERVE_ROOT) && capable(CAP_SYS_RESOURCE))) + avail_user_block_count -= sbi->root_reserved_blocks; + if (unlikely(sbi->total_valid_block_count > avail_user_block_count)) { diff = sbi->total_valid_block_count - avail_user_block_count; + if (diff > *count) + diff = *count; *count -= diff; release = diff; - sbi->total_valid_block_count = avail_user_block_count; + sbi->total_valid_block_count -= diff; if (!*count) { spin_unlock(&sbi->stat_lock); percpu_counter_sub(&sbi->alloc_valid_block_count, diff); @@ -1840,9 +1848,13 @@ static inline int inc_valid_node_count(struct f2fs_sb_info *sbi, spin_lock(&sbi->stat_lock); - valid_block_count = sbi->total_valid_block_count + 1; - if (unlikely(valid_block_count + sbi->current_reserved_blocks > - sbi->user_block_count)) { + valid_block_count = sbi->total_valid_block_count + + sbi->current_reserved_blocks + 1; + + if (!(test_opt(sbi, RESERVE_ROOT) && capable(CAP_SYS_RESOURCE))) + valid_block_count += sbi->root_reserved_blocks; + + if (unlikely(valid_block_count > sbi->user_block_count)) { spin_unlock(&sbi->stat_lock); goto enospc; } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 4f888e1c5bae..9ec270a961e2 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -107,6 +107,7 @@ enum { Opt_noextent_cache, Opt_noinline_data, Opt_data_flush, + Opt_reserve_root, Opt_mode, Opt_io_size_bits, Opt_fault_injection, @@ -157,6 +158,7 @@ static match_table_t f2fs_tokens = { {Opt_noextent_cache, "noextent_cache"}, {Opt_noinline_data, "noinline_data"}, {Opt_data_flush, "data_flush"}, + {Opt_reserve_root, "reserve_root=%u"}, {Opt_mode, "mode=%s"}, {Opt_io_size_bits, "io_bits=%u"}, {Opt_fault_injection, "fault_injection=%u"}, @@ -191,6 +193,19 @@ void f2fs_msg(struct super_block *sb, const char *level, const char *fmt, ...) va_end(args); } +static inline void limit_reserve_root(struct f2fs_sb_info *sbi) +{ + block_t limit = (sbi->user_block_count << 1) / 1000; + + /* limit is 0.2% */ + if (test_opt(sbi, RESERVE_ROOT) && sbi->root_reserved_blocks > limit) { + sbi->root_reserved_blocks = limit; + f2fs_msg(sbi->sb, KERN_INFO, + "Reduce reserved blocks for root = %u", + sbi->root_reserved_blocks); + } +} + static void init_once(void *foo) { struct f2fs_inode_info *fi = (struct f2fs_inode_info *) foo; @@ -488,6 +503,18 @@ static int parse_options(struct super_block *sb, char *options) case Opt_data_flush: set_opt(sbi, DATA_FLUSH); break; + case Opt_reserve_root: + if (args->from && match_int(args, &arg)) + return -EINVAL; + if (test_opt(sbi, RESERVE_ROOT)) { + f2fs_msg(sb, KERN_INFO, + "Preserve previous reserve_root=%u", + sbi->root_reserved_blocks); + } else { + sbi->root_reserved_blocks = arg; + set_opt(sbi, RESERVE_ROOT); + } + break; case Opt_mode: name = match_strdup(&args[0]); @@ -1007,7 +1034,10 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_blocks = total_count - start_count; buf->f_bfree = user_block_count - valid_user_blocks(sbi) - sbi->current_reserved_blocks; - buf->f_bavail = buf->f_bfree; + if (buf->f_bfree > sbi->root_reserved_blocks) + buf->f_bavail = buf->f_bfree - sbi->root_reserved_blocks; + else + buf->f_bavail = 0; avail_node_count = sbi->total_node_count - sbi->nquota_files - F2FS_RESERVED_NODE_NUM; @@ -1136,6 +1166,9 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) else if (test_opt(sbi, LFS)) seq_puts(seq, "lfs"); seq_printf(seq, ",active_logs=%u", sbi->active_logs); + if (test_opt(sbi, RESERVE_ROOT)) + seq_printf(seq, ",reserve_root=%u", + sbi->root_reserved_blocks); if (F2FS_IO_SIZE_BITS(sbi)) seq_printf(seq, ",io_size=%uKB", F2FS_IO_SIZE_KB(sbi)); #ifdef CONFIG_F2FS_FAULT_INJECTION @@ -1334,6 +1367,7 @@ skip: sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | (test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0); + limit_reserve_root(sbi); return 0; restore_gc: if (need_restart_gc) { @@ -2577,6 +2611,7 @@ try_onemore: sbi->last_valid_block_count = sbi->total_valid_block_count; sbi->reserved_blocks = 0; sbi->current_reserved_blocks = 0; + limit_reserve_root(sbi); for (i = 0; i < NR_INODE_TYPE; i++) { INIT_LIST_HEAD(&sbi->inode_list[i]); diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 93c3364250dd..ab6028c332aa 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -162,7 +162,8 @@ static ssize_t f2fs_sbi_store(struct f2fs_attr *a, #endif if (a->struct_type == RESERVED_BLOCKS) { spin_lock(&sbi->stat_lock); - if (t > (unsigned long)sbi->user_block_count) { + if (t > (unsigned long)(sbi->user_block_count - + sbi->root_reserved_blocks)) { spin_unlock(&sbi->stat_lock); return -EINVAL; } From b78e9302e2e358d45ea4377bf2c20d045f1c3b8a Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Fri, 5 Jan 2018 09:41:20 +0000 Subject: [PATCH 563/804] f2fs: make local functions static Fixes the following sparse warnings: fs/f2fs/segment.c:887:6: warning: symbol '__check_sit_bitmap' was not declared. Should it be static? fs/f2fs/segment.c:1327:6: warning: symbol 'f2fs_wait_discard_bio' was not declared. Should it be static? fs/f2fs/super.c:1661:5: warning: symbol 'f2fs_get_projid' was not declared. Should it be static? Signed-off-by: Wei Yongjun Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 4 ++-- fs/f2fs/super.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 96b01c7bea42..116e50470360 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -965,7 +965,7 @@ static int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, return 0; } -void __check_sit_bitmap(struct f2fs_sb_info *sbi, +static void __check_sit_bitmap(struct f2fs_sb_info *sbi, block_t start, block_t end) { #ifdef CONFIG_F2FS_CHECK_FS @@ -1404,7 +1404,7 @@ static void __wait_all_discard_cmd(struct f2fs_sb_info *sbi, } /* This should be covered by global mutex, &sit_i->sentry_lock */ -void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) +static void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct discard_cmd *dc; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 9ec270a961e2..ec13397b635c 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1693,7 +1693,7 @@ void f2fs_quota_off_umount(struct super_block *sb) } #if 0 -int f2fs_get_projid(struct inode *inode, kprojid_t *projid) +static int f2fs_get_projid(struct inode *inode, kprojid_t *projid) { *projid = F2FS_I(inode)->i_projid; return 0; From d4f19f6266abaf573312c78723e09fb6498980ab Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 8 Jan 2018 18:48:33 +0800 Subject: [PATCH 564/804] f2fs: avoid high cpu usage in discard thread We take very long time to finish generic/476, this is because we will check consistence of all discard entries in global rb tree while traversing all different granularity pending lists, even when the list is empty, in order to avoid that unneeded overhead, we have to skip the check when coming up an empty list. generic/476 time consumption: cost Before patch & w/o consistence check 57s Before patch & w/ consistence check 1426s After patch 78s Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 116e50470360..d13e36b292a4 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1284,6 +1284,8 @@ static int __issue_discard_cmd(struct f2fs_sb_info *sbi, pend_list = &dcc->pend_list[i]; mutex_lock(&dcc->cmd_lock); + if (list_empty(pend_list)) + goto next; f2fs_bug_on(sbi, !__check_rb_tree_consistence(sbi, &dcc->root)); blk_start_plug(&plug); list_for_each_entry_safe(dc, tmp, pend_list, list) { @@ -1302,6 +1304,7 @@ skip: break; } blk_finish_plug(&plug); +next: mutex_unlock(&dcc->cmd_lock); if (iter >= dpolicy->max_requests) From 1338f376d5a344fe786cc497e68c223508d2a937 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 8 Jan 2018 18:48:34 +0800 Subject: [PATCH 565/804] f2fs: remove unused pend_list_tag In commit 78997b569f56 ("f2fs: split discard policy"), we have get rid of using pend_list_tag field in struct discard_cmd_control, but forgot to remove it, now do it. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 38d595b99f58..635866e33a31 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -334,7 +334,6 @@ struct discard_cmd_control { struct task_struct *f2fs_issue_discard; /* discard thread */ struct list_head entry_list; /* 4KB discard entry list */ struct list_head pend_list[MAX_PLIST_NUM];/* store pending entries */ - unsigned char pend_list_tag[MAX_PLIST_NUM];/* tag for pending entries */ struct list_head wait_list; /* store on-flushing entries */ struct list_head fstrim_list; /* in-flight discard from fstrim */ wait_queue_head_t discard_wait_queue; /* waiting queue for wake-up */ From f53dcf6799abaf7776bc82679beb4382e8a85f9b Mon Sep 17 00:00:00 2001 From: Yufen Yu Date: Tue, 9 Jan 2018 19:33:39 +0800 Subject: [PATCH 566/804] f2fs: implement cgroup writeback support Cgroup writeback requires explicit support from the filesystem. f2fs's data and node writeback IOs go through __write_data_page, which sets fio for submiting IOs. So, we add io_wbc for fio, associate bios with blkcg by invoking wbc_init_bio() and account IOs issuing by wbc_account_io(). In addtion, f2fs_fill_super() is updated to set SB_I_CGROUPWB. Meta writeback IOs is left alone by this patch and will always be attributed to the root cgroup. The results show that f2fs can throttle writeback nicely for data writing and file creating. Reviewed-by: Chao Yu Signed-off-by: Yufen Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 12 ++++++++++-- fs/f2fs/f2fs.h | 1 + fs/f2fs/node.c | 1 + fs/f2fs/super.c | 1 + 4 files changed, 13 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 08b126366658..8148dff3732e 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -168,6 +168,7 @@ static bool __same_bdev(struct f2fs_sb_info *sbi, * Low-level block read/write IO operations. */ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr, + struct writeback_control *wbc, int npages, bool is_read) { struct bio *bio; @@ -177,6 +178,8 @@ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr, f2fs_target_device(sbi, blk_addr, bio); bio->bi_end_io = is_read ? f2fs_read_end_io : f2fs_write_end_io; bio->bi_private = is_read ? NULL : sbi; + if (wbc) + wbc_init_bio(wbc, bio); return bio; } @@ -372,7 +375,8 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) f2fs_trace_ios(fio, 0); /* Allocate a new bio */ - bio = __bio_alloc(fio->sbi, fio->new_blkaddr, 1, is_read_io(fio->op)); + bio = __bio_alloc(fio->sbi, fio->new_blkaddr, fio->io_wbc, + 1, is_read_io(fio->op)); if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { bio_put(bio); @@ -434,7 +438,7 @@ alloc_new: dec_page_count(sbi, WB_DATA_TYPE(bio_page)); goto out_fail; } - io->bio = __bio_alloc(sbi, fio->new_blkaddr, + io->bio = __bio_alloc(sbi, fio->new_blkaddr, fio->io_wbc, BIO_MAX_PAGES, false); io->fio = *fio; } @@ -444,6 +448,9 @@ alloc_new: goto alloc_new; } + if (fio->io_wbc) + wbc_account_io(fio->io_wbc, bio_page, PAGE_SIZE); + io->last_block_in_bio = fio->new_blkaddr; f2fs_trace_ios(fio, 0); @@ -1528,6 +1535,7 @@ static int __write_data_page(struct page *page, bool *submitted, .submitted = false, .need_lock = LOCK_RETRY, .io_type = io_type, + .io_wbc = wbc, }; trace_f2fs_writepage(page, DATA); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 635866e33a31..90b412bb4e3b 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1025,6 +1025,7 @@ struct f2fs_io_info { int need_lock; /* indicate we need to lock cp_rwsem */ bool in_list; /* indicate fio is in io_list */ enum iostat_type io_type; /* io type */ + struct writeback_control *io_wbc; /* writeback control */ }; #define is_read_io(rw) ((rw) == READ) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index ec486ec074da..676b0e3f5ef3 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1339,6 +1339,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, .encrypted_page = NULL, .submitted = false, .io_type = io_type, + .io_wbc = wbc, }; trace_f2fs_writepage(page, NODE); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index ec13397b635c..38e33f6d1f93 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2526,6 +2526,7 @@ try_onemore: sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | (test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0); memcpy(sb->s_uuid, raw_super->uuid, sizeof(raw_super->uuid)); + sb->s_iflags |= SB_I_CGROUPWB; /* init f2fs-specific super block info */ sbi->valid_super_block = valid_super_block; From 2ce6b9d8167e2785ea01011bc60c2f95b6313dea Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 4 Jan 2018 21:36:09 -0800 Subject: [PATCH 567/804] f2fs: add resgid and resuid to reserve root blocks This patch adds mount options to reserve some blocks via resgid=%u,resuid=%u. It only activates with reserve_root=%u. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 27 +++++++++++++++++++++++++-- fs/f2fs/super.c | 46 ++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 69 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 90b412bb4e3b..c35f87423123 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -198,6 +199,12 @@ static inline struct timespec current_time(struct inode *inode) return timespec_trunc(now, inode->i_sb->s_time_gran); } +/* + * Default values for user and/or group using reserved blocks + */ +#define F2FS_DEF_RESUID 0 +#define F2FS_DEF_RESGID 0 + /* * For checkpoint manager */ @@ -1171,6 +1178,8 @@ struct f2fs_sb_info { block_t reserved_blocks; /* configurable reserved blocks */ block_t current_reserved_blocks; /* current reserved blocks */ block_t root_reserved_blocks; /* root reserved blocks */ + kuid_t s_resuid; /* reserved blocks for uid */ + kgid_t s_resgid; /* reserved blocks for gid */ unsigned int nquota_files; /* # of quota sysfile */ @@ -1620,6 +1629,20 @@ static inline bool f2fs_has_xattr_block(unsigned int ofs) return ofs == XATTR_NODE_OFFSET; } +static inline bool __allow_reserved_blocks(struct f2fs_sb_info *sbi) +{ + if (!test_opt(sbi, RESERVE_ROOT)) + return false; + if (capable(CAP_SYS_RESOURCE)) + return true; + if (uid_eq(sbi->s_resuid, current_fsuid())) + return true; + if (!gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) && + in_group_p(sbi->s_resgid)) + return true; + return false; +} + static inline void f2fs_i_blocks_write(struct inode *, block_t, bool, bool); static inline int inc_valid_block_count(struct f2fs_sb_info *sbi, struct inode *inode, blkcnt_t *count) @@ -1650,7 +1673,7 @@ static inline int inc_valid_block_count(struct f2fs_sb_info *sbi, avail_user_block_count = sbi->user_block_count - sbi->current_reserved_blocks; - if (!(test_opt(sbi, RESERVE_ROOT) && capable(CAP_SYS_RESOURCE))) + if (!__allow_reserved_blocks(sbi)) avail_user_block_count -= sbi->root_reserved_blocks; if (unlikely(sbi->total_valid_block_count > avail_user_block_count)) { @@ -1851,7 +1874,7 @@ static inline int inc_valid_node_count(struct f2fs_sb_info *sbi, valid_block_count = sbi->total_valid_block_count + sbi->current_reserved_blocks + 1; - if (!(test_opt(sbi, RESERVE_ROOT) && capable(CAP_SYS_RESOURCE))) + if (!__allow_reserved_blocks(sbi)) valid_block_count += sbi->root_reserved_blocks; if (unlikely(valid_block_count > sbi->user_block_count)) { diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 38e33f6d1f93..d89834b79646 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -108,6 +108,8 @@ enum { Opt_noinline_data, Opt_data_flush, Opt_reserve_root, + Opt_resgid, + Opt_resuid, Opt_mode, Opt_io_size_bits, Opt_fault_injection, @@ -159,6 +161,8 @@ static match_table_t f2fs_tokens = { {Opt_noinline_data, "noinline_data"}, {Opt_data_flush, "data_flush"}, {Opt_reserve_root, "reserve_root=%u"}, + {Opt_resgid, "resgid=%u"}, + {Opt_resuid, "resuid=%u"}, {Opt_mode, "mode=%s"}, {Opt_io_size_bits, "io_bits=%u"}, {Opt_fault_injection, "fault_injection=%u"}, @@ -204,6 +208,15 @@ static inline void limit_reserve_root(struct f2fs_sb_info *sbi) "Reduce reserved blocks for root = %u", sbi->root_reserved_blocks); } + if (!test_opt(sbi, RESERVE_ROOT) && + (!uid_eq(sbi->s_resuid, + make_kuid(&init_user_ns, F2FS_DEF_RESUID)) || + !gid_eq(sbi->s_resgid, + make_kgid(&init_user_ns, F2FS_DEF_RESGID)))) + f2fs_msg(sbi->sb, KERN_INFO, + "Ignore s_resuid=%u, s_resgid=%u w/o reserve_root", + from_kuid_munged(&init_user_ns, sbi->s_resuid), + from_kgid_munged(&init_user_ns, sbi->s_resgid)); } static void init_once(void *foo) @@ -336,6 +349,8 @@ static int parse_options(struct super_block *sb, char *options) substring_t args[MAX_OPT_ARGS]; char *p, *name; int arg = 0; + kuid_t uid; + kgid_t gid; #ifdef CONFIG_QUOTA int ret; #endif @@ -515,6 +530,28 @@ static int parse_options(struct super_block *sb, char *options) set_opt(sbi, RESERVE_ROOT); } break; + case Opt_resuid: + if (args->from && match_int(args, &arg)) + return -EINVAL; + uid = make_kuid(current_user_ns(), arg); + if (!uid_valid(uid)) { + f2fs_msg(sb, KERN_ERR, + "Invalid uid value %d", arg); + return -EINVAL; + } + sbi->s_resuid = uid; + break; + case Opt_resgid: + if (args->from && match_int(args, &arg)) + return -EINVAL; + gid = make_kgid(current_user_ns(), arg); + if (!gid_valid(gid)) { + f2fs_msg(sb, KERN_ERR, + "Invalid gid value %d", arg); + return -EINVAL; + } + sbi->s_resgid = gid; + break; case Opt_mode: name = match_strdup(&args[0]); @@ -1167,8 +1204,10 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, "lfs"); seq_printf(seq, ",active_logs=%u", sbi->active_logs); if (test_opt(sbi, RESERVE_ROOT)) - seq_printf(seq, ",reserve_root=%u", - sbi->root_reserved_blocks); + seq_printf(seq, ",reserve_root=%u,resuid=%u,resgid=%u", + sbi->root_reserved_blocks, + from_kuid_munged(&init_user_ns, sbi->s_resuid), + from_kgid_munged(&init_user_ns, sbi->s_resgid)); if (F2FS_IO_SIZE_BITS(sbi)) seq_printf(seq, ",io_size=%uKB", F2FS_IO_SIZE_KB(sbi)); #ifdef CONFIG_F2FS_FAULT_INJECTION @@ -2463,6 +2502,9 @@ try_onemore: sb->s_fs_info = sbi; sbi->raw_super = raw_super; + sbi->s_resuid = make_kuid(&init_user_ns, F2FS_DEF_RESUID); + sbi->s_resgid = make_kgid(&init_user_ns, F2FS_DEF_RESGID); + /* precompute checksum seed for metadata */ if (f2fs_sb_has_inode_chksum(sb)) sbi->s_chksum_seed = f2fs_chksum(sbi, ~0, raw_super->uuid, From d49132d45cb07dc77904bf9b6501df2dd77b251b Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Wed, 10 Jan 2018 16:49:10 +0900 Subject: [PATCH 568/804] f2fs: handle newly created page when revoking inmem pages When committing inmem pages is successful, we revoke already committed blocks in __revoke_inmem_pages() and finally replace the committed ones with the old blocks using f2fs_replace_block(). However, if the committed block was newly created one, the address of the old block is NEW_ADDR and __f2fs_replace_block() cannot handle NEW_ADDR as new_blkaddr properly and a kernel panic occurrs. Signed-off-by: Daeho Jeong Tested-by: Shu Tan Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index d13e36b292a4..7638ebb1c343 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -248,7 +248,11 @@ retry: goto next; } get_node_info(sbi, dn.nid, &ni); - f2fs_replace_block(sbi, &dn, dn.data_blkaddr, + if (cur->old_addr == NEW_ADDR) { + invalidate_blocks(sbi, dn.data_blkaddr); + f2fs_update_data_blkaddr(&dn, NEW_ADDR); + } else + f2fs_replace_block(sbi, &dn, dn.data_blkaddr, cur->old_addr, ni.version, true, true); f2fs_put_dnode(&dn); } From b203c58dfd5538d1a7f99737db6d3653b7601c82 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 10 Jan 2018 18:18:51 +0800 Subject: [PATCH 569/804] f2fs: fix to caclulate required free section correctly When calculating required free section during file defragmenting, we should skip holes in file, otherwise we will probably fail to defrag sparse file with large size. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index de0a167c8238..56f6b21cd9a9 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2101,10 +2101,12 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, continue; } - if (blk_end && blk_end != map.m_pblk) { + if (blk_end && blk_end != map.m_pblk) fragmented = true; - break; - } + + /* record total count of block that we're going to move */ + total += map.m_len; + blk_end = map.m_pblk + map.m_len; map.m_lblk += map.m_len; @@ -2113,10 +2115,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, if (!fragmented) goto out; - map.m_lblk = pg_start; - map.m_len = pg_end - pg_start; - - sec_num = (map.m_len + BLKS_PER_SEC(sbi) - 1) / BLKS_PER_SEC(sbi); + sec_num = (total + BLKS_PER_SEC(sbi) - 1) / BLKS_PER_SEC(sbi); /* * make sure there are enough free section for LFS allocation, this can @@ -2128,6 +2127,10 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, goto out; } + map.m_lblk = pg_start; + map.m_len = pg_end - pg_start; + total = 0; + while (map.m_lblk < pg_end) { pgoff_t idx; int cnt = 0; From 10f4a4140b618ea89740dd76ff47aedbd5161f84 Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Thu, 11 Jan 2018 14:19:32 +0800 Subject: [PATCH 570/804] f2fs: check node page again in write end io Check node page again in write end io in case of data corruption during inflght IO. Signed-off-by: Yunlei He Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 8148dff3732e..bbb6eb79351f 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -113,6 +113,10 @@ static void f2fs_write_end_io(struct bio *bio) if (type == F2FS_WB_CP_DATA) f2fs_stop_checkpoint(sbi, true); } + + f2fs_bug_on(sbi, page->mapping == NODE_MAPPING(sbi) && + page->index != nid_of_node(page)); + dec_page_count(sbi, type); clear_cold_data(page); end_page_writeback(page); From 6afa9a94d09b1e9155db2aa41555e31696d31bf7 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 11 Jan 2018 14:37:35 +0800 Subject: [PATCH 571/804] f2fs: fix to cover f2fs_inline_data_fiemap with inode_lock This patch fix to cover f2fs_inline_data_fiemap with inode_lock in order to make that interface avoiding race with mapping change. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index bbb6eb79351f..ab32b33c8e02 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1154,14 +1154,14 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, if (ret) return ret; + inode_lock(inode); + if (f2fs_has_inline_data(inode)) { ret = f2fs_inline_data_fiemap(inode, fieinfo, start, len); if (ret != -EAGAIN) - return ret; + goto out; } - inode_lock(inode); - if (logical_to_blk(inode, len) == 0) len = blk_to_logical(inode, 1); From 58b1f5b0fcf1b203c3a44eaedd0f2db572a01069 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 11 Jan 2018 14:39:57 +0800 Subject: [PATCH 572/804] f2fs: support FIEMAP_FLAG_XATTR This patch enables ->fiemap to handle FIEMAP_FLAG_XATTR flag for xattr mapping info lookup purpose. It makes f2fs passing generic/425 test in fstest. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 69 +++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 68 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index ab32b33c8e02..2be6e1999ab3 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1140,6 +1140,68 @@ static inline loff_t blk_to_logical(struct inode *inode, sector_t blk) return (blk << inode->i_blkbits); } +static int f2fs_xattr_fiemap(struct inode *inode, + struct fiemap_extent_info *fieinfo) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct page *page; + struct node_info ni; + __u64 phys = 0, len; + __u32 flags; + nid_t xnid = F2FS_I(inode)->i_xattr_nid; + int err = 0; + + if (f2fs_has_inline_xattr(inode)) { + int offset; + + page = f2fs_grab_cache_page(NODE_MAPPING(sbi), + inode->i_ino, false); + if (!page) + return -ENOMEM; + + get_node_info(sbi, inode->i_ino, &ni); + + phys = (__u64)blk_to_logical(inode, ni.blk_addr); + offset = offsetof(struct f2fs_inode, i_addr) + + sizeof(__le32) * (DEF_ADDRS_PER_INODE - + F2FS_INLINE_XATTR_ADDRS(inode)); + + phys += offset; + len = inline_xattr_size(inode); + + f2fs_put_page(page, 1); + + flags = FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_NOT_ALIGNED; + + if (!xnid) + flags |= FIEMAP_EXTENT_LAST; + + err = fiemap_fill_next_extent(fieinfo, 0, phys, len, flags); + if (err || err == 1) + return err; + } + + if (xnid) { + page = f2fs_grab_cache_page(NODE_MAPPING(sbi), xnid, false); + if (!page) + return -ENOMEM; + + get_node_info(sbi, xnid, &ni); + + phys = (__u64)blk_to_logical(inode, ni.blk_addr); + len = inode->i_sb->s_blocksize; + + f2fs_put_page(page, 1); + + flags = FIEMAP_EXTENT_LAST; + } + + if (phys) + err = fiemap_fill_next_extent(fieinfo, 0, phys, len, flags); + + return (err < 0 ? err : 0); +} + int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { @@ -1150,12 +1212,17 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u32 flags = 0; int ret = 0; - ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC); + ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC | FIEMAP_FLAG_XATTR); if (ret) return ret; inode_lock(inode); + if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) { + ret = f2fs_xattr_fiemap(inode, fieinfo); + goto out; + } + if (f2fs_has_inline_data(inode)) { ret = f2fs_inline_data_fiemap(inode, fieinfo, start, len); if (ret != -EAGAIN) From 292c8e1cfd4d2eafdd7e90ea269f146dc275e412 Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Thu, 11 Jan 2018 11:26:19 +0900 Subject: [PATCH 573/804] f2fs: prevent newly created inode from being dirtied incorrectly Now, we invoke f2fs_mark_inode_dirty_sync() to make an inode dirty in advance of creating a new node page for the inode. By this, some inodes whose node page is not created yet can be linked into the global dirty list. If the checkpoint is executed at this moment, the inode will be written back by writeback_single_inode() and finally update_inode_page() will fail to detach the inode from the global dirty list because the inode doesn't have a node page. The problem is that the inode's state in VFS layer will become clean after execution of writeback_single_inode() and it's still linked in the global dirty list of f2fs and this will cause a kernel panic. So, we will prevent the newly created inode from being dirtied during the FI_NEW_INODE flag of the inode is set. We will make it dirty right after the flag is cleared. Signed-off-by: Daeho Jeong Signed-off-by: Youngjin Gil Tested-by: Hobin Woo Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 1 + fs/f2fs/inode.c | 3 +++ fs/f2fs/namei.c | 4 ++-- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index c35f87423123..c26e4a3a04b2 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2214,6 +2214,7 @@ static inline void __mark_inode_dirty_flag(struct inode *inode, case FI_INLINE_XATTR: case FI_INLINE_DATA: case FI_INLINE_DENTRY: + case FI_NEW_INODE: if (set) return; case FI_DATA_EXIST: diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 234322889e65..1dc77a40d0ad 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -22,6 +22,9 @@ void f2fs_mark_inode_dirty_sync(struct inode *inode, bool sync) { + if (is_inode_flag_set(inode, FI_NEW_INODE)) + return; + if (f2fs_inode_dirtied(inode, sync)) return; diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index a72c226c4d30..7573779a8e7c 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -74,12 +74,12 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) if (err) goto fail_drop; + set_inode_flag(inode, FI_NEW_INODE); + /* If the directory encrypted, then we should encrypt the inode. */ if (f2fs_encrypted_inode(dir) && f2fs_may_encrypt(inode)) f2fs_set_encrypted_inode(inode); - set_inode_flag(inode, FI_NEW_INODE); - if (f2fs_sb_has_extra_attr(sbi->sb)) { set_inode_flag(inode, FI_EXTRA_ATTR); F2FS_I(inode)->i_extra_isize = F2FS_TOTAL_EXTRA_ATTR_SIZE; From 84960fca96c4330d79d384fe21357f26537357de Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 7 Dec 2017 16:25:39 -0800 Subject: [PATCH 574/804] f2fs: add an ioctl to disable GC for specific file This patch gives a flag to disable GC on given file, which would be useful, when user wants to keep its block map. It also conducts in-place-update for dontmove file. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 + fs/f2fs/f2fs.h | 29 +++++++++++++- fs/f2fs/file.c | 83 +++++++++++++++++++++++++++++++++++++++++ fs/f2fs/gc.c | 11 ++++++ fs/f2fs/gc.h | 2 + fs/f2fs/sysfs.c | 2 + include/linux/f2fs_fs.h | 9 ++++- 7 files changed, 136 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 2be6e1999ab3..8c539fe293c8 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1478,6 +1478,8 @@ static inline bool need_inplace_update(struct f2fs_io_info *fio) { struct inode *inode = fio->page->mapping->host; + if (f2fs_is_pinned_file(inode)) + return true; if (S_ISDIR(inode->i_mode) || f2fs_is_atomic_file(inode)) return false; if (is_cold_data(fio->page)) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index c26e4a3a04b2..8a3096f3b3d3 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -423,6 +423,8 @@ static inline bool __has_cursum_space(struct f2fs_journal *journal, #define F2FS_IOC_GARBAGE_COLLECT_RANGE _IOW(F2FS_IOCTL_MAGIC, 11, \ struct f2fs_gc_range) #define F2FS_IOC_GET_FEATURES _IOR(F2FS_IOCTL_MAGIC, 12, __u32) +#define F2FS_IOC_SET_PIN_FILE _IOW(F2FS_IOCTL_MAGIC, 13, __u32) +#define F2FS_IOC_GET_PIN_FILE _IOR(F2FS_IOCTL_MAGIC, 14, __u32) #define F2FS_IOC_SET_ENCRYPTION_POLICY FS_IOC_SET_ENCRYPTION_POLICY #define F2FS_IOC_GET_ENCRYPTION_POLICY FS_IOC_GET_ENCRYPTION_POLICY @@ -657,7 +659,10 @@ struct f2fs_inode_info { unsigned long i_flags; /* keep an inode flags for ioctl */ unsigned char i_advise; /* use to give file attribute hints */ unsigned char i_dir_level; /* use for dentry level for large dir */ - unsigned int i_current_depth; /* use only in directory structure */ + union { + unsigned int i_current_depth; /* only for directory depth */ + unsigned short i_gc_failures; /* only for regular file */ + }; unsigned int i_pino; /* parent inode number */ umode_t i_acl_mode; /* keep file acl mode temporarily */ @@ -1206,6 +1211,9 @@ struct f2fs_sb_info { /* threshold for converting bg victims for fg */ u64 fggc_threshold; + /* threshold for gc trials on pinned files */ + u64 gc_pin_file_threshold; + /* maximum # of trials to find a victim segment for SSR and GC */ unsigned int max_victim_search; @@ -2205,6 +2213,7 @@ enum { FI_HOT_DATA, /* indicate file is hot */ FI_EXTRA_ATTR, /* indicate file has extra attribute */ FI_PROJ_INHERIT, /* indicate file inherits projectid */ + FI_PIN_FILE, /* indicate file should not be gced */ }; static inline void __mark_inode_dirty_flag(struct inode *inode, @@ -2219,6 +2228,7 @@ static inline void __mark_inode_dirty_flag(struct inode *inode, return; case FI_DATA_EXIST: case FI_INLINE_DOTS: + case FI_PIN_FILE: f2fs_mark_inode_dirty_sync(inode, true); } } @@ -2299,6 +2309,13 @@ static inline void f2fs_i_depth_write(struct inode *inode, unsigned int depth) f2fs_mark_inode_dirty_sync(inode, true); } +static inline void f2fs_i_gc_failures_write(struct inode *inode, + unsigned int count) +{ + F2FS_I(inode)->i_gc_failures = count; + f2fs_mark_inode_dirty_sync(inode, true); +} + static inline void f2fs_i_xnid_write(struct inode *inode, nid_t xnid) { F2FS_I(inode)->i_xattr_nid = xnid; @@ -2327,6 +2344,8 @@ static inline void get_inline_info(struct inode *inode, struct f2fs_inode *ri) set_bit(FI_INLINE_DOTS, &fi->flags); if (ri->i_inline & F2FS_EXTRA_ATTR) set_bit(FI_EXTRA_ATTR, &fi->flags); + if (ri->i_inline & F2FS_PIN_FILE) + set_bit(FI_PIN_FILE, &fi->flags); } static inline void set_raw_inline(struct inode *inode, struct f2fs_inode *ri) @@ -2345,6 +2364,8 @@ static inline void set_raw_inline(struct inode *inode, struct f2fs_inode *ri) ri->i_inline |= F2FS_INLINE_DOTS; if (is_inode_flag_set(inode, FI_EXTRA_ATTR)) ri->i_inline |= F2FS_EXTRA_ATTR; + if (is_inode_flag_set(inode, FI_PIN_FILE)) + ri->i_inline |= F2FS_PIN_FILE; } static inline int f2fs_has_extra_attr(struct inode *inode) @@ -2390,6 +2411,11 @@ static inline int f2fs_has_inline_dots(struct inode *inode) return is_inode_flag_set(inode, FI_INLINE_DOTS); } +static inline bool f2fs_is_pinned_file(struct inode *inode) +{ + return is_inode_flag_set(inode, FI_PIN_FILE); +} + static inline bool f2fs_is_atomic_file(struct inode *inode) { return is_inode_flag_set(inode, FI_ATOMIC_FILE); @@ -2634,6 +2660,7 @@ int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end); void truncate_data_blocks_range(struct dnode_of_data *dn, int count); long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); +int f2fs_pin_file_control(struct inode *inode, bool inc); /* * inode.c diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 56f6b21cd9a9..617ff6f6f268 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2454,6 +2454,83 @@ static int f2fs_ioc_get_features(struct file *filp, unsigned long arg) return put_user(sb_feature, (u32 __user *)arg); } +int f2fs_pin_file_control(struct inode *inode, bool inc) +{ + struct f2fs_inode_info *fi = F2FS_I(inode); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + /* Use i_gc_failures for normal file as a risk signal. */ + if (inc) + f2fs_i_gc_failures_write(inode, fi->i_gc_failures + 1); + + if (fi->i_gc_failures > sbi->gc_pin_file_threshold) { + f2fs_msg(sbi->sb, KERN_WARNING, + "%s: Enable GC = ino %lx after %x GC trials\n", + __func__, inode->i_ino, fi->i_gc_failures); + clear_inode_flag(inode, FI_PIN_FILE); + return -EAGAIN; + } + return 0; +} + +static int f2fs_ioc_set_pin_file(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + __u32 pin; + int ret = 0; + + if (!inode_owner_or_capable(inode)) + return -EACCES; + + if (get_user(pin, (__u32 __user *)arg)) + return -EFAULT; + + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + + if (f2fs_readonly(F2FS_I_SB(inode)->sb)) + return -EROFS; + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + inode_lock(inode); + + if (!pin) { + clear_inode_flag(inode, FI_PIN_FILE); + F2FS_I(inode)->i_gc_failures = 1; + goto done; + } + + if (f2fs_pin_file_control(inode, false)) { + ret = -EAGAIN; + goto out; + } + ret = f2fs_convert_inline_inode(inode); + if (ret) + goto out; + + set_inode_flag(inode, FI_PIN_FILE); + ret = F2FS_I(inode)->i_gc_failures; +done: + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); +out: + inode_unlock(inode); + mnt_drop_write_file(filp); + return ret; +} + +static int f2fs_ioc_get_pin_file(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + __u32 pin = 0; + + if (is_inode_flag_set(inode, FI_PIN_FILE)) + pin = F2FS_I(inode)->i_gc_failures; + return put_user(pin, (u32 __user *)arg); +} + long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { if (unlikely(f2fs_cp_error(F2FS_I_SB(file_inode(filp))))) @@ -2500,6 +2577,10 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return f2fs_ioc_flush_device(filp, arg); case F2FS_IOC_GET_FEATURES: return f2fs_ioc_get_features(filp, arg); + case F2FS_IOC_GET_PIN_FILE: + return f2fs_ioc_get_pin_file(filp, arg); + case F2FS_IOC_SET_PIN_FILE: + return f2fs_ioc_set_pin_file(filp, arg); default: return -ENOTTY; } @@ -2578,6 +2659,8 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case F2FS_IOC_MOVE_RANGE: case F2FS_IOC_FLUSH_DEVICE: case F2FS_IOC_GET_FEATURES: + case F2FS_IOC_GET_PIN_FILE: + case F2FS_IOC_SET_PIN_FILE: break; default: return -ENOIOCTLCMD; diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index be9fd616736b..d98b869456ce 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -624,6 +624,11 @@ static void move_data_block(struct inode *inode, block_t bidx, if (f2fs_is_atomic_file(inode)) goto out; + if (f2fs_is_pinned_file(inode)) { + f2fs_pin_file_control(inode, true); + goto out; + } + set_new_dnode(&dn, inode, NULL, NULL, 0); err = get_dnode_of_data(&dn, bidx, LOOKUP_NODE); if (err) @@ -720,6 +725,11 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type, if (f2fs_is_atomic_file(inode)) goto out; + if (f2fs_is_pinned_file(inode)) { + if (gc_type == FG_GC) + f2fs_pin_file_control(inode, true); + goto out; + } if (gc_type == BG_GC) { if (PageWriteback(page)) @@ -1091,6 +1101,7 @@ void build_gc_manager(struct f2fs_sb_info *sbi) sbi->fggc_threshold = div64_u64((main_count - ovp_count) * BLKS_PER_SEC(sbi), (main_count - resv_count)); + sbi->gc_pin_file_threshold = DEF_GC_FAILED_PINNED_FILES; /* give warm/cold data area from slower device */ if (sbi->s_ndevs && sbi->segs_per_sec == 1) diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h index 9325191fab2d..b0045d4c8d1e 100644 --- a/fs/f2fs/gc.h +++ b/fs/f2fs/gc.h @@ -20,6 +20,8 @@ #define LIMIT_INVALID_BLOCK 40 /* percentage over total user space */ #define LIMIT_FREE_BLOCK 40 /* percentage over invalid + free space */ +#define DEF_GC_FAILED_PINNED_FILES 2048 + /* Search max. number of dirty segments to select a victim segment */ #define DEF_MAX_VICTIM_SEARCH 4096 /* covers 8GB */ diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index ab6028c332aa..41887e6ec1b3 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -301,6 +301,7 @@ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, cp_interval, interval_time[CP_TIME]); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, idle_interval, interval_time[REQ_TIME]); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, iostat_enable, iostat_enable); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, readdir_ra, readdir_ra); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_pin_file_thresh, gc_pin_file_threshold); #ifdef CONFIG_F2FS_FAULT_INJECTION F2FS_RW_ATTR(FAULT_INFO_RATE, f2fs_fault_info, inject_rate, inject_rate); F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type); @@ -349,6 +350,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(idle_interval), ATTR_LIST(iostat_enable), ATTR_LIST(readdir_ra), + ATTR_LIST(gc_pin_file_thresh), #ifdef CONFIG_F2FS_FAULT_INJECTION ATTR_LIST(inject_rate), ATTR_LIST(inject_type), diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index fef1caeddf54..9bba23187c04 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -212,6 +212,7 @@ struct f2fs_extent { #define F2FS_DATA_EXIST 0x08 /* file inline data exist flag */ #define F2FS_INLINE_DOTS 0x10 /* file having implicit dot dentries */ #define F2FS_EXTRA_ATTR 0x20 /* file having extra attribute */ +#define F2FS_PIN_FILE 0x40 /* file should not be gced */ struct f2fs_inode { __le16 i_mode; /* file mode */ @@ -229,7 +230,13 @@ struct f2fs_inode { __le32 i_ctime_nsec; /* change time in nano scale */ __le32 i_mtime_nsec; /* modification time in nano scale */ __le32 i_generation; /* file version (for NFS) */ - __le32 i_current_depth; /* only for directory depth */ + union { + __le32 i_current_depth; /* only for directory depth */ + __le16 i_gc_failures; /* + * # of gc failures on pinned file. + * only for regular files. + */ + }; __le32 i_xattr_nid; /* nid to save xattr */ __le32 i_flags; /* file attributes */ __le32 i_pino; /* parent inode number */ From 999f806a7c9e29e74019e6c2566be04d54c956ea Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 11 Jan 2018 14:42:30 +0800 Subject: [PATCH 575/804] f2fs: support F2FS_IOC_PRECACHE_EXTENTS This patch introduces a new ioctl F2FS_IOC_PRECACHE_EXTENTS to precache extent info like ext4, in order to gain better performance during triggering AIO by eliminating synchronous waiting of mapping info. Referred commit: 7869a4a6c5ca ("ext4: add support for extent pre-caching") In addition, with newly added extent precache abilitiy, this patch add to support FIEMAP_FLAG_CACHE in ->fiemap. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 39 +++++++++++++++++++++++++++++++++++++++ fs/f2fs/f2fs.h | 4 ++++ fs/f2fs/file.c | 44 ++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 85 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 8c539fe293c8..95a649467272 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -863,6 +863,7 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) map.m_len = 0; map.m_next_pgofs = NULL; + map.m_next_extent = NULL; map.m_seg_type = NO_CHECK_TYPE; if (direct_io) { @@ -930,6 +931,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, blkcnt_t prealloc; struct extent_info ei = {0,0,0}; block_t blkaddr; + unsigned int start_pgofs; if (!maxblocks) return 0; @@ -945,6 +947,8 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, map->m_pblk = ei.blk + pgofs - ei.fofs; map->m_len = min((pgoff_t)maxblocks, ei.fofs + ei.len - pgofs); map->m_flags = F2FS_MAP_MAPPED; + if (map->m_next_extent) + *map->m_next_extent = pgofs + map->m_len; goto out; } @@ -963,10 +967,14 @@ next_dnode: if (map->m_next_pgofs) *map->m_next_pgofs = get_next_page_offset(&dn, pgofs); + if (map->m_next_extent) + *map->m_next_extent = + get_next_page_offset(&dn, pgofs); } goto unlock_out; } + start_pgofs = pgofs; prealloc = 0; last_ofs_in_node = ofs_in_node = dn.ofs_in_node; end_offset = ADDRS_PER_PAGE(dn.node_page, inode); @@ -1000,6 +1008,8 @@ next_block: map->m_pblk = 0; goto sync_out; } + if (flag == F2FS_GET_BLOCK_PRECACHE) + goto sync_out; if (flag == F2FS_GET_BLOCK_FIEMAP && blkaddr == NULL_ADDR) { if (map->m_next_pgofs) @@ -1058,6 +1068,16 @@ skip: else if (dn.ofs_in_node < end_offset) goto next_block; + if (flag == F2FS_GET_BLOCK_PRECACHE) { + if (map->m_flags & F2FS_MAP_MAPPED) { + unsigned int ofs = start_pgofs - map->m_lblk; + + f2fs_update_extent_cache_range(&dn, + start_pgofs, map->m_pblk + ofs, + map->m_len - ofs); + } + } + f2fs_put_dnode(&dn); if (create) { @@ -1067,6 +1087,17 @@ skip: goto next_dnode; sync_out: + if (flag == F2FS_GET_BLOCK_PRECACHE) { + if (map->m_flags & F2FS_MAP_MAPPED) { + unsigned int ofs = start_pgofs - map->m_lblk; + + f2fs_update_extent_cache_range(&dn, + start_pgofs, map->m_pblk + ofs, + map->m_len - ofs); + } + if (map->m_next_extent) + *map->m_next_extent = pgofs + 1; + } f2fs_put_dnode(&dn); unlock_out: if (create) { @@ -1088,6 +1119,7 @@ static int __get_data_block(struct inode *inode, sector_t iblock, map.m_lblk = iblock; map.m_len = bh->b_size >> inode->i_blkbits; map.m_next_pgofs = next_pgofs; + map.m_next_extent = NULL; map.m_seg_type = seg_type; err = f2fs_map_blocks(inode, &map, create, flag); @@ -1212,6 +1244,12 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u32 flags = 0; int ret = 0; + if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) { + ret = f2fs_precache_extents(inode); + if (ret) + return ret; + } + ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC | FIEMAP_FLAG_XATTR); if (ret) return ret; @@ -1313,6 +1351,7 @@ static int f2fs_mpage_readpages(struct address_space *mapping, map.m_len = 0; map.m_flags = 0; map.m_next_pgofs = NULL; + map.m_next_extent = NULL; map.m_seg_type = NO_CHECK_TYPE; for (; nr_pages; nr_pages--) { diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 8a3096f3b3d3..28e5a52aadb4 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -425,6 +425,7 @@ static inline bool __has_cursum_space(struct f2fs_journal *journal, #define F2FS_IOC_GET_FEATURES _IOR(F2FS_IOCTL_MAGIC, 12, __u32) #define F2FS_IOC_SET_PIN_FILE _IOW(F2FS_IOCTL_MAGIC, 13, __u32) #define F2FS_IOC_GET_PIN_FILE _IOR(F2FS_IOCTL_MAGIC, 14, __u32) +#define F2FS_IOC_PRECACHE_EXTENTS _IO(F2FS_IOCTL_MAGIC, 15) #define F2FS_IOC_SET_ENCRYPTION_POLICY FS_IOC_SET_ENCRYPTION_POLICY #define F2FS_IOC_GET_ENCRYPTION_POLICY FS_IOC_GET_ENCRYPTION_POLICY @@ -617,6 +618,7 @@ struct f2fs_map_blocks { unsigned int m_len; unsigned int m_flags; pgoff_t *m_next_pgofs; /* point next possible non-hole pgofs */ + pgoff_t *m_next_extent; /* point to next possible extent */ int m_seg_type; }; @@ -627,6 +629,7 @@ enum { F2FS_GET_BLOCK_BMAP, F2FS_GET_BLOCK_PRE_DIO, F2FS_GET_BLOCK_PRE_AIO, + F2FS_GET_BLOCK_PRECACHE, }; /* @@ -2658,6 +2661,7 @@ int f2fs_getattr(struct vfsmount *mnt, struct dentry *dentry, int f2fs_setattr(struct dentry *dentry, struct iattr *attr); int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end); void truncate_data_blocks_range(struct dnode_of_data *dn, int count); +int f2fs_precache_extents(struct inode *inode); long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); int f2fs_pin_file_control(struct inode *inode, bool inc); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 617ff6f6f268..29c1aed384f6 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1407,7 +1407,7 @@ static int expand_inode_data(struct inode *inode, loff_t offset, { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_map_blocks map = { .m_next_pgofs = NULL, - .m_seg_type = NO_CHECK_TYPE }; + .m_next_extent = NULL, .m_seg_type = NO_CHECK_TYPE }; pgoff_t pg_end; loff_t new_size = i_size_read(inode); loff_t off_end; @@ -2048,7 +2048,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, { struct inode *inode = file_inode(filp); struct f2fs_map_blocks map = { .m_next_pgofs = NULL, - .m_seg_type = NO_CHECK_TYPE }; + .m_next_extent = NULL, .m_seg_type = NO_CHECK_TYPE }; struct extent_info ei = {0,0,0}; pgoff_t pg_start, pg_end; unsigned int blk_per_seg = sbi->blocks_per_seg; @@ -2531,6 +2531,43 @@ static int f2fs_ioc_get_pin_file(struct file *filp, unsigned long arg) return put_user(pin, (u32 __user *)arg); } +int f2fs_precache_extents(struct inode *inode) +{ + struct f2fs_inode_info *fi = F2FS_I(inode); + struct f2fs_map_blocks map; + pgoff_t m_next_extent; + loff_t end; + int err; + + if (is_inode_flag_set(inode, FI_NO_EXTENT)) + return -EOPNOTSUPP; + + map.m_lblk = 0; + map.m_next_pgofs = NULL; + map.m_next_extent = &m_next_extent; + map.m_seg_type = NO_CHECK_TYPE; + end = F2FS_I_SB(inode)->max_file_blocks; + + while (map.m_lblk < end) { + map.m_len = end - map.m_lblk; + + down_write(&fi->dio_rwsem[WRITE]); + err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_PRECACHE); + up_write(&fi->dio_rwsem[WRITE]); + if (err) + return err; + + map.m_lblk = m_next_extent; + } + + return err; +} + +static int f2fs_ioc_precache_extents(struct file *filp, unsigned long arg) +{ + return f2fs_precache_extents(file_inode(filp)); +} + long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { if (unlikely(f2fs_cp_error(F2FS_I_SB(file_inode(filp))))) @@ -2581,6 +2618,8 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return f2fs_ioc_get_pin_file(filp, arg); case F2FS_IOC_SET_PIN_FILE: return f2fs_ioc_set_pin_file(filp, arg); + case F2FS_IOC_PRECACHE_EXTENTS: + return f2fs_ioc_precache_extents(filp, arg); default: return -ENOTTY; } @@ -2661,6 +2700,7 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case F2FS_IOC_GET_FEATURES: case F2FS_IOC_GET_PIN_FILE: case F2FS_IOC_SET_PIN_FILE: + case F2FS_IOC_PRECACHE_EXTENTS: break; default: return -ENOIOCTLCMD; From 4dca47531eb037d663a903508f636f8758cbc172 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 10 Jan 2018 18:18:52 +0800 Subject: [PATCH 576/804] f2fs: speed up defragment on sparse file We have supported to get next page offset with valid mapping crossing hole in f2fs_map_blocks, utilizing it to speed up defragment on sparse file. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 6 +++++- fs/f2fs/file.c | 11 ++++++----- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 95a649467272..ec6698bc8021 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1016,8 +1016,12 @@ next_block: *map->m_next_pgofs = pgofs + 1; goto sync_out; } - if (flag != F2FS_GET_BLOCK_FIEMAP) + if (flag != F2FS_GET_BLOCK_FIEMAP) { + /* for defragment case */ + if (map->m_next_pgofs) + *map->m_next_pgofs = pgofs + 1; goto sync_out; + } } } diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 29c1aed384f6..02c20d55cf90 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2047,10 +2047,10 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, struct f2fs_defragment *range) { struct inode *inode = file_inode(filp); - struct f2fs_map_blocks map = { .m_next_pgofs = NULL, - .m_next_extent = NULL, .m_seg_type = NO_CHECK_TYPE }; + struct f2fs_map_blocks map = { .m_next_extent = NULL, + .m_seg_type = NO_CHECK_TYPE }; struct extent_info ei = {0,0,0}; - pgoff_t pg_start, pg_end; + pgoff_t pg_start, pg_end, next_pgofs; unsigned int blk_per_seg = sbi->blocks_per_seg; unsigned int total = 0, sec_num; block_t blk_end = 0; @@ -2084,6 +2084,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, } map.m_lblk = pg_start; + map.m_next_pgofs = &next_pgofs; /* * lookup mapping info in dnode page cache, skip defragmenting if all @@ -2097,7 +2098,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, goto out; if (!(map.m_flags & F2FS_MAP_FLAGS)) { - map.m_lblk++; + map.m_lblk = next_pgofs; continue; } @@ -2142,7 +2143,7 @@ do_map: goto clear_out; if (!(map.m_flags & F2FS_MAP_FLAGS)) { - map.m_lblk++; + map.m_lblk = next_pgofs; continue; } From 18d267c273a96a600279b576f492439f969ca6a6 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 15 Jan 2018 17:16:46 +0800 Subject: [PATCH 577/804] f2fs: fix to drop all inmem pages correctly In commit 57864ae5ce3a ("f2fs: limit # of inmemory pages"), we have limited memory footprint of all inmem pages with 20% of total memory, otherwise, if we exceed the threshold, we will try to drop all inmem pages to avoid excessive memory pressure resulting in performance regression. But in some unrelated error paths, we will also drop all inmem pages, which should be wrong, fix it in this patch. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index ec6698bc8021..6401cf431026 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2082,7 +2082,7 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct page *page = NULL; pgoff_t index = ((unsigned long long) pos) >> PAGE_SHIFT; - bool need_balance = false; + bool need_balance = false, drop_atomic = false; block_t blkaddr = NULL_ADDR; int err = 0; @@ -2091,6 +2091,7 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, if (f2fs_is_atomic_file(inode) && !available_free_memory(sbi, INMEM_PAGES)) { err = -ENOMEM; + drop_atomic = true; goto fail; } @@ -2171,7 +2172,7 @@ repeat: fail: f2fs_put_page(page, 1); f2fs_write_failed(mapping, pos + len); - if (f2fs_is_atomic_file(inode)) + if (drop_atomic) drop_inmem_pages_all(sbi); return err; } From c4027d08430b904f9000a13dabaced9078fd0a11 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 5 Jan 2018 16:02:36 -0800 Subject: [PATCH 578/804] f2fs: allow quota to use reserved blocks This patch allows quota to use reserved blocks all the time. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 28e5a52aadb4..6dc15d0ea3c8 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1640,10 +1640,15 @@ static inline bool f2fs_has_xattr_block(unsigned int ofs) return ofs == XATTR_NODE_OFFSET; } -static inline bool __allow_reserved_blocks(struct f2fs_sb_info *sbi) +static inline bool __allow_reserved_blocks(struct f2fs_sb_info *sbi, + struct inode *inode) { + if (!inode) + return true; if (!test_opt(sbi, RESERVE_ROOT)) return false; + if (IS_NOQUOTA(inode)) + return true; if (capable(CAP_SYS_RESOURCE)) return true; if (uid_eq(sbi->s_resuid, current_fsuid())) @@ -1684,7 +1689,7 @@ static inline int inc_valid_block_count(struct f2fs_sb_info *sbi, avail_user_block_count = sbi->user_block_count - sbi->current_reserved_blocks; - if (!__allow_reserved_blocks(sbi)) + if (!__allow_reserved_blocks(sbi, inode)) avail_user_block_count -= sbi->root_reserved_blocks; if (unlikely(sbi->total_valid_block_count > avail_user_block_count)) { @@ -1885,7 +1890,7 @@ static inline int inc_valid_node_count(struct f2fs_sb_info *sbi, valid_block_count = sbi->total_valid_block_count + sbi->current_reserved_blocks + 1; - if (!__allow_reserved_blocks(sbi)) + if (!__allow_reserved_blocks(sbi, inode)) valid_block_count += sbi->root_reserved_blocks; if (unlikely(valid_block_count > sbi->user_block_count)) { From d5efd57e013bfcc82e2338b799df0877fc8db236 Mon Sep 17 00:00:00 2001 From: Sheng Yong Date: Wed, 17 Jan 2018 12:11:31 +0800 Subject: [PATCH 579/804] f2fs: avoid hungtask when GC encrypted block if io_bits is set When io_bits is set, GCing encrypted block may hit the following hungtask. Since io_bits requires aligned block address, f2fs_submit_page_write may return -EAGAIN if new_blkaddr does not satisify io_bits alignment. As a result, the encrypted page will never be writtenback. This patch makes move_data_block aware the EAGAIN error and cancel the writeback. [ 246.751371] INFO: task kworker/u4:4:797 blocked for more than 90 seconds. [ 246.752423] Not tainted 4.15.0-rc4+ #11 [ 246.754176] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 246.755336] kworker/u4:4 D25448 797 2 0x80000000 [ 246.755597] Workqueue: writeback wb_workfn (flush-7:0) [ 246.755616] Call Trace: [ 246.755695] ? __schedule+0x322/0xa90 [ 246.755761] ? blk_init_request_from_bio+0x120/0x120 [ 246.755773] ? pci_mmcfg_check_reserved+0xb0/0xb0 [ 246.755801] ? __radix_tree_create+0x19e/0x200 [ 246.755813] ? delete_node+0x136/0x370 [ 246.755838] schedule+0x43/0xc0 [ 246.755904] io_schedule+0x17/0x40 [ 246.755939] wait_on_page_bit_common+0x17b/0x240 [ 246.755950] ? wake_page_function+0xa0/0xa0 [ 246.755961] ? add_to_page_cache_lru+0x160/0x160 [ 246.755972] ? page_cache_tree_insert+0x170/0x170 [ 246.755983] ? __lru_cache_add+0x96/0xb0 [ 246.756086] __filemap_fdatawait_range+0x14f/0x1c0 [ 246.756097] ? wait_on_page_bit_common+0x240/0x240 [ 246.756120] ? __wake_up_locked_key_bookmark+0x20/0x20 [ 246.756167] ? wait_on_all_pages_writeback+0xc9/0x100 [ 246.756179] ? __remove_ino_entry+0x120/0x120 [ 246.756192] ? wait_woken+0x100/0x100 [ 246.756204] filemap_fdatawait_range+0x9/0x20 [ 246.756216] write_checkpoint+0x18a1/0x1f00 [ 246.756254] ? blk_get_request+0x10/0x10 [ 246.756265] ? cpumask_next_and+0x43/0x60 [ 246.756279] ? f2fs_sync_inode_meta+0x160/0x160 [ 246.756289] ? remove_element.isra.4+0xa0/0xa0 [ 246.756300] ? __put_compound_page+0x40/0x40 [ 246.756310] ? f2fs_sync_fs+0xec/0x1c0 [ 246.756320] ? f2fs_sync_fs+0x120/0x1c0 [ 246.756329] f2fs_sync_fs+0x120/0x1c0 [ 246.756357] ? trace_event_raw_event_f2fs__page+0x260/0x260 [ 246.756393] ? ata_build_rw_tf+0x173/0x410 [ 246.756397] f2fs_balance_fs_bg+0x198/0x390 [ 246.756405] ? drop_inmem_page+0x230/0x230 [ 246.756415] ? ahci_qc_prep+0x1bb/0x2e0 [ 246.756418] ? ahci_qc_issue+0x1df/0x290 [ 246.756422] ? __accumulate_pelt_segments+0x42/0xd0 [ 246.756426] ? f2fs_write_node_pages+0xd1/0x380 [ 246.756429] f2fs_write_node_pages+0xd1/0x380 [ 246.756437] ? sync_node_pages+0x8f0/0x8f0 [ 246.756440] ? update_curr+0x53/0x220 [ 246.756444] ? __accumulate_pelt_segments+0xa2/0xd0 [ 246.756448] ? __update_load_avg_se.isra.39+0x349/0x360 [ 246.756452] ? do_writepages+0x2a/0xa0 [ 246.756456] do_writepages+0x2a/0xa0 [ 246.756460] __writeback_single_inode+0x70/0x490 [ 246.756463] ? check_preempt_wakeup+0x199/0x310 [ 246.756467] writeback_sb_inodes+0x2a2/0x660 [ 246.756471] ? is_empty_dir_inode+0x40/0x40 [ 246.756474] ? __writeback_single_inode+0x490/0x490 [ 246.756477] ? string+0xbf/0xf0 [ 246.756480] ? down_read_trylock+0x35/0x60 [ 246.756484] __writeback_inodes_wb+0x9f/0xf0 [ 246.756488] wb_writeback+0x41d/0x4b0 [ 246.756492] ? writeback_inodes_wb.constprop.55+0x150/0x150 [ 246.756498] ? set_worker_desc+0xf7/0x130 [ 246.756502] ? current_is_workqueue_rescuer+0x60/0x60 [ 246.756511] ? _find_next_bit+0x2c/0xa0 [ 246.756514] ? wb_workfn+0x400/0x5d0 [ 246.756518] wb_workfn+0x400/0x5d0 [ 246.756521] ? finish_task_switch+0xdf/0x2a0 [ 246.756525] ? inode_wait_for_writeback+0x30/0x30 [ 246.756529] process_one_work+0x3a7/0x6f0 [ 246.756533] worker_thread+0x82/0x750 [ 246.756537] kthread+0x16f/0x1c0 [ 246.756541] ? trace_event_raw_event_workqueue_work+0x110/0x110 [ 246.756544] ? kthread_create_worker_on_cpu+0xb0/0xb0 [ 246.756548] ret_from_fork+0x1f/0x30 Signed-off-by: Sheng Yong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index d98b869456ce..d0de3429c26c 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -691,7 +691,12 @@ static void move_data_block(struct inode *inode, block_t bidx, fio.op = REQ_OP_WRITE; fio.op_flags = REQ_SYNC | REQ_NOIDLE; fio.new_blkaddr = newaddr; - f2fs_submit_page_write(&fio); + err = f2fs_submit_page_write(&fio); + if (err) { + if (PageWriteback(fio.encrypted_page)) + end_page_writeback(fio.encrypted_page); + goto put_page_out; + } f2fs_update_iostat(fio.sbi, FS_GC_DATA_IO, F2FS_BLKSIZE); From c1b74c96709223b65a03732cfc9963483e3d105f Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 17 Jan 2018 16:31:35 +0800 Subject: [PATCH 580/804] f2fs: clean up error path of fill_super This patch cleans up error path of fille_super to avoid unneeded release step. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index d89834b79646..aaeba346e9d7 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2612,14 +2612,14 @@ try_onemore: err = init_percpu_info(sbi); if (err) - goto free_options; + goto free_bio_info; if (F2FS_IO_SIZE(sbi) > 1) { sbi->write_io_dummy = mempool_create_page_pool(2 * (F2FS_IO_SIZE(sbi) - 1), 0); if (!sbi->write_io_dummy) { err = -ENOMEM; - goto free_options; + goto free_percpu; } } @@ -2851,10 +2851,12 @@ free_meta_inode: iput(sbi->meta_inode); free_io_dummy: mempool_destroy(sbi->write_io_dummy); -free_options: +free_percpu: + destroy_percpu_info(sbi); +free_bio_info: for (i = 0; i < NR_PAGE_TYPE; i++) kfree(sbi->write_io[i]); - destroy_percpu_info(sbi); +free_options: #ifdef CONFIG_QUOTA for (i = 0; i < MAXQUOTAS; i++) kfree(sbi->s_qf_names[i]); From eeb0118b8340767cb5be7ccc0abeaba11416b317 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 17 Jan 2018 16:31:36 +0800 Subject: [PATCH 581/804] f2fs: kill F2FS_INLINE_XATTR_ADDRS for cleanup Use get_inline_xattr_addrs directly instead of F2FS_INLINE_XATTR_ADDRS. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 +- fs/f2fs/f2fs.h | 7 +++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 6401cf431026..7bd2c9e7e873 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1200,7 +1200,7 @@ static int f2fs_xattr_fiemap(struct inode *inode, phys = (__u64)blk_to_logical(inode, ni.blk_addr); offset = offsetof(struct f2fs_inode, i_addr) + sizeof(__le32) * (DEF_ADDRS_PER_INODE - - F2FS_INLINE_XATTR_ADDRS(inode)); + get_inline_xattr_addrs(inode)); phys += offset; len = inline_xattr_size(inode); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 6dc15d0ea3c8..23944b3417dd 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -478,10 +478,9 @@ struct f2fs_flush_device { #define DEF_MIN_INLINE_SIZE 1 static inline int get_extra_isize(struct inode *inode); static inline int get_inline_xattr_addrs(struct inode *inode); -#define F2FS_INLINE_XATTR_ADDRS(inode) get_inline_xattr_addrs(inode) #define MAX_INLINE_DATA(inode) (sizeof(__le32) * \ (CUR_ADDRS_PER_INODE(inode) - \ - F2FS_INLINE_XATTR_ADDRS(inode) - \ + get_inline_xattr_addrs(inode) - \ DEF_INLINE_RESERVED_SIZE)) /* for inline dir */ @@ -2388,7 +2387,7 @@ static inline int f2fs_has_inline_xattr(struct inode *inode) static inline unsigned int addrs_per_inode(struct inode *inode) { - return CUR_ADDRS_PER_INODE(inode) - F2FS_INLINE_XATTR_ADDRS(inode); + return CUR_ADDRS_PER_INODE(inode) - get_inline_xattr_addrs(inode); } static inline void *inline_xattr_addr(struct inode *inode, struct page *page) @@ -2396,7 +2395,7 @@ static inline void *inline_xattr_addr(struct inode *inode, struct page *page) struct f2fs_inode *ri = F2FS_INODE(page); return (void *)&(ri->i_addr[DEF_ADDRS_PER_INODE - - F2FS_INLINE_XATTR_ADDRS(inode)]); + get_inline_xattr_addrs(inode)]); } static inline int inline_xattr_size(struct inode *inode) From f31d52811c1f654de5f8a01c5e277b56e737e9c3 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 17 Jan 2018 16:31:37 +0800 Subject: [PATCH 582/804] f2fs: fix to update last_disk_size correctly This patch fixes to update last_disk_size only when writing out page successfully. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 7bd2c9e7e873..5dc6e461aa31 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1715,10 +1715,14 @@ write: } } - down_write(&F2FS_I(inode)->i_sem); - if (F2FS_I(inode)->last_disk_size < psize) - F2FS_I(inode)->last_disk_size = psize; - up_write(&F2FS_I(inode)->i_sem); + if (err) { + file_set_keep_isize(inode); + } else { + down_write(&F2FS_I(inode)->i_sem); + if (F2FS_I(inode)->last_disk_size < psize) + F2FS_I(inode)->last_disk_size = psize; + up_write(&F2FS_I(inode)->i_sem); + } done: if (err && err != -ENOENT) From 700b53f21ee8c4feb0238b10ea23b76a8f1e7231 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 17 Jan 2018 16:31:38 +0800 Subject: [PATCH 583/804] f2fs: split need_inplace_update This patch splits need_inplace_update to two functions: a. should_update_inplace() includes all conditions that we must use IPU. b. should_update_outplace() includes all conditions that we must use OPU. So that, in f2fs_ioc_set_pin_file() and f2fs_defragment_range(), we can use corresponding function to check whether we can trigger OPU/IPU or not. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 75 ++++++++++++++++++++++++++++++++++++++++++----- fs/f2fs/f2fs.h | 2 ++ fs/f2fs/file.c | 7 ++++- fs/f2fs/segment.h | 41 -------------------------- 4 files changed, 75 insertions(+), 50 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 5dc6e461aa31..2003ebdc9b52 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1517,20 +1517,79 @@ retry_encrypt: return PTR_ERR(fio->encrypted_page); } +static inline bool check_inplace_update_policy(struct inode *inode, + struct f2fs_io_info *fio) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + unsigned int policy = SM_I(sbi)->ipu_policy; + + if (policy & (0x1 << F2FS_IPU_FORCE)) + return true; + if (policy & (0x1 << F2FS_IPU_SSR) && need_SSR(sbi)) + return true; + if (policy & (0x1 << F2FS_IPU_UTIL) && + utilization(sbi) > SM_I(sbi)->min_ipu_util) + return true; + if (policy & (0x1 << F2FS_IPU_SSR_UTIL) && need_SSR(sbi) && + utilization(sbi) > SM_I(sbi)->min_ipu_util) + return true; + + /* + * IPU for rewrite async pages + */ + if (policy & (0x1 << F2FS_IPU_ASYNC) && + fio && fio->op == REQ_OP_WRITE && + !(fio->op_flags & REQ_SYNC) && + !f2fs_encrypted_inode(inode)) + return true; + + /* this is only set during fdatasync */ + if (policy & (0x1 << F2FS_IPU_FSYNC) && + is_inode_flag_set(inode, FI_NEED_IPU)) + return true; + + return false; +} + +bool should_update_inplace(struct inode *inode, struct f2fs_io_info *fio) +{ + if (f2fs_is_pinned_file(inode)) + return true; + + /* if this is cold file, we should overwrite to avoid fragmentation */ + if (file_is_cold(inode)) + return true; + + return check_inplace_update_policy(inode, fio); +} + +bool should_update_outplace(struct inode *inode, struct f2fs_io_info *fio) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + if (test_opt(sbi, LFS)) + return true; + if (S_ISDIR(inode->i_mode)) + return true; + if (f2fs_is_atomic_file(inode)) + return true; + if (fio) { + if (is_cold_data(fio->page)) + return true; + if (IS_ATOMIC_WRITTEN_PAGE(fio->page)) + return true; + } + return false; +} + static inline bool need_inplace_update(struct f2fs_io_info *fio) { struct inode *inode = fio->page->mapping->host; - if (f2fs_is_pinned_file(inode)) - return true; - if (S_ISDIR(inode->i_mode) || f2fs_is_atomic_file(inode)) - return false; - if (is_cold_data(fio->page)) - return false; - if (IS_ATOMIC_WRITTEN_PAGE(fio->page)) + if (should_update_outplace(inode, fio)) return false; - return need_inplace_update_policy(inode, fio); + return should_update_inplace(inode, fio); } static inline bool valid_ipu_blkaddr(struct f2fs_io_info *fio) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 23944b3417dd..29dad838bf42 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2928,6 +2928,8 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, int create, int flag); int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len); +bool should_update_inplace(struct inode *inode, struct f2fs_io_info *fio); +bool should_update_outplace(struct inode *inode, struct f2fs_io_info *fio); void f2fs_set_page_dirty_nobuffers(struct page *page); int __f2fs_write_data_pages(struct address_space *mapping, struct writeback_control *wbc, diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 02c20d55cf90..2eb9710bf263 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2058,7 +2058,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, int err; /* if in-place-update policy is enabled, don't waste time here */ - if (need_inplace_update_policy(inode, NULL)) + if (should_update_inplace(inode, NULL)) return -EINVAL; pg_start = range->start >> PAGE_SHIFT; @@ -2498,6 +2498,11 @@ static int f2fs_ioc_set_pin_file(struct file *filp, unsigned long arg) inode_lock(inode); + if (should_update_outplace(inode, NULL)) { + ret = -EINVAL; + goto out; + } + if (!pin) { clear_inode_flag(inode, FI_PIN_FILE); F2FS_I(inode)->i_gc_failures = 1; diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 5c4d432ebf1d..e123dd30f2e4 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -580,47 +580,6 @@ enum { F2FS_IPU_ASYNC, }; -static inline bool need_inplace_update_policy(struct inode *inode, - struct f2fs_io_info *fio) -{ - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - unsigned int policy = SM_I(sbi)->ipu_policy; - - if (test_opt(sbi, LFS)) - return false; - - /* if this is cold file, we should overwrite to avoid fragmentation */ - if (file_is_cold(inode)) - return true; - - if (policy & (0x1 << F2FS_IPU_FORCE)) - return true; - if (policy & (0x1 << F2FS_IPU_SSR) && need_SSR(sbi)) - return true; - if (policy & (0x1 << F2FS_IPU_UTIL) && - utilization(sbi) > SM_I(sbi)->min_ipu_util) - return true; - if (policy & (0x1 << F2FS_IPU_SSR_UTIL) && need_SSR(sbi) && - utilization(sbi) > SM_I(sbi)->min_ipu_util) - return true; - - /* - * IPU for rewrite async pages - */ - if (policy & (0x1 << F2FS_IPU_ASYNC) && - fio && fio->op == REQ_OP_WRITE && - !(fio->op_flags & REQ_SYNC) && - !f2fs_encrypted_inode(inode)) - return true; - - /* this is only set during fdatasync */ - if (policy & (0x1 << F2FS_IPU_FSYNC) && - is_inode_flag_set(inode, FI_NEED_IPU)) - return true; - - return false; -} - static inline unsigned int curseg_segno(struct f2fs_sb_info *sbi, int type) { From bb924f777717669e420038c1edd0962ac9205111 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 17 Jan 2018 22:28:52 +0800 Subject: [PATCH 584/804] f2fs: hanlde error case in f2fs_ioc_shutdown This patch makes f2fs_ioc_shutdown handling error case correctly. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 2eb9710bf263..b108395a6e38 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1815,14 +1815,20 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) switch (in) { case F2FS_GOING_DOWN_FULLSYNC: sb = freeze_bdev(sb->s_bdev); - if (sb && !IS_ERR(sb)) { + if (IS_ERR(sb)) { + ret = PTR_ERR(sb); + goto out; + } + if (sb) { f2fs_stop_checkpoint(sbi, false); thaw_bdev(sb->s_bdev, sb); } break; case F2FS_GOING_DOWN_METASYNC: /* do checkpoint only */ - f2fs_sync_fs(sb, 1); + ret = f2fs_sync_fs(sb, 1); + if (ret) + goto out; f2fs_stop_checkpoint(sbi, false); break; case F2FS_GOING_DOWN_NOSYNC: From 8069a0e983d999641331e3a7c8cda42de0ae1166 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 18 Jan 2018 17:23:29 +0800 Subject: [PATCH 585/804] f2fs: stop gc/discard thread after fs shutdown Once filesystem shuts down, daemons like gc/discard thread should be aware of it, and do exit, in addtion, drop all cached pending discard commands and turn off real-time discard mode. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 1 + fs/f2fs/file.c | 7 +++++++ fs/f2fs/segment.c | 5 +++++ 3 files changed, 13 insertions(+) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 29dad838bf42..26f8aefe5f5f 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2820,6 +2820,7 @@ void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr); bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr); void init_discard_policy(struct discard_policy *dpolicy, int discard_type, unsigned int granularity); +void drop_discard_cmd(struct f2fs_sb_info *sbi); void stop_discard_thread(struct f2fs_sb_info *sbi); bool f2fs_wait_discard_bios(struct f2fs_sb_info *sbi); void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index b108395a6e38..86507b51b7d3 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1842,6 +1842,13 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) ret = -EINVAL; goto out; } + + stop_gc_thread(sbi); + stop_discard_thread(sbi); + + drop_discard_cmd(sbi); + clear_opt(sbi, DISCARD); + f2fs_update_time(sbi, REQ_TIME); out: mnt_drop_write_file(filp); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 7638ebb1c343..cfc19d8d4625 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1343,6 +1343,11 @@ static bool __drop_discard_cmd(struct f2fs_sb_info *sbi) return dropped; } +void drop_discard_cmd(struct f2fs_sb_info *sbi) +{ + __drop_discard_cmd(sbi); +} + static unsigned int __wait_one_discard_bio(struct f2fs_sb_info *sbi, struct discard_cmd *dc) { From 70b3a923daff38468c03ad2b5a4b6efd65e5afa3 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 18 Jan 2018 17:29:10 +0800 Subject: [PATCH 586/804] f2fs: drop page cache after fs shutdown Don't remain dirtied page cache in f2fs after shutdown, it can mitigate memory pressure of whole system, in order to keep other modules working properly. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 7 +++++-- fs/f2fs/data.c | 12 ++++++------ fs/f2fs/node.c | 19 ++++++++++--------- 3 files changed, 21 insertions(+), 17 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 8e629434cd05..91c18dd62974 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -238,12 +238,15 @@ static int __f2fs_write_meta_page(struct page *page, trace_f2fs_writepage(page, META); + if (unlikely(f2fs_cp_error(sbi))) { + dec_page_count(sbi, F2FS_DIRTY_META); + unlock_page(page); + return 0; + } if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) goto redirty_out; if (wbc->for_reclaim && page->index < GET_SUM_BLOCK(sbi, 0)) goto redirty_out; - if (unlikely(f2fs_cp_error(sbi))) - goto redirty_out; write_meta_page(sbi, page, io_type); dec_page_count(sbi, F2FS_DIRTY_META); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 2003ebdc9b52..c80f138b0f33 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1715,6 +1715,12 @@ static int __write_data_page(struct page *page, bool *submitted, trace_f2fs_writepage(page, DATA); + /* we should bypass data pages to proceed the kworkder jobs */ + if (unlikely(f2fs_cp_error(sbi))) { + mapping_set_error(page->mapping, -EIO); + goto out; + } + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) goto redirty_out; @@ -1739,12 +1745,6 @@ write: available_free_memory(sbi, BASE_CHECK)))) goto redirty_out; - /* we should bypass data pages to proceed the kworkder jobs */ - if (unlikely(f2fs_cp_error(sbi))) { - mapping_set_error(page->mapping, -EIO); - goto out; - } - /* Dentry blocks are controlled by checkpoint */ if (S_ISDIR(inode->i_mode)) { fio.need_lock = LOCK_DONE; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 676b0e3f5ef3..ad5df8bc51ad 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1344,10 +1344,14 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, trace_f2fs_writepage(page, NODE); + if (unlikely(f2fs_cp_error(sbi))) { + dec_page_count(sbi, F2FS_DIRTY_NODES); + unlock_page(page); + return 0; + } + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) goto redirty_out; - if (unlikely(f2fs_cp_error(sbi))) - goto redirty_out; /* get old block addr of this node page */ nid = nid_of_node(page); @@ -1592,12 +1596,6 @@ next_step: struct page *page = pvec.pages[i]; bool submitted = false; - if (unlikely(f2fs_cp_error(sbi))) { - pagevec_release(&pvec); - ret = -EIO; - goto out; - } - /* * flushing sequence with step: * 0. indirect nodes @@ -1667,9 +1665,12 @@ continue_unlock: step++; goto next_step; } -out: + if (nwritten) f2fs_submit_merged_write(sbi, NODE); + + if (unlikely(f2fs_cp_error(sbi))) + return -EIO; return ret; } From 64aa9569a1bffeafac71f48930e05f87a909d1cb Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Sat, 20 Jan 2018 15:46:33 +0800 Subject: [PATCH 587/804] f2fs: correct removexattr behavior for null valued extended attribute __vfs_removexattr() transfers "NULL" value to the setxattr handler of the f2fs filesystem in order to remove the extended attribute. But, __f2fs_setxattr() just ignores the removal request when the value of the extended attribute is already NULL. We have to remove the extended attribute itself even if the value of that is already NULL. We can reporduce this bug with the below: 1. touch file 2. setfattr -n "user.foo" file 3. setfattr -x "user.foo" file 4. getfattr -d file > user.foo Signed-off-by: Daeho Jeong Signed-off-by: Youngjin Gil Tested-by: Hobin Woo Tested-by: Chao Yu Reviewed-by: Chao Yu Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/xattr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 353fbff85bab..116be979b897 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -640,7 +640,7 @@ static int __f2fs_setxattr(struct inode *inode, int index, goto exit; } - if (f2fs_xattr_value_same(here, value, size)) + if (value && f2fs_xattr_value_same(here, value, size)) goto exit; } else if ((flags & XATTR_REPLACE)) { error = -ENODATA; From e56500860be0787a5b78380463ec0fd027460de3 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 19 Jan 2018 20:01:40 -0800 Subject: [PATCH 588/804] f2fs: recover some i_inline flags This fixes lost i_inline flags during roll-forward. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/recovery.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 7d63faf51e52..b6d1ec620a8c 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -195,6 +195,20 @@ out: return err; } +static void recover_inline_flags(struct inode *inode, struct f2fs_inode *ri) +{ + if (ri->i_inline & F2FS_PIN_FILE) + set_inode_flag(inode, FI_PIN_FILE); + else + clear_inode_flag(inode, FI_PIN_FILE); + if (ri->i_inline & F2FS_DATA_EXIST) + set_inode_flag(inode, FI_DATA_EXIST); + else + clear_inode_flag(inode, FI_DATA_EXIST); + if (!(ri->i_inline & F2FS_INLINE_DOTS)) + clear_inode_flag(inode, FI_INLINE_DOTS); +} + static void recover_inode(struct inode *inode, struct page *page) { struct f2fs_inode *raw = F2FS_INODE(page); @@ -211,13 +225,16 @@ static void recover_inode(struct inode *inode, struct page *page) F2FS_I(inode)->i_advise = raw->i_advise; + recover_inline_flags(inode, raw); + if (file_enc_name(inode)) name = ""; else name = F2FS_INODE(page)->i_name; - f2fs_msg(inode->i_sb, KERN_NOTICE, "recover_inode: ino = %x, name = %s", - ino_of_node(page), name); + f2fs_msg(inode->i_sb, KERN_NOTICE, + "recover_inode: ino = %x, name = %s, inline = %x", + ino_of_node(page), name, raw->i_inline); } static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head, From 35b11839a1ae84d02338b2c96952bffa1c908df8 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 19 Jan 2018 13:42:33 -0800 Subject: [PATCH 589/804] f2fs: allow to recover node blocks given updated checkpoint If fsck.f2fs changes crc, we have no way to recover some inode blocks by roll- forward recovery. Let's relax the condition to recover them. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 1 + fs/f2fs/node.h | 4 ++++ include/linux/f2fs_fs.h | 1 + 3 files changed, 6 insertions(+) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 91c18dd62974..3c343e922f6e 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1161,6 +1161,7 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* set this flag to activate crc|cp_ver for recovery */ __set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG); + __clear_ckpt_flags(ckpt, CP_NOCRC_RECOVERY_FLAG); spin_unlock_irqrestore(&sbi->cp_lock, flags); } diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 0ee3e5ff49a3..081ef0d672bf 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -305,6 +305,10 @@ static inline bool is_recoverable_dnode(struct page *page) struct f2fs_checkpoint *ckpt = F2FS_CKPT(F2FS_P_SB(page)); __u64 cp_ver = cur_cp_version(ckpt); + /* Don't care crc part, if fsck.f2fs sets it. */ + if (__is_set_ckpt_flags(ckpt, CP_NOCRC_RECOVERY_FLAG)) + return (cp_ver << 32) == (cpver_of_node(page) << 32); + if (__is_set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG)) cp_ver |= (cur_cp_crc(ckpt) << 32); diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index 9bba23187c04..9e0cb7b63883 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -117,6 +117,7 @@ struct f2fs_super_block { /* * For checkpoint */ +#define CP_NOCRC_RECOVERY_FLAG 0x00000200 #define CP_TRIMMED_FLAG 0x00000100 #define CP_NAT_BITS_FLAG 0x00000080 #define CP_CRC_RECOVERY_FLAG 0x00000040 From b007190234d624dad977a124ec8d520f4c874b6d Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 25 Jan 2018 18:57:25 +0800 Subject: [PATCH 590/804] f2fs: use GFP_F2FS_ZERO for cleanup Clean up codes with GFP_F2FS_ZERO, no logic changes. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index ad5df8bc51ad..c294d0feea08 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -143,11 +143,9 @@ static struct nat_entry *__alloc_nat_entry(nid_t nid, bool no_fail) struct nat_entry *new; if (no_fail) - new = f2fs_kmem_cache_alloc(nat_entry_slab, - GFP_NOFS | __GFP_ZERO); + new = f2fs_kmem_cache_alloc(nat_entry_slab, GFP_F2FS_ZERO); else - new = kmem_cache_alloc(nat_entry_slab, - GFP_NOFS | __GFP_ZERO); + new = kmem_cache_alloc(nat_entry_slab, GFP_F2FS_ZERO); if (new) { nat_set_nid(new, nid); nat_reset_flag(new); From fa043fae90300e9b49218e204409a5066121b0a7 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 25 Jan 2018 18:57:26 +0800 Subject: [PATCH 591/804] f2fs: clean up duplicated assignment in init_discard_policy Remove duplicated codes of assignment for .max_requests and .io_aware_gran. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index cfc19d8d4625..31c69c6660e7 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1795,25 +1795,20 @@ void init_discard_policy(struct discard_policy *dpolicy, dpolicy->sync = true; dpolicy->granularity = granularity; + dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST; + dpolicy->io_aware_gran = MAX_PLIST_NUM; + if (discard_type == DPOLICY_BG) { dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME; dpolicy->max_interval = DEF_MAX_DISCARD_ISSUE_TIME; - dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST; - dpolicy->io_aware_gran = MAX_PLIST_NUM; dpolicy->io_aware = true; } else if (discard_type == DPOLICY_FORCE) { dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME; dpolicy->max_interval = DEF_MAX_DISCARD_ISSUE_TIME; - dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST; - dpolicy->io_aware_gran = MAX_PLIST_NUM; dpolicy->io_aware = true; } else if (discard_type == DPOLICY_FSTRIM) { - dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST; - dpolicy->io_aware_gran = MAX_PLIST_NUM; dpolicy->io_aware = false; } else if (discard_type == DPOLICY_UMOUNT) { - dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST; - dpolicy->io_aware_gran = MAX_PLIST_NUM; dpolicy->io_aware = false; } } From 1062a0c018296c6719e49b25bed206d414c18898 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 25 Jan 2018 18:57:27 +0800 Subject: [PATCH 592/804] f2fs: stop issuing discard if fs is readonly If filesystem is readonly, stop to issue discard in daemon. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 31c69c6660e7..6662c6caf477 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1486,6 +1486,8 @@ static int issue_discard_thread(void *data) msecs_to_jiffies(wait_ms)); if (try_to_freeze()) continue; + if (f2fs_readonly(sbi->sb)) + continue; if (kthread_should_stop()) return 0; From 9fb0de175172c63132cc84b630e8c50834269e1b Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Thu, 25 Jan 2018 17:27:11 +0800 Subject: [PATCH 593/804] f2fs: rebuild sit page from sit info in mem This patch rebuild sit page from sit info in mem instead of issue a read io. I test this method and the result is as below: Pre: mmc_perf_test-12061 [001] ...1 976.819992: f2fs_write_checkpoint: dev = (259,44), checkpoint for Sync, state = start flush sit mmc_perf_test-12061 [001] ...1 976.856446: f2fs_write_checkpoint: dev = (259,44), checkpoint for Sync, state = end flush sit mmc_perf_test-12061 [003] ...1 998.976946: f2fs_write_checkpoint: dev = (259,44), checkpoint for Sync, state = start flush sit mmc_perf_test-12061 [003] ...1 999.023269: f2fs_write_checkpoint: dev = (259,44), checkpoint for Sync, state = end flush sit mmc_perf_test-12061 [003] ...1 1022.060772: f2fs_write_checkpoint: dev = (259,44), checkpoint for Sync, state = start flush sit mmc_perf_test-12061 [003] ...1 1022.111034: f2fs_write_checkpoint: dev = (259,44), checkpoint for Sync, state = end flush sit mmc_perf_test-12061 [002] ...1 1070.127643: f2fs_write_checkpoint: dev = (259,44), checkpoint for Sync, state = start flush sit mmc_perf_test-12061 [003] ...1 1070.187352: f2fs_write_checkpoint: dev = (259,44), checkpoint for Sync, state = end flush sit mmc_perf_test-12061 [003] ...1 1095.942124: f2fs_write_checkpoint: dev = (259,44), checkpoint for Sync, state = start flush sit mmc_perf_test-12061 [003] ...1 1095.995975: f2fs_write_checkpoint: dev = (259,44), checkpoint for Sync, state = end flush sit mmc_perf_test-12061 [003] ...1 1122.535091: f2fs_write_checkpoint: dev = (259,44), checkpoint for Sync, state = start flush sit mmc_perf_test-12061 [003] ...1 1122.586521: f2fs_write_checkpoint: dev = (259,44), checkpoint for Sync, state = end flush sit mmc_perf_test-12061 [001] ...1 1147.897487: f2fs_write_checkpoint: dev = (259,44), checkpoint for Sync, state = start flush sit mmc_perf_test-12061 [001] ...1 1147.959438: f2fs_write_checkpoint: dev = (259,44), checkpoint for Sync, state = end flush sit mmc_perf_test-12061 [003] ...1 1177.926951: f2fs_write_checkpoint: dev = (259,44), checkpoint for Sync, state = start flush sit mmc_perf_test-12061 [002] ...1 1177.976823: f2fs_write_checkpoint: dev = (259,44), checkpoint for Sync, state = end flush sit mmc_perf_test-12061 [002] ...1 1204.176087: f2fs_write_checkpoint: dev = (259,44), checkpoint for Sync, state = start flush sit mmc_perf_test-12061 [002] ...1 1204.239046: f2fs_write_checkpoint: dev = (259,44), checkpoint for Sync, state = end flush sit Some sit flush consume more than 50ms. Now: mmc_perf_test-2187 [007] ...1 196.840684: f2fs_write_checkpoint: dev = (259,44), checkpoint for Sync, state = start flush sit mmc_perf_test-2187 [007] ...1 196.841258: f2fs_write_checkpoint: dev = (259,44), checkpoint for Sync, state = end flush sit mmc_perf_test-2187 [007] ...1 219.430582: f2fs_write_checkpoint: dev = (259,44), checkpoint for Sync, state = start flush sit mmc_perf_test-2187 [007] ...1 219.431144: f2fs_write_checkpoint: dev = (259,44), checkpoint for Sync, state = end flush sit mmc_perf_test-2187 [002] ...1 243.638678: f2fs_write_checkpoint: dev = (259,44), checkpoint for Sync, state = start flush sit mmc_perf_test-2187 [000] ...1 243.638980: f2fs_write_checkpoint: dev = (259,44), checkpoint for Sync, state = end flush sit mmc_perf_test-2187 [002] ...1 265.392180: f2fs_write_checkpoint: dev = (259,44), checkpoint for Sync, state = start flush sit mmc_perf_test-2187 [002] ...1 265.392245: f2fs_write_checkpoint: dev = (259,44), checkpoint for Sync, state = end flush sit mmc_perf_test-2187 [000] ...1 290.309051: f2fs_write_checkpoint: dev = (259,44), checkpoint for Sync, state = start flush sit mmc_perf_test-2187 [000] ...1 290.309116: f2fs_write_checkpoint: dev = (259,44), checkpoint for Sync, state = end flush sit mmc_perf_test-2187 [003] ...1 317.144209: f2fs_write_checkpoint: dev = (259,44), checkpoint for Sync, state = start flush sit mmc_perf_test-2187 [003] ...1 317.145913: f2fs_write_checkpoint: dev = (259,44), checkpoint for Sync, state = end flush sit mmc_perf_test-2187 [005] ...1 343.224954: f2fs_write_checkpoint: dev = (259,44), checkpoint for Sync, state = start flush sit mmc_perf_test-2187 [005] ...1 343.225574: f2fs_write_checkpoint: dev = (259,44), checkpoint for Sync, state = end flush sit mmc_perf_test-2187 [000] ...1 370.239846: f2fs_write_checkpoint: dev = (259,44), checkpoint for Sync, state = start flush sit mmc_perf_test-2187 [000] ...1 370.241138: f2fs_write_checkpoint: dev = (259,44), checkpoint for Sync, state = end flush sit mmc_perf_test-2187 [001] ...1 397.029043: f2fs_write_checkpoint: dev = (259,44), checkpoint for Sync, state = start flush sit mmc_perf_test-2187 [001] ...1 397.030750: f2fs_write_checkpoint: dev = (259,44), checkpoint for Sync, state = end flush sit mmc_perf_test-2187 [003] ...1 425.386377: f2fs_write_checkpoint: dev = (259,44), checkpoint for Sync, state = start flush sit mmc_perf_test-2187 [003] ...1 425.387735: f2fs_write_checkpoint: dev = (259,44), checkpoint for Sync, state = end flush sit Most sit flush consume no more than 1ms. Signed-off-by: Yunlei He Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 19 +++++-------------- fs/f2fs/segment.h | 29 +++++++++++++++++++++++++++-- 2 files changed, 32 insertions(+), 16 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 6662c6caf477..bf98f6f34b7e 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -3191,28 +3191,19 @@ static struct page *get_next_sit_page(struct f2fs_sb_info *sbi, unsigned int start) { struct sit_info *sit_i = SIT_I(sbi); - struct page *src_page, *dst_page; + struct page *page; pgoff_t src_off, dst_off; - void *src_addr, *dst_addr; src_off = current_sit_addr(sbi, start); dst_off = next_sit_addr(sbi, src_off); - /* get current sit block page without lock */ - src_page = get_meta_page(sbi, src_off); - dst_page = grab_meta_page(sbi, dst_off); - f2fs_bug_on(sbi, PageDirty(src_page)); - - src_addr = page_address(src_page); - dst_addr = page_address(dst_page); - memcpy(dst_addr, src_addr, PAGE_SIZE); - - set_page_dirty(dst_page); - f2fs_put_page(src_page, 1); + page = grab_meta_page(sbi, dst_off); + seg_info_to_sit_page(sbi, page, start); + set_page_dirty(page); set_to_next_sit(sit_i, start); - return dst_page; + return page; } static struct sit_entry_set *grab_sit_entry_set(void) diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index e123dd30f2e4..5d6d3e72be31 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -348,16 +348,41 @@ static inline void seg_info_from_raw_sit(struct seg_entry *se, se->mtime = le64_to_cpu(rs->mtime); } -static inline void seg_info_to_raw_sit(struct seg_entry *se, +static inline void __seg_info_to_raw_sit(struct seg_entry *se, struct f2fs_sit_entry *rs) { unsigned short raw_vblocks = (se->type << SIT_VBLOCKS_SHIFT) | se->valid_blocks; rs->vblocks = cpu_to_le16(raw_vblocks); memcpy(rs->valid_map, se->cur_valid_map, SIT_VBLOCK_MAP_SIZE); + rs->mtime = cpu_to_le64(se->mtime); +} + +static inline void seg_info_to_sit_page(struct f2fs_sb_info *sbi, + struct page *page, unsigned int start) +{ + struct f2fs_sit_block *raw_sit; + struct seg_entry *se; + struct f2fs_sit_entry *rs; + unsigned int end = min(start + SIT_ENTRY_PER_BLOCK, + (unsigned long)MAIN_SEGS(sbi)); + int i; + + raw_sit = (struct f2fs_sit_block *)page_address(page); + for (i = 0; i < end - start; i++) { + rs = &raw_sit->entries[i]; + se = get_seg_entry(sbi, start + i); + __seg_info_to_raw_sit(se, rs); + } +} + +static inline void seg_info_to_raw_sit(struct seg_entry *se, + struct f2fs_sit_entry *rs) +{ + __seg_info_to_raw_sit(se, rs); + memcpy(se->ckpt_valid_map, rs->valid_map, SIT_VBLOCK_MAP_SIZE); se->ckpt_valid_blocks = se->valid_blocks; - rs->mtime = cpu_to_le64(se->mtime); } static inline unsigned int find_next_inuse(struct free_segmap_info *free_i, From 5f9b3abb911fa2f51f5690f4376cf919c2069662 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 25 Jan 2018 14:54:42 +0800 Subject: [PATCH 594/804] f2fs: support inode creation time This patch adds creation time field in inode layout to support showing kstat.btime in ->statx. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 7 +++++++ fs/f2fs/file.c | 31 +++++++++++++++++++++++++++++++ fs/f2fs/inode.c | 15 +++++++++++++++ fs/f2fs/namei.c | 3 ++- fs/f2fs/sysfs.c | 7 +++++++ include/linux/f2fs_fs.h | 4 +++- 6 files changed, 65 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 26f8aefe5f5f..0eeeeba0246d 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -125,6 +125,7 @@ struct f2fs_mount_info { #define F2FS_FEATURE_INODE_CHKSUM 0x0020 #define F2FS_FEATURE_FLEXIBLE_INLINE_XATTR 0x0040 #define F2FS_FEATURE_QUOTA_INO 0x0080 +#define F2FS_FEATURE_INODE_CRTIME 0x0100 #define F2FS_HAS_FEATURE(sb, mask) \ ((F2FS_SB(sb)->raw_super->feature & cpu_to_le32(mask)) != 0) @@ -699,6 +700,7 @@ struct f2fs_inode_info { int i_extra_isize; /* size of extra space located in i_addr */ kprojid_t i_projid; /* id for project quota */ int i_inline_xattr_size; /* inline xattr size */ + struct timespec i_crtime; /* inode creation time */ }; static inline void get_extent_info(struct extent_info *ext, @@ -3299,6 +3301,11 @@ static inline int f2fs_sb_has_quota_ino(struct super_block *sb) return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_QUOTA_INO); } +static inline int f2fs_sb_has_inode_crtime(struct super_block *sb) +{ + return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_INODE_CRTIME); +} + #ifdef CONFIG_BLK_DEV_ZONED static inline int get_blkz_type(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t blkaddr) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 86507b51b7d3..65cda5bc61b7 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -673,6 +673,37 @@ int f2fs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { struct inode *inode = d_inode(dentry); +#if 0 + struct f2fs_inode_info *fi = F2FS_I(inode); + struct f2fs_inode *ri; + unsigned int flags; + + if (f2fs_has_extra_attr(inode) && + f2fs_sb_has_inode_crtime(inode->i_sb) && + F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, i_crtime)) { + stat->result_mask |= STATX_BTIME; + stat->btime.tv_sec = fi->i_crtime.tv_sec; + stat->btime.tv_nsec = fi->i_crtime.tv_nsec; + } + + flags = fi->i_flags & (FS_FL_USER_VISIBLE | FS_PROJINHERIT_FL); + if (flags & FS_APPEND_FL) + stat->attributes |= STATX_ATTR_APPEND; + if (flags & FS_COMPR_FL) + stat->attributes |= STATX_ATTR_COMPRESSED; + if (f2fs_encrypted_inode(inode)) + stat->attributes |= STATX_ATTR_ENCRYPTED; + if (flags & FS_IMMUTABLE_FL) + stat->attributes |= STATX_ATTR_IMMUTABLE; + if (flags & FS_NODUMP_FL) + stat->attributes |= STATX_ATTR_NODUMP; + + stat->attributes_mask |= (STATX_ATTR_APPEND | + STATX_ATTR_COMPRESSED | + STATX_ATTR_ENCRYPTED | + STATX_ATTR_IMMUTABLE | + STATX_ATTR_NODUMP); +#endif generic_fillattr(inode, stat); /* we need to show initial sectors used for inline_data/dentries */ diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 1dc77a40d0ad..89c838bfb067 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -278,6 +278,12 @@ static int do_read_inode(struct inode *inode) i_projid = F2FS_DEF_PROJID; fi->i_projid = make_kprojid(&init_user_ns, i_projid); + if (f2fs_has_extra_attr(inode) && f2fs_sb_has_inode_crtime(sbi->sb) && + F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, i_crtime)) { + fi->i_crtime.tv_sec = le64_to_cpu(ri->i_crtime); + fi->i_crtime.tv_nsec = le32_to_cpu(ri->i_crtime_nsec); + } + f2fs_put_page(node_page, 1); stat_inc_inline_xattr(inode); @@ -421,6 +427,15 @@ void update_inode(struct inode *inode, struct page *node_page) F2FS_I(inode)->i_projid); ri->i_projid = cpu_to_le32(i_projid); } + + if (f2fs_sb_has_inode_crtime(F2FS_I_SB(inode)->sb) && + F2FS_FITS_IN_INODE(ri, F2FS_I(inode)->i_extra_isize, + i_crtime)) { + ri->i_crtime = + cpu_to_le64(F2FS_I(inode)->i_crtime.tv_sec); + ri->i_crtime_nsec = + cpu_to_le32(F2FS_I(inode)->i_crtime.tv_nsec); + } } __set_inode_rdev(inode, ri); diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 7573779a8e7c..da7f709e3926 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -50,7 +50,8 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) inode->i_ino = ino; inode->i_blocks = 0; - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode->i_atime = inode->i_ctime = + F2FS_I(inode)->i_crtime = current_time(inode); inode->i_generation = sbi->s_next_generation++; err = insert_inode_locked(inode); diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 41887e6ec1b3..d978c7b6ea04 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -113,6 +113,9 @@ static ssize_t features_show(struct f2fs_attr *a, if (f2fs_sb_has_quota_ino(sb)) len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "quota_ino"); + if (f2fs_sb_has_inode_crtime(sb)) + len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len ? ", " : "", "inode_crtime"); len += snprintf(buf + len, PAGE_SIZE - len, "\n"); return len; } @@ -232,6 +235,7 @@ enum feat_id { FEAT_INODE_CHECKSUM, FEAT_FLEXIBLE_INLINE_XATTR, FEAT_QUOTA_INO, + FEAT_INODE_CRTIME, }; static ssize_t f2fs_feature_show(struct f2fs_attr *a, @@ -246,6 +250,7 @@ static ssize_t f2fs_feature_show(struct f2fs_attr *a, case FEAT_INODE_CHECKSUM: case FEAT_FLEXIBLE_INLINE_XATTR: case FEAT_QUOTA_INO: + case FEAT_INODE_CRTIME: return snprintf(buf, PAGE_SIZE, "supported\n"); } return 0; @@ -323,6 +328,7 @@ F2FS_FEATURE_RO_ATTR(project_quota, FEAT_PROJECT_QUOTA); F2FS_FEATURE_RO_ATTR(inode_checksum, FEAT_INODE_CHECKSUM); F2FS_FEATURE_RO_ATTR(flexible_inline_xattr, FEAT_FLEXIBLE_INLINE_XATTR); F2FS_FEATURE_RO_ATTR(quota_ino, FEAT_QUOTA_INO); +F2FS_FEATURE_RO_ATTR(inode_crtime, FEAT_INODE_CRTIME); #define ATTR_LIST(name) (&f2fs_attr_##name.attr) static struct attribute *f2fs_attrs[] = { @@ -376,6 +382,7 @@ static struct attribute *f2fs_feat_attrs[] = { ATTR_LIST(inode_checksum), ATTR_LIST(flexible_inline_xattr), ATTR_LIST(quota_ino), + ATTR_LIST(inode_crtime), NULL, }; diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index 9e0cb7b63883..c82ae65b5330 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -253,8 +253,10 @@ struct f2fs_inode { __le16 i_inline_xattr_size; /* inline xattr size, unit: 4 bytes */ __le32 i_projid; /* project id */ __le32 i_inode_checksum;/* inode meta checksum */ + __le64 i_crtime; /* creation time */ + __le32 i_crtime_nsec; /* creation time in nano scale */ __le32 i_extra_end[0]; /* for attribute size calculation */ - }; + } __packed; __le32 i_addr[DEF_ADDRS_PER_INODE]; /* Pointers to data blocks */ }; __le32 i_nid[DEF_NIDS_PER_INODE]; /* direct(2), indirect(2), From 39ed8376d611d2a211079be6a2ac08715f5a58c4 Mon Sep 17 00:00:00 2001 From: Yunlong Song Date: Wed, 28 Feb 2018 20:31:52 +0800 Subject: [PATCH 595/804] f2fs: don't put dentry page in pagecache into highmem Previous dentry page uses highmem, which will cause panic in platforms using highmem (such as arm), since the address space of dentry pages from highmem directly goes into the decryption path via the function fscrypt_fname_disk_to_usr. But sg_init_one assumes the address is not from highmem, and then cause panic since it doesn't call kmap_high but kunmap_high is triggered at the end. To fix this problem in a simple way, this patch avoids to put dentry page in pagecache into highmem. Signed-off-by: Yunlong Song Reviewed-by: Chao Yu [Jaegeuk Kim: fix coding style] Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 23 +++++------------------ fs/f2fs/f2fs.h | 6 ------ fs/f2fs/inline.c | 3 +-- fs/f2fs/inode.c | 2 +- fs/f2fs/namei.c | 32 ++++++++------------------------ fs/f2fs/recovery.c | 11 +++++------ include/linux/f2fs_fs.h | 1 - 7 files changed, 20 insertions(+), 58 deletions(-) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index bde445e4e690..560b707050ca 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -94,14 +94,12 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page, struct f2fs_dir_entry *de; struct f2fs_dentry_ptr d; - dentry_blk = (struct f2fs_dentry_block *)kmap(dentry_page); + dentry_blk = (struct f2fs_dentry_block *)page_address(dentry_page); make_dentry_ptr_block(NULL, &d, dentry_blk); de = find_target_dentry(fname, namehash, max_slots, &d); if (de) *res_page = dentry_page; - else - kunmap(dentry_page); return de; } @@ -287,7 +285,6 @@ ino_t f2fs_inode_by_name(struct inode *dir, const struct qstr *qstr, de = f2fs_find_entry(dir, qstr, page); if (de) { res = le32_to_cpu(de->ino); - f2fs_dentry_kunmap(dir, *page); f2fs_put_page(*page, 0); } @@ -302,7 +299,6 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, f2fs_wait_on_page_writeback(page, type, true); de->ino = cpu_to_le32(inode->i_ino); set_de_type(de, inode->i_mode); - f2fs_dentry_kunmap(dir, page); set_page_dirty(page); dir->i_mtime = dir->i_ctime = current_time(dir); @@ -350,13 +346,11 @@ static int make_empty_dir(struct inode *inode, if (IS_ERR(dentry_page)) return PTR_ERR(dentry_page); - dentry_blk = kmap_atomic(dentry_page); + dentry_blk = page_address(dentry_page); make_dentry_ptr_block(NULL, &d, dentry_blk); do_make_empty_dir(inode, parent, &d); - kunmap_atomic(dentry_blk); - set_page_dirty(dentry_page); f2fs_put_page(dentry_page, 1); return 0; @@ -547,13 +541,12 @@ start: if (IS_ERR(dentry_page)) return PTR_ERR(dentry_page); - dentry_blk = kmap(dentry_page); + dentry_blk = page_address(dentry_page); bit_pos = room_for_filename(&dentry_blk->dentry_bitmap, slots, NR_DENTRY_IN_BLOCK); if (bit_pos < NR_DENTRY_IN_BLOCK) goto add_dentry; - kunmap(dentry_page); f2fs_put_page(dentry_page, 1); } @@ -588,7 +581,6 @@ fail: if (inode) up_write(&F2FS_I(inode)->i_sem); - kunmap(dentry_page); f2fs_put_page(dentry_page, 1); return err; @@ -642,7 +634,6 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name, F2FS_I(dir)->task = NULL; } if (de) { - f2fs_dentry_kunmap(dir, page); f2fs_put_page(page, 0); err = -EEXIST; } else if (IS_ERR(page)) { @@ -730,7 +721,6 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap, NR_DENTRY_IN_BLOCK, 0); - kunmap(page); /* kunmap - pair of f2fs_find_entry */ set_page_dirty(page); dir->i_ctime = dir->i_mtime = current_time(dir); @@ -775,7 +765,7 @@ bool f2fs_empty_dir(struct inode *dir) return false; } - dentry_blk = kmap_atomic(dentry_page); + dentry_blk = page_address(dentry_page); if (bidx == 0) bit_pos = 2; else @@ -783,7 +773,6 @@ bool f2fs_empty_dir(struct inode *dir) bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap, NR_DENTRY_IN_BLOCK, bit_pos); - kunmap_atomic(dentry_blk); f2fs_put_page(dentry_page, 1); @@ -901,19 +890,17 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) } } - dentry_blk = kmap(dentry_page); + dentry_blk = page_address(dentry_page); make_dentry_ptr_block(inode, &d, dentry_blk); err = f2fs_fill_dentries(ctx, &d, n * NR_DENTRY_IN_BLOCK, &fstr); if (err) { - kunmap(dentry_page); f2fs_put_page(dentry_page, 1); break; } - kunmap(dentry_page); f2fs_put_page(dentry_page, 1); } out_free: diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 0eeeeba0246d..e54ffadb692c 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2463,12 +2463,6 @@ static inline int f2fs_has_inline_dentry(struct inode *inode) return is_inode_flag_set(inode, FI_INLINE_DENTRY); } -static inline void f2fs_dentry_kunmap(struct inode *dir, struct page *page) -{ - if (!f2fs_has_inline_dentry(dir)) - kunmap(page); -} - static inline int is_file(struct inode *inode, int type) { return F2FS_I(inode)->i_advise & type; diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 0fa5ca0907ba..12f6c6471c56 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -369,7 +369,7 @@ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage, f2fs_wait_on_page_writeback(page, DATA, true); zero_user_segment(page, MAX_INLINE_DATA(dir), PAGE_SIZE); - dentry_blk = kmap_atomic(page); + dentry_blk = page_address(page); make_dentry_ptr_inline(dir, &src, inline_dentry); make_dentry_ptr_block(dir, &dst, dentry_blk); @@ -386,7 +386,6 @@ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage, memcpy(dst.dentry, src.dentry, SIZE_OF_DIR_ENTRY * src.max); memcpy(dst.filename, src.filename, src.max * F2FS_SLOT_LEN); - kunmap_atomic(dentry_blk); if (!PageUptodate(page)) SetPageUptodate(page); set_page_dirty(page); diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 89c838bfb067..10be247ca421 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -328,7 +328,7 @@ make_now: inode->i_op = &f2fs_dir_inode_operations; inode->i_fop = &f2fs_dir_operations; inode->i_mapping->a_ops = &f2fs_dblock_aops; - mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_HIGH_ZERO); + inode_nohighmem(inode); } else if (S_ISLNK(inode->i_mode)) { if (f2fs_encrypted_inode(inode)) inode->i_op = &f2fs_encrypted_symlink_inode_operations; diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index da7f709e3926..6bb1adb84324 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -317,7 +317,6 @@ static int __recover_dot_dentries(struct inode *dir, nid_t pino) de = f2fs_find_entry(dir, &dot, &page); if (de) { - f2fs_dentry_kunmap(dir, page); f2fs_put_page(page, 0); } else if (IS_ERR(page)) { err = PTR_ERR(page); @@ -329,14 +328,12 @@ static int __recover_dot_dentries(struct inode *dir, nid_t pino) } de = f2fs_find_entry(dir, &dotdot, &page); - if (de) { - f2fs_dentry_kunmap(dir, page); + if (de) f2fs_put_page(page, 0); - } else if (IS_ERR(page)) { + else if (IS_ERR(page)) err = PTR_ERR(page); - } else { + else err = __f2fs_add_link(dir, &dotdot, NULL, pino, S_IFDIR); - } out: if (!err) clear_inode_flag(dir, FI_INLINE_DOTS); @@ -377,7 +374,6 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, } ino = le32_to_cpu(de->ino); - f2fs_dentry_kunmap(dir, page); f2fs_put_page(page, 0); inode = f2fs_iget(dir->i_sb, ino); @@ -452,7 +448,6 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) err = acquire_orphan_inode(sbi); if (err) { f2fs_unlock_op(sbi); - f2fs_dentry_kunmap(dir, page); f2fs_put_page(page, 0); goto fail; } @@ -610,7 +605,7 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) inode->i_op = &f2fs_dir_inode_operations; inode->i_fop = &f2fs_dir_operations; inode->i_mapping->a_ops = &f2fs_dblock_aops; - mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_HIGH_ZERO); + inode_nohighmem(inode); set_inode_flag(inode, FI_INC_LINK); f2fs_lock_op(sbi); @@ -924,13 +919,11 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, } if (old_dir_entry) { - if (old_dir != new_dir && !whiteout) { + if (old_dir != new_dir && !whiteout) f2fs_set_link(old_inode, old_dir_entry, old_dir_page, new_dir); - } else { - f2fs_dentry_kunmap(old_inode, old_dir_page); + else f2fs_put_page(old_dir_page, 0); - } f2fs_i_links_write(old_dir, false); } add_ino_entry(sbi, new_dir->i_ino, TRANS_DIR_INO); @@ -943,20 +936,15 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, put_out_dir: f2fs_unlock_op(sbi); - if (new_page) { - f2fs_dentry_kunmap(new_dir, new_page); + if (new_page) f2fs_put_page(new_page, 0); - } out_whiteout: if (whiteout) iput(whiteout); out_dir: - if (old_dir_entry) { - f2fs_dentry_kunmap(old_inode, old_dir_page); + if (old_dir_entry) f2fs_put_page(old_dir_page, 0); - } out_old: - f2fs_dentry_kunmap(old_dir, old_page); f2fs_put_page(old_page, 0); out: return err; @@ -1098,19 +1086,15 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, return 0; out_new_dir: if (new_dir_entry) { - f2fs_dentry_kunmap(new_inode, new_dir_page); f2fs_put_page(new_dir_page, 0); } out_old_dir: if (old_dir_entry) { - f2fs_dentry_kunmap(old_inode, old_dir_page); f2fs_put_page(old_dir_page, 0); } out_new: - f2fs_dentry_kunmap(new_dir, new_page); f2fs_put_page(new_page, 0); out_old: - f2fs_dentry_kunmap(old_dir, old_page); f2fs_put_page(old_page, 0); out: return err; diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index b6d1ec620a8c..210de28c9cd2 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -144,7 +144,7 @@ static int recover_dentry(struct inode *inode, struct page *ipage, retry: de = __f2fs_find_entry(dir, &fname, &page); if (de && inode->i_ino == le32_to_cpu(de->ino)) - goto out_unmap_put; + goto out_put; if (de) { einode = f2fs_iget_retry(inode->i_sb, le32_to_cpu(de->ino)); @@ -153,19 +153,19 @@ retry: err = PTR_ERR(einode); if (err == -ENOENT) err = -EEXIST; - goto out_unmap_put; + goto out_put; } err = dquot_initialize(einode); if (err) { iput(einode); - goto out_unmap_put; + goto out_put; } err = acquire_orphan_inode(F2FS_I_SB(inode)); if (err) { iput(einode); - goto out_unmap_put; + goto out_put; } f2fs_delete_entry(de, page, dir, einode); iput(einode); @@ -180,8 +180,7 @@ retry: goto retry; goto out; -out_unmap_put: - f2fs_dentry_kunmap(dir, page); +out_put: f2fs_put_page(page, 0); out: if (file_enc_name(inode)) diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index c82ae65b5330..073365c9808a 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -46,7 +46,6 @@ /* This flag is used by node and meta inodes, and by recovery */ #define GFP_F2FS_ZERO (GFP_NOFS | __GFP_ZERO) -#define GFP_F2FS_HIGH_ZERO (GFP_NOFS | __GFP_ZERO | __GFP_HIGHMEM) /* * For further optimization on multi-head logs, on-disk layout supports maximum From 3a2c7917782292956a32d4e1df8dc3dbecc01b25 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 5 Jan 2018 10:44:52 -0800 Subject: [PATCH 596/804] fscrypt: move fscrypt_has_encryption_key() to supp/notsupp headers fscrypt_has_encryption_key() is already split into two versions depending on whether the filesystem is being built with encryption support or not. Move them into the appropriate headers. Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- include/linux/fscrypt.h | 10 ---------- include/linux/fscrypt_notsupp.h | 5 +++++ include/linux/fscrypt_supp.h | 5 +++++ 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index 8641e56b8f8a..1e2343b46262 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -125,11 +125,6 @@ static inline struct page *fscrypt_control_page(struct page *page) return ((struct fscrypt_ctx *)page_private(page))->w.control_page; } -static inline bool fscrypt_has_encryption_key(const struct inode *inode) -{ - return (inode->i_crypt_info != NULL); -} - #include #else /* !__FS_HAS_ENCRYPTION */ @@ -140,11 +135,6 @@ static inline struct page *fscrypt_control_page(struct page *page) return ERR_PTR(-EINVAL); } -static inline bool fscrypt_has_encryption_key(const struct inode *inode) -{ - return 0; -} - #include #endif /* __FS_HAS_ENCRYPTION */ diff --git a/include/linux/fscrypt_notsupp.h b/include/linux/fscrypt_notsupp.h index c4c6bf2c390e..f8685c25b7b3 100644 --- a/include/linux/fscrypt_notsupp.h +++ b/include/linux/fscrypt_notsupp.h @@ -13,6 +13,11 @@ #ifndef _LINUX_FSCRYPT_NOTSUPP_H #define _LINUX_FSCRYPT_NOTSUPP_H +static inline bool fscrypt_has_encryption_key(const struct inode *inode) +{ + return false; +} + /* crypto.c */ static inline struct fscrypt_ctx *fscrypt_get_ctx(const struct inode *inode, gfp_t gfp_flags) diff --git a/include/linux/fscrypt_supp.h b/include/linux/fscrypt_supp.h index 2db5e9706f60..1fb73a6892b1 100644 --- a/include/linux/fscrypt_supp.h +++ b/include/linux/fscrypt_supp.h @@ -10,6 +10,11 @@ #ifndef _LINUX_FSCRYPT_SUPP_H #define _LINUX_FSCRYPT_SUPP_H +static inline bool fscrypt_has_encryption_key(const struct inode *inode) +{ + return (inode->i_crypt_info != NULL); +} + /* crypto.c */ extern struct kmem_cache *fscrypt_info_cachep; extern struct fscrypt_ctx *fscrypt_get_ctx(const struct inode *, gfp_t); From dfe0b3b1b67f6489ea857ef75135e27eb16638d4 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 5 Jan 2018 10:44:53 -0800 Subject: [PATCH 597/804] fscrypt: move fscrypt_control_page() to supp/notsupp headers fscrypt_control_page() is already split into two versions depending on whether the filesystem is being built with encryption support or not. Move them into the appropriate headers. Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- include/linux/fscrypt.h | 18 ++---------------- include/linux/fscrypt_notsupp.h | 5 +++++ include/linux/fscrypt_supp.h | 6 ++++++ 3 files changed, 13 insertions(+), 16 deletions(-) diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index 1e2343b46262..0961315a5fff 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -119,24 +119,10 @@ static inline bool fscrypt_is_dot_dotdot(const struct qstr *str) } #if __FS_HAS_ENCRYPTION - -static inline struct page *fscrypt_control_page(struct page *page) -{ - return ((struct fscrypt_ctx *)page_private(page))->w.control_page; -} - #include - -#else /* !__FS_HAS_ENCRYPTION */ - -static inline struct page *fscrypt_control_page(struct page *page) -{ - WARN_ON_ONCE(1); - return ERR_PTR(-EINVAL); -} - +#else #include -#endif /* __FS_HAS_ENCRYPTION */ +#endif /** * fscrypt_require_key - require an inode's encryption key diff --git a/include/linux/fscrypt_notsupp.h b/include/linux/fscrypt_notsupp.h index f8685c25b7b3..3d394a0737ed 100644 --- a/include/linux/fscrypt_notsupp.h +++ b/include/linux/fscrypt_notsupp.h @@ -47,6 +47,11 @@ static inline int fscrypt_decrypt_page(const struct inode *inode, return -EOPNOTSUPP; } +static inline struct page *fscrypt_control_page(struct page *page) +{ + WARN_ON_ONCE(1); + return ERR_PTR(-EINVAL); +} static inline void fscrypt_restore_control_page(struct page *page) { diff --git a/include/linux/fscrypt_supp.h b/include/linux/fscrypt_supp.h index 1fb73a6892b1..95ea7265e25b 100644 --- a/include/linux/fscrypt_supp.h +++ b/include/linux/fscrypt_supp.h @@ -24,6 +24,12 @@ extern struct page *fscrypt_encrypt_page(const struct inode *, struct page *, u64, gfp_t); extern int fscrypt_decrypt_page(const struct inode *, struct page *, unsigned int, unsigned int, u64); + +static inline struct page *fscrypt_control_page(struct page *page) +{ + return ((struct fscrypt_ctx *)page_private(page))->w.control_page; +} + extern void fscrypt_restore_control_page(struct page *); extern const struct dentry_operations fscrypt_d_ops; From 8216a0b51a3b24006d8318e28a7ef318bf142506 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 5 Jan 2018 10:44:54 -0800 Subject: [PATCH 598/804] fscrypt: move fscrypt_info_cachep declaration to fscrypt_private.h The fscrypt_info kmem_cache is internal to fscrypt; filesystems don't need to access it. So move its declaration into fscrypt_private.h. Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/crypto/fscrypt_private.h | 1 + include/linux/fscrypt_supp.h | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index c3ad415cd14f..09e99f5007de 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -82,6 +82,7 @@ static inline void bio_set_op_attrs(struct bio *bio, unsigned op, } /* crypto.c */ +extern struct kmem_cache *fscrypt_info_cachep; extern int fscrypt_initialize(unsigned int cop_flags); extern struct workqueue_struct *fscrypt_read_workqueue; extern int fscrypt_do_page_crypto(const struct inode *inode, diff --git a/include/linux/fscrypt_supp.h b/include/linux/fscrypt_supp.h index 95ea7265e25b..11522c880632 100644 --- a/include/linux/fscrypt_supp.h +++ b/include/linux/fscrypt_supp.h @@ -16,7 +16,6 @@ static inline bool fscrypt_has_encryption_key(const struct inode *inode) } /* crypto.c */ -extern struct kmem_cache *fscrypt_info_cachep; extern struct fscrypt_ctx *fscrypt_get_ctx(const struct inode *, gfp_t); extern void fscrypt_release_ctx(struct fscrypt_ctx *); extern struct page *fscrypt_encrypt_page(const struct inode *, struct page *, From 3f16e09dadfbf319fe4a71b61f6049a83d7c277c Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 5 Jan 2018 10:44:55 -0800 Subject: [PATCH 599/804] fscrypt: move fscrypt_ctx declaration to fscrypt_supp.h Filesystems only ever access 'struct fscrypt_ctx' through fscrypt functions. But when a filesystem is built without encryption support, these functions are all stubbed out, so the declaration of fscrypt_ctx is unneeded. Therefore, move it from fscrypt.h to fscrypt_supp.h. Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- include/linux/fscrypt.h | 16 +--------------- include/linux/fscrypt_supp.h | 15 +++++++++++++++ 2 files changed, 16 insertions(+), 15 deletions(-) diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index 0961315a5fff..f627ee378bc3 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -23,23 +23,9 @@ #define FS_CRYPTO_BLOCK_SIZE 16 +struct fscrypt_ctx; struct fscrypt_info; -struct fscrypt_ctx { - union { - struct { - struct page *bounce_page; /* Ciphertext page */ - struct page *control_page; /* Original page */ - } w; - struct { - struct bio *bio; - struct work_struct work; - } r; - struct list_head free_list; /* Free list */ - }; - u8 flags; /* Flags */ -}; - /** * For encrypted symlinks, the ciphertext length is stored at the beginning * of the string in little-endian format. diff --git a/include/linux/fscrypt_supp.h b/include/linux/fscrypt_supp.h index 11522c880632..40fee89fac9e 100644 --- a/include/linux/fscrypt_supp.h +++ b/include/linux/fscrypt_supp.h @@ -10,6 +10,21 @@ #ifndef _LINUX_FSCRYPT_SUPP_H #define _LINUX_FSCRYPT_SUPP_H +struct fscrypt_ctx { + union { + struct { + struct page *bounce_page; /* Ciphertext page */ + struct page *control_page; /* Original page */ + } w; + struct { + struct bio *bio; + struct work_struct work; + } r; + struct list_head free_list; /* Free list */ + }; + u8 flags; /* Flags */ +}; + static inline bool fscrypt_has_encryption_key(const struct inode *inode) { return (inode->i_crypt_info != NULL); From 7ed178bc8ae9eb13c0a0a155688e2a7187afd2bb Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 5 Jan 2018 10:44:56 -0800 Subject: [PATCH 600/804] fscrypt: split fscrypt_dummy_context_enabled() into supp/notsupp versions fscrypt_dummy_context_enabled() accesses ->s_cop, which now is only set when the filesystem is built with encryption support. This didn't actually matter because no filesystems called it. However, it will start being used soon, so fix it by moving it from fscrypt.h to fscrypt_supp.h and stubbing it out in fscrypt_notsupp.h. Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- include/linux/fscrypt.h | 8 -------- include/linux/fscrypt_notsupp.h | 5 +++++ include/linux/fscrypt_supp.h | 6 ++++++ 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index f627ee378bc3..fc43cc303cf2 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -71,14 +71,6 @@ struct fscrypt_operations { unsigned (*max_namelen)(struct inode *); }; -static inline bool fscrypt_dummy_context_enabled(struct inode *inode) -{ - if (inode->i_sb->s_cop->dummy_context && - inode->i_sb->s_cop->dummy_context(inode)) - return true; - return false; -} - static inline bool fscrypt_valid_enc_modes(u32 contents_mode, u32 filenames_mode) { diff --git a/include/linux/fscrypt_notsupp.h b/include/linux/fscrypt_notsupp.h index 3d394a0737ed..151bbc3c61f1 100644 --- a/include/linux/fscrypt_notsupp.h +++ b/include/linux/fscrypt_notsupp.h @@ -18,6 +18,11 @@ static inline bool fscrypt_has_encryption_key(const struct inode *inode) return false; } +static inline bool fscrypt_dummy_context_enabled(struct inode *inode) +{ + return false; +} + /* crypto.c */ static inline struct fscrypt_ctx *fscrypt_get_ctx(const struct inode *inode, gfp_t gfp_flags) diff --git a/include/linux/fscrypt_supp.h b/include/linux/fscrypt_supp.h index 40fee89fac9e..90965fa403b1 100644 --- a/include/linux/fscrypt_supp.h +++ b/include/linux/fscrypt_supp.h @@ -30,6 +30,12 @@ static inline bool fscrypt_has_encryption_key(const struct inode *inode) return (inode->i_crypt_info != NULL); } +static inline bool fscrypt_dummy_context_enabled(struct inode *inode) +{ + return inode->i_sb->s_cop->dummy_context && + inode->i_sb->s_cop->dummy_context(inode); +} + /* crypto.c */ extern struct fscrypt_ctx *fscrypt_get_ctx(const struct inode *, gfp_t); extern void fscrypt_release_ctx(struct fscrypt_ctx *); From efefa434f47e1d907b3a4c31b9c9f1e561fe57d6 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 5 Jan 2018 10:44:57 -0800 Subject: [PATCH 601/804] fscrypt: move fscrypt_operations declaration to fscrypt_supp.h Filesystems now only define their fscrypt_operations when they are compiled with encryption support, so move the fscrypt_operations declaration from fscrypt.h to fscrypt_supp.h. Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- include/linux/fscrypt.h | 18 ------------------ include/linux/fscrypt_supp.h | 18 ++++++++++++++++++ 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index fc43cc303cf2..b29cdfc3486e 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -53,24 +53,6 @@ struct fscrypt_name { #define fname_name(p) ((p)->disk_name.name) #define fname_len(p) ((p)->disk_name.len) -/* - * fscrypt superblock flags - */ -#define FS_CFLG_OWN_PAGES (1U << 1) - -/* - * crypto opertions for filesystems - */ -struct fscrypt_operations { - unsigned int flags; - const char *key_prefix; - int (*get_context)(struct inode *, void *, size_t); - int (*set_context)(struct inode *, const void *, size_t, void *); - bool (*dummy_context)(struct inode *); - bool (*empty_dir)(struct inode *); - unsigned (*max_namelen)(struct inode *); -}; - static inline bool fscrypt_valid_enc_modes(u32 contents_mode, u32 filenames_mode) { diff --git a/include/linux/fscrypt_supp.h b/include/linux/fscrypt_supp.h index 90965fa403b1..c785f7297f29 100644 --- a/include/linux/fscrypt_supp.h +++ b/include/linux/fscrypt_supp.h @@ -10,6 +10,24 @@ #ifndef _LINUX_FSCRYPT_SUPP_H #define _LINUX_FSCRYPT_SUPP_H +/* + * fscrypt superblock flags + */ +#define FS_CFLG_OWN_PAGES (1U << 1) + +/* + * crypto operations for filesystems + */ +struct fscrypt_operations { + unsigned int flags; + const char *key_prefix; + int (*get_context)(struct inode *, void *, size_t); + int (*set_context)(struct inode *, const void *, size_t, void *); + bool (*dummy_context)(struct inode *); + bool (*empty_dir)(struct inode *); + unsigned (*max_namelen)(struct inode *); +}; + struct fscrypt_ctx { union { struct { From e6fe930580cb0344a4fbd0b15bb8cda5e3986fee Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 5 Jan 2018 10:44:58 -0800 Subject: [PATCH 602/804] fscrypt: move fscrypt_valid_enc_modes() to fscrypt_private.h The encryption modes are validated by fs/crypto/, not by individual filesystems. Therefore, move fscrypt_valid_enc_modes() from fscrypt.h to fscrypt_private.h. Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/crypto/fscrypt_private.h | 14 ++++++++++++++ include/linux/fscrypt.h | 14 -------------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index 09e99f5007de..d5dc791d7228 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -81,6 +81,20 @@ static inline void bio_set_op_attrs(struct bio *bio, unsigned op, bio->bi_rw = op | op_flags; } +static inline bool fscrypt_valid_enc_modes(u32 contents_mode, + u32 filenames_mode) +{ + if (contents_mode == FS_ENCRYPTION_MODE_AES_128_CBC && + filenames_mode == FS_ENCRYPTION_MODE_AES_128_CTS) + return true; + + if (contents_mode == FS_ENCRYPTION_MODE_AES_256_XTS && + filenames_mode == FS_ENCRYPTION_MODE_AES_256_CTS) + return true; + + return false; +} + /* crypto.c */ extern struct kmem_cache *fscrypt_info_cachep; extern int fscrypt_initialize(unsigned int cop_flags); diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index b29cdfc3486e..b03cb23728ea 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -53,20 +53,6 @@ struct fscrypt_name { #define fname_name(p) ((p)->disk_name.name) #define fname_len(p) ((p)->disk_name.len) -static inline bool fscrypt_valid_enc_modes(u32 contents_mode, - u32 filenames_mode) -{ - if (contents_mode == FS_ENCRYPTION_MODE_AES_128_CBC && - filenames_mode == FS_ENCRYPTION_MODE_AES_128_CTS) - return true; - - if (contents_mode == FS_ENCRYPTION_MODE_AES_256_XTS && - filenames_mode == FS_ENCRYPTION_MODE_AES_256_CTS) - return true; - - return false; -} - static inline bool fscrypt_is_dot_dotdot(const struct qstr *str) { if (str->len == 1 && str->name[0] == '.') From d9cadc11bdcf9907041dcaba204384c0ff552b81 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 5 Jan 2018 10:44:59 -0800 Subject: [PATCH 603/804] fscrypt: move fscrypt_is_dot_dotdot() to fs/crypto/fname.c Only fs/crypto/fname.c cares about treating the "." and ".." filenames specially with regards to encryption, so move fscrypt_is_dot_dotdot() from fscrypt.h to there. Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/crypto/fname.c | 11 +++++++++++ include/linux/fscrypt.h | 11 ----------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index 6eb434363ff2..bce476dc2c65 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -14,6 +14,17 @@ #include #include "fscrypt_private.h" +static inline bool fscrypt_is_dot_dotdot(const struct qstr *str) +{ + if (str->len == 1 && str->name[0] == '.') + return true; + + if (str->len == 2 && str->name[0] == '.' && str->name[1] == '.') + return true; + + return false; +} + /** * fname_encrypt() - encrypt a filename * diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index b03cb23728ea..f71d6326936e 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -53,17 +53,6 @@ struct fscrypt_name { #define fname_name(p) ((p)->disk_name.name) #define fname_len(p) ((p)->disk_name.len) -static inline bool fscrypt_is_dot_dotdot(const struct qstr *str) -{ - if (str->len == 1 && str->name[0] == '.') - return true; - - if (str->len == 2 && str->name[0] == '.' && str->name[1] == '.') - return true; - - return false; -} - #if __FS_HAS_ENCRYPTION #include #else From 7f43602f4d104ad482f54e3a9122f3e5f31d60d9 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 5 Jan 2018 10:45:00 -0800 Subject: [PATCH 604/804] fscrypt: trim down fscrypt.h includes fscrypt.h included way too many other headers, given that it is included by filesystems both with and without encryption support. Trim down the includes list by moving the needed includes into more appropriate places, and removing the unneeded ones. Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/crypto/crypto.c | 1 + fs/crypto/fname.c | 1 + fs/crypto/keyinfo.c | 1 + include/linux/fscrypt.h | 6 ------ include/linux/fscrypt_supp.h | 3 +++ 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index 732a786cce9d..ce654526c0fb 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -27,6 +27,7 @@ #include #include #include +#include #include "fscrypt_private.h" static unsigned int num_prealloc_crypto_pages = 32; diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index bce476dc2c65..f5db8bd500b6 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -12,6 +12,7 @@ #include #include +#include #include "fscrypt_private.h" static inline bool fscrypt_is_dot_dotdot(const struct qstr *str) diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c index 444c65ed6db8..7c00331da5df 100644 --- a/fs/crypto/keyinfo.c +++ b/fs/crypto/keyinfo.c @@ -13,6 +13,7 @@ #include #include #include +#include #include "fscrypt_private.h" static struct crypto_shash *essiv_hash_tfm; diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index f71d6326936e..486886811915 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -13,13 +13,7 @@ #ifndef _LINUX_FSCRYPT_H #define _LINUX_FSCRYPT_H -#include #include -#include -#include -#include -#include -#include #define FS_CRYPTO_BLOCK_SIZE 16 diff --git a/include/linux/fscrypt_supp.h b/include/linux/fscrypt_supp.h index c785f7297f29..cdfe1600f53e 100644 --- a/include/linux/fscrypt_supp.h +++ b/include/linux/fscrypt_supp.h @@ -10,6 +10,9 @@ #ifndef _LINUX_FSCRYPT_SUPP_H #define _LINUX_FSCRYPT_SUPP_H +#include +#include + /* * fscrypt superblock flags */ From a1cdacb7ae0db3e376c3c874df8c8793448ad1e9 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 5 Jan 2018 10:45:01 -0800 Subject: [PATCH 605/804] fscrypt: new helper functions for ->symlink() Currently, filesystems supporting fscrypt need to implement some tricky logic when creating encrypted symlinks, including handling a peculiar on-disk format (struct fscrypt_symlink_data) and correctly calculating the size of the encrypted symlink. Introduce helper functions to make things a bit easier: - fscrypt_prepare_symlink() computes and validates the size the symlink target will require on-disk. - fscrypt_encrypt_symlink() creates the encrypted target if needed. The new helpers actually fix some subtle bugs. First, when checking whether the symlink target was too long, filesystems didn't account for the fact that the NUL padding is meant to be truncated if it would cause the maximum length to be exceeded, as is done for filenames in directories. Consequently users would receive ENAMETOOLONG when creating symlinks close to what is supposed to be the maximum length. For example, with EXT4 with a 4K block size, the maximum symlink target length in an encrypted directory is supposed to be 4093 bytes (in comparison to 4095 in an unencrypted directory), but in FS_POLICY_FLAGS_PAD_32-mode only up to 4064 bytes were accepted. Second, symlink targets of "." and ".." were not being encrypted, even though they should be, as these names are special in *directory entries* but not in symlink targets. Fortunately, we can fix this simply by starting to encrypt them, as old kernels already accept them in encrypted form. Third, the output string length the filesystems were providing when doing the actual encryption was incorrect, as it was forgotten to exclude 'sizeof(struct fscrypt_symlink_data)'. Fortunately though, this bug didn't make a difference. Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/crypto/fname.c | 8 +-- fs/crypto/fscrypt_private.h | 4 ++ fs/crypto/hooks.c | 90 +++++++++++++++++++++++++++++++++ include/linux/fscrypt.h | 64 +++++++++++++++++++++++ include/linux/fscrypt_notsupp.h | 16 ++++++ include/linux/fscrypt_supp.h | 6 +++ 6 files changed, 185 insertions(+), 3 deletions(-) diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index f5db8bd500b6..55ca8d913c94 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -33,8 +33,8 @@ static inline bool fscrypt_is_dot_dotdot(const struct qstr *str) * * Return: 0 on success, -errno on failure */ -static int fname_encrypt(struct inode *inode, - const struct qstr *iname, struct fscrypt_str *oname) +int fname_encrypt(struct inode *inode, + const struct qstr *iname, struct fscrypt_str *oname) { struct skcipher_request *req = NULL; DECLARE_CRYPTO_WAIT(wait); @@ -55,9 +55,11 @@ static int fname_encrypt(struct inode *inode, * Copy the filename to the output buffer for encrypting in-place and * pad it with the needed number of NUL bytes. */ + if (WARN_ON(oname->len < iname->len)) + return -ENOBUFS; cryptlen = max_t(unsigned int, iname->len, FS_CRYPTO_BLOCK_SIZE); cryptlen = round_up(cryptlen, padding); - cryptlen = min(cryptlen, lim); + cryptlen = min3(cryptlen, lim, oname->len); memcpy(oname->name, iname->name, iname->len); memset(oname->name + iname->len, 0, cryptlen - iname->len); diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index d5dc791d7228..0712b0ac974b 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -108,6 +108,10 @@ extern int fscrypt_do_page_crypto(const struct inode *inode, extern struct page *fscrypt_alloc_bounce_page(struct fscrypt_ctx *ctx, gfp_t gfp_flags); +/* fname.c */ +extern int fname_encrypt(struct inode *inode, + const struct qstr *iname, struct fscrypt_str *oname); + /* keyinfo.c */ extern void __exit fscrypt_essiv_cleanup(void); diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c index 9f5fb2eb9cf7..4b83e4af2e41 100644 --- a/fs/crypto/hooks.c +++ b/fs/crypto/hooks.c @@ -110,3 +110,93 @@ int __fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry) return 0; } EXPORT_SYMBOL_GPL(__fscrypt_prepare_lookup); + +int __fscrypt_prepare_symlink(struct inode *dir, unsigned int len, + unsigned int max_len, + struct fscrypt_str *disk_link) +{ + int err; + + /* + * To calculate the size of the encrypted symlink target we need to know + * the amount of NUL padding, which is determined by the flags set in + * the encryption policy which will be inherited from the directory. + * The easiest way to get access to this is to just load the directory's + * fscrypt_info, since we'll need it to create the dir_entry anyway. + * + * Note: in test_dummy_encryption mode, @dir may be unencrypted. + */ + err = fscrypt_get_encryption_info(dir); + if (err) + return err; + if (!fscrypt_has_encryption_key(dir)) + return -ENOKEY; + + /* + * Calculate the size of the encrypted symlink and verify it won't + * exceed max_len. Note that for historical reasons, encrypted symlink + * targets are prefixed with the ciphertext length, despite this + * actually being redundant with i_size. This decreases by 2 bytes the + * longest symlink target we can accept. + * + * We could recover 1 byte by not counting a null terminator, but + * counting it (even though it is meaningless for ciphertext) is simpler + * for now since filesystems will assume it is there and subtract it. + */ + if (sizeof(struct fscrypt_symlink_data) + len > max_len) + return -ENAMETOOLONG; + disk_link->len = min_t(unsigned int, + sizeof(struct fscrypt_symlink_data) + + fscrypt_fname_encrypted_size(dir, len), + max_len); + disk_link->name = NULL; + return 0; +} +EXPORT_SYMBOL_GPL(__fscrypt_prepare_symlink); + +int __fscrypt_encrypt_symlink(struct inode *inode, const char *target, + unsigned int len, struct fscrypt_str *disk_link) +{ + int err; + struct qstr iname = { .name = target, .len = len }; + struct fscrypt_symlink_data *sd; + unsigned int ciphertext_len; + struct fscrypt_str oname; + + err = fscrypt_require_key(inode); + if (err) + return err; + + if (disk_link->name) { + /* filesystem-provided buffer */ + sd = (struct fscrypt_symlink_data *)disk_link->name; + } else { + sd = kmalloc(disk_link->len, GFP_NOFS); + if (!sd) + return -ENOMEM; + } + ciphertext_len = disk_link->len - sizeof(*sd); + sd->len = cpu_to_le16(ciphertext_len); + + oname.name = sd->encrypted_path; + oname.len = ciphertext_len; + err = fname_encrypt(inode, &iname, &oname); + if (err) { + if (!disk_link->name) + kfree(sd); + return err; + } + BUG_ON(oname.len != ciphertext_len); + + /* + * Null-terminating the ciphertext doesn't make sense, but we still + * count the null terminator in the length, so we might as well + * initialize it just in case the filesystem writes it out. + */ + sd->encrypted_path[ciphertext_len] = '\0'; + + if (!disk_link->name) + disk_link->name = (unsigned char *)sd; + return 0; +} +EXPORT_SYMBOL_GPL(__fscrypt_encrypt_symlink); diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index 486886811915..77a171da8254 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -192,4 +192,68 @@ static inline int fscrypt_prepare_setattr(struct dentry *dentry, return 0; } +/** + * fscrypt_prepare_symlink - prepare to create a possibly-encrypted symlink + * @dir: directory in which the symlink is being created + * @target: plaintext symlink target + * @len: length of @target excluding null terminator + * @max_len: space the filesystem has available to store the symlink target + * @disk_link: (out) the on-disk symlink target being prepared + * + * This function computes the size the symlink target will require on-disk, + * stores it in @disk_link->len, and validates it against @max_len. An + * encrypted symlink may be longer than the original. + * + * Additionally, @disk_link->name is set to @target if the symlink will be + * unencrypted, but left NULL if the symlink will be encrypted. For encrypted + * symlinks, the filesystem must call fscrypt_encrypt_symlink() to create the + * on-disk target later. (The reason for the two-step process is that some + * filesystems need to know the size of the symlink target before creating the + * inode, e.g. to determine whether it will be a "fast" or "slow" symlink.) + * + * Return: 0 on success, -ENAMETOOLONG if the symlink target is too long, + * -ENOKEY if the encryption key is missing, or another -errno code if a problem + * occurred while setting up the encryption key. + */ +static inline int fscrypt_prepare_symlink(struct inode *dir, + const char *target, + unsigned int len, + unsigned int max_len, + struct fscrypt_str *disk_link) +{ + if (IS_ENCRYPTED(dir) || fscrypt_dummy_context_enabled(dir)) + return __fscrypt_prepare_symlink(dir, len, max_len, disk_link); + + disk_link->name = (unsigned char *)target; + disk_link->len = len + 1; + if (disk_link->len > max_len) + return -ENAMETOOLONG; + return 0; +} + +/** + * fscrypt_encrypt_symlink - encrypt the symlink target if needed + * @inode: symlink inode + * @target: plaintext symlink target + * @len: length of @target excluding null terminator + * @disk_link: (in/out) the on-disk symlink target being prepared + * + * If the symlink target needs to be encrypted, then this function encrypts it + * into @disk_link->name. fscrypt_prepare_symlink() must have been called + * previously to compute @disk_link->len. If the filesystem did not allocate a + * buffer for @disk_link->name after calling fscrypt_prepare_link(), then one + * will be kmalloc()'ed and the filesystem will be responsible for freeing it. + * + * Return: 0 on success, -errno on failure + */ +static inline int fscrypt_encrypt_symlink(struct inode *inode, + const char *target, + unsigned int len, + struct fscrypt_str *disk_link) +{ + if (IS_ENCRYPTED(inode)) + return __fscrypt_encrypt_symlink(inode, target, len, disk_link); + return 0; +} + #endif /* _LINUX_FSCRYPT_H */ diff --git a/include/linux/fscrypt_notsupp.h b/include/linux/fscrypt_notsupp.h index 151bbc3c61f1..875c83672318 100644 --- a/include/linux/fscrypt_notsupp.h +++ b/include/linux/fscrypt_notsupp.h @@ -222,4 +222,20 @@ static inline int __fscrypt_prepare_lookup(struct inode *dir, return -EOPNOTSUPP; } +static inline int __fscrypt_prepare_symlink(struct inode *dir, + unsigned int len, + unsigned int max_len, + struct fscrypt_str *disk_link) +{ + return -EOPNOTSUPP; +} + +static inline int __fscrypt_encrypt_symlink(struct inode *inode, + const char *target, + unsigned int len, + struct fscrypt_str *disk_link) +{ + return -EOPNOTSUPP; +} + #endif /* _LINUX_FSCRYPT_NOTSUPP_H */ diff --git a/include/linux/fscrypt_supp.h b/include/linux/fscrypt_supp.h index cdfe1600f53e..6ccaad58d2be 100644 --- a/include/linux/fscrypt_supp.h +++ b/include/linux/fscrypt_supp.h @@ -204,5 +204,11 @@ extern int __fscrypt_prepare_rename(struct inode *old_dir, struct dentry *new_dentry, unsigned int flags); extern int __fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry); +extern int __fscrypt_prepare_symlink(struct inode *dir, unsigned int len, + unsigned int max_len, + struct fscrypt_str *disk_link); +extern int __fscrypt_encrypt_symlink(struct inode *inode, const char *target, + unsigned int len, + struct fscrypt_str *disk_link); #endif /* _LINUX_FSCRYPT_SUPP_H */ From fd457d2c4e0411e56b82f67a3b22c8c589f77038 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 5 Jan 2018 10:45:02 -0800 Subject: [PATCH 606/804] fscrypt: new helper function - fscrypt_get_symlink() Filesystems also have duplicate code to support ->get_link() on encrypted symlinks. Factor it out into a new function fscrypt_get_symlink(). It takes in the contents of the encrypted symlink on-disk and provides the target (decrypted or encoded) that should be returned from ->get_link(). Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/crypto/hooks.c | 71 +++++++++++++++++++++++++++++++++ include/linux/fscrypt_notsupp.h | 7 ++++ include/linux/fscrypt_supp.h | 2 + 3 files changed, 80 insertions(+) diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c index 4b83e4af2e41..534cfb212cdb 100644 --- a/fs/crypto/hooks.c +++ b/fs/crypto/hooks.c @@ -200,3 +200,74 @@ int __fscrypt_encrypt_symlink(struct inode *inode, const char *target, return 0; } EXPORT_SYMBOL_GPL(__fscrypt_encrypt_symlink); + +/** + * fscrypt_get_symlink - get the target of an encrypted symlink + * @inode: the symlink inode + * @caddr: the on-disk contents of the symlink + * @max_size: size of @caddr buffer + * @done: if successful, will be set up to free the returned target + * + * If the symlink's encryption key is available, we decrypt its target. + * Otherwise, we encode its target for presentation. + * + * This may sleep, so the filesystem must have dropped out of RCU mode already. + * + * Return: the presentable symlink target or an ERR_PTR() + */ +void *fscrypt_get_symlink(struct inode *inode, const void *caddr, + unsigned int max_size) +{ + const struct fscrypt_symlink_data *sd; + struct fscrypt_str cstr, pstr; + int err; + + /* This is for encrypted symlinks only */ + if (WARN_ON(!IS_ENCRYPTED(inode))) + return ERR_PTR(-EINVAL); + + /* + * Try to set up the symlink's encryption key, but we can continue + * regardless of whether the key is available or not. + */ + err = fscrypt_get_encryption_info(inode); + if (err) + return ERR_PTR(err); + + /* + * For historical reasons, encrypted symlink targets are prefixed with + * the ciphertext length, even though this is redundant with i_size. + */ + + if (max_size < sizeof(*sd)) + return ERR_PTR(-EUCLEAN); + sd = caddr; + cstr.name = (unsigned char *)sd->encrypted_path; + cstr.len = le16_to_cpu(sd->len); + + if (cstr.len == 0) + return ERR_PTR(-EUCLEAN); + + if (cstr.len + sizeof(*sd) - 1 > max_size) + return ERR_PTR(-EUCLEAN); + + err = fscrypt_fname_alloc_buffer(inode, cstr.len, &pstr); + if (err) + return ERR_PTR(err); + + err = fscrypt_fname_disk_to_usr(inode, 0, 0, &cstr, &pstr); + if (err) + goto err_kfree; + + err = -EUCLEAN; + if (pstr.name[0] == '\0') + goto err_kfree; + + pstr.name[pstr.len] = '\0'; + return pstr.name; + +err_kfree: + kfree(pstr.name); + return ERR_PTR(err); +} +EXPORT_SYMBOL_GPL(fscrypt_get_symlink); diff --git a/include/linux/fscrypt_notsupp.h b/include/linux/fscrypt_notsupp.h index 875c83672318..1886b255adcb 100644 --- a/include/linux/fscrypt_notsupp.h +++ b/include/linux/fscrypt_notsupp.h @@ -238,4 +238,11 @@ static inline int __fscrypt_encrypt_symlink(struct inode *inode, return -EOPNOTSUPP; } +static inline void *fscrypt_get_symlink(struct inode *inode, + const void *caddr, + unsigned int max_size) +{ + return ERR_PTR(-EOPNOTSUPP); +} + #endif /* _LINUX_FSCRYPT_NOTSUPP_H */ diff --git a/include/linux/fscrypt_supp.h b/include/linux/fscrypt_supp.h index 6ccaad58d2be..92e50820fd4f 100644 --- a/include/linux/fscrypt_supp.h +++ b/include/linux/fscrypt_supp.h @@ -210,5 +210,7 @@ extern int __fscrypt_prepare_symlink(struct inode *dir, unsigned int len, extern int __fscrypt_encrypt_symlink(struct inode *inode, const char *target, unsigned int len, struct fscrypt_str *disk_link); +extern void *fscrypt_get_symlink(struct inode *inode, const void *caddr, + unsigned int max_size); #endif /* _LINUX_FSCRYPT_SUPP_H */ From 6b76f58e24bda781a9434989e8d6d4500e91891a Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 11 Jan 2018 23:26:49 -0500 Subject: [PATCH 607/804] f2fs: switch to fscrypt ->symlink() helper functions Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/f2fs/namei.c | 66 ++++++++++++------------------------------------- 1 file changed, 16 insertions(+), 50 deletions(-) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 6bb1adb84324..7438f0d8c9f0 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -478,27 +478,16 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, struct f2fs_sb_info *sbi = F2FS_I_SB(dir); struct inode *inode; size_t len = strlen(symname); - struct fscrypt_str disk_link = FSTR_INIT((char *)symname, len + 1); - struct fscrypt_symlink_data *sd = NULL; + struct fscrypt_str disk_link; int err; if (unlikely(f2fs_cp_error(sbi))) return -EIO; - if (f2fs_encrypted_inode(dir)) { - err = fscrypt_get_encryption_info(dir); - if (err) - return err; - - if (!fscrypt_has_encryption_key(dir)) - return -ENOKEY; - - disk_link.len = (fscrypt_fname_encrypted_size(dir, len) + - sizeof(struct fscrypt_symlink_data)); - } - - if (disk_link.len > dir->i_sb->s_blocksize) - return -ENAMETOOLONG; + err = fscrypt_prepare_symlink(dir, symname, len, dir->i_sb->s_blocksize, + &disk_link); + if (err) + return err; err = dquot_initialize(dir); if (err) @@ -508,7 +497,7 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, if (IS_ERR(inode)) return PTR_ERR(inode); - if (f2fs_encrypted_inode(inode)) + if (IS_ENCRYPTED(inode)) inode->i_op = &f2fs_encrypted_symlink_inode_operations; else inode->i_op = &f2fs_symlink_inode_operations; @@ -518,38 +507,13 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); if (err) - goto out; + goto out_handle_failed_inode; f2fs_unlock_op(sbi); alloc_nid_done(sbi, inode->i_ino); - if (f2fs_encrypted_inode(inode)) { - struct qstr istr = QSTR_INIT(symname, len); - struct fscrypt_str ostr; - - sd = f2fs_kzalloc(sbi, disk_link.len, GFP_NOFS); - if (!sd) { - err = -ENOMEM; - goto err_out; - } - - err = fscrypt_get_encryption_info(inode); - if (err) - goto err_out; - - if (!fscrypt_has_encryption_key(inode)) { - err = -ENOKEY; - goto err_out; - } - - ostr.name = sd->encrypted_path; - ostr.len = disk_link.len; - err = fscrypt_fname_usr_to_disk(inode, &istr, &ostr); - if (err) - goto err_out; - - sd->len = cpu_to_le16(ostr.len); - disk_link.name = (char *)sd; - } + err = fscrypt_encrypt_symlink(inode, symname, len, &disk_link); + if (err) + goto err_out; err = page_symlink(inode, disk_link.name, disk_link.len); @@ -576,12 +540,14 @@ err_out: f2fs_unlink(dir, dentry); } - kfree(sd); - f2fs_balance_fs(sbi, true); - return err; -out: + goto out_free_encrypted_link; + +out_handle_failed_inode: handle_failed_inode(inode); +out_free_encrypted_link: + if (disk_link.name != (unsigned char *)symname) + kfree(disk_link.name); return err; } From 7ac4756a247488122b526156f119833e6356bd72 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 11 Jan 2018 23:26:49 -0500 Subject: [PATCH 608/804] f2fs: switch to fscrypt_get_symlink() Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/f2fs/namei.c | 66 +++++++++---------------------------------------- 1 file changed, 11 insertions(+), 55 deletions(-) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 7438f0d8c9f0..72328a18c086 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -1093,65 +1093,21 @@ static int f2fs_rename2(struct inode *old_dir, struct dentry *old_dentry, static const char *f2fs_encrypted_follow_link(struct dentry *dentry, void **cookie) { - struct page *cpage = NULL; - char *caddr, *paddr = NULL; - struct fscrypt_str cstr = FSTR_INIT(NULL, 0); - struct fscrypt_str pstr = FSTR_INIT(NULL, 0); - struct fscrypt_symlink_data *sd; struct inode *inode = d_inode(dentry); - u32 max_size = inode->i_sb->s_blocksize; - int res; + struct page *page; + void *target; - res = fscrypt_get_encryption_info(inode); - if (res) - return ERR_PTR(res); + if (!dentry) + return ERR_PTR(-ECHILD); - cpage = read_mapping_page(inode->i_mapping, 0, NULL); - if (IS_ERR(cpage)) - return ERR_CAST(cpage); - caddr = page_address(cpage); + page = read_mapping_page(inode->i_mapping, 0, NULL); + if (IS_ERR(page)) + return ERR_CAST(page); - /* Symlink is encrypted */ - sd = (struct fscrypt_symlink_data *)caddr; - cstr.name = sd->encrypted_path; - cstr.len = le16_to_cpu(sd->len); - - /* this is broken symlink case */ - if (unlikely(cstr.len == 0)) { - res = -ENOENT; - goto errout; - } - - if ((cstr.len + sizeof(struct fscrypt_symlink_data) - 1) > max_size) { - /* Symlink data on the disk is corrupted */ - res = -EIO; - goto errout; - } - res = fscrypt_fname_alloc_buffer(inode, cstr.len, &pstr); - if (res) - goto errout; - - res = fscrypt_fname_disk_to_usr(inode, 0, 0, &cstr, &pstr); - if (res) - goto errout; - - /* this is broken symlink case */ - if (unlikely(pstr.name[0] == 0)) { - res = -ENOENT; - goto errout; - } - - paddr = pstr.name; - - /* Null-terminate the name */ - paddr[pstr.len] = '\0'; - - put_page(cpage); - return *cookie = paddr; -errout: - fscrypt_fname_free_buffer(&pstr); - put_page(cpage); - return ERR_PTR(res); + target = fscrypt_get_symlink(inode, page_address(page), + inode->i_sb->s_blocksize); + put_page(page); + return *cookie = target; } const struct inode_operations f2fs_encrypted_symlink_inode_operations = { From f9550c24c20e3e4a89c2958c1496588a301d1409 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 11 Jan 2018 23:30:08 -0500 Subject: [PATCH 609/804] fscrypt: remove fscrypt_fname_usr_to_disk() fscrypt_fname_usr_to_disk() sounded very generic but was actually only used to encrypt symlinks. Remove it now that all filesystems have been switched over to fscrypt_encrypt_symlink(). Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/crypto/fname.c | 29 ----------------------------- include/linux/fscrypt_notsupp.h | 7 ------- include/linux/fscrypt_supp.h | 2 -- 3 files changed, 38 deletions(-) diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index 55ca8d913c94..897041751791 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -310,35 +310,6 @@ int fscrypt_fname_disk_to_usr(struct inode *inode, } EXPORT_SYMBOL(fscrypt_fname_disk_to_usr); -/** - * fscrypt_fname_usr_to_disk() - converts a filename from user space to disk - * space - * - * The caller must have allocated sufficient memory for the @oname string. - * - * Return: 0 on success, -errno on failure - */ -int fscrypt_fname_usr_to_disk(struct inode *inode, - const struct qstr *iname, - struct fscrypt_str *oname) -{ - if (fscrypt_is_dot_dotdot(iname)) { - oname->name[0] = '.'; - oname->name[iname->len - 1] = '.'; - oname->len = iname->len; - return 0; - } - if (inode->i_crypt_info) - return fname_encrypt(inode, iname, oname); - /* - * Without a proper key, a user is not allowed to modify the filenames - * in a directory. Consequently, a user space name cannot be mapped to - * a disk-space name - */ - return -ENOKEY; -} -EXPORT_SYMBOL(fscrypt_fname_usr_to_disk); - /** * fscrypt_setup_filename() - prepare to search a possibly encrypted directory * @dir: the directory that will be searched diff --git a/include/linux/fscrypt_notsupp.h b/include/linux/fscrypt_notsupp.h index 1886b255adcb..db31cf0c80c5 100644 --- a/include/linux/fscrypt_notsupp.h +++ b/include/linux/fscrypt_notsupp.h @@ -158,13 +158,6 @@ static inline int fscrypt_fname_disk_to_usr(struct inode *inode, return -EOPNOTSUPP; } -static inline int fscrypt_fname_usr_to_disk(struct inode *inode, - const struct qstr *iname, - struct fscrypt_str *oname) -{ - return -EOPNOTSUPP; -} - static inline bool fscrypt_match_name(const struct fscrypt_name *fname, const u8 *de_name, u32 de_name_len) { diff --git a/include/linux/fscrypt_supp.h b/include/linux/fscrypt_supp.h index 92e50820fd4f..ddd79019e3f9 100644 --- a/include/linux/fscrypt_supp.h +++ b/include/linux/fscrypt_supp.h @@ -112,8 +112,6 @@ extern int fscrypt_fname_alloc_buffer(const struct inode *, u32, extern void fscrypt_fname_free_buffer(struct fscrypt_str *); extern int fscrypt_fname_disk_to_usr(struct inode *, u32, u32, const struct fscrypt_str *, struct fscrypt_str *); -extern int fscrypt_fname_usr_to_disk(struct inode *, const struct qstr *, - struct fscrypt_str *); #define FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE 32 From 042ae9f4cfbfb555e6de68579870ac3c43594215 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 11 Jan 2018 23:30:08 -0500 Subject: [PATCH 610/804] fscrypt: move fscrypt_symlink_data to fscrypt_private.h Now that all filesystems have been converted to use the symlink helper functions, they no longer need the declaration of 'struct fscrypt_symlink_data'. Move it from fscrypt.h to fscrypt_private.h. Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/crypto/fscrypt_private.h | 9 +++++++++ include/linux/fscrypt.h | 9 --------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index 0712b0ac974b..e44e8e1419d6 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -49,6 +49,15 @@ struct fscrypt_context { #define FS_ENCRYPTION_CONTEXT_FORMAT_V1 1 +/** + * For encrypted symlinks, the ciphertext length is stored at the beginning + * of the string in little-endian format. + */ +struct fscrypt_symlink_data { + __le16 len; + char encrypted_path[1]; +} __packed; + /* * A pointer to this structure is stored in the file system's in-core * representation of an inode. diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index 77a171da8254..9e535af579e8 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -20,15 +20,6 @@ struct fscrypt_ctx; struct fscrypt_info; -/** - * For encrypted symlinks, the ciphertext length is stored at the beginning - * of the string in little-endian format. - */ -struct fscrypt_symlink_data { - __le16 len; - char encrypted_path[1]; -} __packed; - struct fscrypt_str { unsigned char *name; u32 len; From 168a90782888affff92b4a4fe950c9e5afca7179 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 11 Jan 2018 23:30:08 -0500 Subject: [PATCH 611/804] fscrypt: calculate NUL-padding length in one place only Currently, when encrypting a filename (either a real filename or a symlink target) we calculate the amount of NUL-padding twice: once before encryption and once during encryption in fname_encrypt(). It is needed before encryption to allocate the needed buffer size as well as calculate the size the symlink target will take up on-disk before creating the symlink inode. Calculating the size during encryption as well is redundant. Remove this redundancy by always calculating the exact size beforehand, and making fname_encrypt() just add as much NUL padding as is needed to fill the output buffer. Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/crypto/fname.c | 53 ++++++++++++++++++------------------- fs/crypto/fscrypt_private.h | 4 +-- fs/crypto/hooks.c | 7 +---- 3 files changed, 29 insertions(+), 35 deletions(-) diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index 897041751791..3145665c9ca1 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -29,39 +29,29 @@ static inline bool fscrypt_is_dot_dotdot(const struct qstr *str) /** * fname_encrypt() - encrypt a filename * - * The caller must have allocated sufficient memory for the @oname string. + * The output buffer must be at least as large as the input buffer. + * Any extra space is filled with NUL padding before encryption. * * Return: 0 on success, -errno on failure */ -int fname_encrypt(struct inode *inode, - const struct qstr *iname, struct fscrypt_str *oname) +int fname_encrypt(struct inode *inode, const struct qstr *iname, + u8 *out, unsigned int olen) { struct skcipher_request *req = NULL; DECLARE_CRYPTO_WAIT(wait); - struct fscrypt_info *ci = inode->i_crypt_info; - struct crypto_skcipher *tfm = ci->ci_ctfm; + struct crypto_skcipher *tfm = inode->i_crypt_info->ci_ctfm; int res = 0; char iv[FS_CRYPTO_BLOCK_SIZE]; struct scatterlist sg; - int padding = 4 << (ci->ci_flags & FS_POLICY_FLAGS_PAD_MASK); - unsigned int lim; - unsigned int cryptlen; - - lim = inode->i_sb->s_cop->max_namelen(inode); - if (iname->len <= 0 || iname->len > lim) - return -EIO; /* * Copy the filename to the output buffer for encrypting in-place and * pad it with the needed number of NUL bytes. */ - if (WARN_ON(oname->len < iname->len)) + if (WARN_ON(olen < iname->len)) return -ENOBUFS; - cryptlen = max_t(unsigned int, iname->len, FS_CRYPTO_BLOCK_SIZE); - cryptlen = round_up(cryptlen, padding); - cryptlen = min3(cryptlen, lim, oname->len); - memcpy(oname->name, iname->name, iname->len); - memset(oname->name + iname->len, 0, cryptlen - iname->len); + memcpy(out, iname->name, iname->len); + memset(out + iname->len, 0, olen - iname->len); /* Initialize the IV */ memset(iv, 0, FS_CRYPTO_BLOCK_SIZE); @@ -76,8 +66,8 @@ int fname_encrypt(struct inode *inode, skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, crypto_req_done, &wait); - sg_init_one(&sg, oname->name, cryptlen); - skcipher_request_set_crypt(req, &sg, &sg, cryptlen, iv); + sg_init_one(&sg, out, olen); + skcipher_request_set_crypt(req, &sg, &sg, olen, iv); /* Do the encryption */ res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait); @@ -88,7 +78,6 @@ int fname_encrypt(struct inode *inode, return res; } - oname->len = cryptlen; return 0; } @@ -353,11 +342,21 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, return ret; if (dir->i_crypt_info) { - ret = fscrypt_fname_alloc_buffer(dir, iname->len, - &fname->crypto_buf); - if (ret) - return ret; - ret = fname_encrypt(dir, iname, &fname->crypto_buf); + unsigned int max_len = dir->i_sb->s_cop->max_namelen(dir); + + if (iname->len > max_len) + return -ENAMETOOLONG; + + fname->crypto_buf.len = + min(fscrypt_fname_encrypted_size(dir, iname->len), + max_len); + fname->crypto_buf.name = kmalloc(fname->crypto_buf.len, + GFP_NOFS); + if (!fname->crypto_buf.name) + return -ENOMEM; + + ret = fname_encrypt(dir, iname, fname->crypto_buf.name, + fname->crypto_buf.len); if (ret) goto errout; fname->disk_name.name = fname->crypto_buf.name; @@ -409,7 +408,7 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, return 0; errout: - fscrypt_fname_free_buffer(&fname->crypto_buf); + kfree(fname->crypto_buf.name); return ret; } EXPORT_SYMBOL(fscrypt_setup_filename); diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index e44e8e1419d6..eb40d32b8e79 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -118,8 +118,8 @@ extern struct page *fscrypt_alloc_bounce_page(struct fscrypt_ctx *ctx, gfp_t gfp_flags); /* fname.c */ -extern int fname_encrypt(struct inode *inode, - const struct qstr *iname, struct fscrypt_str *oname); +extern int fname_encrypt(struct inode *inode, const struct qstr *iname, + u8 *out, unsigned int olen); /* keyinfo.c */ extern void __exit fscrypt_essiv_cleanup(void); diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c index 534cfb212cdb..8b829400f467 100644 --- a/fs/crypto/hooks.c +++ b/fs/crypto/hooks.c @@ -161,7 +161,6 @@ int __fscrypt_encrypt_symlink(struct inode *inode, const char *target, struct qstr iname = { .name = target, .len = len }; struct fscrypt_symlink_data *sd; unsigned int ciphertext_len; - struct fscrypt_str oname; err = fscrypt_require_key(inode); if (err) @@ -178,16 +177,12 @@ int __fscrypt_encrypt_symlink(struct inode *inode, const char *target, ciphertext_len = disk_link->len - sizeof(*sd); sd->len = cpu_to_le16(ciphertext_len); - oname.name = sd->encrypted_path; - oname.len = ciphertext_len; - err = fname_encrypt(inode, &iname, &oname); + err = fname_encrypt(inode, &iname, sd->encrypted_path, ciphertext_len); if (err) { if (!disk_link->name) kfree(sd); return err; } - BUG_ON(oname.len != ciphertext_len); - /* * Null-terminating the ciphertext doesn't make sense, but we still * count the null terminator in the length, so we might as well From 82bec888567bbe1143ae2173b2ef442070ecbe4a Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 11 Jan 2018 23:30:08 -0500 Subject: [PATCH 612/804] fscrypt: define fscrypt_fname_alloc_buffer() to be for presented names Previously fscrypt_fname_alloc_buffer() was used to allocate buffers for both presented (decrypted or encoded) and encrypted filenames. That was confusing, because it had to allocate the worst-case size for either, e.g. including NUL-padding even when it was meaningless. But now that fscrypt_setup_filename() no longer calls it, it is only used in the ->get_link() and ->readdir() paths, which specifically want a buffer for presented filenames. Therefore, switch the behavior over to allocating the buffer for presented filenames only. Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/crypto/fname.c | 29 ++++++++++++++--------------- include/linux/fscrypt_notsupp.h | 2 +- 2 files changed, 15 insertions(+), 16 deletions(-) diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index 3145665c9ca1..aee2c3c36048 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -203,37 +203,36 @@ u32 fscrypt_fname_encrypted_size(const struct inode *inode, u32 ilen) EXPORT_SYMBOL(fscrypt_fname_encrypted_size); /** - * fscrypt_fname_crypto_alloc_obuff() - + * fscrypt_fname_alloc_buffer - allocate a buffer for presented filenames * - * Allocates an output buffer that is sufficient for the crypto operation - * specified by the context and the direction. + * Allocate a buffer that is large enough to hold any decrypted or encoded + * filename (null-terminated), for the given maximum encrypted filename length. + * + * Return: 0 on success, -errno on failure */ int fscrypt_fname_alloc_buffer(const struct inode *inode, - u32 ilen, struct fscrypt_str *crypto_str) + u32 max_encrypted_len, + struct fscrypt_str *crypto_str) { - u32 olen = fscrypt_fname_encrypted_size(inode, ilen); const u32 max_encoded_len = max_t(u32, BASE64_CHARS(FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE), 1 + BASE64_CHARS(sizeof(struct fscrypt_digested_name))); + u32 max_presented_len; - crypto_str->len = olen; - olen = max(olen, max_encoded_len); + max_presented_len = max(max_encoded_len, max_encrypted_len); - /* - * Allocated buffer can hold one more character to null-terminate the - * string - */ - crypto_str->name = kmalloc(olen + 1, GFP_NOFS); - if (!(crypto_str->name)) + crypto_str->name = kmalloc(max_presented_len + 1, GFP_NOFS); + if (!crypto_str->name) return -ENOMEM; + crypto_str->len = max_presented_len; return 0; } EXPORT_SYMBOL(fscrypt_fname_alloc_buffer); /** - * fscrypt_fname_crypto_free_buffer() - + * fscrypt_fname_free_buffer - free the buffer for presented filenames * - * Frees the buffer allocated for crypto operation. + * Free the buffer allocated by fscrypt_fname_alloc_buffer(). */ void fscrypt_fname_free_buffer(struct fscrypt_str *crypto_str) { diff --git a/include/linux/fscrypt_notsupp.h b/include/linux/fscrypt_notsupp.h index db31cf0c80c5..f5de736cf1c1 100644 --- a/include/linux/fscrypt_notsupp.h +++ b/include/linux/fscrypt_notsupp.h @@ -139,7 +139,7 @@ static inline u32 fscrypt_fname_encrypted_size(const struct inode *inode, } static inline int fscrypt_fname_alloc_buffer(const struct inode *inode, - u32 ilen, + u32 max_encrypted_len, struct fscrypt_str *crypto_str) { return -EOPNOTSUPP; From 31d3279a4fcaf92099ad5ee613a6cf3db99f7e9b Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 11 Jan 2018 23:30:08 -0500 Subject: [PATCH 613/804] fscrypt: fix up fscrypt_fname_encrypted_size() for internal use Filesystems don't need fscrypt_fname_encrypted_size() anymore, so unexport it and move it to fscrypt_private.h. We also never calculate the encrypted size of a filename without having the fscrypt_info present since it is needed to know the amount of NUL-padding which is determined by the encryption policy, and also we will always truncate the NUL-padding to the maximum filename length. Therefore, also make fscrypt_fname_encrypted_size() assume that the fscrypt_info is present, and make it truncate the returned length to the specified max_len. Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/crypto/fname.c | 29 ++++++++++++++--------------- fs/crypto/fscrypt_private.h | 3 +++ fs/crypto/hooks.c | 10 +++++----- include/linux/fscrypt_notsupp.h | 8 -------- include/linux/fscrypt_supp.h | 1 - 5 files changed, 22 insertions(+), 29 deletions(-) diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index aee2c3c36048..b18fa323d1d9 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -190,17 +190,20 @@ static int digest_decode(const char *src, int len, char *dst) return cp - dst; } -u32 fscrypt_fname_encrypted_size(const struct inode *inode, u32 ilen) +bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len, + u32 max_len, u32 *encrypted_len_ret) { - int padding = 32; - struct fscrypt_info *ci = inode->i_crypt_info; + int padding = 4 << (inode->i_crypt_info->ci_flags & + FS_POLICY_FLAGS_PAD_MASK); + u32 encrypted_len; - if (ci) - padding = 4 << (ci->ci_flags & FS_POLICY_FLAGS_PAD_MASK); - ilen = max(ilen, (u32)FS_CRYPTO_BLOCK_SIZE); - return round_up(ilen, padding); + if (orig_len > max_len) + return false; + encrypted_len = max(orig_len, (u32)FS_CRYPTO_BLOCK_SIZE); + encrypted_len = round_up(encrypted_len, padding); + *encrypted_len_ret = min(encrypted_len, max_len); + return true; } -EXPORT_SYMBOL(fscrypt_fname_encrypted_size); /** * fscrypt_fname_alloc_buffer - allocate a buffer for presented filenames @@ -341,14 +344,10 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, return ret; if (dir->i_crypt_info) { - unsigned int max_len = dir->i_sb->s_cop->max_namelen(dir); - - if (iname->len > max_len) + if (!fscrypt_fname_encrypted_size(dir, iname->len, + dir->i_sb->s_cop->max_namelen(dir), + &fname->crypto_buf.len)) return -ENAMETOOLONG; - - fname->crypto_buf.len = - min(fscrypt_fname_encrypted_size(dir, iname->len), - max_len); fname->crypto_buf.name = kmalloc(fname->crypto_buf.len, GFP_NOFS); if (!fname->crypto_buf.name) diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index eb40d32b8e79..5c296d4af4a9 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -120,6 +120,9 @@ extern struct page *fscrypt_alloc_bounce_page(struct fscrypt_ctx *ctx, /* fname.c */ extern int fname_encrypt(struct inode *inode, const struct qstr *iname, u8 *out, unsigned int olen); +extern bool fscrypt_fname_encrypted_size(const struct inode *inode, + u32 orig_len, u32 max_len, + u32 *encrypted_len_ret); /* keyinfo.c */ extern void __exit fscrypt_essiv_cleanup(void); diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c index 8b829400f467..a91f605d81e9 100644 --- a/fs/crypto/hooks.c +++ b/fs/crypto/hooks.c @@ -143,12 +143,12 @@ int __fscrypt_prepare_symlink(struct inode *dir, unsigned int len, * counting it (even though it is meaningless for ciphertext) is simpler * for now since filesystems will assume it is there and subtract it. */ - if (sizeof(struct fscrypt_symlink_data) + len > max_len) + if (!fscrypt_fname_encrypted_size(dir, len, + max_len - sizeof(struct fscrypt_symlink_data), + &disk_link->len)) return -ENAMETOOLONG; - disk_link->len = min_t(unsigned int, - sizeof(struct fscrypt_symlink_data) + - fscrypt_fname_encrypted_size(dir, len), - max_len); + disk_link->len += sizeof(struct fscrypt_symlink_data); + disk_link->name = NULL; return 0; } diff --git a/include/linux/fscrypt_notsupp.h b/include/linux/fscrypt_notsupp.h index f5de736cf1c1..5777251400f9 100644 --- a/include/linux/fscrypt_notsupp.h +++ b/include/linux/fscrypt_notsupp.h @@ -130,14 +130,6 @@ static inline void fscrypt_free_filename(struct fscrypt_name *fname) return; } -static inline u32 fscrypt_fname_encrypted_size(const struct inode *inode, - u32 ilen) -{ - /* never happens */ - WARN_ON(1); - return 0; -} - static inline int fscrypt_fname_alloc_buffer(const struct inode *inode, u32 max_encrypted_len, struct fscrypt_str *crypto_str) diff --git a/include/linux/fscrypt_supp.h b/include/linux/fscrypt_supp.h index ddd79019e3f9..c88d2058902a 100644 --- a/include/linux/fscrypt_supp.h +++ b/include/linux/fscrypt_supp.h @@ -106,7 +106,6 @@ static inline void fscrypt_free_filename(struct fscrypt_name *fname) kfree(fname->crypto_buf.name); } -extern u32 fscrypt_fname_encrypted_size(const struct inode *, u32); extern int fscrypt_fname_alloc_buffer(const struct inode *, u32, struct fscrypt_str *); extern void fscrypt_fname_free_buffer(struct fscrypt_str *); From 7e0e7995ee97a285ca764c854c7e899aecd75949 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 19 Jan 2018 13:45:24 -0800 Subject: [PATCH 614/804] fscrypt: fix build with pre-4.6 gcc versions gcc versions prior to 4.6 require an extra level of braces when using a designated initializer for a member in an anonymous struct or union. This caused a compile error with the 'struct qstr' initialization in __fscrypt_encrypt_symlink(). Fix it by using QSTR_INIT(). Reported-by: Andrew Morton Fixes: 76e81d6d5048 ("fscrypt: new helper functions for ->symlink()") Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/crypto/hooks.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c index a91f605d81e9..bc010e4609ef 100644 --- a/fs/crypto/hooks.c +++ b/fs/crypto/hooks.c @@ -158,7 +158,7 @@ int __fscrypt_encrypt_symlink(struct inode *inode, const char *target, unsigned int len, struct fscrypt_str *disk_link) { int err; - struct qstr iname = { .name = target, .len = len }; + struct qstr iname = QSTR_INIT(target, len); struct fscrypt_symlink_data *sd; unsigned int ciphertext_len; From 39575737bb62fc391c8cc8ea5dfea09daed57d5d Mon Sep 17 00:00:00 2001 From: Sheng Yong Date: Mon, 29 Jan 2018 19:13:15 +0800 Subject: [PATCH 615/804] f2fs: fix potential corruption in area before F2FS_SUPER_OFFSET sb_getblk does not guarantee the buffer head is uptodate. If bh is not uptodate, the data (may be used as boot code) in area before F2FS_SUPER_OFFSET may get corrupted when super block is committed. Signed-off-by: Sheng Yong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index aaeba346e9d7..63729184bcc4 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1898,7 +1898,6 @@ static int __f2fs_commit_super(struct buffer_head *bh, lock_buffer(bh); if (super) memcpy(bh->b_data + F2FS_SUPER_OFFSET, super, sizeof(*super)); - set_buffer_uptodate(bh); set_buffer_dirty(bh); unlock_buffer(bh); @@ -2338,7 +2337,7 @@ int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover) } /* write back-up superblock first */ - bh = sb_getblk(sbi->sb, sbi->valid_super_block ? 0: 1); + bh = sb_bread(sbi->sb, sbi->valid_super_block ? 0 : 1); if (!bh) return -EIO; err = __f2fs_commit_super(bh, F2FS_RAW_SUPER(sbi)); @@ -2349,7 +2348,7 @@ int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover) return err; /* write current valid superblock */ - bh = sb_getblk(sbi->sb, sbi->valid_super_block); + bh = sb_bread(sbi->sb, sbi->valid_super_block); if (!bh) return -EIO; err = __f2fs_commit_super(bh, F2FS_RAW_SUPER(sbi)); From 41dda11641377f1233f14aae8fe8b3d0a2989ff8 Mon Sep 17 00:00:00 2001 From: Yunlong Song Date: Mon, 29 Jan 2018 11:37:45 +0800 Subject: [PATCH 616/804] f2fs: fix heap mode to reset it back Commit 7a20b8a61eff81bdb7097a578752a74860e9d142 ("f2fs: allocate node and hot data in the beginning of partition") introduces another mount option, heap, to reset it back. But it does not do anything for heap mode, so fix it. Cc: stable@vger.kernel.org Signed-off-by: Yunlong Song Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 5 +++-- fs/f2fs/segment.c | 3 ++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index d0de3429c26c..06de4ca9abc9 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -191,8 +191,9 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type, if (gc_type != FG_GC && p->max_search > sbi->max_victim_search) p->max_search = sbi->max_victim_search; - /* let's select beginning hot/small space first */ - if (type == CURSEG_HOT_DATA || IS_NODESEG(type)) + /* let's select beginning hot/small space first in no_heap mode*/ + if (test_opt(sbi, NOHEAP) && + (type == CURSEG_HOT_DATA || IS_NODESEG(type))) p->offset = 0; else p->offset = SIT_I(sbi)->last_victim[p->gc_mode]; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index bf98f6f34b7e..4e27b6721ba1 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2244,7 +2244,8 @@ static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type) if (sbi->segs_per_sec != 1) return CURSEG_I(sbi, type)->segno; - if (type == CURSEG_HOT_DATA || IS_NODESEG(type)) + if (test_opt(sbi, NOHEAP) && + (type == CURSEG_HOT_DATA || IS_NODESEG(type))) return 0; if (SIT_I(sbi)->last_victim[ALLOC_NEXT]) From 2e2a339c9853be971a114d0a572cb85de13d2ad7 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 27 Jan 2018 17:29:48 +0800 Subject: [PATCH 617/804] f2fs: restrict inline_xattr_size configuration This patch limits to enable inline_xattr_size mount option only if both extra_attr and flexible_inline_xattr feature is on in current image. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 63729184bcc4..bb0ab4f5e2d4 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -699,6 +699,13 @@ static int parse_options(struct super_block *sb, char *options) } if (test_opt(sbi, INLINE_XATTR_SIZE)) { + if (!f2fs_sb_has_extra_attr(sb) || + !f2fs_sb_has_flexible_inline_xattr(sb)) { + f2fs_msg(sb, KERN_ERR, + "extra_attr or flexible_inline_xattr " + "feature is off"); + return -EINVAL; + } if (!test_opt(sbi, INLINE_XATTR)) { f2fs_msg(sb, KERN_ERR, "inline_xattr_size option should be " From eceb943d5d592873f67d25b68d66232f7ef44be7 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 27 Jan 2018 17:29:49 +0800 Subject: [PATCH 618/804] f2fs: fix to check extent cache in f2fs_drop_extent_tree If noextent_cache mount option is on, we will never initialize extent tree in inode, but still we're going to access it in f2fs_drop_extent_tree, result in kernel panic as below: BUG: unable to handle kernel NULL pointer dereference at 0000000000000038 IP: _raw_write_lock+0xc/0x30 Call Trace: ? f2fs_drop_extent_tree+0x41/0x70 [f2fs] f2fs_fallocate+0x5a0/0xdd0 [f2fs] ? common_file_perm+0x47/0xc0 ? apparmor_file_permission+0x1a/0x20 vfs_fallocate+0x15b/0x290 SyS_fallocate+0x44/0x70 do_syscall_64+0x6e/0x160 entry_SYSCALL64_slow_path+0x25/0x25 This patch fixes to check extent cache status before using in f2fs_drop_extent_tree. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/extent_cache.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index ff2352a0ed15..aff6c2ed1c02 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -706,6 +706,9 @@ void f2fs_drop_extent_tree(struct inode *inode) struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct extent_tree *et = F2FS_I(inode)->extent_tree; + if (!f2fs_may_extent_tree(inode)) + return; + set_inode_flag(inode, FI_NO_EXTENT); write_lock(&et->lock); From 0671fae134bb95325ddb35405656af3c9236548d Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 25 Jan 2018 19:40:08 +0800 Subject: [PATCH 619/804] f2fs: support large nat bitmap Previously, we will store all nat version bitmap in checkpoint pack block, so our total node entry number has a limitation which caused total node number can not exceed (3900 * 8) block * 455 node/block = 14196000. So that once user wants to create more nodes in large size image, it becomes a bottleneck, that's unreasonable. This patch detects the new layout of nat/sit version bitmap in image in order to enable supporting large nat bitmap. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 6 ++++++ include/linux/f2fs_fs.h | 1 + 2 files changed, 7 insertions(+) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index e54ffadb692c..20e940f22c5c 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1826,6 +1826,12 @@ static inline void *__bitmap_ptr(struct f2fs_sb_info *sbi, int flag) struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); int offset; + if (is_set_ckpt_flags(sbi, CP_LARGE_NAT_BITMAP_FLAG)) { + offset = (flag == SIT_BITMAP) ? + le32_to_cpu(ckpt->nat_ver_bitmap_bytesize) : 0; + return &ckpt->sit_nat_version_bitmap + offset; + } + if (__cp_payload(sbi) > 0) { if (flag == NAT_BITMAP) return &ckpt->sit_nat_version_bitmap; diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index 073365c9808a..23f23b8e6878 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -116,6 +116,7 @@ struct f2fs_super_block { /* * For checkpoint */ +#define CP_LARGE_NAT_BITMAP_FLAG 0x00000400 #define CP_NOCRC_RECOVERY_FLAG 0x00000200 #define CP_TRIMMED_FLAG 0x00000100 #define CP_NAT_BITS_FLAG 0x00000080 From 180900373ec1684eb94e748915a5c25dde14774e Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 31 Jan 2018 09:30:34 +0800 Subject: [PATCH 620/804] f2fs: fix to clear CP_TRIMMED_FLAG Once CP_TRIMMED_FLAG is set, after a reboot, we will never issue discard before LBA becomes invalid again, fix it by clearing the flag in checkpoint without CP_TRIMMED reason. Fixes: 1f43e2ad7bff ("f2fs: introduce CP_TRIMMED_FLAG to avoid unneeded discard") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 3c343e922f6e..ab1b35856082 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1140,6 +1140,8 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc) if (cpc->reason & CP_TRIMMED) __set_ckpt_flags(ckpt, CP_TRIMMED_FLAG); + else + __clear_ckpt_flags(ckpt, CP_TRIMMED_FLAG); if (cpc->reason & CP_UMOUNT) __set_ckpt_flags(ckpt, CP_UMOUNT_FLAG); From 22fa74c2b0975f8ae05e55860b84ba2557c940ad Mon Sep 17 00:00:00 2001 From: Hyunchul Lee Date: Wed, 31 Jan 2018 11:36:57 +0900 Subject: [PATCH 621/804] f2fs: support passing down write hints given by users to block layer Add the 'whint_mode' mount option that controls which write hints are passed down to block layer. There are "off" and "user-based" mode. The default mode is "off". 1) whint_mode=off. F2FS only passes down WRITE_LIFE_NOT_SET. 2) whint_mode=user-based. F2FS tries to pass down hints given by users. User F2FS Block ---- ---- ----- META WRITE_LIFE_NOT_SET HOT_NODE " WARM_NODE " COLD_NODE " ioctl(COLD) COLD_DATA WRITE_LIFE_EXTREME extension list " " -- buffered io WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET WRITE_LIFE_NONE " " WRITE_LIFE_MEDIUM " " WRITE_LIFE_LONG " " -- direct io WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET WRITE_LIFE_NONE " WRITE_LIFE_NONE WRITE_LIFE_MEDIUM " WRITE_LIFE_MEDIUM WRITE_LIFE_LONG " WRITE_LIFE_LONG Many thanks to Chao Yu and Jaegeuk Kim for comments to implement this patch. Signed-off-by: Hyunchul Lee Reviewed-by: Chao Yu [Jaegeuk Kim: avoid build warning] [Chao Yu: fix to restore whint_mode in ->remount_fs] Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 32 +++++++++++++++------ fs/f2fs/f2fs.h | 18 ++++++------ fs/f2fs/segment.c | 59 ++++++++++++++++++++++++++++++++++++++ fs/f2fs/super.c | 31 +++++++++++++++++++- include/linux/blk_types.h | 1 + include/linux/fs.h | 17 +++++++++-- include/uapi/linux/fcntl.h | 21 ++++++++++++++ 7 files changed, 158 insertions(+), 21 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index c80f138b0f33..680241a10505 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -173,15 +173,22 @@ static bool __same_bdev(struct f2fs_sb_info *sbi, */ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr, struct writeback_control *wbc, - int npages, bool is_read) + int npages, bool is_read, + enum page_type type, enum temp_type temp) { struct bio *bio; bio = f2fs_bio_alloc(sbi, npages, true); f2fs_target_device(sbi, blk_addr, bio); - bio->bi_end_io = is_read ? f2fs_read_end_io : f2fs_write_end_io; - bio->bi_private = is_read ? NULL : sbi; + if (is_read) { + bio->bi_end_io = f2fs_read_end_io; + bio->bi_private = NULL; + } else { + bio->bi_end_io = f2fs_write_end_io; + bio->bi_private = sbi; + bio->bi_write_hint = io_type_to_rw_hint(sbi, type, temp); + } if (wbc) wbc_init_bio(wbc, bio); @@ -380,7 +387,7 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) /* Allocate a new bio */ bio = __bio_alloc(fio->sbi, fio->new_blkaddr, fio->io_wbc, - 1, is_read_io(fio->op)); + 1, is_read_io(fio->op), fio->type, fio->temp); if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { bio_put(bio); @@ -443,7 +450,8 @@ alloc_new: goto out_fail; } io->bio = __bio_alloc(sbi, fio->new_blkaddr, fio->io_wbc, - BIO_MAX_PAGES, false); + BIO_MAX_PAGES, false, + fio->type, fio->temp); io->fio = *fio; } @@ -867,8 +875,7 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) map.m_seg_type = NO_CHECK_TYPE; if (direct_io) { - /* map.m_seg_type = rw_hint_to_seg_type(iocb->ki_hint); */ - map.m_seg_type = rw_hint_to_seg_type(WRITE_LIFE_NOT_SET); + map.m_seg_type = rw_hint_to_seg_type(iocb->ki_hint); flag = __force_buffered_io(inode, WRITE) ? F2FS_GET_BLOCK_PRE_AIO : F2FS_GET_BLOCK_PRE_DIO; @@ -1150,8 +1157,7 @@ static int get_data_block_dio(struct inode *inode, sector_t iblock, return __get_data_block(inode, iblock, bh_result, create, F2FS_GET_BLOCK_DEFAULT, NULL, rw_hint_to_seg_type( - WRITE_LIFE_NOT_SET)); - /* inode->i_write_hint)); */ + inode->i_write_hint)); } static int get_data_block_bmap(struct inode *inode, sector_t iblock, @@ -2292,9 +2298,12 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, { struct address_space *mapping = iocb->ki_filp->f_mapping; struct inode *inode = mapping->host; + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); size_t count = iov_iter_count(iter); int rw = iov_iter_rw(iter); int err; + enum rw_hint hint = iocb->ki_hint; + int whint_mode = sbi->whint_mode; err = check_direct_IO(inode, iter, offset); if (err) @@ -2305,11 +2314,16 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, trace_f2fs_direct_IO_enter(inode, offset, count, rw); + if (rw == WRITE && whint_mode == WHINT_MODE_OFF) + iocb->ki_hint = WRITE_LIFE_NOT_SET; + down_read(&F2FS_I(inode)->dio_rwsem[rw]); err = blockdev_direct_IO(iocb, inode, iter, offset, get_data_block_dio); up_read(&F2FS_I(inode)->dio_rwsem[rw]); if (rw == WRITE) { + if (whint_mode == WHINT_MODE_OFF) + iocb->ki_hint = hint; if (err > 0) { f2fs_update_iostat(F2FS_I_SB(inode), APP_DIRECT_IO, err); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 20e940f22c5c..2aa47fac39a8 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1101,6 +1101,11 @@ enum { MAX_TIME, }; +enum { + WHINT_MODE_OFF, /* not pass down write hints */ + WHINT_MODE_USER, /* try to pass down hints given by users */ +}; + struct f2fs_sb_info { struct super_block *sb; /* pointer to VFS super block */ struct proc_dir_entry *s_proc; /* proc entry */ @@ -1284,6 +1289,8 @@ struct f2fs_sb_info { char *s_qf_names[MAXQUOTAS]; int s_jquota_fmt; /* Format of quota to use */ #endif + /* For which write hints are passed down to block layer */ + int whint_mode; }; #ifdef CONFIG_F2FS_FAULT_INJECTION @@ -2573,15 +2580,6 @@ static inline void *kvzalloc(size_t size, gfp_t flags) return ret; } -enum rw_hint { - WRITE_LIFE_NOT_SET = 0, - WRITE_LIFE_NONE = 1, /* RWH_WRITE_LIFE_NONE */ - WRITE_LIFE_SHORT = 2, /* RWH_WRITE_LIFE_SHORT */ - WRITE_LIFE_MEDIUM = 3, /* RWH_WRITE_LIFE_MEDIUM */ - WRITE_LIFE_LONG = 4, /* RWH_WRITE_LIFE_LONG */ - WRITE_LIFE_EXTREME = 5, /* RWH_WRITE_LIFE_EXTREME */ -}; - static inline void *f2fs_kzalloc(struct f2fs_sb_info *sbi, size_t size, gfp_t flags) { @@ -2862,6 +2860,8 @@ void destroy_segment_manager(struct f2fs_sb_info *sbi); int __init create_segment_manager_caches(void); void destroy_segment_manager_caches(void); int rw_hint_to_seg_type(enum rw_hint hint); +enum rw_hint io_type_to_rw_hint(struct f2fs_sb_info *sbi, enum page_type type, + enum temp_type temp); /* * checkpoint.c diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 4e27b6721ba1..5dc604058205 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2536,6 +2536,62 @@ int rw_hint_to_seg_type(enum rw_hint hint) } } +/* This returns write hints for each segment type. This hints will be + * passed down to block layer. There are mapping tables which depend on + * the mount option 'whint_mode'. + * + * 1) whint_mode=off. F2FS only passes down WRITE_LIFE_NOT_SET. + * + * 2) whint_mode=user-based. F2FS tries to pass down hints given by users. + * + * User F2FS Block + * ---- ---- ----- + * META WRITE_LIFE_NOT_SET + * HOT_NODE " + * WARM_NODE " + * COLD_NODE " + * ioctl(COLD) COLD_DATA WRITE_LIFE_EXTREME + * extension list " " + * + * -- buffered io + * WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME + * WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT + * WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET + * WRITE_LIFE_NONE " " + * WRITE_LIFE_MEDIUM " " + * WRITE_LIFE_LONG " " + * + * -- direct io + * WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME + * WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT + * WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET + * WRITE_LIFE_NONE " WRITE_LIFE_NONE + * WRITE_LIFE_MEDIUM " WRITE_LIFE_MEDIUM + * WRITE_LIFE_LONG " WRITE_LIFE_LONG + * + */ + +enum rw_hint io_type_to_rw_hint(struct f2fs_sb_info *sbi, + enum page_type type, enum temp_type temp) +{ + if (sbi->whint_mode == WHINT_MODE_USER) { + if (type == DATA) { + switch (temp) { + case COLD: + return WRITE_LIFE_EXTREME; + case HOT: + return WRITE_LIFE_SHORT; + default: + return WRITE_LIFE_NOT_SET; + } + } else { + return WRITE_LIFE_NOT_SET; + } + } else { + return WRITE_LIFE_NOT_SET; + } +} + static int __get_segment_type_2(struct f2fs_io_info *fio) { if (fio->type == DATA) @@ -2724,6 +2780,7 @@ void write_meta_page(struct f2fs_sb_info *sbi, struct page *page, struct f2fs_io_info fio = { .sbi = sbi, .type = META, + .temp = HOT, .op = REQ_OP_WRITE, .op_flags = REQ_SYNC | REQ_NOIDLE | REQ_META | REQ_PRIO, .old_blkaddr = page->index, @@ -2772,6 +2829,8 @@ int rewrite_data_page(struct f2fs_io_info *fio) int err; fio->new_blkaddr = fio->old_blkaddr; + /* i/o temperature is needed for passing down write hints */ + __get_segment_type(fio); stat_inc_inplace_blocks(fio->sbi); err = f2fs_submit_page_bio(fio); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index bb0ab4f5e2d4..96a720680eec 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -129,6 +129,7 @@ enum { Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, + Opt_whint, Opt_err, }; @@ -182,6 +183,7 @@ static match_table_t f2fs_tokens = { {Opt_jqfmt_vfsold, "jqfmt=vfsold"}, {Opt_jqfmt_vfsv0, "jqfmt=vfsv0"}, {Opt_jqfmt_vfsv1, "jqfmt=vfsv1"}, + {Opt_whint, "whint_mode=%s"}, {Opt_err, NULL}, }; @@ -679,6 +681,22 @@ static int parse_options(struct super_block *sb, char *options) "quota operations not supported"); break; #endif + case Opt_whint: + name = match_strdup(&args[0]); + if (!name) + return -ENOMEM; + if (strlen(name) == 10 && + !strncmp(name, "user-based", 10)) { + sbi->whint_mode = WHINT_MODE_USER; + } else if (strlen(name) == 3 && + !strncmp(name, "off", 3)) { + sbi->whint_mode = WHINT_MODE_OFF; + } else { + kfree(name); + return -EINVAL; + } + kfree(name); + break; default: f2fs_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" or missing value", @@ -722,6 +740,12 @@ static int parse_options(struct super_block *sb, char *options) return -EINVAL; } } + + /* Not pass down write hints if the number of active logs is lesser + * than NR_CURSEG_TYPE. + */ + if (sbi->active_logs != NR_CURSEG_TYPE) + sbi->whint_mode = WHINT_MODE_OFF; return 0; } @@ -1233,6 +1257,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",prjquota"); #endif f2fs_show_quota_options(seq, sbi->sb); + if (sbi->whint_mode == WHINT_MODE_USER) + seq_printf(seq, ",whint_mode=%s", "user-based"); return 0; } @@ -1242,6 +1268,7 @@ static void default_options(struct f2fs_sb_info *sbi) /* init some FS parameters */ sbi->active_logs = NR_CURSEG_TYPE; sbi->inline_xattr_size = DEFAULT_INLINE_XATTR_ADDRS; + sbi->whint_mode = WHINT_MODE_OFF; set_opt(sbi, BG_GC); set_opt(sbi, INLINE_XATTR); @@ -1282,6 +1309,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) bool need_restart_gc = false; bool need_stop_gc = false; bool no_extent_cache = !test_opt(sbi, EXTENT_CACHE); + int old_whint_mode = sbi->whint_mode; #ifdef CONFIG_F2FS_FAULT_INJECTION struct f2fs_fault_info ffi = sbi->fault_info; #endif @@ -1381,7 +1409,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) need_stop_gc = true; } - if (*flags & MS_RDONLY) { + if (*flags & MS_RDONLY || sbi->whint_mode != old_whint_mode) { writeback_inodes_sb(sb, WB_REASON_SYNC); sync_inodes_sb(sb); @@ -1431,6 +1459,7 @@ restore_opts: sbi->s_qf_names[i] = s_qf_names[i]; } #endif + sbi->whint_mode = old_whint_mode; sbi->mount_opt = org_mount_opt; sbi->active_logs = active_logs; sb->s_flags = old_sb_flags; diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 0fb65843ec1e..f0942a82bb20 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -47,6 +47,7 @@ struct bio { struct bio *bi_next; /* request queue link */ struct block_device *bi_bdev; unsigned int bi_flags; /* status, command, etc */ + unsigned short bi_write_hint; int bi_error; unsigned long bi_rw; /* bottom bits READ/WRITE, * top bits priority diff --git a/include/linux/fs.h b/include/linux/fs.h index 933978eb92fb..8231cdc25901 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -318,6 +319,18 @@ struct page; struct address_space; struct writeback_control; +/* + * Write life time hint values. + */ +enum rw_hint { + WRITE_LIFE_NOT_SET = 0, + WRITE_LIFE_NONE = RWH_WRITE_LIFE_NONE, + WRITE_LIFE_SHORT = RWH_WRITE_LIFE_SHORT, + WRITE_LIFE_MEDIUM = RWH_WRITE_LIFE_MEDIUM, + WRITE_LIFE_LONG = RWH_WRITE_LIFE_LONG, + WRITE_LIFE_EXTREME = RWH_WRITE_LIFE_EXTREME, +}; + #define IOCB_EVENTFD (1 << 0) #define IOCB_APPEND (1 << 1) #define IOCB_DIRECT (1 << 2) @@ -328,6 +341,7 @@ struct kiocb { void (*ki_complete)(struct kiocb *iocb, long ret, long ret2); void *private; int ki_flags; + enum rw_hint ki_hint; }; static inline bool is_sync_kiocb(struct kiocb *kiocb) @@ -624,6 +638,7 @@ struct inode { spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */ unsigned short i_bytes; unsigned int i_blkbits; + enum rw_hint i_write_hint; blkcnt_t i_blocks; #ifdef __NEED_I_SIZE_ORDERED @@ -1053,8 +1068,6 @@ struct file_lock_context { #define OFFT_OFFSET_MAX INT_LIMIT(off_t) #endif -#include - extern void send_sigio(struct fown_struct *fown, int fd, int band); #ifdef CONFIG_FILE_LOCKING diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h index beed138bd359..f85ed3a5ef4d 100644 --- a/include/uapi/linux/fcntl.h +++ b/include/uapi/linux/fcntl.h @@ -42,6 +42,27 @@ #define F_SEAL_WRITE 0x0008 /* prevent writes */ /* (1U << 31) is reserved for signed error codes */ +/* + * Set/Get write life time hints. {GET,SET}_RW_HINT operate on the + * underlying inode, while {GET,SET}_FILE_RW_HINT operate only on + * the specific file. + */ +#define F_GET_RW_HINT (F_LINUX_SPECIFIC_BASE + 11) +#define F_SET_RW_HINT (F_LINUX_SPECIFIC_BASE + 12) +#define F_GET_FILE_RW_HINT (F_LINUX_SPECIFIC_BASE + 13) +#define F_SET_FILE_RW_HINT (F_LINUX_SPECIFIC_BASE + 14) + +/* + * Valid hint values for F_{GET,SET}_RW_HINT. 0 is "not set", or can be + * used to clear any hints previously set. + */ +#define RWF_WRITE_LIFE_NOT_SET 0 +#define RWH_WRITE_LIFE_NONE 1 +#define RWH_WRITE_LIFE_SHORT 2 +#define RWH_WRITE_LIFE_MEDIUM 3 +#define RWH_WRITE_LIFE_LONG 4 +#define RWH_WRITE_LIFE_EXTREME 5 + /* * Types of directory notifications that may be requested. */ From 92b12bb1a23e6e808e40d2c01f231b881e44abb2 Mon Sep 17 00:00:00 2001 From: Hyunchul Lee Date: Wed, 31 Jan 2018 11:36:58 +0900 Subject: [PATCH 622/804] f2fs: support passing down write hints to block layer with F2FS policy Add 'whint_mode=fs-based' mount option. In this mode, F2FS passes down write hints with its policy. * whint_mode=fs-based. F2FS passes down hints with its policy. User F2FS Block ---- ---- ----- META WRITE_LIFE_MEDIUM; HOT_NODE WRITE_LIFE_NOT_SET WARM_NODE " COLD_NODE WRITE_LIFE_NONE ioctl(COLD) COLD_DATA WRITE_LIFE_EXTREME extension list " " -- buffered io WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_LONG WRITE_LIFE_NONE " " WRITE_LIFE_MEDIUM " " WRITE_LIFE_LONG " " -- direct io WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET WRITE_LIFE_NONE " WRITE_LIFE_NONE WRITE_LIFE_MEDIUM " WRITE_LIFE_MEDIUM WRITE_LIFE_LONG " WRITE_LIFE_LONG Many thanks to Chao Yu and Jaegeuk Kim for comments to implement this patch. Signed-off-by: Hyunchul Lee Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 1 + fs/f2fs/segment.c | 57 +++++++++++++++++++++++++++++++++++++++-------- fs/f2fs/super.c | 5 +++++ 3 files changed, 54 insertions(+), 9 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 2aa47fac39a8..385ad384775a 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1104,6 +1104,7 @@ enum { enum { WHINT_MODE_OFF, /* not pass down write hints */ WHINT_MODE_USER, /* try to pass down hints given by users */ + WHINT_MODE_FS, /* pass down hints with F2FS policy */ }; struct f2fs_sb_info { diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 5dc604058205..3a150018fd2c 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2569,6 +2569,32 @@ int rw_hint_to_seg_type(enum rw_hint hint) * WRITE_LIFE_MEDIUM " WRITE_LIFE_MEDIUM * WRITE_LIFE_LONG " WRITE_LIFE_LONG * + * 3) whint_mode=fs-based. F2FS passes down hints with its policy. + * + * User F2FS Block + * ---- ---- ----- + * META WRITE_LIFE_MEDIUM; + * HOT_NODE WRITE_LIFE_NOT_SET + * WARM_NODE " + * COLD_NODE WRITE_LIFE_NONE + * ioctl(COLD) COLD_DATA WRITE_LIFE_EXTREME + * extension list " " + * + * -- buffered io + * WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME + * WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT + * WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_LONG + * WRITE_LIFE_NONE " " + * WRITE_LIFE_MEDIUM " " + * WRITE_LIFE_LONG " " + * + * -- direct io + * WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME + * WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT + * WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET + * WRITE_LIFE_NONE " WRITE_LIFE_NONE + * WRITE_LIFE_MEDIUM " WRITE_LIFE_MEDIUM + * WRITE_LIFE_LONG " WRITE_LIFE_LONG */ enum rw_hint io_type_to_rw_hint(struct f2fs_sb_info *sbi, @@ -2576,20 +2602,33 @@ enum rw_hint io_type_to_rw_hint(struct f2fs_sb_info *sbi, { if (sbi->whint_mode == WHINT_MODE_USER) { if (type == DATA) { - switch (temp) { - case COLD: - return WRITE_LIFE_EXTREME; - case HOT: - return WRITE_LIFE_SHORT; - default: + if (temp == WARM) return WRITE_LIFE_NOT_SET; - } + else if (temp == HOT) + return WRITE_LIFE_SHORT; + else if (temp == COLD) + return WRITE_LIFE_EXTREME; } else { return WRITE_LIFE_NOT_SET; } - } else { - return WRITE_LIFE_NOT_SET; + } else if (sbi->whint_mode == WHINT_MODE_FS) { + if (type == DATA) { + if (temp == WARM) + return WRITE_LIFE_LONG; + else if (temp == HOT) + return WRITE_LIFE_SHORT; + else if (temp == COLD) + return WRITE_LIFE_EXTREME; + } else if (type == NODE) { + if (temp == WARM || temp == HOT) + return WRITE_LIFE_NOT_SET; + else if (temp == COLD) + return WRITE_LIFE_NONE; + } else if (type == META) { + return WRITE_LIFE_MEDIUM; + } } + return WRITE_LIFE_NOT_SET; } static int __get_segment_type_2(struct f2fs_io_info *fio) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 96a720680eec..8b6edc4e5cab 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -691,6 +691,9 @@ static int parse_options(struct super_block *sb, char *options) } else if (strlen(name) == 3 && !strncmp(name, "off", 3)) { sbi->whint_mode = WHINT_MODE_OFF; + } else if (strlen(name) == 8 && + !strncmp(name, "fs-based", 8)) { + sbi->whint_mode = WHINT_MODE_FS; } else { kfree(name); return -EINVAL; @@ -1259,6 +1262,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) f2fs_show_quota_options(seq, sbi->sb); if (sbi->whint_mode == WHINT_MODE_USER) seq_printf(seq, ",whint_mode=%s", "user-based"); + else if (sbi->whint_mode == WHINT_MODE_FS) + seq_printf(seq, ",whint_mode=%s", "fs-based"); return 0; } From 889d98087652d168cccc3ebb84d62efa6e825644 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 7 Feb 2018 17:01:48 -0800 Subject: [PATCH 623/804] f2fs: handle quota for orphan inodes This is to detect dquot_initialize errors early from evict_inode for orphan inodes. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index ab1b35856082..29bb6209dee2 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -573,13 +573,8 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) struct node_info ni; int err = acquire_orphan_inode(sbi); - if (err) { - set_sbi_flag(sbi, SBI_NEED_FSCK); - f2fs_msg(sbi->sb, KERN_WARNING, - "%s: orphan failed (ino=%x), run fsck to fix.", - __func__, ino); - return err; - } + if (err) + goto err_out; __add_ino_entry(sbi, ino, 0, ORPHAN_INO); @@ -593,6 +588,11 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) return PTR_ERR(inode); } + err = dquot_initialize(inode); + if (err) + goto err_out; + + dquot_initialize(inode); clear_nlink(inode); /* truncate all the data during iput */ @@ -602,14 +602,18 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) /* ENOMEM was fully retried in f2fs_evict_inode. */ if (ni.blk_addr != NULL_ADDR) { - set_sbi_flag(sbi, SBI_NEED_FSCK); - f2fs_msg(sbi->sb, KERN_WARNING, - "%s: orphan failed (ino=%x) by kernel, retry mount.", - __func__, ino); - return -EIO; + err = -EIO; + goto err_out; } __remove_ino_entry(sbi, ino, ORPHAN_INO); return 0; + +err_out: + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_msg(sbi->sb, KERN_WARNING, + "%s: orphan failed (ino=%x), run fsck to fix.", + __func__, ino); + return err; } int recover_orphan_inodes(struct f2fs_sb_info *sbi) From 190e64a819df14ed6406f6cb075a5177155b4101 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 3 Feb 2018 17:44:39 +0800 Subject: [PATCH 624/804] f2fs: fix to handle looped node chain during recovery There is no checksum in node block now, so bit-transition from hardware can make node_footer.next_blkaddr being corrupted w/o any detection, result in node chain becoming looped one. For this condition, during recovery, in order to avoid running into dead loop, let's detect it and just skip out. Signed-off-by: Yunlei He Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/recovery.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 210de28c9cd2..4ddc2262baf1 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -242,6 +242,9 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head, struct curseg_info *curseg; struct page *page = NULL; block_t blkaddr; + unsigned int loop_cnt = 0; + unsigned int free_blocks = sbi->user_block_count - + valid_user_blocks(sbi); int err = 0; /* get node pages in the current segment */ @@ -294,6 +297,17 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head, if (IS_INODE(page) && is_dent_dnode(page)) entry->last_dentry = blkaddr; next: + /* sanity check in order to detect looped node chain */ + if (++loop_cnt >= free_blocks || + blkaddr == next_blkaddr_of_node(page)) { + f2fs_msg(sbi->sb, KERN_NOTICE, + "%s: detect looped node chain, " + "blkaddr:%u, next:%u", + __func__, blkaddr, next_blkaddr_of_node(page)); + err = -EINVAL; + break; + } + /* check next segment */ blkaddr = next_blkaddr_of_node(page); f2fs_put_page(page, 1); From a292477154b522b4dfa38d62f5249e6999a93a82 Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Tue, 6 Feb 2018 08:21:45 +0800 Subject: [PATCH 625/804] f2fs: remove redundant check of page type when submit bio This patch removes redundant check of page type when submit bio to make the logic more clear. Signed-off-by: Tiezhu Yang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 680241a10505..a6ebf4549529 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -201,13 +201,12 @@ static inline void __submit_bio(struct f2fs_sb_info *sbi, if (!is_read_io(bio_op(bio))) { unsigned int start; - if (f2fs_sb_mounted_blkzoned(sbi->sb) && - current->plug && (type == DATA || type == NODE)) - blk_finish_plug(current->plug); - if (type != DATA && type != NODE) goto submit_io; + if (f2fs_sb_mounted_blkzoned(sbi->sb) && current->plug) + blk_finish_plug(current->plug); + start = bio->bi_iter.bi_size >> F2FS_BLKSIZE_BITS; start %= F2FS_IO_SIZE(sbi); From e5081a52ac0965739126d52db39b32a12e7a06b7 Mon Sep 17 00:00:00 2001 From: Sheng Yong Date: Tue, 6 Feb 2018 12:31:17 +0800 Subject: [PATCH 626/804] f2fs: clean up f2fs_sb_has_xxx functions This patch introduces F2FS_FEATURE_FUNCS to clean up the definitions of different f2fs_sb_has_xxx functions. Signed-off-by: Sheng Yong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 +- fs/f2fs/f2fs.h | 51 ++++++++++++----------------------------------- fs/f2fs/file.c | 6 +++--- fs/f2fs/segment.c | 4 ++-- fs/f2fs/super.c | 14 ++++++------- fs/f2fs/sysfs.c | 4 ++-- 6 files changed, 28 insertions(+), 53 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index a6ebf4549529..9ca848dc9dc0 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -204,7 +204,7 @@ static inline void __submit_bio(struct f2fs_sb_info *sbi, if (type != DATA && type != NODE) goto submit_io; - if (f2fs_sb_mounted_blkzoned(sbi->sb) && current->plug) + if (f2fs_sb_has_blkzoned(sbi->sb) && current->plug) blk_finish_plug(current->plug); start = bio->bi_iter.bi_size >> F2FS_BLKSIZE_BITS; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 385ad384775a..1653f6010495 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3267,45 +3267,20 @@ static inline bool f2fs_bio_encrypted(struct bio *bio) return bio->bi_private != NULL; } -static inline int f2fs_sb_has_crypto(struct super_block *sb) -{ - return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_ENCRYPT); +#define F2FS_FEATURE_FUNCS(name, flagname) \ +static inline int f2fs_sb_has_##name(struct super_block *sb) \ +{ \ + return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_##flagname); \ } -static inline int f2fs_sb_mounted_blkzoned(struct super_block *sb) -{ - return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_BLKZONED); -} - -static inline int f2fs_sb_has_extra_attr(struct super_block *sb) -{ - return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_EXTRA_ATTR); -} - -static inline int f2fs_sb_has_project_quota(struct super_block *sb) -{ - return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_PRJQUOTA); -} - -static inline int f2fs_sb_has_inode_chksum(struct super_block *sb) -{ - return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_INODE_CHKSUM); -} - -static inline int f2fs_sb_has_flexible_inline_xattr(struct super_block *sb) -{ - return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_FLEXIBLE_INLINE_XATTR); -} - -static inline int f2fs_sb_has_quota_ino(struct super_block *sb) -{ - return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_QUOTA_INO); -} - -static inline int f2fs_sb_has_inode_crtime(struct super_block *sb) -{ - return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_INODE_CRTIME); -} +F2FS_FEATURE_FUNCS(encrypt, ENCRYPT); +F2FS_FEATURE_FUNCS(blkzoned, BLKZONED); +F2FS_FEATURE_FUNCS(extra_attr, EXTRA_ATTR); +F2FS_FEATURE_FUNCS(project_quota, PRJQUOTA); +F2FS_FEATURE_FUNCS(inode_chksum, INODE_CHKSUM); +F2FS_FEATURE_FUNCS(flexible_inline_xattr, FLEXIBLE_INLINE_XATTR); +F2FS_FEATURE_FUNCS(quota_ino, QUOTA_INO); +F2FS_FEATURE_FUNCS(inode_crtime, INODE_CRTIME); #ifdef CONFIG_BLK_DEV_ZONED static inline int get_blkz_type(struct f2fs_sb_info *sbi, @@ -3325,7 +3300,7 @@ static inline bool f2fs_discard_en(struct f2fs_sb_info *sbi) { struct request_queue *q = bdev_get_queue(sbi->sb->s_bdev); - return blk_queue_discard(q) || f2fs_sb_mounted_blkzoned(sbi->sb); + return blk_queue_discard(q) || f2fs_sb_has_blkzoned(sbi->sb); } static inline void set_opt_mode(struct f2fs_sb_info *sbi, unsigned int mt) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 65cda5bc61b7..7c7d0477c057 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1936,7 +1936,7 @@ static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); - if (!f2fs_sb_has_crypto(inode->i_sb)) + if (!f2fs_sb_has_encrypt(inode->i_sb)) return -EOPNOTSUPP; f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); @@ -1946,7 +1946,7 @@ static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg) static int f2fs_ioc_get_encryption_policy(struct file *filp, unsigned long arg) { - if (!f2fs_sb_has_crypto(file_inode(filp)->i_sb)) + if (!f2fs_sb_has_encrypt(file_inode(filp)->i_sb)) return -EOPNOTSUPP; return fscrypt_ioctl_get_policy(filp, (void __user *)arg); } @@ -1957,7 +1957,7 @@ static int f2fs_ioc_get_encryption_pwsalt(struct file *filp, unsigned long arg) struct f2fs_sb_info *sbi = F2FS_I_SB(inode); int err; - if (!f2fs_sb_has_crypto(inode->i_sb)) + if (!f2fs_sb_has_encrypt(inode->i_sb)) return -EOPNOTSUPP; if (uuid_is_nonzero(sbi->raw_super->encrypt_pw_salt)) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 3a150018fd2c..d4e09133c013 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1565,7 +1565,7 @@ static int __issue_discard_async(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t blkstart, block_t blklen) { #ifdef CONFIG_BLK_DEV_ZONED - if (f2fs_sb_mounted_blkzoned(sbi->sb) && + if (f2fs_sb_has_blkzoned(sbi->sb) && bdev_zoned_model(bdev) != BLK_ZONED_NONE) return __f2fs_issue_discard_zone(sbi, bdev, blkstart, blklen); #endif @@ -1763,7 +1763,7 @@ find_next: sbi->blocks_per_seg, cur_pos); len = next_pos - cur_pos; - if (f2fs_sb_mounted_blkzoned(sbi->sb) || + if (f2fs_sb_has_blkzoned(sbi->sb) || (force && len < cpc->trim_minlen)) goto skip; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 8b6edc4e5cab..b6d70d6d8a27 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -405,14 +405,14 @@ static int parse_options(struct super_block *sb, char *options) q = bdev_get_queue(sb->s_bdev); if (blk_queue_discard(q)) { set_opt(sbi, DISCARD); - } else if (!f2fs_sb_mounted_blkzoned(sb)) { + } else if (!f2fs_sb_has_blkzoned(sb)) { f2fs_msg(sb, KERN_WARNING, "mounting with \"discard\" option, but " "the device does not support discard"); } break; case Opt_nodiscard: - if (f2fs_sb_mounted_blkzoned(sb)) { + if (f2fs_sb_has_blkzoned(sb)) { f2fs_msg(sb, KERN_WARNING, "discard is required for zoned block devices"); return -EINVAL; @@ -561,7 +561,7 @@ static int parse_options(struct super_block *sb, char *options) return -ENOMEM; if (strlen(name) == 8 && !strncmp(name, "adaptive", 8)) { - if (f2fs_sb_mounted_blkzoned(sb)) { + if (f2fs_sb_has_blkzoned(sb)) { f2fs_msg(sb, KERN_WARNING, "adaptive mode is not allowed with " "zoned block device feature"); @@ -1283,7 +1283,7 @@ static void default_options(struct f2fs_sb_info *sbi) set_opt(sbi, NOHEAP); sbi->sb->s_flags |= MS_LAZYTIME; set_opt(sbi, FLUSH_MERGE); - if (f2fs_sb_mounted_blkzoned(sbi->sb)) { + if (f2fs_sb_has_blkzoned(sbi->sb)) { set_opt_mode(sbi, F2FS_MOUNT_LFS); set_opt(sbi, DISCARD); } else { @@ -2250,7 +2250,7 @@ static int init_blkz_info(struct f2fs_sb_info *sbi, int devi) unsigned int n = 0; int err = -EIO; - if (!f2fs_sb_mounted_blkzoned(sbi->sb)) + if (!f2fs_sb_has_blkzoned(sbi->sb)) return 0; if (sbi->blocks_per_blkz && sbi->blocks_per_blkz != @@ -2461,7 +2461,7 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) #ifdef CONFIG_BLK_DEV_ZONED if (bdev_zoned_model(FDEV(i).bdev) == BLK_ZONED_HM && - !f2fs_sb_mounted_blkzoned(sbi->sb)) { + !f2fs_sb_has_blkzoned(sbi->sb)) { f2fs_msg(sbi->sb, KERN_ERR, "Zoned block device feature not enabled\n"); return -EINVAL; @@ -2556,7 +2556,7 @@ try_onemore: * devices, but mandatory for host-managed zoned block devices. */ #ifndef CONFIG_BLK_DEV_ZONED - if (f2fs_sb_mounted_blkzoned(sb)) { + if (f2fs_sb_has_blkzoned(sb)) { f2fs_msg(sb, KERN_ERR, "Zoned block device support is not enabled\n"); err = -EOPNOTSUPP; diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index d978c7b6ea04..374ee5c82f94 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -92,10 +92,10 @@ static ssize_t features_show(struct f2fs_attr *a, if (!sb->s_bdev->bd_part) return snprintf(buf, PAGE_SIZE, "0\n"); - if (f2fs_sb_has_crypto(sb)) + if (f2fs_sb_has_encrypt(sb)) len += snprintf(buf, PAGE_SIZE - len, "%s", "encryption"); - if (f2fs_sb_mounted_blkzoned(sb)) + if (f2fs_sb_has_blkzoned(sb)) len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "blkzoned"); if (f2fs_sb_has_extra_attr(sb)) From 946aefc7545d4eacf8f18ffac7db09a7d59e9b8f Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Sat, 10 Feb 2018 12:12:51 +0800 Subject: [PATCH 627/804] f2fs: flush cp pack except cp pack 2 page at first Previously, we attempt to flush the whole cp pack in a single bio, however, when suddenly powering off at this time, we could get into an extreme scenario that cp pack 1 page and cp pack 2 page are updated and latest, but payload or current summaries are still partially outdated. (see reliable write in the UFS specification) This patch submits the whole cp pack except cp pack 2 page at first, and then writes the cp pack 2 page with an extra independent bio with pre-io barrier. Signed-off-by: Gao Xiang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 69 +++++++++++++++++++++++++++++--------------- 1 file changed, 46 insertions(+), 23 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 29bb6209dee2..9db919c423b6 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1172,6 +1172,39 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc) spin_unlock_irqrestore(&sbi->cp_lock, flags); } +static void commit_checkpoint(struct f2fs_sb_info *sbi, + void *src, block_t blk_addr) +{ + struct writeback_control wbc = { + .for_reclaim = 0, + }; + + /* + * pagevec_lookup_tag and lock_page again will take + * some extra time. Therefore, update_meta_pages and + * sync_meta_pages are combined in this function. + */ + struct page *page = grab_meta_page(sbi, blk_addr); + int err; + + memcpy(page_address(page), src, PAGE_SIZE); + set_page_dirty(page); + + f2fs_wait_on_page_writeback(page, META, true); + f2fs_bug_on(sbi, PageWriteback(page)); + if (unlikely(!clear_page_dirty_for_io(page))) + f2fs_bug_on(sbi, 1); + + /* writeout cp pack 2 page */ + err = __f2fs_write_meta_page(page, &wbc, FS_CP_META_IO); + f2fs_bug_on(sbi, err); + + f2fs_put_page(page, 0); + + /* submit checkpoint (with barrier if NOBARRIER is not set) */ + f2fs_submit_merged_write(sbi, META_FLUSH); +} + static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); @@ -1274,16 +1307,6 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) } } - /* need to wait for end_io results */ - wait_on_all_pages_writeback(sbi); - if (unlikely(f2fs_cp_error(sbi))) - return -EIO; - - /* flush all device cache */ - err = f2fs_flush_device_cache(sbi); - if (err) - return err; - /* write out checkpoint buffer at block 0 */ update_meta_page(sbi, ckpt, start_blk++); @@ -1311,26 +1334,26 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) start_blk += NR_CURSEG_NODE_TYPE; } - /* writeout checkpoint block */ - update_meta_page(sbi, ckpt, start_blk); + /* update user_block_counts */ + sbi->last_valid_block_count = sbi->total_valid_block_count; + percpu_counter_set(&sbi->alloc_valid_block_count, 0); - /* wait for previous submitted node/meta pages writeback */ + /* Here, we have one bio having CP pack except cp pack 2 page */ + sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO); + + /* wait for previous submitted meta pages writeback */ wait_on_all_pages_writeback(sbi); if (unlikely(f2fs_cp_error(sbi))) return -EIO; - filemap_fdatawait_range(NODE_MAPPING(sbi), 0, LLONG_MAX); - filemap_fdatawait_range(META_MAPPING(sbi), 0, LLONG_MAX); + /* flush all device cache */ + err = f2fs_flush_device_cache(sbi); + if (err) + return err; - /* update user_block_counts */ - sbi->last_valid_block_count = sbi->total_valid_block_count; - percpu_counter_set(&sbi->alloc_valid_block_count, 0); - - /* Here, we only have one bio having CP pack */ - sync_meta_pages(sbi, META_FLUSH, LONG_MAX, FS_CP_META_IO); - - /* wait for previous submitted meta pages writeback */ + /* barrier and flush checkpoint cp pack 2 page if it can */ + commit_checkpoint(sbi, ckpt, start_blk); wait_on_all_pages_writeback(sbi); release_ino_entry(sbi, false); From 1f6bac14c10061c2556deb4bf50600971d911b50 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 21 Feb 2018 18:13:40 +0000 Subject: [PATCH 628/804] f2fs: remove redundant initialization of pointer 'p' Pointer p is initialized with a value that is never read and is later re-assigned a new value, hence the initialization is redundant and can be removed. Cleans up clang warning: fs/f2fs/extent_cache.c:463:19: warning: Value stored to 'p' during its initialization is never read Signed-off-by: Colin Ian King Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/extent_cache.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index aff6c2ed1c02..d5a861bf2b42 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -460,7 +460,7 @@ static struct extent_node *__insert_extent_tree(struct inode *inode, struct rb_node *insert_parent) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct rb_node **p = &et->root.rb_node; + struct rb_node **p; struct rb_node *parent = NULL; struct extent_node *en = NULL; From 4d409fa3346bf97cc68435cf49a6ab7c5733b27f Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sun, 11 Feb 2018 22:53:20 +0800 Subject: [PATCH 629/804] f2fs: introduce sb_lock to make encrypt pwsalt update exclusive f2fs_super_block.encrypt_pw_salt can be udpated and persisted concurrently, result in getting different pwsalt in separated threads, so let's introduce sb_lock to exclude concurrent accessers. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 1 + fs/f2fs/file.c | 19 +++++++++++-------- fs/f2fs/super.c | 2 ++ 3 files changed, 14 insertions(+), 8 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 1653f6010495..a8ea66cb45ed 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1111,6 +1111,7 @@ struct f2fs_sb_info { struct super_block *sb; /* pointer to VFS super block */ struct proc_dir_entry *s_proc; /* proc entry */ struct f2fs_super_block *raw_super; /* raw super block pointer */ + struct mutex sb_lock; /* lock for raw super block */ int valid_super_block; /* valid super block no */ unsigned long s_flag; /* flags for sbi */ diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 7c7d0477c057..9152fb41764a 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1960,13 +1960,15 @@ static int f2fs_ioc_get_encryption_pwsalt(struct file *filp, unsigned long arg) if (!f2fs_sb_has_encrypt(inode->i_sb)) return -EOPNOTSUPP; - if (uuid_is_nonzero(sbi->raw_super->encrypt_pw_salt)) - goto got_it; - err = mnt_want_write_file(filp); if (err) return err; + mutex_lock(&sbi->sb_lock); + + if (uuid_is_nonzero(sbi->raw_super->encrypt_pw_salt)) + goto got_it; + /* update superblock with uuid */ generate_random_uuid(sbi->raw_super->encrypt_pw_salt); @@ -1974,15 +1976,16 @@ static int f2fs_ioc_get_encryption_pwsalt(struct file *filp, unsigned long arg) if (err) { /* undo new data */ memset(sbi->raw_super->encrypt_pw_salt, 0, 16); - mnt_drop_write_file(filp); - return err; + goto out_err; } - mnt_drop_write_file(filp); got_it: if (copy_to_user((__u8 __user *)arg, sbi->raw_super->encrypt_pw_salt, 16)) - return -EFAULT; - return 0; + err = -EFAULT; +out_err: + mutex_unlock(&sbi->sb_lock); + mnt_drop_write_file(filp); + return err; } static int f2fs_ioc_gc(struct file *filp, unsigned long arg) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index b6d70d6d8a27..f86374cc4470 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2225,6 +2225,8 @@ static void init_sb_info(struct f2fs_sb_info *sbi) sbi->dirty_device = 0; spin_lock_init(&sbi->dev_lock); + + mutex_init(&sbi->sb_lock); } static int init_percpu_info(struct f2fs_sb_info *sbi) From 98b329de5026821e871b933aeb8815d3ceb3b03b Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sun, 25 Feb 2018 23:38:21 +0800 Subject: [PATCH 630/804] f2fs: fix to set KEEP_SIZE bit in f2fs_zero_range As Jayashree Mohan reported: A simple workload to reproduce this would be : 1. create foo 2. Write (8K - 16K) // foo size = 16K now 3. fsync() 4. falloc zero_range , keep_size (4202496 - 4210688) // foo size must be 16K 5. fdatasync() Crash now On recovery, we see that the file size is 4210688 and not 16K, which violates the semantics of keep_size flag. We have a test case to reproduce this using CrashMonkey on 4.15 kernel. Try this out by simply running : ./c_harness -f /dev/sda -d /dev/cow_ram0 -t f2fs -e 102400 -P -v tests/generic_468_zero.so The root cause is that we miss to set KEEP_SIZE bit correctly in zero_range when zeroing block cross EOF with FALLOC_FL_KEEP_SIZE, let's fix this missing case. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 9152fb41764a..84614f5d1689 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1354,8 +1354,12 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, } out: - if (!(mode & FALLOC_FL_KEEP_SIZE) && i_size_read(inode) < new_size) - f2fs_i_size_write(inode, new_size); + if (new_size > i_size_read(inode)) { + if (mode & FALLOC_FL_KEEP_SIZE) + file_set_keep_isize(inode); + else + f2fs_i_size_write(inode, new_size); + } out_sem: up_write(&F2FS_I(inode)->i_mmap_sem); From 766d2321697fe98dd0db6b06aa4b41ef9559f506 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 26 Feb 2018 22:04:13 +0800 Subject: [PATCH 631/804] f2fs: expose extension_list sysfs entry This patch adds a sysfs entry 'extension_list' to support query/add/del item in extension list. Query: cat /sys/fs/f2fs//extension_list Add: echo 'extension' > /sys/fs/f2fs//extension_list Del: echo '!extension' > /sys/fs/f2fs//extension_list Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 9 ++++++ fs/f2fs/f2fs.h | 4 ++- fs/f2fs/file.c | 4 +-- fs/f2fs/namei.c | 42 +++++++++++++++++++++++-- fs/f2fs/super.c | 2 +- fs/f2fs/sysfs.c | 40 +++++++++++++++++++++++ include/linux/f2fs_fs.h | 3 +- 7 files changed, 96 insertions(+), 8 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index db7aab1516de..be3f74ec05b5 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -192,3 +192,12 @@ Date: November 2017 Contact: "Sheng Yong" Description: Controls readahead inode block in readdir. + +What: /sys/fs/f2fs//extension_list +Date: Feburary 2018 +Contact: "Chao Yu" +Description: + Used to control configure extension list: + - Query: cat /sys/fs/f2fs//extension_list + - Add: echo 'extension' > /sys/fs/f2fs//extension_list + - Del: echo '!extension' > /sys/fs/f2fs//extension_list diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index a8ea66cb45ed..621603b1835c 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1111,7 +1111,7 @@ struct f2fs_sb_info { struct super_block *sb; /* pointer to VFS super block */ struct proc_dir_entry *s_proc; /* proc entry */ struct f2fs_super_block *raw_super; /* raw super block pointer */ - struct mutex sb_lock; /* lock for raw super block */ + struct rw_semaphore sb_lock; /* lock for raw super block */ int valid_super_block; /* valid super block no */ unsigned long s_flag; /* flags for sbi */ @@ -2690,6 +2690,8 @@ void handle_failed_inode(struct inode *inode); /* * namei.c */ +int update_extension_list(struct f2fs_sb_info *sbi, const char *name, + bool set); struct dentry *f2fs_get_parent(struct dentry *child); /* diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 84614f5d1689..e418fc5b3fed 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1968,7 +1968,7 @@ static int f2fs_ioc_get_encryption_pwsalt(struct file *filp, unsigned long arg) if (err) return err; - mutex_lock(&sbi->sb_lock); + down_write(&sbi->sb_lock); if (uuid_is_nonzero(sbi->raw_super->encrypt_pw_salt)) goto got_it; @@ -1987,7 +1987,7 @@ got_it: 16)) err = -EFAULT; out_err: - mutex_unlock(&sbi->sb_lock); + up_write(&sbi->sb_lock); mnt_drop_write_file(filp); return err; } diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 72328a18c086..685f94ba760b 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -171,16 +171,52 @@ static int is_multimedia_file(const unsigned char *s, const char *sub) static inline void set_cold_files(struct f2fs_sb_info *sbi, struct inode *inode, const unsigned char *name) { - int i; - __u8 (*extlist)[8] = sbi->raw_super->extension_list; + __u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list; + int i, count; + + down_read(&sbi->sb_lock); + + count = le32_to_cpu(sbi->raw_super->extension_count); - int count = le32_to_cpu(sbi->raw_super->extension_count); for (i = 0; i < count; i++) { if (is_multimedia_file(name, extlist[i])) { file_set_cold(inode); break; } } + + up_read(&sbi->sb_lock); +} + +int update_extension_list(struct f2fs_sb_info *sbi, const char *name, bool set) +{ + __u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list; + int count = le32_to_cpu(sbi->raw_super->extension_count); + int i; + + for (i = 0; i < count; i++) { + if (strcmp(name, extlist[i])) + continue; + + if (set) + return -EINVAL; + + memcpy(extlist[i], extlist[i + 1], + F2FS_EXTENSION_LEN * (count - i - 1)); + memset(extlist[count - 1], 0, F2FS_EXTENSION_LEN); + sbi->raw_super->extension_count = cpu_to_le32(count - 1); + return 0; + } + + if (!set) + return -EINVAL; + + if (count == F2FS_MAX_EXTENSION) + return -EINVAL; + + strncpy(extlist[count], name, strlen(name)); + sbi->raw_super->extension_count = cpu_to_le32(count + 1); + return 0; } static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index f86374cc4470..ec68aa982649 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2226,7 +2226,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi) sbi->dirty_device = 0; spin_lock_init(&sbi->dev_lock); - mutex_init(&sbi->sb_lock); + init_rwsem(&sbi->sb_lock); } static int init_percpu_info(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 374ee5c82f94..d27b28e602a6 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -136,6 +136,18 @@ static ssize_t f2fs_sbi_show(struct f2fs_attr *a, if (!ptr) return -EINVAL; + if (!strcmp(a->attr.name, "extension_list")) { + __u8 (*extlist)[F2FS_EXTENSION_LEN] = + sbi->raw_super->extension_list; + int count = le32_to_cpu(sbi->raw_super->extension_count); + int len = 0, i; + + for (i = 0; i < count; i++) + len += snprintf(buf + len, PAGE_SIZE - len, "%s\n", + extlist[i]); + return len; + } + ui = (unsigned int *)(ptr + a->offset); return snprintf(buf, PAGE_SIZE, "%u\n", *ui); @@ -154,6 +166,32 @@ static ssize_t f2fs_sbi_store(struct f2fs_attr *a, if (!ptr) return -EINVAL; + if (!strcmp(a->attr.name, "extension_list")) { + const char *name = strim((char *)buf); + bool set = true; + + if (name[0] == '!') { + name++; + set = false; + } + + if (strlen(name) >= F2FS_EXTENSION_LEN) + return -EINVAL; + + down_write(&sbi->sb_lock); + + ret = update_extension_list(sbi, name, set); + if (ret) + goto out; + + ret = f2fs_commit_super(sbi, false); + if (ret) + update_extension_list(sbi, name, !set); +out: + up_write(&sbi->sb_lock); + return ret ? ret : count; + } + ui = (unsigned int *)(ptr + a->offset); ret = kstrtoul(skip_spaces(buf), 0, &t); @@ -307,6 +345,7 @@ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, idle_interval, interval_time[REQ_TIME]); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, iostat_enable, iostat_enable); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, readdir_ra, readdir_ra); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_pin_file_thresh, gc_pin_file_threshold); +F2FS_RW_ATTR(F2FS_SBI, f2fs_super_block, extension_list, extension_list); #ifdef CONFIG_F2FS_FAULT_INJECTION F2FS_RW_ATTR(FAULT_INFO_RATE, f2fs_fault_info, inject_rate, inject_rate); F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type); @@ -357,6 +396,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(iostat_enable), ATTR_LIST(readdir_ra), ATTR_LIST(gc_pin_file_thresh), + ATTR_LIST(extension_list), #ifdef CONFIG_F2FS_FAULT_INJECTION ATTR_LIST(inject_rate), ATTR_LIST(inject_type), diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index 23f23b8e6878..bee1211bc2b9 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -21,6 +21,7 @@ #define F2FS_BLKSIZE 4096 /* support only 4KB block */ #define F2FS_BLKSIZE_BITS 12 /* bits for F2FS_BLKSIZE */ #define F2FS_MAX_EXTENSION 64 /* # of extension entries */ +#define F2FS_EXTENSION_LEN 8 /* max size of extension */ #define F2FS_BLK_ALIGN(x) (((x) + F2FS_BLKSIZE - 1) >> F2FS_BLKSIZE_BITS) #define NULL_ADDR ((block_t)0) /* used as block_t addresses */ @@ -101,7 +102,7 @@ struct f2fs_super_block { __u8 uuid[16]; /* 128-bit uuid for volume */ __le16 volume_name[MAX_VOLUME_NAME]; /* volume name */ __le32 extension_count; /* # of extensions below */ - __u8 extension_list[F2FS_MAX_EXTENSION][8]; /* extension array */ + __u8 extension_list[F2FS_MAX_EXTENSION][F2FS_EXTENSION_LEN];/* extension array */ __le32 cp_payload; __u8 version[VERSION_LEN]; /* the kernel version */ __u8 init_version[VERSION_LEN]; /* the initial kernel version */ From b7982989124958d1ad880bae0b5169e6eaa00421 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 26 Feb 2018 09:19:47 -0800 Subject: [PATCH 632/804] f2fs: don't stop GC if GC is contended Let's do GC as much as possible, while gc_urgent is set. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 06de4ca9abc9..7725999394b0 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -76,14 +76,15 @@ static int gc_thread_func(void *data) * invalidated soon after by user update or deletion. * So, I'd like to wait some time to collect dirty segments. */ - if (!mutex_trylock(&sbi->gc_mutex)) - goto next; - if (gc_th->gc_urgent) { wait_ms = gc_th->urgent_sleep_time; + mutex_lock(&sbi->gc_mutex); goto do_gc; } + if (!mutex_trylock(&sbi->gc_mutex)) + goto next; + if (!is_idle(sbi)) { increase_sleep_time(gc_th, &wait_ms); mutex_unlock(&sbi->gc_mutex); From 0ffdffc8f106628a4c6bc3eed2eb3cf88393d2f3 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Sun, 18 Feb 2018 08:50:49 -0800 Subject: [PATCH 633/804] f2fs: add mount option for segment allocation policy This patch adds an mount option, "alloc_mode=%s" having two options, "default" and "reuse". In "alloc_mode=reuse" case, f2fs starts to allocate segments from 0'th segment all the time to reassign segments. It'd be useful for small-sized eMMC parts. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.txt | 8 ++++++++ fs/f2fs/f2fs.h | 8 ++++++++ fs/f2fs/segment.c | 5 +++++ fs/f2fs/super.c | 26 ++++++++++++++++++++++++++ 4 files changed, 47 insertions(+) diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt index 6cf9ad12c57f..579c1119131d 100644 --- a/Documentation/filesystems/f2fs.txt +++ b/Documentation/filesystems/f2fs.txt @@ -172,6 +172,14 @@ offgrpjquota Turn off group journelled quota. offprjjquota Turn off project journelled quota. quota Enable plain user disk quota accounting. noquota Disable all plain disk quota option. +whint_mode=%s Control which write hints are passed down to block + layer. This supports "off", "user-based", and + "fs-based". In "off" mode (default), f2fs does not pass + down hints. In "user-based" mode, f2fs tries to pass + down hints given by users. And in "fs-based" mode, f2fs + passes down hints with its policy. +alloc_mode=%s Adjust block allocation policy, which supports "reuse" + and "default". ================================================================================ DEBUGFS ENTRIES diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 621603b1835c..1f88986207ed 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1107,6 +1107,11 @@ enum { WHINT_MODE_FS, /* pass down hints with F2FS policy */ }; +enum { + ALLOC_MODE_DEFAULT, /* stay default */ + ALLOC_MODE_REUSE, /* reuse segments as much as possible */ +}; + struct f2fs_sb_info { struct super_block *sb; /* pointer to VFS super block */ struct proc_dir_entry *s_proc; /* proc entry */ @@ -1293,6 +1298,9 @@ struct f2fs_sb_info { #endif /* For which write hints are passed down to block layer */ int whint_mode; + + /* segment allocation policy */ + int alloc_mode; }; #ifdef CONFIG_F2FS_FAULT_INJECTION diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index d4e09133c013..da498a1de469 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2250,6 +2250,11 @@ static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type) if (SIT_I(sbi)->last_victim[ALLOC_NEXT]) return SIT_I(sbi)->last_victim[ALLOC_NEXT]; + + /* find segments from 0 to reuse freed segments */ + if (sbi->alloc_mode == ALLOC_MODE_REUSE) + return 0; + return CURSEG_I(sbi, type)->segno; } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index ec68aa982649..ff59af55ccd4 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -130,6 +130,7 @@ enum { Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_whint, + Opt_alloc, Opt_err, }; @@ -184,6 +185,7 @@ static match_table_t f2fs_tokens = { {Opt_jqfmt_vfsv0, "jqfmt=vfsv0"}, {Opt_jqfmt_vfsv1, "jqfmt=vfsv1"}, {Opt_whint, "whint_mode=%s"}, + {Opt_alloc, "alloc_mode=%s"}, {Opt_err, NULL}, }; @@ -700,6 +702,23 @@ static int parse_options(struct super_block *sb, char *options) } kfree(name); break; + case Opt_alloc: + name = match_strdup(&args[0]); + if (!name) + return -ENOMEM; + + if (strlen(name) == 7 && + !strncmp(name, "default", 7)) { + sbi->alloc_mode = ALLOC_MODE_DEFAULT; + } else if (strlen(name) == 5 && + !strncmp(name, "reuse", 5)) { + sbi->alloc_mode = ALLOC_MODE_REUSE; + } else { + kfree(name); + return -EINVAL; + } + kfree(name); + break; default: f2fs_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" or missing value", @@ -1265,6 +1284,10 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) else if (sbi->whint_mode == WHINT_MODE_FS) seq_printf(seq, ",whint_mode=%s", "fs-based"); + if (sbi->alloc_mode == ALLOC_MODE_DEFAULT) + seq_printf(seq, ",alloc_mode=%s", "default"); + else if (sbi->alloc_mode == ALLOC_MODE_REUSE) + seq_printf(seq, ",alloc_mode=%s", "reuse"); return 0; } @@ -1274,6 +1297,7 @@ static void default_options(struct f2fs_sb_info *sbi) sbi->active_logs = NR_CURSEG_TYPE; sbi->inline_xattr_size = DEFAULT_INLINE_XATTR_ADDRS; sbi->whint_mode = WHINT_MODE_OFF; + sbi->alloc_mode = ALLOC_MODE_DEFAULT; set_opt(sbi, BG_GC); set_opt(sbi, INLINE_XATTR); @@ -1315,6 +1339,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) bool need_stop_gc = false; bool no_extent_cache = !test_opt(sbi, EXTENT_CACHE); int old_whint_mode = sbi->whint_mode; + int old_alloc_mode = sbi->alloc_mode; #ifdef CONFIG_F2FS_FAULT_INJECTION struct f2fs_fault_info ffi = sbi->fault_info; #endif @@ -1464,6 +1489,7 @@ restore_opts: sbi->s_qf_names[i] = s_qf_names[i]; } #endif + sbi->alloc_mode = old_alloc_mode; sbi->whint_mode = old_whint_mode; sbi->mount_opt = org_mount_opt; sbi->active_logs = active_logs; From 1aa536a624cc246bcafd5ace82abe3b50e47c802 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 22 Feb 2018 14:09:30 -0800 Subject: [PATCH 634/804] f2fs: add auto tuning for small devices If f2fs is running on top of very small devices, it's worth to avoid abusing free LBAs. In order to achieve that, this patch introduces some parameter tuning. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.h | 2 ++ fs/f2fs/super.c | 14 ++++++++++++++ 2 files changed, 16 insertions(+) diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 5d6d3e72be31..d1524d16b2a0 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -596,6 +596,8 @@ static inline int utilization(struct f2fs_sb_info *sbi) #define DEF_MIN_FSYNC_BLOCKS 8 #define DEF_MIN_HOT_BLOCKS 16 +#define SMALL_VOLUME_SEGMENTS (16 * 512) /* 16GB */ + enum { F2FS_IPU_FORCE, F2FS_IPU_SSR, diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index ff59af55ccd4..28c49fc34e86 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2523,6 +2523,18 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) return 0; } +static void f2fs_tuning_parameters(struct f2fs_sb_info *sbi) +{ + struct f2fs_sm_info *sm_i = SM_I(sbi); + + /* adjust parameters according to the volume size */ + if (sm_i->main_segments <= SMALL_VOLUME_SEGMENTS) { + sbi->alloc_mode = ALLOC_MODE_REUSE; + sm_i->dcc_info->discard_granularity = 1; + sm_i->ipu_policy = 1 << F2FS_IPU_FORCE; + } +} + static int f2fs_fill_super(struct super_block *sb, void *data, int silent) { struct f2fs_sb_info *sbi; @@ -2875,6 +2887,8 @@ skip_recovery: f2fs_join_shrinker(sbi); + f2fs_tuning_parameters(sbi); + f2fs_msg(sbi->sb, KERN_NOTICE, "Mounted with checkpoint version = %llx", cur_cp_version(F2FS_CKPT(sbi))); f2fs_update_time(sbi, CP_TIME); From a5052f32b940d492403a8a6624ce88094bfb610e Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Sun, 25 Feb 2018 01:04:57 -0800 Subject: [PATCH 635/804] f2fs: set readdir_ra by default It gives general readdir improvement. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 28c49fc34e86..85e4b938b996 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1298,6 +1298,7 @@ static void default_options(struct f2fs_sb_info *sbi) sbi->inline_xattr_size = DEFAULT_INLINE_XATTR_ADDRS; sbi->whint_mode = WHINT_MODE_OFF; sbi->alloc_mode = ALLOC_MODE_DEFAULT; + sbi->readdir_ra = 1; set_opt(sbi, BG_GC); set_opt(sbi, INLINE_XATTR); From 10b2d001d6ace7f509bda9321a729b6949cc6ea0 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 22 Feb 2018 23:30:55 -0800 Subject: [PATCH 636/804] f2fs: issue discard aggressively in the gc_urgent mode This patch avoids to skip discard commands when user sets gc_urgent mode. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index da498a1de469..c217a91088af 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1491,12 +1491,11 @@ static int issue_discard_thread(void *data) if (kthread_should_stop()) return 0; - if (dcc->discard_wake) { + if (dcc->discard_wake) dcc->discard_wake = 0; - if (sbi->gc_thread && sbi->gc_thread->gc_urgent) - init_discard_policy(&dpolicy, - DPOLICY_FORCE, 1); - } + + if (sbi->gc_thread && sbi->gc_thread->gc_urgent) + init_discard_policy(&dpolicy, DPOLICY_FORCE, 1); sb_start_intwrite(sbi->sb); @@ -1807,7 +1806,7 @@ void init_discard_policy(struct discard_policy *dpolicy, } else if (discard_type == DPOLICY_FORCE) { dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME; dpolicy->max_interval = DEF_MAX_DISCARD_ISSUE_TIME; - dpolicy->io_aware = true; + dpolicy->io_aware = false; } else if (discard_type == DPOLICY_FSTRIM) { dpolicy->io_aware = false; } else if (discard_type == DPOLICY_UMOUNT) { From 1e0aeb0af9ed3b16b4c2543aa2c6502a153b897b Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 26 Feb 2018 15:40:30 -0800 Subject: [PATCH 637/804] f2fs: do gc in greedy mode for whole range if gc_urgent mode is set Otherwise, f2fs conducts GC on 8GB range only based on slow cost-benefit. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 7725999394b0..54f51a990794 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -162,12 +162,17 @@ static int select_gc_type(struct f2fs_gc_kthread *gc_th, int gc_type) { int gc_mode = (gc_type == BG_GC) ? GC_CB : GC_GREEDY; - if (gc_th && gc_th->gc_idle) { + if (!gc_th) + return gc_mode; + + if (gc_th->gc_idle) { if (gc_th->gc_idle == 1) gc_mode = GC_CB; else if (gc_th->gc_idle == 2) gc_mode = GC_GREEDY; } + if (gc_th->gc_urgent) + gc_mode = GC_GREEDY; return gc_mode; } @@ -189,7 +194,9 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type, } /* we need to check every dirty segments in the FG_GC case */ - if (gc_type != FG_GC && p->max_search > sbi->max_victim_search) + if (gc_type != FG_GC && + (sbi->gc_thread && !sbi->gc_thread->gc_urgent) && + p->max_search > sbi->max_victim_search) p->max_search = sbi->max_victim_search; /* let's select beginning hot/small space first in no_heap mode*/ From 58edcdbca67ab09ef7631e7a94a5bd5190895631 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 27 Feb 2018 22:45:24 +0800 Subject: [PATCH 638/804] f2fs: fix to avoid race in between atomic write and background GC Sqlite user Background GC - move_data_block : move page #1 - f2fs_is_atomic_file - f2fs_ioc_start_atomic_write - f2fs_ioc_commit_atomic_write - commit_inmem_pages : commit page #1 & set node #2 dirty - f2fs_submit_page_write - f2fs_update_data_blkaddr - set_page_dirty : set node #2 dirty - f2fs_do_sync_file - fsync_node_pages : commit node #1 & node #2, then sudden power-cut In a race case, we may check FI_ATOMIC_FILE flag before starting atomic write flow, then we will commit meta data before data with reversed order, after a sudden pow-cut, database transaction will be inconsistent. So we'd better to exclude gc/atomic_write to each other by using lock instead of flag checking. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index e418fc5b3fed..8ec080550a37 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1713,6 +1713,8 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) inode_lock(inode); + down_write(&F2FS_I(inode)->dio_rwsem[WRITE]); + if (f2fs_is_volatile_file(inode)) goto err_out; @@ -1731,6 +1733,7 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 1, false); } err_out: + up_write(&F2FS_I(inode)->dio_rwsem[WRITE]); inode_unlock(inode); mnt_drop_write_file(filp); return ret; From 076a6f32fe5d2d8c43f44e625c67d796eeb8f1ed Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 28 Feb 2018 17:07:27 +0800 Subject: [PATCH 639/804] f2fs: support hot file extension This patch supports to recognize hot file extension in f2fs, so that we can allocate proper hot segment location for its data, which can lead to better hot/cold seperation in filesystem. In addition, we changes a bit on query/add/del operation method for extension_list sysfs entry as below: - Query: cat /sys/fs/f2fs//extension_list - Add: echo 'extension' > /sys/fs/f2fs//extension_list - Del: echo '!extension' > /sys/fs/f2fs//extension_list - Add: echo '[h/c]extension' > /sys/fs/f2fs//extension_list - Del: echo '[h/c]!extension' > /sys/fs/f2fs//extension_list - [h] means add/del hot file extension - [c] means add/del cold file extension Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 6 +- fs/f2fs/f2fs.h | 6 +- fs/f2fs/namei.c | 77 +++++++++++++++++++------ fs/f2fs/segment.c | 3 +- fs/f2fs/sysfs.c | 30 ++++++++-- include/linux/f2fs_fs.h | 3 +- 6 files changed, 95 insertions(+), 30 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index be3f74ec05b5..b8d0a30f1644 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -199,5 +199,7 @@ Contact: "Chao Yu" Description: Used to control configure extension list: - Query: cat /sys/fs/f2fs//extension_list - - Add: echo 'extension' > /sys/fs/f2fs//extension_list - - Del: echo '!extension' > /sys/fs/f2fs//extension_list + - Add: echo '[h/c]extension' > /sys/fs/f2fs//extension_list + - Del: echo '[h/c]!extension' > /sys/fs/f2fs//extension_list + - [h] means add/del hot file extension + - [c] means add/del cold file extension diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 1f88986207ed..e3bfecf7852b 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -640,6 +640,7 @@ enum { #define FADVISE_ENCRYPT_BIT 0x04 #define FADVISE_ENC_NAME_BIT 0x08 #define FADVISE_KEEP_SIZE_BIT 0x10 +#define FADVISE_HOT_BIT 0x20 #define file_is_cold(inode) is_file(inode, FADVISE_COLD_BIT) #define file_wrong_pino(inode) is_file(inode, FADVISE_LOST_PINO_BIT) @@ -654,6 +655,9 @@ enum { #define file_set_enc_name(inode) set_file(inode, FADVISE_ENC_NAME_BIT) #define file_keep_isize(inode) is_file(inode, FADVISE_KEEP_SIZE_BIT) #define file_set_keep_isize(inode) set_file(inode, FADVISE_KEEP_SIZE_BIT) +#define file_is_hot(inode) is_file(inode, FADVISE_HOT_BIT) +#define file_set_hot(inode) set_file(inode, FADVISE_HOT_BIT) +#define file_clear_hot(inode) clear_file(inode, FADVISE_HOT_BIT) #define DEF_DIR_LEVEL 0 @@ -2699,7 +2703,7 @@ void handle_failed_inode(struct inode *inode); * namei.c */ int update_extension_list(struct f2fs_sb_info *sbi, const char *name, - bool set); + bool hot, bool set); struct dentry *f2fs_get_parent(struct dentry *child); /* diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 685f94ba760b..794dac1c64b3 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -142,7 +142,7 @@ fail_drop: return ERR_PTR(err); } -static int is_multimedia_file(const unsigned char *s, const char *sub) +static int is_extension_exist(const unsigned char *s, const char *sub) { size_t slen = strlen(s); size_t sublen = strlen(sub); @@ -168,33 +168,59 @@ static int is_multimedia_file(const unsigned char *s, const char *sub) /* * Set multimedia files as cold files for hot/cold data separation */ -static inline void set_cold_files(struct f2fs_sb_info *sbi, struct inode *inode, +static inline void set_file_temperature(struct f2fs_sb_info *sbi, struct inode *inode, const unsigned char *name) { __u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list; - int i, count; + int i, cold_count, hot_count; down_read(&sbi->sb_lock); - count = le32_to_cpu(sbi->raw_super->extension_count); + cold_count = le32_to_cpu(sbi->raw_super->extension_count); + hot_count = sbi->raw_super->hot_ext_count; - for (i = 0; i < count; i++) { - if (is_multimedia_file(name, extlist[i])) { + for (i = 0; i < cold_count + hot_count; i++) { + if (!is_extension_exist(name, extlist[i])) + continue; + if (i < cold_count) file_set_cold(inode); - break; - } + else + file_set_hot(inode); + break; } up_read(&sbi->sb_lock); } -int update_extension_list(struct f2fs_sb_info *sbi, const char *name, bool set) +int update_extension_list(struct f2fs_sb_info *sbi, const char *name, + bool hot, bool set) { __u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list; - int count = le32_to_cpu(sbi->raw_super->extension_count); + int cold_count = le32_to_cpu(sbi->raw_super->extension_count); + int hot_count = sbi->raw_super->hot_ext_count; + int total_count = cold_count + hot_count; + int start, count; int i; - for (i = 0; i < count; i++) { + if (set) { + if (total_count == F2FS_MAX_EXTENSION) + return -EINVAL; + } else { + if (!hot && !cold_count) + return -EINVAL; + if (hot && !hot_count) + return -EINVAL; + } + + if (hot) { + start = cold_count; + count = total_count; + } else { + start = 0; + count = cold_count; + } + + for (i = start; i < count; i++) { if (strcmp(name, extlist[i])) continue; @@ -202,20 +228,33 @@ int update_extension_list(struct f2fs_sb_info *sbi, const char *name, bool set) return -EINVAL; memcpy(extlist[i], extlist[i + 1], - F2FS_EXTENSION_LEN * (count - i - 1)); - memset(extlist[count - 1], 0, F2FS_EXTENSION_LEN); - sbi->raw_super->extension_count = cpu_to_le32(count - 1); + F2FS_EXTENSION_LEN * (total_count - i - 1)); + memset(extlist[total_count - 1], 0, F2FS_EXTENSION_LEN); + if (hot) + sbi->raw_super->hot_ext_count = hot_count - 1; + else + sbi->raw_super->extension_count = + cpu_to_le32(cold_count - 1); return 0; } if (!set) return -EINVAL; - if (count == F2FS_MAX_EXTENSION) - return -EINVAL; + if (hot) { + strncpy(extlist[count], name, strlen(name)); + sbi->raw_super->hot_ext_count = hot_count + 1; + } else { + char buf[F2FS_MAX_EXTENSION][F2FS_EXTENSION_LEN]; - strncpy(extlist[count], name, strlen(name)); - sbi->raw_super->extension_count = cpu_to_le32(count + 1); + memcpy(buf, &extlist[cold_count], + F2FS_EXTENSION_LEN * hot_count); + memset(extlist[cold_count], 0, F2FS_EXTENSION_LEN); + strncpy(extlist[cold_count], name, strlen(name)); + memcpy(&extlist[cold_count + 1], buf, + F2FS_EXTENSION_LEN * hot_count); + sbi->raw_super->extension_count = cpu_to_le32(cold_count + 1); + } return 0; } @@ -239,7 +278,7 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, return PTR_ERR(inode); if (!test_opt(sbi, DISABLE_EXT_IDENTIFY)) - set_cold_files(sbi, inode, dentry->d_name.name); + set_file_temperature(sbi, inode, dentry->d_name.name); inode->i_op = &f2fs_file_inode_operations; inode->i_fop = &f2fs_file_operations; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index c217a91088af..2d753f9b7499 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2667,7 +2667,8 @@ static int __get_segment_type_6(struct f2fs_io_info *fio) if (is_cold_data(fio->page) || file_is_cold(inode)) return CURSEG_COLD_DATA; - if (is_inode_flag_set(inode, FI_HOT_DATA)) + if (file_is_hot(inode) || + is_inode_flag_set(inode, FI_HOT_DATA)) return CURSEG_HOT_DATA; /* rw_hint_to_seg_type(inode->i_write_hint); */ return CURSEG_WARM_DATA; diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index d27b28e602a6..23a2d8d66c43 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -139,10 +139,19 @@ static ssize_t f2fs_sbi_show(struct f2fs_attr *a, if (!strcmp(a->attr.name, "extension_list")) { __u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list; - int count = le32_to_cpu(sbi->raw_super->extension_count); + int cold_count = le32_to_cpu(sbi->raw_super->extension_count); + int hot_count = sbi->raw_super->hot_ext_count; int len = 0, i; - for (i = 0; i < count; i++) + len += snprintf(buf + len, PAGE_SIZE - len, + "cold file extenstion:\n"); + for (i = 0; i < cold_count; i++) + len += snprintf(buf + len, PAGE_SIZE - len, "%s\n", + extlist[i]); + + len += snprintf(buf + len, PAGE_SIZE - len, + "hot file extenstion:\n"); + for (i = cold_count; i < cold_count + hot_count; i++) len += snprintf(buf + len, PAGE_SIZE - len, "%s\n", extlist[i]); return len; @@ -168,9 +177,18 @@ static ssize_t f2fs_sbi_store(struct f2fs_attr *a, if (!strcmp(a->attr.name, "extension_list")) { const char *name = strim((char *)buf); - bool set = true; + bool set = true, hot; - if (name[0] == '!') { + if (!strncmp(name, "[h]", 3)) + hot = true; + else if (!strncmp(name, "[c]", 3)) + hot = false; + else + return -EINVAL; + + name += 3; + + if (*name == '!') { name++; set = false; } @@ -180,13 +198,13 @@ static ssize_t f2fs_sbi_store(struct f2fs_attr *a, down_write(&sbi->sb_lock); - ret = update_extension_list(sbi, name, set); + ret = update_extension_list(sbi, name, hot, set); if (ret) goto out; ret = f2fs_commit_super(sbi, false); if (ret) - update_extension_list(sbi, name, !set); + update_extension_list(sbi, name, hot, !set); out: up_write(&sbi->sb_lock); return ret ? ret : count; diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index bee1211bc2b9..bb92fd5b5841 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -111,7 +111,8 @@ struct f2fs_super_block { __u8 encrypt_pw_salt[16]; /* Salt used for string2key algorithm */ struct f2fs_device devs[MAX_DEVICES]; /* device list */ __le32 qf_ino[F2FS_MAX_QUOTAS]; /* quota inode numbers */ - __u8 reserved[315]; /* valid reserved region */ + __u8 hot_ext_count; /* # of hot file extension */ + __u8 reserved[314]; /* valid reserved region */ } __packed; /* From 6c6611223a79ead8030efbe3443f870c0f11540f Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 8 Mar 2018 20:47:33 -0800 Subject: [PATCH 640/804] f2fs: avoid selinux denial on CAP_SYS_RESOURCE This fixes CAP_SYS_RESOURCE denial of selinux when using resgid, since it seems selinux reports it at the first place, but mostly we don't need to check this condition first. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index e3bfecf7852b..3e05162bbeb7 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1671,13 +1671,13 @@ static inline bool __allow_reserved_blocks(struct f2fs_sb_info *sbi, return false; if (IS_NOQUOTA(inode)) return true; - if (capable(CAP_SYS_RESOURCE)) - return true; if (uid_eq(sbi->s_resuid, current_fsuid())) return true; if (!gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) && in_group_p(sbi->s_resgid)) return true; + if (capable(CAP_SYS_RESOURCE)) + return true; return false; } From 0c9c3e034410c4b1410fc3dad4d2657d71539ae4 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 1 Mar 2018 23:40:31 +0800 Subject: [PATCH 641/804] f2fs: wrap sb_rdonly with f2fs_readonly Use f2fs_readonly to wrap sb_rdonly for cleanup, and spread it in all places. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 3 +-- fs/f2fs/super.c | 6 +++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 3e05162bbeb7..be7f236a38da 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2531,8 +2531,7 @@ static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync) return ret; } -#define sb_rdonly f2fs_readonly -static inline int f2fs_readonly(struct super_block *sb) +static inline bool f2fs_readonly(struct super_block *sb) { return sb->s_flags & MS_RDONLY; } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 85e4b938b996..8db821b2d78e 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -336,7 +336,7 @@ static int f2fs_check_quota_options(struct f2fs_sb_info *sbi) "QUOTA feature is enabled, so ignore jquota_fmt"); sbi->s_jquota_fmt = 0; } - if (f2fs_sb_has_quota_ino(sbi->sb) && sb_rdonly(sbi->sb)) { + if (f2fs_sb_has_quota_ino(sbi->sb) && f2fs_readonly(sbi->sb)) { f2fs_msg(sbi->sb, KERN_INFO, "Filesystem with quota feature cannot be mounted RDWR " "without CONFIG_QUOTA"); @@ -2813,7 +2813,7 @@ try_onemore: * Turn on quotas which were not enabled for read-only mounts if * filesystem has quota feature, so that they are updated correctly. */ - if (f2fs_sb_has_quota_ino(sb) && !sb_rdonly(sb)) { + if (f2fs_sb_has_quota_ino(sb) && !f2fs_readonly(sb)) { err = f2fs_enable_quotas(sb); if (err) { f2fs_msg(sb, KERN_ERR, @@ -2898,7 +2898,7 @@ skip_recovery: free_meta: #ifdef CONFIG_QUOTA - if (f2fs_sb_has_quota_ino(sb) && !sb_rdonly(sb)) + if (f2fs_sb_has_quota_ino(sb) && !f2fs_readonly(sb)) f2fs_quota_off_umount(sbi->sb); #endif f2fs_sync_inode_meta(sbi); From 6bc490f0eedcd21df5a41e9369cdafed154c9e95 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 1 Mar 2018 23:40:32 +0800 Subject: [PATCH 642/804] f2fs: fix to restore old mount option in ->remount_fs This patch fixes to restore old mount option once we encounter failure in ->remount_fs. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 8db821b2d78e..dca74d62d9d4 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1341,6 +1341,11 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) bool no_extent_cache = !test_opt(sbi, EXTENT_CACHE); int old_whint_mode = sbi->whint_mode; int old_alloc_mode = sbi->alloc_mode; + int old_inline_xattr_size = sbi->inline_xattr_size; + block_t old_root_reserved_blocks = sbi->root_reserved_blocks; + kuid_t old_resuid = sbi->s_resuid; + kgid_t old_resgid = sbi->s_resgid; + int old_write_io_size_bits = sbi->write_io_size_bits; #ifdef CONFIG_F2FS_FAULT_INJECTION struct f2fs_fault_info ffi = sbi->fault_info; #endif @@ -1490,6 +1495,11 @@ restore_opts: sbi->s_qf_names[i] = s_qf_names[i]; } #endif + sbi->write_io_size_bits = old_write_io_size_bits; + sbi->s_resgid = old_resgid; + sbi->s_resuid = old_resuid; + sbi->root_reserved_blocks = old_root_reserved_blocks; + sbi->inline_xattr_size = old_inline_xattr_size; sbi->alloc_mode = old_alloc_mode; sbi->whint_mode = old_whint_mode; sbi->mount_opt = org_mount_opt; From 0bdeb167c843f33fffa3bd046b9e9e9eb8ff01ea Mon Sep 17 00:00:00 2001 From: Junling Zheng Date: Wed, 7 Mar 2018 12:07:49 +0800 Subject: [PATCH 643/804] f2fs: introduce mount option for fsync mode Commit "0a007b97aad6"(f2fs: recover directory operations by fsync) fixed xfstest generic/342 case, but it also increased the written data and caused the performance degradation. In most cases, there's no need to do so heavy fsync actually. So we introduce new mount option "fsync_mode={posix,strict}" to control the policy of fsync. "fsync_mode=posix" is set by default, and means that f2fs uses a light fsync, which follows POSIX semantics. And "fsync_mode=strict" means that it's a heavy fsync, which behaves in line with xfs, ext4 and btrfs, where generic/342 will pass, but the performance will regress. Signed-off-by: Junling Zheng Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.txt | 7 +++++++ fs/f2fs/dir.c | 3 ++- fs/f2fs/f2fs.h | 8 ++++++++ fs/f2fs/file.c | 3 ++- fs/f2fs/namei.c | 9 ++++++--- fs/f2fs/super.c | 26 ++++++++++++++++++++++++++ 6 files changed, 51 insertions(+), 5 deletions(-) diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt index 579c1119131d..fb92e6f25adf 100644 --- a/Documentation/filesystems/f2fs.txt +++ b/Documentation/filesystems/f2fs.txt @@ -180,6 +180,13 @@ whint_mode=%s Control which write hints are passed down to block passes down hints with its policy. alloc_mode=%s Adjust block allocation policy, which supports "reuse" and "default". +fsync_mode=%s Control the policy of fsync. Currently supports "posix" + and "strict". In "posix" mode, which is default, fsync + will follow POSIX semantics and does a light operation + to improve the filesystem performance. In "strict" mode, + fsync will be heavy and behaves in line with xfs, ext4 + and btrfs, where xfstest generic/342 will pass, but the + performance will regress. ================================================================================ DEBUGFS ENTRIES diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 560b707050ca..bb3b8ef1a890 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -704,7 +704,8 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, f2fs_update_time(F2FS_I_SB(dir), REQ_TIME); - add_ino_entry(F2FS_I_SB(dir), dir->i_ino, TRANS_DIR_INO); + if (F2FS_I_SB(dir)->fsync_mode == FSYNC_MODE_STRICT) + add_ino_entry(F2FS_I_SB(dir), dir->i_ino, TRANS_DIR_INO); if (f2fs_has_inline_dentry(dir)) return f2fs_delete_inline_entry(dentry, page, dir, inode); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index be7f236a38da..1ec04a58576e 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1116,6 +1116,11 @@ enum { ALLOC_MODE_REUSE, /* reuse segments as much as possible */ }; +enum fsync_mode { + FSYNC_MODE_POSIX, /* fsync follows posix semantics */ + FSYNC_MODE_STRICT, /* fsync behaves in line with ext4 */ +}; + struct f2fs_sb_info { struct super_block *sb; /* pointer to VFS super block */ struct proc_dir_entry *s_proc; /* proc entry */ @@ -1305,6 +1310,9 @@ struct f2fs_sb_info { /* segment allocation policy */ int alloc_mode; + + /* fsync policy */ + int fsync_mode; }; #ifdef CONFIG_F2FS_FAULT_INJECTION diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 8ec080550a37..57afbf3e09ea 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -168,7 +168,8 @@ static inline enum cp_reason_type need_do_checkpoint(struct inode *inode) cp_reason = CP_FASTBOOT_MODE; else if (sbi->active_logs == 2) cp_reason = CP_SPEC_LOG_NUM; - else if (need_dentry_mark(sbi, inode->i_ino) && + else if (sbi->fsync_mode == FSYNC_MODE_STRICT && + need_dentry_mark(sbi, inode->i_ino) && exist_written_data(sbi, F2FS_I(inode)->i_pino, TRANS_DIR_INO)) cp_reason = CP_RECOVER_DIR; diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 794dac1c64b3..2b00eb44bb90 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -967,7 +967,8 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, f2fs_put_page(old_dir_page, 0); f2fs_i_links_write(old_dir, false); } - add_ino_entry(sbi, new_dir->i_ino, TRANS_DIR_INO); + if (sbi->fsync_mode == FSYNC_MODE_STRICT) + add_ino_entry(sbi, new_dir->i_ino, TRANS_DIR_INO); f2fs_unlock_op(sbi); @@ -1117,8 +1118,10 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, } f2fs_mark_inode_dirty_sync(new_dir, false); - add_ino_entry(sbi, old_dir->i_ino, TRANS_DIR_INO); - add_ino_entry(sbi, new_dir->i_ino, TRANS_DIR_INO); + if (sbi->fsync_mode == FSYNC_MODE_STRICT) { + add_ino_entry(sbi, old_dir->i_ino, TRANS_DIR_INO); + add_ino_entry(sbi, new_dir->i_ino, TRANS_DIR_INO); + } f2fs_unlock_op(sbi); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index dca74d62d9d4..b7c3f3b18a6d 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -131,6 +131,7 @@ enum { Opt_jqfmt_vfsv1, Opt_whint, Opt_alloc, + Opt_fsync, Opt_err, }; @@ -186,6 +187,7 @@ static match_table_t f2fs_tokens = { {Opt_jqfmt_vfsv1, "jqfmt=vfsv1"}, {Opt_whint, "whint_mode=%s"}, {Opt_alloc, "alloc_mode=%s"}, + {Opt_fsync, "fsync_mode=%s"}, {Opt_err, NULL}, }; @@ -719,6 +721,22 @@ static int parse_options(struct super_block *sb, char *options) } kfree(name); break; + case Opt_fsync: + name = match_strdup(&args[0]); + if (!name) + return -ENOMEM; + if (strlen(name) == 5 && + !strncmp(name, "posix", 5)) { + sbi->fsync_mode = FSYNC_MODE_POSIX; + } else if (strlen(name) == 6 && + !strncmp(name, "strict", 6)) { + sbi->fsync_mode = FSYNC_MODE_STRICT; + } else { + kfree(name); + return -EINVAL; + } + kfree(name); + break; default: f2fs_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" or missing value", @@ -1288,6 +1306,11 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_printf(seq, ",alloc_mode=%s", "default"); else if (sbi->alloc_mode == ALLOC_MODE_REUSE) seq_printf(seq, ",alloc_mode=%s", "reuse"); + + if (sbi->fsync_mode == FSYNC_MODE_POSIX) + seq_printf(seq, ",fsync_mode=%s", "posix"); + else if (sbi->fsync_mode == FSYNC_MODE_STRICT) + seq_printf(seq, ",fsync_mode=%s", "strict"); return 0; } @@ -1298,6 +1321,7 @@ static void default_options(struct f2fs_sb_info *sbi) sbi->inline_xattr_size = DEFAULT_INLINE_XATTR_ADDRS; sbi->whint_mode = WHINT_MODE_OFF; sbi->alloc_mode = ALLOC_MODE_DEFAULT; + sbi->fsync_mode = FSYNC_MODE_POSIX; sbi->readdir_ra = 1; set_opt(sbi, BG_GC); @@ -1341,6 +1365,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) bool no_extent_cache = !test_opt(sbi, EXTENT_CACHE); int old_whint_mode = sbi->whint_mode; int old_alloc_mode = sbi->alloc_mode; + int old_fsync_mode = sbi->fsync_mode; int old_inline_xattr_size = sbi->inline_xattr_size; block_t old_root_reserved_blocks = sbi->root_reserved_blocks; kuid_t old_resuid = sbi->s_resuid; @@ -1501,6 +1526,7 @@ restore_opts: sbi->root_reserved_blocks = old_root_reserved_blocks; sbi->inline_xattr_size = old_inline_xattr_size; sbi->alloc_mode = old_alloc_mode; + sbi->fsync_mode = old_fsync_mode; sbi->whint_mode = old_whint_mode; sbi->mount_opt = org_mount_opt; sbi->active_logs = active_logs; From 5738be52b3e88fab6008a95bab75548ef2f47826 Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Wed, 7 Mar 2018 16:22:50 +0800 Subject: [PATCH 644/804] f2fs: Don't overwrite all types of node to keep node chain Currently, we enable node SSR by default, and mixed different types of node segment to do SSR more intensively. Although reuse warm node is not allowed, warm node chain will be destroyed by errors introduced by other types node chain. So we'd better forbid reusing all types of node to keep warm node chain. Signed-off-by: Yunlei He Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 2d753f9b7499..92a46a7ba931 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1942,7 +1942,7 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) sbi->discard_blks--; /* don't overwrite by SSR to keep node chain */ - if (se->type == CURSEG_WARM_NODE) { + if (IS_NODESEG(se->type)) { if (!f2fs_test_and_set_bit(offset, se->ckpt_valid_map)) se->ckpt_valid_blocks++; } From d909e9410634d321ae6931e87bb0ad5eaac3fa62 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 8 Mar 2018 14:22:56 +0800 Subject: [PATCH 645/804] f2fs: wrap all options with f2fs_sb_info.mount_opt This patch merges miscellaneous mount options into struct f2fs_mount_info, After this patch, once we add new mount option, we don't need to worry about recovery of it in remount_fs(), since we will recover the f2fs_sb_info.mount_opt including all options. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 +- fs/f2fs/dir.c | 2 +- fs/f2fs/f2fs.h | 64 ++++++------ fs/f2fs/file.c | 4 +- fs/f2fs/namei.c | 6 +- fs/f2fs/segment.c | 8 +- fs/f2fs/super.c | 226 +++++++++++++++++++--------------------- fs/f2fs/sysfs.c | 4 +- include/linux/f2fs_fs.h | 8 +- 9 files changed, 154 insertions(+), 170 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 9ca848dc9dc0..1e78f55c9a7a 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2302,7 +2302,7 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int rw = iov_iter_rw(iter); int err; enum rw_hint hint = iocb->ki_hint; - int whint_mode = sbi->whint_mode; + int whint_mode = F2FS_OPTION(sbi).whint_mode; err = check_direct_IO(inode, iter, offset); if (err) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index bb3b8ef1a890..02c32c96fe09 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -704,7 +704,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, f2fs_update_time(F2FS_I_SB(dir), REQ_TIME); - if (F2FS_I_SB(dir)->fsync_mode == FSYNC_MODE_STRICT) + if (F2FS_OPTION(F2FS_I_SB(dir)).fsync_mode == FSYNC_MODE_STRICT) add_ino_entry(F2FS_I_SB(dir), dir->i_ino, TRANS_DIR_INO); if (f2fs_has_inline_dentry(dir)) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 1ec04a58576e..fa93ef53be34 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -99,9 +99,10 @@ extern char *fault_name[FAULT_MAX]; #define F2FS_MOUNT_INLINE_XATTR_SIZE 0x00800000 #define F2FS_MOUNT_RESERVE_ROOT 0x01000000 -#define clear_opt(sbi, option) ((sbi)->mount_opt.opt &= ~F2FS_MOUNT_##option) -#define set_opt(sbi, option) ((sbi)->mount_opt.opt |= F2FS_MOUNT_##option) -#define test_opt(sbi, option) ((sbi)->mount_opt.opt & F2FS_MOUNT_##option) +#define F2FS_OPTION(sbi) ((sbi)->mount_opt) +#define clear_opt(sbi, option) (F2FS_OPTION(sbi).opt &= ~F2FS_MOUNT_##option) +#define set_opt(sbi, option) (F2FS_OPTION(sbi).opt |= F2FS_MOUNT_##option) +#define test_opt(sbi, option) (F2FS_OPTION(sbi).opt & F2FS_MOUNT_##option) #define ver_after(a, b) (typecheck(unsigned long long, a) && \ typecheck(unsigned long long, b) && \ @@ -114,7 +115,25 @@ typedef u32 block_t; /* typedef u32 nid_t; struct f2fs_mount_info { - unsigned int opt; + unsigned int opt; + int write_io_size_bits; /* Write IO size bits */ + block_t root_reserved_blocks; /* root reserved blocks */ + kuid_t s_resuid; /* reserved blocks for uid */ + kgid_t s_resgid; /* reserved blocks for gid */ + int active_logs; /* # of active logs */ + int inline_xattr_size; /* inline xattr size */ +#ifdef CONFIG_F2FS_FAULT_INJECTION + struct f2fs_fault_info fault_info; /* For fault injection */ +#endif +#ifdef CONFIG_QUOTA + /* Names of quota files with journalled quota */ + char *s_qf_names[MAXQUOTAS]; + int s_jquota_fmt; /* Format of quota to use */ +#endif + /* For which write hints are passed down to block layer */ + int whint_mode; + int alloc_mode; /* segment allocation policy */ + int fsync_mode; /* fsync policy */ }; #define F2FS_FEATURE_ENCRYPT 0x0001 @@ -1145,7 +1164,6 @@ struct f2fs_sb_info { struct f2fs_bio_info *write_io[NR_PAGE_TYPE]; /* for write bios */ struct mutex wio_mutex[NR_PAGE_TYPE - 1][NR_TEMP_TYPE]; /* bio ordering for NODE/DATA */ - int write_io_size_bits; /* Write IO size bits */ mempool_t *write_io_dummy; /* Dummy pages */ /* for checkpoint */ @@ -1195,9 +1213,7 @@ struct f2fs_sb_info { unsigned int total_node_count; /* total node block count */ unsigned int total_valid_node_count; /* valid node block count */ loff_t max_file_blocks; /* max block index of file */ - int active_logs; /* # of active logs */ int dir_level; /* directory level */ - int inline_xattr_size; /* inline xattr size */ unsigned int trigger_ssr_threshold; /* threshold to trigger ssr */ int readdir_ra; /* readahead inode in readdir */ @@ -1207,9 +1223,6 @@ struct f2fs_sb_info { block_t last_valid_block_count; /* for recovery */ block_t reserved_blocks; /* configurable reserved blocks */ block_t current_reserved_blocks; /* current reserved blocks */ - block_t root_reserved_blocks; /* root reserved blocks */ - kuid_t s_resuid; /* reserved blocks for uid */ - kgid_t s_resgid; /* reserved blocks for gid */ unsigned int nquota_files; /* # of quota sysfile */ @@ -1294,25 +1307,6 @@ struct f2fs_sb_info { /* Precomputed FS UUID checksum for seeding other checksums */ __u32 s_chksum_seed; - - /* For fault injection */ -#ifdef CONFIG_F2FS_FAULT_INJECTION - struct f2fs_fault_info fault_info; -#endif - -#ifdef CONFIG_QUOTA - /* Names of quota files with journalled quota */ - char *s_qf_names[MAXQUOTAS]; - int s_jquota_fmt; /* Format of quota to use */ -#endif - /* For which write hints are passed down to block layer */ - int whint_mode; - - /* segment allocation policy */ - int alloc_mode; - - /* fsync policy */ - int fsync_mode; }; #ifdef CONFIG_F2FS_FAULT_INJECTION @@ -1322,7 +1316,7 @@ struct f2fs_sb_info { __func__, __builtin_return_address(0)) static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type) { - struct f2fs_fault_info *ffi = &sbi->fault_info; + struct f2fs_fault_info *ffi = &F2FS_OPTION(sbi).fault_info; if (!ffi->inject_rate) return false; @@ -1679,10 +1673,10 @@ static inline bool __allow_reserved_blocks(struct f2fs_sb_info *sbi, return false; if (IS_NOQUOTA(inode)) return true; - if (uid_eq(sbi->s_resuid, current_fsuid())) + if (uid_eq(F2FS_OPTION(sbi).s_resuid, current_fsuid())) return true; - if (!gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) && - in_group_p(sbi->s_resgid)) + if (!gid_eq(F2FS_OPTION(sbi).s_resgid, GLOBAL_ROOT_GID) && + in_group_p(F2FS_OPTION(sbi).s_resgid)) return true; if (capable(CAP_SYS_RESOURCE)) return true; @@ -1720,7 +1714,7 @@ static inline int inc_valid_block_count(struct f2fs_sb_info *sbi, sbi->current_reserved_blocks; if (!__allow_reserved_blocks(sbi, inode)) - avail_user_block_count -= sbi->root_reserved_blocks; + avail_user_block_count -= F2FS_OPTION(sbi).root_reserved_blocks; if (unlikely(sbi->total_valid_block_count > avail_user_block_count)) { diff = sbi->total_valid_block_count - avail_user_block_count; @@ -1927,7 +1921,7 @@ static inline int inc_valid_node_count(struct f2fs_sb_info *sbi, sbi->current_reserved_blocks + 1; if (!__allow_reserved_blocks(sbi, inode)) - valid_block_count += sbi->root_reserved_blocks; + valid_block_count += F2FS_OPTION(sbi).root_reserved_blocks; if (unlikely(valid_block_count > sbi->user_block_count)) { spin_unlock(&sbi->stat_lock); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 57afbf3e09ea..e39edd76e170 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -166,9 +166,9 @@ static inline enum cp_reason_type need_do_checkpoint(struct inode *inode) cp_reason = CP_NODE_NEED_CP; else if (test_opt(sbi, FASTBOOT)) cp_reason = CP_FASTBOOT_MODE; - else if (sbi->active_logs == 2) + else if (F2FS_OPTION(sbi).active_logs == 2) cp_reason = CP_SPEC_LOG_NUM; - else if (sbi->fsync_mode == FSYNC_MODE_STRICT && + else if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_STRICT && need_dentry_mark(sbi, inode->i_ino) && exist_written_data(sbi, F2FS_I(inode)->i_pino, TRANS_DIR_INO)) cp_reason = CP_RECOVER_DIR; diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 2b00eb44bb90..62aec95fe124 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -97,7 +97,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) if (f2fs_sb_has_flexible_inline_xattr(sbi->sb)) { f2fs_bug_on(sbi, !f2fs_has_extra_attr(inode)); if (f2fs_has_inline_xattr(inode)) - xattr_size = sbi->inline_xattr_size; + xattr_size = F2FS_OPTION(sbi).inline_xattr_size; /* Otherwise, will be 0 */ } else if (f2fs_has_inline_xattr(inode) || f2fs_has_inline_dentry(inode)) { @@ -967,7 +967,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, f2fs_put_page(old_dir_page, 0); f2fs_i_links_write(old_dir, false); } - if (sbi->fsync_mode == FSYNC_MODE_STRICT) + if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_STRICT) add_ino_entry(sbi, new_dir->i_ino, TRANS_DIR_INO); f2fs_unlock_op(sbi); @@ -1118,7 +1118,7 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, } f2fs_mark_inode_dirty_sync(new_dir, false); - if (sbi->fsync_mode == FSYNC_MODE_STRICT) { + if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_STRICT) { add_ino_entry(sbi, old_dir->i_ino, TRANS_DIR_INO); add_ino_entry(sbi, new_dir->i_ino, TRANS_DIR_INO); } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 92a46a7ba931..3389721893d3 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2251,7 +2251,7 @@ static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type) return SIT_I(sbi)->last_victim[ALLOC_NEXT]; /* find segments from 0 to reuse freed segments */ - if (sbi->alloc_mode == ALLOC_MODE_REUSE) + if (F2FS_OPTION(sbi).alloc_mode == ALLOC_MODE_REUSE) return 0; return CURSEG_I(sbi, type)->segno; @@ -2604,7 +2604,7 @@ int rw_hint_to_seg_type(enum rw_hint hint) enum rw_hint io_type_to_rw_hint(struct f2fs_sb_info *sbi, enum page_type type, enum temp_type temp) { - if (sbi->whint_mode == WHINT_MODE_USER) { + if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_USER) { if (type == DATA) { if (temp == WARM) return WRITE_LIFE_NOT_SET; @@ -2615,7 +2615,7 @@ enum rw_hint io_type_to_rw_hint(struct f2fs_sb_info *sbi, } else { return WRITE_LIFE_NOT_SET; } - } else if (sbi->whint_mode == WHINT_MODE_FS) { + } else if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_FS) { if (type == DATA) { if (temp == WARM) return WRITE_LIFE_LONG; @@ -2684,7 +2684,7 @@ static int __get_segment_type(struct f2fs_io_info *fio) { int type = 0; - switch (fio->sbi->active_logs) { + switch (F2FS_OPTION(fio->sbi).active_logs) { case 2: type = __get_segment_type_2(fio); break; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index b7c3f3b18a6d..67b0e1e34da0 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -60,7 +60,7 @@ char *fault_name[FAULT_MAX] = { static void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate) { - struct f2fs_fault_info *ffi = &sbi->fault_info; + struct f2fs_fault_info *ffi = &F2FS_OPTION(sbi).fault_info; if (rate) { atomic_set(&ffi->inject_ops, 0); @@ -208,21 +208,24 @@ static inline void limit_reserve_root(struct f2fs_sb_info *sbi) block_t limit = (sbi->user_block_count << 1) / 1000; /* limit is 0.2% */ - if (test_opt(sbi, RESERVE_ROOT) && sbi->root_reserved_blocks > limit) { - sbi->root_reserved_blocks = limit; + if (test_opt(sbi, RESERVE_ROOT) && + F2FS_OPTION(sbi).root_reserved_blocks > limit) { + F2FS_OPTION(sbi).root_reserved_blocks = limit; f2fs_msg(sbi->sb, KERN_INFO, "Reduce reserved blocks for root = %u", - sbi->root_reserved_blocks); + F2FS_OPTION(sbi).root_reserved_blocks); } if (!test_opt(sbi, RESERVE_ROOT) && - (!uid_eq(sbi->s_resuid, + (!uid_eq(F2FS_OPTION(sbi).s_resuid, make_kuid(&init_user_ns, F2FS_DEF_RESUID)) || - !gid_eq(sbi->s_resgid, + !gid_eq(F2FS_OPTION(sbi).s_resgid, make_kgid(&init_user_ns, F2FS_DEF_RESGID)))) f2fs_msg(sbi->sb, KERN_INFO, "Ignore s_resuid=%u, s_resgid=%u w/o reserve_root", - from_kuid_munged(&init_user_ns, sbi->s_resuid), - from_kgid_munged(&init_user_ns, sbi->s_resgid)); + from_kuid_munged(&init_user_ns, + F2FS_OPTION(sbi).s_resuid), + from_kgid_munged(&init_user_ns, + F2FS_OPTION(sbi).s_resgid)); } static void init_once(void *foo) @@ -242,7 +245,7 @@ static int f2fs_set_qf_name(struct super_block *sb, int qtype, char *qname; int ret = -EINVAL; - if (sb_any_quota_loaded(sb) && !sbi->s_qf_names[qtype]) { + if (sb_any_quota_loaded(sb) && !F2FS_OPTION(sbi).s_qf_names[qtype]) { f2fs_msg(sb, KERN_ERR, "Cannot change journaled " "quota options when quota turned on"); @@ -260,8 +263,8 @@ static int f2fs_set_qf_name(struct super_block *sb, int qtype, "Not enough memory for storing quotafile name"); return -EINVAL; } - if (sbi->s_qf_names[qtype]) { - if (strcmp(sbi->s_qf_names[qtype], qname) == 0) + if (F2FS_OPTION(sbi).s_qf_names[qtype]) { + if (strcmp(F2FS_OPTION(sbi).s_qf_names[qtype], qname) == 0) ret = 0; else f2fs_msg(sb, KERN_ERR, @@ -274,7 +277,7 @@ static int f2fs_set_qf_name(struct super_block *sb, int qtype, "quotafile must be on filesystem root"); goto errout; } - sbi->s_qf_names[qtype] = qname; + F2FS_OPTION(sbi).s_qf_names[qtype] = qname; set_opt(sbi, QUOTA); return 0; errout: @@ -286,13 +289,13 @@ static int f2fs_clear_qf_name(struct super_block *sb, int qtype) { struct f2fs_sb_info *sbi = F2FS_SB(sb); - if (sb_any_quota_loaded(sb) && sbi->s_qf_names[qtype]) { + if (sb_any_quota_loaded(sb) && F2FS_OPTION(sbi).s_qf_names[qtype]) { f2fs_msg(sb, KERN_ERR, "Cannot change journaled quota options" " when quota turned on"); return -EINVAL; } - kfree(sbi->s_qf_names[qtype]); - sbi->s_qf_names[qtype] = NULL; + kfree(F2FS_OPTION(sbi).s_qf_names[qtype]); + F2FS_OPTION(sbi).s_qf_names[qtype] = NULL; return 0; } @@ -308,15 +311,19 @@ static int f2fs_check_quota_options(struct f2fs_sb_info *sbi) "Cannot enable project quota enforcement."); return -1; } - if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA] || - sbi->s_qf_names[PRJQUOTA]) { - if (test_opt(sbi, USRQUOTA) && sbi->s_qf_names[USRQUOTA]) + if (F2FS_OPTION(sbi).s_qf_names[USRQUOTA] || + F2FS_OPTION(sbi).s_qf_names[GRPQUOTA] || + F2FS_OPTION(sbi).s_qf_names[PRJQUOTA]) { + if (test_opt(sbi, USRQUOTA) && + F2FS_OPTION(sbi).s_qf_names[USRQUOTA]) clear_opt(sbi, USRQUOTA); - if (test_opt(sbi, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA]) + if (test_opt(sbi, GRPQUOTA) && + F2FS_OPTION(sbi).s_qf_names[GRPQUOTA]) clear_opt(sbi, GRPQUOTA); - if (test_opt(sbi, PRJQUOTA) && sbi->s_qf_names[PRJQUOTA]) + if (test_opt(sbi, PRJQUOTA) && + F2FS_OPTION(sbi).s_qf_names[PRJQUOTA]) clear_opt(sbi, PRJQUOTA); if (test_opt(sbi, GRPQUOTA) || test_opt(sbi, USRQUOTA) || @@ -326,17 +333,17 @@ static int f2fs_check_quota_options(struct f2fs_sb_info *sbi) return -1; } - if (!sbi->s_jquota_fmt) { + if (!F2FS_OPTION(sbi).s_jquota_fmt) { f2fs_msg(sbi->sb, KERN_ERR, "journaled quota format " "not specified"); return -1; } } - if (f2fs_sb_has_quota_ino(sbi->sb) && sbi->s_jquota_fmt) { + if (f2fs_sb_has_quota_ino(sbi->sb) && F2FS_OPTION(sbi).s_jquota_fmt) { f2fs_msg(sbi->sb, KERN_INFO, "QUOTA feature is enabled, so ignore jquota_fmt"); - sbi->s_jquota_fmt = 0; + F2FS_OPTION(sbi).s_jquota_fmt = 0; } if (f2fs_sb_has_quota_ino(sbi->sb) && f2fs_readonly(sbi->sb)) { f2fs_msg(sbi->sb, KERN_INFO, @@ -446,7 +453,7 @@ static int parse_options(struct super_block *sb, char *options) if (args->from && match_int(args, &arg)) return -EINVAL; set_opt(sbi, INLINE_XATTR_SIZE); - sbi->inline_xattr_size = arg; + F2FS_OPTION(sbi).inline_xattr_size = arg; break; #else case Opt_user_xattr: @@ -486,7 +493,7 @@ static int parse_options(struct super_block *sb, char *options) return -EINVAL; if (arg != 2 && arg != 4 && arg != NR_CURSEG_TYPE) return -EINVAL; - sbi->active_logs = arg; + F2FS_OPTION(sbi).active_logs = arg; break; case Opt_disable_ext_identify: set_opt(sbi, DISABLE_EXT_IDENTIFY); @@ -530,9 +537,9 @@ static int parse_options(struct super_block *sb, char *options) if (test_opt(sbi, RESERVE_ROOT)) { f2fs_msg(sb, KERN_INFO, "Preserve previous reserve_root=%u", - sbi->root_reserved_blocks); + F2FS_OPTION(sbi).root_reserved_blocks); } else { - sbi->root_reserved_blocks = arg; + F2FS_OPTION(sbi).root_reserved_blocks = arg; set_opt(sbi, RESERVE_ROOT); } break; @@ -545,7 +552,7 @@ static int parse_options(struct super_block *sb, char *options) "Invalid uid value %d", arg); return -EINVAL; } - sbi->s_resuid = uid; + F2FS_OPTION(sbi).s_resuid = uid; break; case Opt_resgid: if (args->from && match_int(args, &arg)) @@ -556,7 +563,7 @@ static int parse_options(struct super_block *sb, char *options) "Invalid gid value %d", arg); return -EINVAL; } - sbi->s_resgid = gid; + F2FS_OPTION(sbi).s_resgid = gid; break; case Opt_mode: name = match_strdup(&args[0]); @@ -591,7 +598,7 @@ static int parse_options(struct super_block *sb, char *options) 1 << arg, BIO_MAX_PAGES); return -EINVAL; } - sbi->write_io_size_bits = arg; + F2FS_OPTION(sbi).write_io_size_bits = arg; break; case Opt_fault_injection: if (args->from && match_int(args, &arg)) @@ -652,13 +659,13 @@ static int parse_options(struct super_block *sb, char *options) return ret; break; case Opt_jqfmt_vfsold: - sbi->s_jquota_fmt = QFMT_VFS_OLD; + F2FS_OPTION(sbi).s_jquota_fmt = QFMT_VFS_OLD; break; case Opt_jqfmt_vfsv0: - sbi->s_jquota_fmt = QFMT_VFS_V0; + F2FS_OPTION(sbi).s_jquota_fmt = QFMT_VFS_V0; break; case Opt_jqfmt_vfsv1: - sbi->s_jquota_fmt = QFMT_VFS_V1; + F2FS_OPTION(sbi).s_jquota_fmt = QFMT_VFS_V1; break; case Opt_noquota: clear_opt(sbi, QUOTA); @@ -691,13 +698,13 @@ static int parse_options(struct super_block *sb, char *options) return -ENOMEM; if (strlen(name) == 10 && !strncmp(name, "user-based", 10)) { - sbi->whint_mode = WHINT_MODE_USER; + F2FS_OPTION(sbi).whint_mode = WHINT_MODE_USER; } else if (strlen(name) == 3 && !strncmp(name, "off", 3)) { - sbi->whint_mode = WHINT_MODE_OFF; + F2FS_OPTION(sbi).whint_mode = WHINT_MODE_OFF; } else if (strlen(name) == 8 && !strncmp(name, "fs-based", 8)) { - sbi->whint_mode = WHINT_MODE_FS; + F2FS_OPTION(sbi).whint_mode = WHINT_MODE_FS; } else { kfree(name); return -EINVAL; @@ -711,10 +718,10 @@ static int parse_options(struct super_block *sb, char *options) if (strlen(name) == 7 && !strncmp(name, "default", 7)) { - sbi->alloc_mode = ALLOC_MODE_DEFAULT; + F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_DEFAULT; } else if (strlen(name) == 5 && !strncmp(name, "reuse", 5)) { - sbi->alloc_mode = ALLOC_MODE_REUSE; + F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_REUSE; } else { kfree(name); return -EINVAL; @@ -727,10 +734,10 @@ static int parse_options(struct super_block *sb, char *options) return -ENOMEM; if (strlen(name) == 5 && !strncmp(name, "posix", 5)) { - sbi->fsync_mode = FSYNC_MODE_POSIX; + F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_POSIX; } else if (strlen(name) == 6 && !strncmp(name, "strict", 6)) { - sbi->fsync_mode = FSYNC_MODE_STRICT; + F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_STRICT; } else { kfree(name); return -EINVAL; @@ -770,8 +777,9 @@ static int parse_options(struct super_block *sb, char *options) "set with inline_xattr option"); return -EINVAL; } - if (!sbi->inline_xattr_size || - sbi->inline_xattr_size >= DEF_ADDRS_PER_INODE - + if (!F2FS_OPTION(sbi).inline_xattr_size || + F2FS_OPTION(sbi).inline_xattr_size >= + DEF_ADDRS_PER_INODE - F2FS_TOTAL_EXTRA_ATTR_SIZE - DEF_INLINE_RESERVED_SIZE - DEF_MIN_INLINE_SIZE) { @@ -784,8 +792,8 @@ static int parse_options(struct super_block *sb, char *options) /* Not pass down write hints if the number of active logs is lesser * than NR_CURSEG_TYPE. */ - if (sbi->active_logs != NR_CURSEG_TYPE) - sbi->whint_mode = WHINT_MODE_OFF; + if (F2FS_OPTION(sbi).active_logs != NR_CURSEG_TYPE) + F2FS_OPTION(sbi).whint_mode = WHINT_MODE_OFF; return 0; } @@ -1028,7 +1036,7 @@ static void f2fs_put_super(struct super_block *sb) mempool_destroy(sbi->write_io_dummy); #ifdef CONFIG_QUOTA for (i = 0; i < MAXQUOTAS; i++) - kfree(sbi->s_qf_names[i]); + kfree(F2FS_OPTION(sbi).s_qf_names[i]); #endif destroy_percpu_info(sbi); for (i = 0; i < NR_PAGE_TYPE; i++) @@ -1142,8 +1150,9 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_blocks = total_count - start_count; buf->f_bfree = user_block_count - valid_user_blocks(sbi) - sbi->current_reserved_blocks; - if (buf->f_bfree > sbi->root_reserved_blocks) - buf->f_bavail = buf->f_bfree - sbi->root_reserved_blocks; + if (buf->f_bfree > F2FS_OPTION(sbi).root_reserved_blocks) + buf->f_bavail = buf->f_bfree - + F2FS_OPTION(sbi).root_reserved_blocks; else buf->f_bavail = 0; @@ -1178,10 +1187,10 @@ static inline void f2fs_show_quota_options(struct seq_file *seq, #ifdef CONFIG_QUOTA struct f2fs_sb_info *sbi = F2FS_SB(sb); - if (sbi->s_jquota_fmt) { + if (F2FS_OPTION(sbi).s_jquota_fmt) { char *fmtname = ""; - switch (sbi->s_jquota_fmt) { + switch (F2FS_OPTION(sbi).s_jquota_fmt) { case QFMT_VFS_OLD: fmtname = "vfsold"; break; @@ -1195,14 +1204,17 @@ static inline void f2fs_show_quota_options(struct seq_file *seq, seq_printf(seq, ",jqfmt=%s", fmtname); } - if (sbi->s_qf_names[USRQUOTA]) - seq_show_option(seq, "usrjquota", sbi->s_qf_names[USRQUOTA]); + if (F2FS_OPTION(sbi).s_qf_names[USRQUOTA]) + seq_show_option(seq, "usrjquota", + F2FS_OPTION(sbi).s_qf_names[USRQUOTA]); - if (sbi->s_qf_names[GRPQUOTA]) - seq_show_option(seq, "grpjquota", sbi->s_qf_names[GRPQUOTA]); + if (F2FS_OPTION(sbi).s_qf_names[GRPQUOTA]) + seq_show_option(seq, "grpjquota", + F2FS_OPTION(sbi).s_qf_names[GRPQUOTA]); - if (sbi->s_qf_names[PRJQUOTA]) - seq_show_option(seq, "prjjquota", sbi->s_qf_names[PRJQUOTA]); + if (F2FS_OPTION(sbi).s_qf_names[PRJQUOTA]) + seq_show_option(seq, "prjjquota", + F2FS_OPTION(sbi).s_qf_names[PRJQUOTA]); #endif } @@ -1237,7 +1249,7 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",noinline_xattr"); if (test_opt(sbi, INLINE_XATTR_SIZE)) seq_printf(seq, ",inline_xattr_size=%u", - sbi->inline_xattr_size); + F2FS_OPTION(sbi).inline_xattr_size); #endif #ifdef CONFIG_F2FS_FS_POSIX_ACL if (test_opt(sbi, POSIX_ACL)) @@ -1273,18 +1285,20 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, "adaptive"); else if (test_opt(sbi, LFS)) seq_puts(seq, "lfs"); - seq_printf(seq, ",active_logs=%u", sbi->active_logs); + seq_printf(seq, ",active_logs=%u", F2FS_OPTION(sbi).active_logs); if (test_opt(sbi, RESERVE_ROOT)) seq_printf(seq, ",reserve_root=%u,resuid=%u,resgid=%u", - sbi->root_reserved_blocks, - from_kuid_munged(&init_user_ns, sbi->s_resuid), - from_kgid_munged(&init_user_ns, sbi->s_resgid)); + F2FS_OPTION(sbi).root_reserved_blocks, + from_kuid_munged(&init_user_ns, + F2FS_OPTION(sbi).s_resuid), + from_kgid_munged(&init_user_ns, + F2FS_OPTION(sbi).s_resgid)); if (F2FS_IO_SIZE_BITS(sbi)) seq_printf(seq, ",io_size=%uKB", F2FS_IO_SIZE_KB(sbi)); #ifdef CONFIG_F2FS_FAULT_INJECTION if (test_opt(sbi, FAULT_INJECTION)) seq_printf(seq, ",fault_injection=%u", - sbi->fault_info.inject_rate); + F2FS_OPTION(sbi).fault_info.inject_rate); #endif #ifdef CONFIG_QUOTA if (test_opt(sbi, QUOTA)) @@ -1297,19 +1311,19 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",prjquota"); #endif f2fs_show_quota_options(seq, sbi->sb); - if (sbi->whint_mode == WHINT_MODE_USER) + if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_USER) seq_printf(seq, ",whint_mode=%s", "user-based"); - else if (sbi->whint_mode == WHINT_MODE_FS) + else if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_FS) seq_printf(seq, ",whint_mode=%s", "fs-based"); - if (sbi->alloc_mode == ALLOC_MODE_DEFAULT) + if (F2FS_OPTION(sbi).alloc_mode == ALLOC_MODE_DEFAULT) seq_printf(seq, ",alloc_mode=%s", "default"); - else if (sbi->alloc_mode == ALLOC_MODE_REUSE) + else if (F2FS_OPTION(sbi).alloc_mode == ALLOC_MODE_REUSE) seq_printf(seq, ",alloc_mode=%s", "reuse"); - if (sbi->fsync_mode == FSYNC_MODE_POSIX) + if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_POSIX) seq_printf(seq, ",fsync_mode=%s", "posix"); - else if (sbi->fsync_mode == FSYNC_MODE_STRICT) + else if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_STRICT) seq_printf(seq, ",fsync_mode=%s", "strict"); return 0; } @@ -1317,11 +1331,11 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) static void default_options(struct f2fs_sb_info *sbi) { /* init some FS parameters */ - sbi->active_logs = NR_CURSEG_TYPE; - sbi->inline_xattr_size = DEFAULT_INLINE_XATTR_ADDRS; - sbi->whint_mode = WHINT_MODE_OFF; - sbi->alloc_mode = ALLOC_MODE_DEFAULT; - sbi->fsync_mode = FSYNC_MODE_POSIX; + F2FS_OPTION(sbi).active_logs = NR_CURSEG_TYPE; + F2FS_OPTION(sbi).inline_xattr_size = DEFAULT_INLINE_XATTR_ADDRS; + F2FS_OPTION(sbi).whint_mode = WHINT_MODE_OFF; + F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_DEFAULT; + F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_POSIX; sbi->readdir_ra = 1; set_opt(sbi, BG_GC); @@ -1359,24 +1373,11 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) struct f2fs_sb_info *sbi = F2FS_SB(sb); struct f2fs_mount_info org_mount_opt; unsigned long old_sb_flags; - int err, active_logs; + int err; bool need_restart_gc = false; bool need_stop_gc = false; bool no_extent_cache = !test_opt(sbi, EXTENT_CACHE); - int old_whint_mode = sbi->whint_mode; - int old_alloc_mode = sbi->alloc_mode; - int old_fsync_mode = sbi->fsync_mode; - int old_inline_xattr_size = sbi->inline_xattr_size; - block_t old_root_reserved_blocks = sbi->root_reserved_blocks; - kuid_t old_resuid = sbi->s_resuid; - kgid_t old_resgid = sbi->s_resgid; - int old_write_io_size_bits = sbi->write_io_size_bits; -#ifdef CONFIG_F2FS_FAULT_INJECTION - struct f2fs_fault_info ffi = sbi->fault_info; -#endif #ifdef CONFIG_QUOTA - int s_jquota_fmt; - char *s_qf_names[MAXQUOTAS]; int i, j; #endif @@ -1386,21 +1387,21 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) */ org_mount_opt = sbi->mount_opt; old_sb_flags = sb->s_flags; - active_logs = sbi->active_logs; #ifdef CONFIG_QUOTA - s_jquota_fmt = sbi->s_jquota_fmt; + org_mount_opt.s_jquota_fmt = F2FS_OPTION(sbi).s_jquota_fmt; for (i = 0; i < MAXQUOTAS; i++) { - if (sbi->s_qf_names[i]) { - s_qf_names[i] = kstrdup(sbi->s_qf_names[i], - GFP_KERNEL); - if (!s_qf_names[i]) { + if (F2FS_OPTION(sbi).s_qf_names[i]) { + org_mount_opt.s_qf_names[i] = + kstrdup(F2FS_OPTION(sbi).s_qf_names[i], + GFP_KERNEL); + if (!org_mount_opt.s_qf_names[i]) { for (j = 0; j < i; j++) - kfree(s_qf_names[j]); + kfree(org_mount_opt.s_qf_names[j]); return -ENOMEM; } } else { - s_qf_names[i] = NULL; + org_mount_opt.s_qf_names[i] = NULL; } } #endif @@ -1470,7 +1471,8 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) need_stop_gc = true; } - if (*flags & MS_RDONLY || sbi->whint_mode != old_whint_mode) { + if (*flags & MS_RDONLY || + F2FS_OPTION(sbi).whint_mode != org_mount_opt.whint_mode) { writeback_inodes_sb(sb, WB_REASON_SYNC); sync_inodes_sb(sb); @@ -1496,7 +1498,7 @@ skip: #ifdef CONFIG_QUOTA /* Release old quota file names */ for (i = 0; i < MAXQUOTAS; i++) - kfree(s_qf_names[i]); + kfree(org_mount_opt.s_qf_names[i]); #endif /* Update the POSIXACL Flag */ sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | @@ -1514,26 +1516,14 @@ restore_gc: } restore_opts: #ifdef CONFIG_QUOTA - sbi->s_jquota_fmt = s_jquota_fmt; + F2FS_OPTION(sbi).s_jquota_fmt = org_mount_opt.s_jquota_fmt; for (i = 0; i < MAXQUOTAS; i++) { - kfree(sbi->s_qf_names[i]); - sbi->s_qf_names[i] = s_qf_names[i]; + kfree(F2FS_OPTION(sbi).s_qf_names[i]); + F2FS_OPTION(sbi).s_qf_names[i] = org_mount_opt.s_qf_names[i]; } #endif - sbi->write_io_size_bits = old_write_io_size_bits; - sbi->s_resgid = old_resgid; - sbi->s_resuid = old_resuid; - sbi->root_reserved_blocks = old_root_reserved_blocks; - sbi->inline_xattr_size = old_inline_xattr_size; - sbi->alloc_mode = old_alloc_mode; - sbi->fsync_mode = old_fsync_mode; - sbi->whint_mode = old_whint_mode; sbi->mount_opt = org_mount_opt; - sbi->active_logs = active_logs; sb->s_flags = old_sb_flags; -#ifdef CONFIG_F2FS_FAULT_INJECTION - sbi->fault_info = ffi; -#endif return err; } @@ -1655,8 +1645,8 @@ static qsize_t *f2fs_get_reserved_space(struct inode *inode) static int f2fs_quota_on_mount(struct f2fs_sb_info *sbi, int type) { - return dquot_quota_on_mount(sbi->sb, sbi->s_qf_names[type], - sbi->s_jquota_fmt, type); + return dquot_quota_on_mount(sbi->sb, F2FS_OPTION(sbi).s_qf_names[type], + F2FS_OPTION(sbi).s_jquota_fmt, type); } int f2fs_enable_quota_files(struct f2fs_sb_info *sbi, bool rdonly) @@ -1675,7 +1665,7 @@ int f2fs_enable_quota_files(struct f2fs_sb_info *sbi, bool rdonly) } for (i = 0; i < MAXQUOTAS; i++) { - if (sbi->s_qf_names[i]) { + if (F2FS_OPTION(sbi).s_qf_names[i]) { err = f2fs_quota_on_mount(sbi, i); if (!err) { enabled = 1; @@ -2566,7 +2556,7 @@ static void f2fs_tuning_parameters(struct f2fs_sb_info *sbi) /* adjust parameters according to the volume size */ if (sm_i->main_segments <= SMALL_VOLUME_SEGMENTS) { - sbi->alloc_mode = ALLOC_MODE_REUSE; + F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_REUSE; sm_i->dcc_info->discard_granularity = 1; sm_i->ipu_policy = 1 << F2FS_IPU_FORCE; } @@ -2619,8 +2609,8 @@ try_onemore: sb->s_fs_info = sbi; sbi->raw_super = raw_super; - sbi->s_resuid = make_kuid(&init_user_ns, F2FS_DEF_RESUID); - sbi->s_resgid = make_kgid(&init_user_ns, F2FS_DEF_RESGID); + F2FS_OPTION(sbi).s_resuid = make_kuid(&init_user_ns, F2FS_DEF_RESUID); + F2FS_OPTION(sbi).s_resgid = make_kgid(&init_user_ns, F2FS_DEF_RESGID); /* precompute checksum seed for metadata */ if (f2fs_sb_has_inode_chksum(sb)) @@ -2978,7 +2968,7 @@ free_bio_info: free_options: #ifdef CONFIG_QUOTA for (i = 0; i < MAXQUOTAS; i++) - kfree(sbi->s_qf_names[i]); + kfree(F2FS_OPTION(sbi).s_qf_names[i]); #endif kfree(options); free_sb_buf: diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 23a2d8d66c43..7d983ad19da4 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -58,7 +58,7 @@ static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type) #ifdef CONFIG_F2FS_FAULT_INJECTION else if (struct_type == FAULT_INFO_RATE || struct_type == FAULT_INFO_TYPE) - return (unsigned char *)&sbi->fault_info; + return (unsigned char *)&F2FS_OPTION(sbi).fault_info; #endif return NULL; } @@ -222,7 +222,7 @@ out: if (a->struct_type == RESERVED_BLOCKS) { spin_lock(&sbi->stat_lock); if (t > (unsigned long)(sbi->user_block_count - - sbi->root_reserved_blocks)) { + F2FS_OPTION(sbi).root_reserved_blocks)) { spin_unlock(&sbi->stat_lock); return -EINVAL; } diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index bb92fd5b5841..61ddee120675 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -39,10 +39,10 @@ #define F2FS_MAX_QUOTAS 3 -#define F2FS_IO_SIZE(sbi) (1 << (sbi)->write_io_size_bits) /* Blocks */ -#define F2FS_IO_SIZE_KB(sbi) (1 << ((sbi)->write_io_size_bits + 2)) /* KB */ -#define F2FS_IO_SIZE_BYTES(sbi) (1 << ((sbi)->write_io_size_bits + 12)) /* B */ -#define F2FS_IO_SIZE_BITS(sbi) ((sbi)->write_io_size_bits) /* power of 2 */ +#define F2FS_IO_SIZE(sbi) (1 << F2FS_OPTION(sbi).write_io_size_bits) /* Blocks */ +#define F2FS_IO_SIZE_KB(sbi) (1 << (F2FS_OPTION(sbi).write_io_size_bits + 2)) /* KB */ +#define F2FS_IO_SIZE_BYTES(sbi) (1 << (F2FS_OPTION(sbi).write_io_size_bits + 12)) /* B */ +#define F2FS_IO_SIZE_BITS(sbi) (F2FS_OPTION(sbi).write_io_size_bits) /* power of 2 */ #define F2FS_IO_SIZE_MASK(sbi) (F2FS_IO_SIZE(sbi) - 1) /* This flag is used by node and meta inodes, and by recovery */ From 30654507e0a28a634e709f7fa05dd3850067bd32 Mon Sep 17 00:00:00 2001 From: Hyunchul Lee Date: Thu, 8 Mar 2018 19:34:38 +0900 Subject: [PATCH 646/804] f2fs: add nowait aio support This patch adds nowait aio support[1]. Return EAGAIN if any of the following checks fail for direct I/O: - i_rwsem is not lockable - Blocks are not allocated at the write location And xfstests generic/471 is passed. [1]: 6be96d "Introduce RWF_NOWAIT and FMODE_AIO_NOWAIT" Signed-off-by: Hyunchul Lee Reviewed-by: Goldwyn Rodrigues Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 47 ++++++++++++++++++++++++++++++++++++---------- fs/f2fs/f2fs.h | 8 ++++++++ fs/f2fs/file.c | 35 ++++++++++++++++++++++++++++------ include/linux/fs.h | 4 ++++ 4 files changed, 78 insertions(+), 16 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 1e78f55c9a7a..b66b78d3f76d 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -837,13 +837,6 @@ alloc: return 0; } -static inline bool __force_buffered_io(struct inode *inode, int rw) -{ - return (f2fs_encrypted_file(inode) || - (rw == WRITE && test_opt(F2FS_I_SB(inode), LFS)) || - F2FS_I_SB(inode)->s_ndevs); -} - int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) { struct inode *inode = file_inode(iocb->ki_filp); @@ -875,7 +868,7 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) if (direct_io) { map.m_seg_type = rw_hint_to_seg_type(iocb->ki_hint); - flag = __force_buffered_io(inode, WRITE) ? + flag = f2fs_force_buffered_io(inode, WRITE) ? F2FS_GET_BLOCK_PRE_AIO : F2FS_GET_BLOCK_PRE_DIO; goto map_blocks; @@ -1119,6 +1112,31 @@ out: return err; } +bool f2fs_overwrite_io(struct inode *inode, loff_t pos, size_t len) +{ + struct f2fs_map_blocks map; + block_t last_lblk; + int err; + + if (pos + len > i_size_read(inode)) + return false; + + map.m_lblk = F2FS_BYTES_TO_BLK(pos); + map.m_next_pgofs = NULL; + map.m_next_extent = NULL; + map.m_seg_type = NO_CHECK_TYPE; + last_lblk = F2FS_BLK_ALIGN(pos + len); + + while (map.m_lblk < last_lblk) { + map.m_len = last_lblk - map.m_lblk; + err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_DEFAULT); + if (err || map.m_len == 0) + return false; + map.m_lblk += map.m_len; + } + return true; +} + static int __get_data_block(struct inode *inode, sector_t iblock, struct buffer_head *bh, int create, int flag, pgoff_t *next_pgofs, int seg_type) @@ -2308,7 +2326,7 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, if (err) return err; - if (__force_buffered_io(inode, rw)) + if (f2fs_force_buffered_io(inode, rw)) return 0; trace_f2fs_direct_IO_enter(inode, offset, count, rw); @@ -2316,7 +2334,15 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, if (rw == WRITE && whint_mode == WHINT_MODE_OFF) iocb->ki_hint = WRITE_LIFE_NOT_SET; - down_read(&F2FS_I(inode)->dio_rwsem[rw]); + if (!down_read_trylock(&F2FS_I(inode)->dio_rwsem[rw])) { + if (iocb->ki_flags & IOCB_NOWAIT) { + iocb->ki_hint = hint; + err = -EAGAIN; + goto out; + } + down_read(&F2FS_I(inode)->dio_rwsem[rw]); + } + err = blockdev_direct_IO(iocb, inode, iter, offset, get_data_block_dio); up_read(&F2FS_I(inode)->dio_rwsem[rw]); @@ -2332,6 +2358,7 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, } } +out: trace_f2fs_direct_IO_exit(inode, offset, count, rw, err); return err; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index fa93ef53be34..615f158f895b 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2961,6 +2961,7 @@ int f2fs_release_page(struct page *page, gfp_t wait); int f2fs_migrate_page(struct address_space *mapping, struct page *newpage, struct page *page, enum migrate_mode mode); #endif +bool f2fs_overwrite_io(struct inode *inode, loff_t pos, size_t len); /* * gc.c @@ -3345,4 +3346,11 @@ static inline bool f2fs_may_encrypt(struct inode *inode) #endif } +static inline bool f2fs_force_buffered_io(struct inode *inode, int rw) +{ + return (f2fs_encrypted_file(inode) || + (rw == WRITE && test_opt(F2FS_I_SB(inode), LFS)) || + F2FS_I_SB(inode)->s_ndevs); +} + #endif diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index e39edd76e170..cdad5853aaff 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -482,6 +482,9 @@ static int f2fs_file_open(struct inode *inode, struct file *filp) if (err) return err; + + filp->f_mode |= FMODE_NOWAIT; + return dquot_file_open(inode, filp); } @@ -2696,7 +2699,15 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) return -EIO; - inode_lock(inode); + if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT)) + return -EINVAL; + + if (!inode_trylock(inode)) { + if (iocb->ki_flags & IOCB_NOWAIT) + return -EAGAIN; + inode_lock(inode); + } + ret = generic_write_checks(iocb, from); if (ret > 0) { int err; @@ -2704,11 +2715,23 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (iov_iter_fault_in_readable(from, iov_iter_count(from))) set_inode_flag(inode, FI_NO_PREALLOC); - err = f2fs_preallocate_blocks(iocb, from); - if (err) { - clear_inode_flag(inode, FI_NO_PREALLOC); - inode_unlock(inode); - return err; + if ((iocb->ki_flags & IOCB_NOWAIT) && + (iocb->ki_flags & IOCB_DIRECT)) { + if (!f2fs_overwrite_io(inode, iocb->ki_pos, + iov_iter_count(from)) || + f2fs_has_inline_data(inode) || + f2fs_force_buffered_io(inode, WRITE)) { + inode_unlock(inode); + return -EAGAIN; + } + + } else { + err = f2fs_preallocate_blocks(iocb, from); + if (err) { + clear_inode_flag(inode, FI_NO_PREALLOC); + inode_unlock(inode); + return err; + } } blk_start_plug(&plug); ret = __generic_file_write_iter(iocb, from); diff --git a/include/linux/fs.h b/include/linux/fs.h index 8231cdc25901..e9382296305d 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -144,6 +144,9 @@ typedef void (dax_iodone_t)(struct buffer_head *bh_map, int uptodate); /* File was opened by fanotify and shouldn't generate fanotify events */ #define FMODE_NONOTIFY ((__force fmode_t)0x4000000) +/* File is capable of returning -EAGAIN if I/O will block */ +#define FMODE_NOWAIT ((__force fmode_t)0x8000000) + /* * Flag for rw_copy_check_uvector and compat_rw_copy_check_uvector * that indicates that they should check the contents of the iovec are @@ -334,6 +337,7 @@ enum rw_hint { #define IOCB_EVENTFD (1 << 0) #define IOCB_APPEND (1 << 1) #define IOCB_DIRECT (1 << 2) +#define IOCB_NOWAIT (1 << 7) struct kiocb { struct file *ki_filp; From 4c55abe4f8d2ca91987cf5e91e8eb7a71b2dab9c Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 9 Mar 2018 14:24:22 +0800 Subject: [PATCH 647/804] f2fs: remove unneeded set_cold_node() When setting COLD_BIT_SHIFT flag in node block, we only need to call set_cold_node() in new_node_page() and recover_inode_page() during node page initialization. So remove unneeded set_cold_node() in other places. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 2 -- fs/f2fs/inode.c | 1 - fs/f2fs/node.c | 3 ++- fs/f2fs/node.h | 4 ++-- 4 files changed, 4 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 02c32c96fe09..00ada49c7fa4 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -396,8 +396,6 @@ struct page *init_inode_metadata(struct inode *inode, struct inode *dir, page = get_node_page(F2FS_I_SB(dir), inode->i_ino); if (IS_ERR(page)) return page; - - set_cold_node(inode, page); } if (new_name) { diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 10be247ca421..562a56bc037c 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -439,7 +439,6 @@ void update_inode(struct inode *inode, struct page *node_page) } __set_inode_rdev(inode, ri); - set_cold_node(inode, node_page); /* deleted inode */ if (inode->i_nlink == 0) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index c294d0feea08..ab2595842c5d 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1076,7 +1076,7 @@ struct page *new_node_page(struct dnode_of_data *dn, unsigned int ofs) f2fs_wait_on_page_writeback(page, NODE, true); fill_node_footer(page, dn->nid, dn->inode->i_ino, ofs, true); - set_cold_node(dn->inode, page); + set_cold_node(page, S_ISDIR(dn->inode->i_mode)); if (!PageUptodate(page)) SetPageUptodate(page); if (set_page_dirty(page)) @@ -2313,6 +2313,7 @@ retry: if (!PageUptodate(ipage)) SetPageUptodate(ipage); fill_node_footer(ipage, ino, ino, 0, true); + set_cold_node(page, false); src = F2FS_INODE(page); dst = F2FS_INODE(ipage); diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 081ef0d672bf..e593b4d78be2 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -422,12 +422,12 @@ static inline void clear_inline_node(struct page *page) ClearPageChecked(page); } -static inline void set_cold_node(struct inode *inode, struct page *page) +static inline void set_cold_node(struct page *page, bool is_dir) { struct f2fs_node *rn = F2FS_NODE(page); unsigned int flag = le32_to_cpu(rn->footer.flag); - if (S_ISDIR(inode->i_mode)) + if (is_dir) flag &= ~(0x1 << COLD_BIT_SHIFT); else flag |= (0x1 << COLD_BIT_SHIFT); From 739ace131cdfd5dd0eca4c4bbf06b0a3bce25d9d Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 9 Mar 2018 17:42:28 -0800 Subject: [PATCH 648/804] f2fs: align memory boundary for bitops For example, in arm64, free_nid_bitmap should be aligned to word size in order to use bit operations. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- fs/f2fs/node.c | 20 +++++++++++++++++--- include/linux/f2fs_fs.h | 4 ++++ 3 files changed, 22 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 615f158f895b..93b13c50af67 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -830,7 +830,7 @@ struct f2fs_nm_info { unsigned int nid_cnt[MAX_NID_STATE]; /* the number of free node id */ spinlock_t nid_list_lock; /* protect nid lists ops */ struct mutex build_lock; /* lock for build free nids */ - unsigned char (*free_nid_bitmap)[NAT_ENTRY_BITMAP_SIZE]; + unsigned char **free_nid_bitmap; unsigned char *nat_block_bitmap; unsigned short *free_nid_count; /* free nid count of NAT block */ diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index ab2595842c5d..571cb70c5fbd 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2730,12 +2730,20 @@ static int init_node_manager(struct f2fs_sb_info *sbi) static int init_free_nid_cache(struct f2fs_sb_info *sbi) { struct f2fs_nm_info *nm_i = NM_I(sbi); + int i; - nm_i->free_nid_bitmap = f2fs_kvzalloc(sbi, nm_i->nat_blocks * - NAT_ENTRY_BITMAP_SIZE, GFP_KERNEL); + nm_i->free_nid_bitmap = f2fs_kzalloc(sbi, nm_i->nat_blocks * + sizeof(unsigned char *), GFP_KERNEL); if (!nm_i->free_nid_bitmap) return -ENOMEM; + for (i = 0; i < nm_i->nat_blocks; i++) { + nm_i->free_nid_bitmap[i] = f2fs_kvzalloc(sbi, + NAT_ENTRY_BITMAP_SIZE_ALIGNED, GFP_KERNEL); + if (!nm_i->free_nid_bitmap) + return -ENOMEM; + } + nm_i->nat_block_bitmap = f2fs_kvzalloc(sbi, nm_i->nat_blocks / 8, GFP_KERNEL); if (!nm_i->nat_block_bitmap) @@ -2826,7 +2834,13 @@ void destroy_node_manager(struct f2fs_sb_info *sbi) up_write(&nm_i->nat_tree_lock); kvfree(nm_i->nat_block_bitmap); - kvfree(nm_i->free_nid_bitmap); + if (nm_i->free_nid_bitmap) { + int i; + + for (i = 0; i < nm_i->nat_blocks; i++) + kvfree(nm_i->free_nid_bitmap[i]); + kfree(nm_i->free_nid_bitmap); + } kvfree(nm_i->free_nid_count); kfree(nm_i->nat_bitmap); diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index 61ddee120675..2ebfa01b7091 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -305,6 +305,10 @@ struct f2fs_node { */ #define NAT_ENTRY_PER_BLOCK (PAGE_SIZE / sizeof(struct f2fs_nat_entry)) #define NAT_ENTRY_BITMAP_SIZE ((NAT_ENTRY_PER_BLOCK + 7) / 8) +#define NAT_ENTRY_BITMAP_SIZE_ALIGNED \ + ((NAT_ENTRY_BITMAP_SIZE + BITS_PER_LONG - 1) / \ + BITS_PER_LONG * BITS_PER_LONG) + struct f2fs_nat_entry { __u8 version; /* latest version of cached nat entry */ From 8a5719615847c523b9975bb68294aa9a792d94ba Mon Sep 17 00:00:00 2001 From: Qiuyang Sun Date: Tue, 13 Mar 2018 19:42:50 +0800 Subject: [PATCH 649/804] f2fs: release locks before return in f2fs_ioc_gc_range() Currently, we will leave the kernel with locks still held when the gc_range is invalid. This patch fixes the bug. Signed-off-by: Qiuyang Sun Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index cdad5853aaff..abc3db46cb1d 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2057,8 +2057,10 @@ static int f2fs_ioc_gc_range(struct file *filp, unsigned long arg) return ret; end = range.start + range.len; - if (range.start < MAIN_BLKADDR(sbi) || end >= MAX_BLKADDR(sbi)) - return -EINVAL; + if (range.start < MAIN_BLKADDR(sbi) || end >= MAX_BLKADDR(sbi)) { + ret = -EINVAL; + goto out; + } do_more: if (!range.sync) { if (!mutex_trylock(&sbi->gc_mutex)) { From 9321e22c038cf725ad1734b42dffb2536e920242 Mon Sep 17 00:00:00 2001 From: Sheng Yong Date: Thu, 15 Mar 2018 18:51:41 +0800 Subject: [PATCH 650/804] f2fs: introduce F2FS_FEATURE_LOST_FOUND feature This patch introduces a new feature, F2FS_FEATURE_LOST_FOUND, which is set by mkfs. mkfs creates a directory named lost+found, which saves unreachable files. If fsck finds a file which has no parent, or its parent is removed by fsck, the file will be placed under lost+found directory by fsck. lost+found directory could not be encrypted. As a result, the root directory cannot be encrypted too. So if LOST_FOUND feature is enabled, let's avoid to encrypt root directory. Signed-off-by: Sheng Yong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 ++ fs/f2fs/super.c | 12 ++++++++++++ fs/f2fs/sysfs.c | 7 +++++++ 3 files changed, 21 insertions(+) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 93b13c50af67..e0bf6f83dd14 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -145,6 +145,7 @@ struct f2fs_mount_info { #define F2FS_FEATURE_FLEXIBLE_INLINE_XATTR 0x0040 #define F2FS_FEATURE_QUOTA_INO 0x0080 #define F2FS_FEATURE_INODE_CRTIME 0x0100 +#define F2FS_FEATURE_LOST_FOUND 0x0200 #define F2FS_HAS_FEATURE(sb, mask) \ ((F2FS_SB(sb)->raw_super->feature & cpu_to_le32(mask)) != 0) @@ -3298,6 +3299,7 @@ F2FS_FEATURE_FUNCS(inode_chksum, INODE_CHKSUM); F2FS_FEATURE_FUNCS(flexible_inline_xattr, FLEXIBLE_INLINE_XATTR); F2FS_FEATURE_FUNCS(quota_ino, QUOTA_INO); F2FS_FEATURE_FUNCS(inode_crtime, INODE_CRTIME); +F2FS_FEATURE_FUNCS(lost_found, LOST_FOUND); #ifdef CONFIG_BLK_DEV_ZONED static inline int get_blkz_type(struct f2fs_sb_info *sbi, diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 67b0e1e34da0..ec4774942a48 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1895,6 +1895,18 @@ static int f2fs_get_context(struct inode *inode, void *ctx, size_t len) static int f2fs_set_context(struct inode *inode, const void *ctx, size_t len, void *fs_data) { + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + /* + * Encrypting the root directory is not allowed because fsck + * expects lost+found directory to exist and remain unencrypted + * if LOST_FOUND feature is enabled. + * + */ + if (f2fs_sb_has_lost_found(sbi->sb) && + inode->i_ino == F2FS_ROOT_INO(sbi)) + return -EPERM; + return f2fs_setxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION, F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, ctx, len, fs_data, XATTR_CREATE); diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 7d983ad19da4..f33a56d6e6dd 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -116,6 +116,9 @@ static ssize_t features_show(struct f2fs_attr *a, if (f2fs_sb_has_inode_crtime(sb)) len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "inode_crtime"); + if (f2fs_sb_has_lost_found(sb)) + len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len ? ", " : "", "lost_found"); len += snprintf(buf + len, PAGE_SIZE - len, "\n"); return len; } @@ -292,6 +295,7 @@ enum feat_id { FEAT_FLEXIBLE_INLINE_XATTR, FEAT_QUOTA_INO, FEAT_INODE_CRTIME, + FEAT_LOST_FOUND, }; static ssize_t f2fs_feature_show(struct f2fs_attr *a, @@ -307,6 +311,7 @@ static ssize_t f2fs_feature_show(struct f2fs_attr *a, case FEAT_FLEXIBLE_INLINE_XATTR: case FEAT_QUOTA_INO: case FEAT_INODE_CRTIME: + case FEAT_LOST_FOUND: return snprintf(buf, PAGE_SIZE, "supported\n"); } return 0; @@ -386,6 +391,7 @@ F2FS_FEATURE_RO_ATTR(inode_checksum, FEAT_INODE_CHECKSUM); F2FS_FEATURE_RO_ATTR(flexible_inline_xattr, FEAT_FLEXIBLE_INLINE_XATTR); F2FS_FEATURE_RO_ATTR(quota_ino, FEAT_QUOTA_INO); F2FS_FEATURE_RO_ATTR(inode_crtime, FEAT_INODE_CRTIME); +F2FS_FEATURE_RO_ATTR(lost_found, FEAT_LOST_FOUND); #define ATTR_LIST(name) (&f2fs_attr_##name.attr) static struct attribute *f2fs_attrs[] = { @@ -441,6 +447,7 @@ static struct attribute *f2fs_feat_attrs[] = { ATTR_LIST(flexible_inline_xattr), ATTR_LIST(quota_ino), ATTR_LIST(inode_crtime), + ATTR_LIST(lost_found), NULL, }; From 7419dcb8be0282e165f676539babeec2766bb0ca Mon Sep 17 00:00:00 2001 From: Sheng Yong Date: Thu, 15 Mar 2018 18:51:42 +0800 Subject: [PATCH 651/804] f2fs: introduce a new mount option test_dummy_encryption This patch introduces a new mount option `test_dummy_encryption' to allow fscrypt to create a fake fscrypt context. This is used by xfstests. Signed-off-by: Sheng Yong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.txt | 2 ++ fs/f2fs/dir.c | 4 +++- fs/f2fs/f2fs.h | 8 ++++++++ fs/f2fs/namei.c | 9 ++++++--- fs/f2fs/super.c | 28 ++++++++++++++++++++++++++++ 5 files changed, 47 insertions(+), 4 deletions(-) diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt index fb92e6f25adf..1f52baea2f69 100644 --- a/Documentation/filesystems/f2fs.txt +++ b/Documentation/filesystems/f2fs.txt @@ -187,6 +187,8 @@ fsync_mode=%s Control the policy of fsync. Currently supports "posix" fsync will be heavy and behaves in line with xfs, ext4 and btrfs, where xfstest generic/342 will pass, but the performance will regress. +test_dummy_encryption Enable dummy encryption, which provides a fake fscrypt + context. The fake fscrypt context is used by xfstests. ================================================================================ DEBUGFS ENTRIES diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 00ada49c7fa4..41d32171bd52 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -361,6 +361,7 @@ struct page *init_inode_metadata(struct inode *inode, struct inode *dir, struct page *dpage) { struct page *page; + int dummy_encrypt = DUMMY_ENCRYPTION_ENABLED(F2FS_I_SB(dir)); int err; if (is_inode_flag_set(inode, FI_NEW_INODE)) { @@ -387,7 +388,8 @@ struct page *init_inode_metadata(struct inode *inode, struct inode *dir, if (err) goto put_error; - if (f2fs_encrypted_inode(dir) && f2fs_may_encrypt(inode)) { + if ((f2fs_encrypted_inode(dir) || dummy_encrypt) && + f2fs_may_encrypt(inode)) { err = fscrypt_inherit_context(dir, inode, page, false); if (err) goto put_error; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index e0bf6f83dd14..423603d3f5b3 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -134,6 +134,7 @@ struct f2fs_mount_info { int whint_mode; int alloc_mode; /* segment allocation policy */ int fsync_mode; /* fsync policy */ + bool test_dummy_encryption; /* test dummy encryption */ }; #define F2FS_FEATURE_ENCRYPT 0x0001 @@ -1141,6 +1142,13 @@ enum fsync_mode { FSYNC_MODE_STRICT, /* fsync behaves in line with ext4 */ }; +#ifdef CONFIG_F2FS_FS_ENCRYPTION +#define DUMMY_ENCRYPTION_ENABLED(sbi) \ + (unlikely(F2FS_OPTION(sbi).test_dummy_encryption)) +#else +#define DUMMY_ENCRYPTION_ENABLED(sbi) (0) +#endif + struct f2fs_sb_info { struct super_block *sb; /* pointer to VFS super block */ struct proc_dir_entry *s_proc; /* proc entry */ diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 62aec95fe124..5ec20f077629 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -78,7 +78,8 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) set_inode_flag(inode, FI_NEW_INODE); /* If the directory encrypted, then we should encrypt the inode. */ - if (f2fs_encrypted_inode(dir) && f2fs_may_encrypt(inode)) + if ((f2fs_encrypted_inode(dir) || DUMMY_ENCRYPTION_ENABLED(sbi)) && + f2fs_may_encrypt(inode)) f2fs_set_encrypted_inode(inode); if (f2fs_sb_has_extra_attr(sbi->sb)) { @@ -784,10 +785,12 @@ out: static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) { - if (unlikely(f2fs_cp_error(F2FS_I_SB(dir)))) + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + + if (unlikely(f2fs_cp_error(sbi))) return -EIO; - if (f2fs_encrypted_inode(dir)) { + if (f2fs_encrypted_inode(dir) || DUMMY_ENCRYPTION_ENABLED(sbi)) { int err = fscrypt_get_encryption_info(dir); if (err) return err; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index ec4774942a48..62f228478849 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -132,6 +132,7 @@ enum { Opt_whint, Opt_alloc, Opt_fsync, + Opt_test_dummy_encryption, Opt_err, }; @@ -188,6 +189,7 @@ static match_table_t f2fs_tokens = { {Opt_whint, "whint_mode=%s"}, {Opt_alloc, "alloc_mode=%s"}, {Opt_fsync, "fsync_mode=%s"}, + {Opt_test_dummy_encryption, "test_dummy_encryption"}, {Opt_err, NULL}, }; @@ -744,6 +746,21 @@ static int parse_options(struct super_block *sb, char *options) } kfree(name); break; + case Opt_test_dummy_encryption: +#ifdef CONFIG_F2FS_FS_ENCRYPTION + if (!f2fs_sb_has_encrypt(sb)) { + f2fs_msg(sb, KERN_ERR, "Encrypt feature is off"); + return -EINVAL; + } + + F2FS_OPTION(sbi).test_dummy_encryption = true; + f2fs_msg(sb, KERN_INFO, + "Test dummy encryption mode enabled"); +#else + f2fs_msg(sb, KERN_INFO, + "Test dummy encryption mount option ignored"); +#endif + break; default: f2fs_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" or missing value", @@ -1315,6 +1332,10 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_printf(seq, ",whint_mode=%s", "user-based"); else if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_FS) seq_printf(seq, ",whint_mode=%s", "fs-based"); +#ifdef CONFIG_F2FS_FS_ENCRYPTION + if (F2FS_OPTION(sbi).test_dummy_encryption) + seq_puts(seq, ",test_dummy_encryption"); +#endif if (F2FS_OPTION(sbi).alloc_mode == ALLOC_MODE_DEFAULT) seq_printf(seq, ",alloc_mode=%s", "default"); @@ -1336,6 +1357,7 @@ static void default_options(struct f2fs_sb_info *sbi) F2FS_OPTION(sbi).whint_mode = WHINT_MODE_OFF; F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_DEFAULT; F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_POSIX; + F2FS_OPTION(sbi).test_dummy_encryption = false; sbi->readdir_ra = 1; set_opt(sbi, BG_GC); @@ -1912,6 +1934,11 @@ static int f2fs_set_context(struct inode *inode, const void *ctx, size_t len, ctx, len, fs_data, XATTR_CREATE); } +static bool f2fs_dummy_context(struct inode *inode) +{ + return DUMMY_ENCRYPTION_ENABLED(F2FS_I_SB(inode)); +} + static unsigned f2fs_max_namelen(struct inode *inode) { return S_ISLNK(inode->i_mode) ? @@ -1922,6 +1949,7 @@ static const struct fscrypt_operations f2fs_cryptops = { .key_prefix = "f2fs:", .get_context = f2fs_get_context, .set_context = f2fs_set_context, + .dummy_context = f2fs_dummy_context, .empty_dir = f2fs_empty_dir, .max_namelen = f2fs_max_namelen, }; From 6ab573a9d96f7991927948ecb481c89654d4bdd0 Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Fri, 16 Mar 2018 18:53:53 +0530 Subject: [PATCH 652/804] f2fs: Set GF_NOFS in read_cache_page_gfp while doing f2fs_quota_read Quota code itself is serializing the operations by taking mutex_lock. It seems a below deadlock can happen if GF_NOFS is not used in f2fs_quota_read __switch_to+0x88 __schedule+0x5b0 schedule+0x78 schedule_preempt_disabled+0x20 __mutex_lock_slowpath+0xdc //mutex owner is itself mutex_lock+0x2c dquot_commit+0x30 //mutex_lock(&dqopt->dqio_mutex); dqput+0xe0 __dquot_drop+0x80 dquot_drop+0x48 f2fs_evict_inode+0x218 evict+0xa8 dispose_list+0x3c prune_icache_sb+0x58 super_cache_scan+0xf4 do_shrink_slab+0x208 shrink_slab.part.40+0xac shrink_zone+0x1b0 do_try_to_free_pages+0x25c try_to_free_pages+0x164 __alloc_pages_nodemask+0x534 do_read_cache_page+0x6c read_cache_page+0x14 f2fs_quota_read+0xa4 read_blk+0x54 find_tree_dqentry+0xe4 find_tree_dqentry+0xb8 find_tree_dqentry+0xb8 find_tree_dqentry+0xb8 qtree_read_dquot+0x68 v2_read_dquot+0x24 dquot_acquire+0x5c // mutex_lock(&dqopt->dqio_mutex); dqget+0x238 __dquot_initialize+0xd4 dquot_initialize+0x10 dquot_file_open+0x34 f2fs_file_open+0x6c do_dentry_open+0x1e4 vfs_open+0x6c path_openat+0xa20 do_filp_open+0x4c do_sys_open+0x178 Signed-off-by: Ritesh Harjani Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 62f228478849..2feaf1e3fc9f 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1573,7 +1573,7 @@ static ssize_t f2fs_quota_read(struct super_block *sb, int type, char *data, while (toread > 0) { tocopy = min_t(unsigned long, sb->s_blocksize - offset, toread); repeat: - page = read_mapping_page(mapping, blkidx, NULL); + page = read_cache_page_gfp(mapping, blkidx, GFP_NOFS); if (IS_ERR(page)) { if (PTR_ERR(page) == -ENOMEM) { congestion_wait(BLK_RW_ASYNC, HZ/50); From 2c8834a7a2c95b19e7242559ac4fc64c0f40916d Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Thu, 8 Mar 2018 16:29:13 +0800 Subject: [PATCH 653/804] f2fs: check blkaddr more accuratly before issue a bio This patch check blkaddr more accuratly before issue a write or read bio. Signed-off-by: Yunlei He Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 2 ++ fs/f2fs/data.c | 5 +++-- fs/f2fs/f2fs.h | 1 + fs/f2fs/segment.h | 25 +++++++++++++++++++------ 4 files changed, 25 insertions(+), 8 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 9db919c423b6..04c608646fd5 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -68,6 +68,7 @@ static struct page *__get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index, .old_blkaddr = index, .new_blkaddr = index, .encrypted_page = NULL, + .is_meta = is_meta, }; if (unlikely(!is_meta)) @@ -163,6 +164,7 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, REQ_RAHEAD, .encrypted_page = NULL, .in_list = false, + .is_meta = (type != META_POR), }; struct blk_plug plug; diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index b66b78d3f76d..b677300c5bac 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -381,6 +381,7 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) struct page *page = fio->encrypted_page ? fio->encrypted_page : fio->page; + verify_block_addr(fio, fio->new_blkaddr); trace_f2fs_submit_page_bio(page, fio); f2fs_trace_ios(fio, 0); @@ -426,8 +427,8 @@ next: } if (fio->old_blkaddr != NEW_ADDR) - verify_block_addr(sbi, fio->old_blkaddr); - verify_block_addr(sbi, fio->new_blkaddr); + verify_block_addr(fio, fio->old_blkaddr); + verify_block_addr(fio, fio->new_blkaddr); bio_page = fio->encrypted_page ? fio->encrypted_page : fio->page; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 423603d3f5b3..db5f61f821e4 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1065,6 +1065,7 @@ struct f2fs_io_info { bool submitted; /* indicate IO submission */ int need_lock; /* indicate we need to lock cp_rwsem */ bool in_list; /* indicate fio is in io_list */ + bool is_meta; /* indicate borrow meta inode mapping or not */ enum iostat_type io_type; /* io type */ struct writeback_control *io_wbc; /* writeback control */ }; diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index d1524d16b2a0..96a2d57ba8a4 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -53,13 +53,19 @@ ((secno) == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno / \ (sbi)->segs_per_sec)) \ -#define MAIN_BLKADDR(sbi) (SM_I(sbi)->main_blkaddr) -#define SEG0_BLKADDR(sbi) (SM_I(sbi)->seg0_blkaddr) +#define MAIN_BLKADDR(sbi) \ + (SM_I(sbi) ? SM_I(sbi)->main_blkaddr : \ + le32_to_cpu(F2FS_RAW_SUPER(sbi)->main_blkaddr)) +#define SEG0_BLKADDR(sbi) \ + (SM_I(sbi) ? SM_I(sbi)->seg0_blkaddr : \ + le32_to_cpu(F2FS_RAW_SUPER(sbi)->segment0_blkaddr)) #define MAIN_SEGS(sbi) (SM_I(sbi)->main_segments) #define MAIN_SECS(sbi) ((sbi)->total_sections) -#define TOTAL_SEGS(sbi) (SM_I(sbi)->segment_count) +#define TOTAL_SEGS(sbi) \ + (SM_I(sbi) ? SM_I(sbi)->segment_count : \ + le32_to_cpu(F2FS_RAW_SUPER(sbi)->segment_count)) #define TOTAL_BLKS(sbi) (TOTAL_SEGS(sbi) << (sbi)->log_blocks_per_seg) #define MAX_BLKADDR(sbi) (SEG0_BLKADDR(sbi) + TOTAL_BLKS(sbi)) @@ -632,10 +638,17 @@ static inline void check_seg_range(struct f2fs_sb_info *sbi, unsigned int segno) f2fs_bug_on(sbi, segno > TOTAL_SEGS(sbi) - 1); } -static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr) +static inline void verify_block_addr(struct f2fs_io_info *fio, block_t blk_addr) { - BUG_ON(blk_addr < SEG0_BLKADDR(sbi) - || blk_addr >= MAX_BLKADDR(sbi)); + struct f2fs_sb_info *sbi = fio->sbi; + + if (PAGE_TYPE_OF_BIO(fio->type) == META && + (!is_read_io(fio->op) || fio->is_meta)) + BUG_ON(blk_addr < SEG0_BLKADDR(sbi) || + blk_addr >= MAIN_BLKADDR(sbi)); + else + BUG_ON(blk_addr < MAIN_BLKADDR(sbi) || + blk_addr >= MAX_BLKADDR(sbi)); } /* From d6a69d5e656825919c0b92f50032829f55d6f8f9 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 20 Mar 2018 23:08:29 +0800 Subject: [PATCH 654/804] f2fs: clean up with F2FS_BLK_ALIGN Clean up F2FS_BYTES_TO_BLK(x + F2FS_BLKSIZE - 1) with F2FS_BLK_ALIGN(x). Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 3 +-- fs/f2fs/node.c | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index abc3db46cb1d..161ddca86387 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -575,7 +575,6 @@ truncate_out: int truncate_blocks(struct inode *inode, u64 from, bool lock) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - unsigned int blocksize = inode->i_sb->s_blocksize; struct dnode_of_data dn; pgoff_t free_from; int count = 0, err = 0; @@ -584,7 +583,7 @@ int truncate_blocks(struct inode *inode, u64 from, bool lock) trace_f2fs_truncate_blocks_enter(inode, from); - free_from = (pgoff_t)F2FS_BYTES_TO_BLK(from + blocksize - 1); + free_from = (pgoff_t)F2FS_BLK_ALIGN(from); if (free_from >= sbi->max_file_blocks) goto free_partial; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 571cb70c5fbd..35e661890c58 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2603,8 +2603,7 @@ static int __get_nat_bitmaps(struct f2fs_sb_info *sbi) if (!enabled_nat_bits(sbi, NULL)) return 0; - nm_i->nat_bits_blocks = F2FS_BYTES_TO_BLK((nat_bits_bytes << 1) + 8 + - F2FS_BLKSIZE - 1); + nm_i->nat_bits_blocks = F2FS_BLK_ALIGN((nat_bits_bytes << 1) + 8); nm_i->nat_bits = f2fs_kzalloc(sbi, nm_i->nat_bits_blocks << F2FS_BLKSIZE_BITS, GFP_KERNEL); if (!nm_i->nat_bits) From 49338842e9b23b7a320531b7f199e0e5266e2de4 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 20 Mar 2018 23:08:30 +0800 Subject: [PATCH 655/804] f2fs: don't track new nat entry in nat set Nat entry set is used only in checkpoint(), and during checkpoint() we won't flush new nat entry with unallocated address, so we don't need to add new nat entry into nat set, then nat_entry_set::entry_cnt can indicate actual entry count we need to flush in checkpoint(). Signed-off-by: Yunlei He Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 29 +++++++++++++++++++++++++---- fs/f2fs/node.h | 1 + 2 files changed, 26 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 35e661890c58..157d768c7b31 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -193,8 +193,8 @@ static void __del_from_nat_cache(struct f2fs_nm_info *nm_i, struct nat_entry *e) __free_nat_entry(e); } -static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i, - struct nat_entry *ne) +static struct nat_entry_set *__grab_nat_entry_set(struct f2fs_nm_info *nm_i, + struct nat_entry *ne) { nid_t set = NAT_BLOCK_OFFSET(ne->ni.nid); struct nat_entry_set *head; @@ -209,15 +209,36 @@ static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i, head->entry_cnt = 0; f2fs_radix_tree_insert(&nm_i->nat_set_root, set, head); } + return head; +} + +static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i, + struct nat_entry *ne) +{ + struct nat_entry_set *head; + bool new_ne = nat_get_blkaddr(ne) == NEW_ADDR; + + if (!new_ne) + head = __grab_nat_entry_set(nm_i, ne); + + /* + * update entry_cnt in below condition: + * 1. update NEW_ADDR to valid block address; + * 2. update old block address to new one; + */ + if (!new_ne && (get_nat_flag(ne, IS_PREALLOC) || + !get_nat_flag(ne, IS_DIRTY))) + head->entry_cnt++; + + set_nat_flag(ne, IS_PREALLOC, new_ne); if (get_nat_flag(ne, IS_DIRTY)) goto refresh_list; nm_i->dirty_nat_cnt++; - head->entry_cnt++; set_nat_flag(ne, IS_DIRTY, true); refresh_list: - if (nat_get_blkaddr(ne) == NEW_ADDR) + if (new_ne) list_del_init(&ne->list); else list_move_tail(&ne->list, &head->entry_list); diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index e593b4d78be2..b95e49e4a928 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -44,6 +44,7 @@ enum { HAS_FSYNCED_INODE, /* is the inode fsynced before? */ HAS_LAST_FSYNC, /* has the latest node fsync mark? */ IS_DIRTY, /* this nat entry is dirty? */ + IS_PREALLOC, /* nat entry is preallocated */ }; /* From 0192e0a4502f23761a844333fb878fc60ce1b029 Mon Sep 17 00:00:00 2001 From: Yunlong Song Date: Thu, 22 Mar 2018 10:08:40 +0800 Subject: [PATCH 656/804] f2fs: no need to initialize zero value for GFP_F2FS_ZERO Since f2fs_inode_info is allocated with flag GFP_F2FS_ZERO, so we do not need to initialize zero value for its member any more. Signed-off-by: Yunlong Song Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 2feaf1e3fc9f..a622eb4f59f2 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -827,7 +827,6 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) /* Initialize f2fs-specific inode info */ atomic_set(&fi->dirty_pages, 0); fi->i_current_depth = 1; - fi->i_advise = 0; init_rwsem(&fi->i_sem); INIT_LIST_HEAD(&fi->dirty_list); INIT_LIST_HEAD(&fi->gdirty_list); @@ -839,10 +838,6 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) init_rwsem(&fi->i_mmap_sem); init_rwsem(&fi->i_xattr_sem); -#ifdef CONFIG_QUOTA - memset(&fi->i_dquot, 0, sizeof(fi->i_dquot)); - fi->i_reserved_quota = 0; -#endif /* Will be used by directory only */ fi->i_dir_level = F2FS_SB(sb)->dir_level; From ee2e74b3f00e663207d7832f613e75a5df3ae3fb Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Mon, 26 Mar 2018 17:32:23 +0800 Subject: [PATCH 657/804] f2fs: Add a segment type check in inplace write This patch add a segment type check in IPU, in case of something wrong with blkadd in dnode. Signed-off-by: Yunlei He Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 3389721893d3..d7bac60ad719 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2871,10 +2871,15 @@ void write_data_page(struct dnode_of_data *dn, struct f2fs_io_info *fio) int rewrite_data_page(struct f2fs_io_info *fio) { int err; + struct f2fs_sb_info *sbi = fio->sbi; fio->new_blkaddr = fio->old_blkaddr; /* i/o temperature is needed for passing down write hints */ __get_segment_type(fio); + + f2fs_bug_on(sbi, !IS_DATASEG(get_seg_entry(sbi, + GET_SEGNO(sbi, fio->new_blkaddr))->type)); + stat_inc_inplace_blocks(fio->sbi); err = f2fs_submit_page_bio(fio); From db2188a68704bd120d32836bc5ac273dc26b4617 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 28 Mar 2018 11:15:09 -0700 Subject: [PATCH 658/804] f2fs: reserve bits for fs-verity Reserve an F2FS feature flag and inode flag for fs-verity. This is an in-development feature that is planned be discussed at LSF/MM 2018 [1]. It will provide file-based integrity and authenticity for read-only files. Most code will be in a filesystem-independent module, with smaller changes needed to individual filesystems that opt-in to supporting the feature. An early prototype supporting F2FS is available [2]. Reserving the F2FS on-disk bits for fs-verity will prevent users of the prototype from conflicting with other new F2FS features. Note that we're reserving the inode flag in f2fs_inode.i_advise, which isn't really appropriate since it's not a hint or advice. But ->i_advise is already being used to hold the 'encrypt' flag; and F2FS's ->i_flags uses the generic FS_* values, so it seems ->i_flags can't be used for an F2FS-specific flag without additional work to remove the assumption that ->i_flags uses the generic flags namespace. [1] https://marc.info/?l=linux-fsdevel&m=151690752225644 [2] https://git.kernel.org/pub/scm/linux/kernel/git/mhalcrow/linux.git/log/?h=fs-verity-dev Signed-off-by: Eric Biggers Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index db5f61f821e4..2ede2e36f30f 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -147,6 +147,7 @@ struct f2fs_mount_info { #define F2FS_FEATURE_QUOTA_INO 0x0080 #define F2FS_FEATURE_INODE_CRTIME 0x0100 #define F2FS_FEATURE_LOST_FOUND 0x0200 +#define F2FS_FEATURE_VERITY 0x0400 /* reserved */ #define F2FS_HAS_FEATURE(sb, mask) \ ((F2FS_SB(sb)->raw_super->feature & cpu_to_le32(mask)) != 0) @@ -662,6 +663,7 @@ enum { #define FADVISE_ENC_NAME_BIT 0x08 #define FADVISE_KEEP_SIZE_BIT 0x10 #define FADVISE_HOT_BIT 0x20 +#define FADVISE_VERITY_BIT 0x40 /* reserved */ #define file_is_cold(inode) is_file(inode, FADVISE_COLD_BIT) #define file_wrong_pino(inode) is_file(inode, FADVISE_LOST_PINO_BIT) From 86444d6006929a57706be791a09e1e77f9fd6fdd Mon Sep 17 00:00:00 2001 From: Junling Zheng Date: Thu, 29 Mar 2018 19:27:12 +0800 Subject: [PATCH 659/804] f2fs: fix a wrong condition in f2fs_skip_inode_update Fix commit 97dd26ad8347 (f2fs: fix wrong AUTO_RECOVER condition). We should use ~PAGE_MASK to determine whether i_size is aligned to the f2fs's block size or not. Signed-off-by: Junling Zheng Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 2ede2e36f30f..1ca32899af2b 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2535,7 +2535,7 @@ static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync) } if (!is_inode_flag_set(inode, FI_AUTO_RECOVER) || file_keep_isize(inode) || - i_size_read(inode) & PAGE_MASK) + i_size_read(inode) & ~PAGE_MASK) return false; down_read(&F2FS_I(inode)->i_sem); From a8d07f1f9c627e53ae608cb3273ade02ae56e343 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 30 Mar 2018 17:58:13 -0700 Subject: [PATCH 660/804] f2fs: truncate preallocated blocks in error case If write is failed, we must deallocate the blocks that we couldn't write. Cc: stable@vger.kernel.org Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 161ddca86387..39c3acb454a3 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2711,6 +2711,8 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) ret = generic_write_checks(iocb, from); if (ret > 0) { + bool preallocated = false; + size_t target_size = 0; int err; if (iov_iter_fault_in_readable(from, iov_iter_count(from))) @@ -2727,6 +2729,9 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) } } else { + preallocated = true; + target_size = iocb->ki_pos + iov_iter_count(from); + err = f2fs_preallocate_blocks(iocb, from); if (err) { clear_inode_flag(inode, FI_NO_PREALLOC); @@ -2739,6 +2744,10 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) blk_finish_plug(&plug); clear_inode_flag(inode, FI_NO_PREALLOC); + /* if we couldn't write data, we should deallocate blocks. */ + if (preallocated && i_size_read(inode) < target_size) + f2fs_truncate(inode); + if (ret > 0) f2fs_update_iostat(F2FS_I_SB(inode), APP_WRITE_IO, ret); } From 6cb5aa02bfbd5081549988aa76cd3598eb1acdab Mon Sep 17 00:00:00 2001 From: Yunlong Song Date: Mon, 2 Apr 2018 20:22:20 +0800 Subject: [PATCH 661/804] f2fs: make assignment of t->dentry_bitmap more readable In make_dentry_ptr_block, it is confused with "&" for t->dentry_bitmap but without "&" for t->dentry, so delete "&" to make code more readable. Signed-off-by: Yunlong Song Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 1ca32899af2b..3bb4071633f2 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -536,7 +536,7 @@ static inline void make_dentry_ptr_block(struct inode *inode, d->inode = inode; d->max = NR_DENTRY_IN_BLOCK; d->nr_bitmap = SIZE_OF_DENTRY_BITMAP; - d->bitmap = &t->dentry_bitmap; + d->bitmap = t->dentry_bitmap; d->dentry = t->dentry; d->filename = t->filename; } From 42bf67fc543bc5945b355ad38f40cd84c3886786 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 29 Mar 2018 22:50:41 -0700 Subject: [PATCH 662/804] f2fs: remain written times to update inode during fsync This fixes xfstests/generic/392. The failure was caused by different times between 1) one marked in the last fsync(2) call and 2) the other given by roll-forward recovery after power-cut. The reason was that we skipped updating inode block at 1), since its i_size was recoverable along with 4KB-aligned data writes, which was fixed by: "f2fs: fix a wrong condition in f2fs_skip_inode_update" Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 11 +++++++++++ fs/f2fs/inode.c | 8 ++++++++ 2 files changed, 19 insertions(+) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 3bb4071633f2..a8bdcf5fc50f 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -728,6 +728,7 @@ struct f2fs_inode_info { kprojid_t i_projid; /* id for project quota */ int i_inline_xattr_size; /* inline xattr size */ struct timespec i_crtime; /* inode creation time */ + struct timespec i_disk_time[4]; /* inode disk times */ }; static inline void get_extent_info(struct extent_info *ext, @@ -2538,6 +2539,16 @@ static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync) i_size_read(inode) & ~PAGE_MASK) return false; + if (!timespec_equal(F2FS_I(inode)->i_disk_time, &inode->i_atime)) + return false; + if (!timespec_equal(F2FS_I(inode)->i_disk_time + 1, &inode->i_ctime)) + return false; + if (!timespec_equal(F2FS_I(inode)->i_disk_time + 2, &inode->i_mtime)) + return false; + if (!timespec_equal(F2FS_I(inode)->i_disk_time + 3, + &F2FS_I(inode)->i_crtime)) + return false; + down_read(&F2FS_I(inode)->i_sem); ret = F2FS_I(inode)->last_disk_size == i_size_read(inode); up_read(&F2FS_I(inode)->i_sem); diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 562a56bc037c..51846fc54fbd 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -284,6 +284,10 @@ static int do_read_inode(struct inode *inode) fi->i_crtime.tv_nsec = le32_to_cpu(ri->i_crtime_nsec); } + F2FS_I(inode)->i_disk_time[0] = inode->i_atime; + F2FS_I(inode)->i_disk_time[1] = inode->i_ctime; + F2FS_I(inode)->i_disk_time[2] = inode->i_mtime; + F2FS_I(inode)->i_disk_time[3] = F2FS_I(inode)->i_crtime; f2fs_put_page(node_page, 1); stat_inc_inline_xattr(inode); @@ -444,6 +448,10 @@ void update_inode(struct inode *inode, struct page *node_page) if (inode->i_nlink == 0) clear_inline_node(node_page); + F2FS_I(inode)->i_disk_time[0] = inode->i_atime; + F2FS_I(inode)->i_disk_time[1] = inode->i_ctime; + F2FS_I(inode)->i_disk_time[2] = inode->i_mtime; + F2FS_I(inode)->i_disk_time[3] = F2FS_I(inode)->i_crtime; } void update_inode_page(struct inode *inode) From 13890bed2032a3d92ea25df2ffe42b54d329f60d Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 18 Apr 2018 11:09:47 -0700 Subject: [PATCH 663/804] fscrypt: allow synchronous bio decryption Currently, fscrypt provides fscrypt_decrypt_bio_pages() which decrypts a bio's pages asynchronously, then unlocks them afterwards. But, this assumes that decryption is the last "postprocessing step" for the bio, so it's incompatible with additional postprocessing steps such as authenticity verification after decryption. Therefore, rename the existing fscrypt_decrypt_bio_pages() to fscrypt_enqueue_decrypt_bio(). Then, add fscrypt_decrypt_bio() which decrypts the pages in the bio synchronously without unlocking the pages, nor setting them Uptodate; and add fscrypt_enqueue_decrypt_work(), which enqueues work on the fscrypt_read_workqueue. The new functions will be used by filesystems that support both fscrypt and fs-verity. Signed-off-by: Eric Biggers Signed-off-by: Jaegeuk Kim --- fs/crypto/bio.c | 35 +++++++++++++++++++++------------ fs/crypto/crypto.c | 8 +++++++- fs/crypto/fscrypt_private.h | 1 - fs/f2fs/data.c | 2 +- include/linux/fscrypt_notsupp.h | 13 +++++++++--- include/linux/fscrypt_supp.h | 5 ++++- 6 files changed, 44 insertions(+), 20 deletions(-) diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c index a91ed46fe503..c7cf565c434e 100644 --- a/fs/crypto/bio.c +++ b/fs/crypto/bio.c @@ -25,15 +25,8 @@ #include #include "fscrypt_private.h" -/* - * Call fscrypt_decrypt_page on every single page, reusing the encryption - * context. - */ -static void completion_pages(struct work_struct *work) +static void __fscrypt_decrypt_bio(struct bio *bio, bool done) { - struct fscrypt_ctx *ctx = - container_of(work, struct fscrypt_ctx, r.work); - struct bio *bio = ctx->r.bio; struct bio_vec *bv; int i; @@ -45,22 +38,38 @@ static void completion_pages(struct work_struct *work) if (ret) { WARN_ON_ONCE(1); SetPageError(page); - } else { + } else if (done) { SetPageUptodate(page); } - unlock_page(page); + if (done) + unlock_page(page); } +} + +void fscrypt_decrypt_bio(struct bio *bio) +{ + __fscrypt_decrypt_bio(bio, false); +} +EXPORT_SYMBOL(fscrypt_decrypt_bio); + +static void completion_pages(struct work_struct *work) +{ + struct fscrypt_ctx *ctx = + container_of(work, struct fscrypt_ctx, r.work); + struct bio *bio = ctx->r.bio; + + __fscrypt_decrypt_bio(bio, true); fscrypt_release_ctx(ctx); bio_put(bio); } -void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *ctx, struct bio *bio) +void fscrypt_enqueue_decrypt_bio(struct fscrypt_ctx *ctx, struct bio *bio) { INIT_WORK(&ctx->r.work, completion_pages); ctx->r.bio = bio; - queue_work(fscrypt_read_workqueue, &ctx->r.work); + fscrypt_enqueue_decrypt_work(&ctx->r.work); } -EXPORT_SYMBOL(fscrypt_decrypt_bio_pages); +EXPORT_SYMBOL(fscrypt_enqueue_decrypt_bio); void fscrypt_pullback_bio_page(struct page **page, bool restore) { diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index ce654526c0fb..0758d32ad01b 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -45,12 +45,18 @@ static mempool_t *fscrypt_bounce_page_pool = NULL; static LIST_HEAD(fscrypt_free_ctxs); static DEFINE_SPINLOCK(fscrypt_ctx_lock); -struct workqueue_struct *fscrypt_read_workqueue; +static struct workqueue_struct *fscrypt_read_workqueue; static DEFINE_MUTEX(fscrypt_init_mutex); static struct kmem_cache *fscrypt_ctx_cachep; struct kmem_cache *fscrypt_info_cachep; +void fscrypt_enqueue_decrypt_work(struct work_struct *work) +{ + queue_work(fscrypt_read_workqueue, work); +} +EXPORT_SYMBOL(fscrypt_enqueue_decrypt_work); + /** * fscrypt_release_ctx() - Releases an encryption context * @ctx: The encryption context to release. diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index 5c296d4af4a9..426aa1b27f17 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -107,7 +107,6 @@ static inline bool fscrypt_valid_enc_modes(u32 contents_mode, /* crypto.c */ extern struct kmem_cache *fscrypt_info_cachep; extern int fscrypt_initialize(unsigned int cop_flags); -extern struct workqueue_struct *fscrypt_read_workqueue; extern int fscrypt_do_page_crypto(const struct inode *inode, fscrypt_direction_t rw, u64 lblk_num, struct page *src_page, diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index b677300c5bac..7acc982f632d 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -65,7 +65,7 @@ static void f2fs_read_end_io(struct bio *bio) if (bio->bi_error) { fscrypt_release_ctx(bio->bi_private); } else { - fscrypt_decrypt_bio_pages(bio->bi_private, bio); + fscrypt_enqueue_decrypt_bio(bio->bi_private, bio); return; } } diff --git a/include/linux/fscrypt_notsupp.h b/include/linux/fscrypt_notsupp.h index 5777251400f9..44bd4fbd3ec5 100644 --- a/include/linux/fscrypt_notsupp.h +++ b/include/linux/fscrypt_notsupp.h @@ -24,6 +24,10 @@ static inline bool fscrypt_dummy_context_enabled(struct inode *inode) } /* crypto.c */ +static inline void fscrypt_enqueue_decrypt_work(struct work_struct *work) +{ +} + static inline struct fscrypt_ctx *fscrypt_get_ctx(const struct inode *inode, gfp_t gfp_flags) { @@ -160,10 +164,13 @@ static inline bool fscrypt_match_name(const struct fscrypt_name *fname, } /* bio.c */ -static inline void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *ctx, - struct bio *bio) +static inline void fscrypt_decrypt_bio(struct bio *bio) +{ +} + +static inline void fscrypt_enqueue_decrypt_bio(struct fscrypt_ctx *ctx, + struct bio *bio) { - return; } static inline void fscrypt_pullback_bio_page(struct page **page, bool restore) diff --git a/include/linux/fscrypt_supp.h b/include/linux/fscrypt_supp.h index c88d2058902a..9d1857302b73 100644 --- a/include/linux/fscrypt_supp.h +++ b/include/linux/fscrypt_supp.h @@ -58,6 +58,7 @@ static inline bool fscrypt_dummy_context_enabled(struct inode *inode) } /* crypto.c */ +extern void fscrypt_enqueue_decrypt_work(struct work_struct *); extern struct fscrypt_ctx *fscrypt_get_ctx(const struct inode *, gfp_t); extern void fscrypt_release_ctx(struct fscrypt_ctx *); extern struct page *fscrypt_encrypt_page(const struct inode *, struct page *, @@ -187,7 +188,9 @@ static inline bool fscrypt_match_name(const struct fscrypt_name *fname, } /* bio.c */ -extern void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *, struct bio *); +extern void fscrypt_decrypt_bio(struct bio *); +extern void fscrypt_enqueue_decrypt_bio(struct fscrypt_ctx *ctx, + struct bio *bio); extern void fscrypt_pullback_bio_page(struct page **, bool); extern int fscrypt_zeroout_range(const struct inode *, pgoff_t, sector_t, unsigned int); From c18b4f60c8dfa090117422018e5f052e2b5b5ba8 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 18 Apr 2018 11:09:48 -0700 Subject: [PATCH 664/804] f2fs: refactor read path to allow multiple postprocessing steps Currently f2fs's ->readpage() and ->readpages() assume that either the data undergoes no postprocessing, or decryption only. But with fs-verity, there will be an additional authenticity verification step, and it may be needed either by itself, or combined with decryption. To support this, store a 'struct bio_post_read_ctx' in ->bi_private which contains a work struct, a bitmask of postprocessing steps that are enabled, and an indicator of the current step. The bio completion routine, if there was no I/O error, enqueues the first postprocessing step. When that completes, it continues to the next step. Pages that fail any postprocessing step have PageError set. Once all steps have completed, pages without PageError set are set Uptodate, and all pages are unlocked. Also replace f2fs_encrypted_file() with a new function f2fs_post_read_required() in places like direct I/O and garbage collection that really should be testing whether the file needs special I/O processing, not whether it is encrypted specifically. This may also be useful for other future f2fs features such as compression. Signed-off-by: Eric Biggers Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 168 +++++++++++++++++++++++++++++++++++------------ fs/f2fs/f2fs.h | 12 +++- fs/f2fs/file.c | 4 +- fs/f2fs/gc.c | 6 +- fs/f2fs/inline.c | 2 +- fs/f2fs/super.c | 6 ++ 6 files changed, 148 insertions(+), 50 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 7acc982f632d..d3d2e4775003 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -29,6 +29,11 @@ #include "trace.h" #include +#define NUM_PREALLOC_POST_READ_CTXS 128 + +static struct kmem_cache *bio_post_read_ctx_cache; +static mempool_t *bio_post_read_ctx_pool; + static bool __is_cp_guaranteed(struct page *page) { struct address_space *mapping = page->mapping; @@ -49,11 +54,77 @@ static bool __is_cp_guaranteed(struct page *page) return false; } -static void f2fs_read_end_io(struct bio *bio) +/* postprocessing steps for read bios */ +enum bio_post_read_step { + STEP_INITIAL = 0, + STEP_DECRYPT, +}; + +struct bio_post_read_ctx { + struct bio *bio; + struct work_struct work; + unsigned int cur_step; + unsigned int enabled_steps; +}; + +static void __read_end_io(struct bio *bio) { - struct bio_vec *bvec; + struct page *page; + struct bio_vec *bv; int i; + bio_for_each_segment_all(bv, bio, i) { + page = bv->bv_page; + + /* PG_error was set if any post_read step failed */ + if (bio->bi_error || PageError(page)) { + ClearPageUptodate(page); + SetPageError(page); + } else { + SetPageUptodate(page); + } + unlock_page(page); + } + if (bio->bi_private) + mempool_free(bio->bi_private, bio_post_read_ctx_pool); + bio_put(bio); +} + +static void bio_post_read_processing(struct bio_post_read_ctx *ctx); + +static void decrypt_work(struct work_struct *work) +{ + struct bio_post_read_ctx *ctx = + container_of(work, struct bio_post_read_ctx, work); + + fscrypt_decrypt_bio(ctx->bio); + + bio_post_read_processing(ctx); +} + +static void bio_post_read_processing(struct bio_post_read_ctx *ctx) +{ + switch (++ctx->cur_step) { + case STEP_DECRYPT: + if (ctx->enabled_steps & (1 << STEP_DECRYPT)) { + INIT_WORK(&ctx->work, decrypt_work); + fscrypt_enqueue_decrypt_work(&ctx->work); + return; + } + ctx->cur_step++; + /* fall-through */ + default: + __read_end_io(ctx->bio); + } +} + +static bool f2fs_bio_post_read_required(struct bio *bio) +{ + return bio->bi_private && !bio->bi_error; +} + +static void f2fs_read_end_io(struct bio *bio) +{ #ifdef CONFIG_F2FS_FAULT_INJECTION if (time_to_inject(F2FS_P_SB(bio->bi_io_vec->bv_page), FAULT_IO)) { f2fs_show_injection_info(FAULT_IO); @@ -61,28 +132,15 @@ static void f2fs_read_end_io(struct bio *bio) } #endif - if (f2fs_bio_encrypted(bio)) { - if (bio->bi_error) { - fscrypt_release_ctx(bio->bi_private); - } else { - fscrypt_enqueue_decrypt_bio(bio->bi_private, bio); - return; - } + if (f2fs_bio_post_read_required(bio)) { + struct bio_post_read_ctx *ctx = bio->bi_private; + + ctx->cur_step = STEP_INITIAL; + bio_post_read_processing(ctx); + return; } - bio_for_each_segment_all(bvec, bio, i) { - struct page *page = bvec->bv_page; - - if (!bio->bi_error) { - if (!PageUptodate(page)) - SetPageUptodate(page); - } else { - ClearPageUptodate(page); - SetPageError(page); - } - unlock_page(page); - } - bio_put(bio); + __read_end_io(bio); } static void f2fs_write_end_io(struct bio *bio) @@ -479,29 +537,33 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, unsigned nr_pages) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct fscrypt_ctx *ctx = NULL; struct bio *bio; + struct bio_post_read_ctx *ctx; + unsigned int post_read_steps = 0; - if (f2fs_encrypted_file(inode)) { - ctx = fscrypt_get_ctx(inode, GFP_NOFS); - if (IS_ERR(ctx)) - return ERR_CAST(ctx); + bio = f2fs_bio_alloc(sbi, min_t(int, nr_pages, BIO_MAX_PAGES), false); + if (!bio) + return ERR_PTR(-ENOMEM); + f2fs_target_device(sbi, blkaddr, bio); + bio->bi_end_io = f2fs_read_end_io; + bio_set_op_attrs(bio, REQ_OP_READ, 0); + + if (f2fs_encrypted_file(inode)) + post_read_steps |= 1 << STEP_DECRYPT; + if (post_read_steps) { + ctx = mempool_alloc(bio_post_read_ctx_pool, GFP_NOFS); + if (!ctx) { + bio_put(bio); + return ERR_PTR(-ENOMEM); + } + ctx->bio = bio; + ctx->enabled_steps = post_read_steps; + bio->bi_private = ctx; /* wait the page to be moved by cleaning */ f2fs_wait_on_block_writeback(sbi, blkaddr); } - bio = f2fs_bio_alloc(sbi, min_t(int, nr_pages, BIO_MAX_PAGES), false); - if (!bio) { - if (ctx) - fscrypt_release_ctx(ctx); - return ERR_PTR(-ENOMEM); - } - f2fs_target_device(sbi, blkaddr, bio); - bio->bi_end_io = f2fs_read_end_io; - bio->bi_private = ctx; - bio_set_op_attrs(bio, REQ_OP_READ, 0); - return bio; } @@ -1522,7 +1584,7 @@ static int encrypt_one_page(struct f2fs_io_info *fio) if (!f2fs_encrypted_file(inode)) return 0; - /* wait for GCed encrypted page writeback */ + /* wait for GCed page writeback via META_MAPPING */ f2fs_wait_on_block_writeback(fio->sbi, fio->old_blkaddr); retry_encrypt: @@ -2224,8 +2286,8 @@ repeat: f2fs_wait_on_page_writeback(page, DATA, false); - /* wait for GCed encrypted page writeback */ - if (f2fs_encrypted_file(inode)) + /* wait for GCed page writeback via META_MAPPING */ + if (f2fs_post_read_required(inode)) f2fs_wait_on_block_writeback(sbi, blkaddr); if (len == PAGE_SIZE || PageUptodate(page)) @@ -2556,3 +2618,27 @@ const struct address_space_operations f2fs_dblock_aops = { .migratepage = f2fs_migrate_page, #endif }; + +int __init f2fs_init_post_read_processing(void) +{ + bio_post_read_ctx_cache = KMEM_CACHE(bio_post_read_ctx, 0); + if (!bio_post_read_ctx_cache) + goto fail; + bio_post_read_ctx_pool = + mempool_create_slab_pool(NUM_PREALLOC_POST_READ_CTXS, + bio_post_read_ctx_cache); + if (!bio_post_read_ctx_pool) + goto fail_free_cache; + return 0; + +fail_free_cache: + kmem_cache_destroy(bio_post_read_ctx_cache); +fail: + return -ENOMEM; +} + +void __exit f2fs_destroy_post_read_processing(void) +{ + mempool_destroy(bio_post_read_ctx_pool); + kmem_cache_destroy(bio_post_read_ctx_cache); +} diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index a8bdcf5fc50f..5ca193f25874 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2942,6 +2942,8 @@ void destroy_checkpoint_caches(void); /* * data.c */ +int f2fs_init_post_read_processing(void); +void f2fs_destroy_post_read_processing(void); void f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type); void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi, struct inode *inode, nid_t ino, pgoff_t idx, @@ -3302,9 +3304,13 @@ static inline void f2fs_set_encrypted_inode(struct inode *inode) #endif } -static inline bool f2fs_bio_encrypted(struct bio *bio) +/* + * Returns true if the reads of the inode's data need to undergo some + * postprocessing step, like decryption or authenticity verification. + */ +static inline bool f2fs_post_read_required(struct inode *inode) { - return bio->bi_private != NULL; + return f2fs_encrypted_file(inode); } #define F2FS_FEATURE_FUNCS(name, flagname) \ @@ -3372,7 +3378,7 @@ static inline bool f2fs_may_encrypt(struct inode *inode) static inline bool f2fs_force_buffered_io(struct inode *inode, int rw) { - return (f2fs_encrypted_file(inode) || + return (f2fs_post_read_required(inode) || (rw == WRITE && test_opt(F2FS_I_SB(inode), LFS)) || F2FS_I_SB(inode)->s_ndevs); } diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 39c3acb454a3..7587758a285f 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -113,8 +113,8 @@ mapped: /* fill the page */ f2fs_wait_on_page_writeback(page, DATA, false); - /* wait for GCed encrypted page writeback */ - if (f2fs_encrypted_file(inode)) + /* wait for GCed page writeback via META_MAPPING */ + if (f2fs_post_read_required(inode)) f2fs_wait_on_block_writeback(sbi, dn.data_blkaddr); out_sem: diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 54f51a990794..c009b50d69f5 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -850,8 +850,8 @@ next_step: if (IS_ERR(inode) || is_bad_inode(inode)) continue; - /* if encrypted inode, let's go phase 3 */ - if (f2fs_encrypted_file(inode)) { + /* if inode uses special I/O path, let's go phase 3 */ + if (f2fs_post_read_required(inode)) { add_gc_inode(gc_list, inode); continue; } @@ -899,7 +899,7 @@ next_step: start_bidx = start_bidx_of_node(nofs, inode) + ofs_in_node; - if (f2fs_encrypted_file(inode)) + if (f2fs_post_read_required(inode)) move_data_block(inode, start_bidx, segno, off); else move_data_page(inode, start_bidx, gc_type, diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 12f6c6471c56..67523fabb822 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -25,7 +25,7 @@ bool f2fs_may_inline_data(struct inode *inode) if (i_size_read(inode) > MAX_INLINE_DATA(inode)) return false; - if (f2fs_encrypted_file(inode)) + if (f2fs_post_read_required(inode)) return false; return true; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index a622eb4f59f2..55b2bad55671 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -3100,8 +3100,13 @@ static int __init init_f2fs_fs(void) err = f2fs_create_root_stats(); if (err) goto free_filesystem; + err = f2fs_init_post_read_processing(); + if (err) + goto free_root_stats; return 0; +free_root_stats: + f2fs_destroy_root_stats(); free_filesystem: unregister_filesystem(&f2fs_fs_type); free_shrinker: @@ -3124,6 +3129,7 @@ fail: static void __exit exit_f2fs_fs(void) { + f2fs_destroy_post_read_processing(); f2fs_destroy_root_stats(); unregister_filesystem(&f2fs_fs_type); unregister_shrinker(&f2fs_shrinker_info); From dafecc032ea1cfbc1399ec5371d10961aaf59a59 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 18 Apr 2018 15:48:42 -0700 Subject: [PATCH 665/804] f2fs: call unlock_new_inode() before d_instantiate() xfstest generic/429 sometimes hangs on f2fs, caused by a thread being unable to take a directory's i_rwsem for write in vfs_rmdir(). In the test, one thread repeatedly creates and removes a directory, and other threads repeatedly look up a file in the directory. The bug is that f2fs_mkdir() calls d_instantiate() before unlock_new_inode(), resulting in the directory inode being exposed to lookups before it has been fully initialized. And with CONFIG_DEBUG_LOCK_ALLOC, unlock_new_inode() reinitializes ->i_rwsem, corrupting its state when it is already held. Fix it by calling unlock_new_inode() before d_instantiate(). This matches what other filesystems do. Fixes: 57397d86c62d ("f2fs: add inode operations for special inodes") Signed-off-by: Eric Biggers Signed-off-by: Jaegeuk Kim --- fs/f2fs/namei.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 5ec20f077629..fecae8685d2a 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -294,8 +294,8 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, alloc_nid_done(sbi, ino); - d_instantiate(dentry, inode); unlock_new_inode(inode); + d_instantiate(dentry, inode); if (IS_DIRSYNC(dir)) f2fs_sync_fs(sbi->sb, 1); @@ -594,8 +594,8 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, err = page_symlink(inode, disk_link.name, disk_link.len); err_out: - d_instantiate(dentry, inode); unlock_new_inode(inode); + d_instantiate(dentry, inode); /* * Let's flush symlink data in order to avoid broken symlink as much as @@ -658,8 +658,8 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) alloc_nid_done(sbi, inode->i_ino); - d_instantiate(dentry, inode); unlock_new_inode(inode); + d_instantiate(dentry, inode); if (IS_DIRSYNC(dir)) f2fs_sync_fs(sbi->sb, 1); @@ -710,8 +710,8 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry, alloc_nid_done(sbi, inode->i_ino); - d_instantiate(dentry, inode); unlock_new_inode(inode); + d_instantiate(dentry, inode); if (IS_DIRSYNC(dir)) f2fs_sync_fs(sbi->sb, 1); From 070da80085a4503d7e7c662f15cd4793d622a626 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 11 Apr 2018 23:09:04 -0700 Subject: [PATCH 666/804] f2fs: clear PageError on writepage This patch clears PageError in some pages tagged by read path, but when we write the pages with valid contents, writepage should clear the bit likewise ext4. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index d3d2e4775003..b8c142de7bb4 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1734,6 +1734,7 @@ got_it: goto out_writepage; set_page_writeback(page); + ClearPageError(page); f2fs_put_dnode(&dn); if (fio->need_lock == LOCK_REQ) f2fs_unlock_op(fio->sbi); @@ -1756,6 +1757,7 @@ got_it: goto out_writepage; set_page_writeback(page); + ClearPageError(page); /* LFS mode write path */ write_data_page(&dn, fio); From 3e7a141175756d3df614ee9ab7480c2e04642b1d Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 20 Apr 2018 19:29:52 -0700 Subject: [PATCH 667/804] Revert "f2fs: introduce f2fs_set_page_dirty_nobuffer" This patch reverts copied f2fs_set_page_dirty_nobuffer to use generic function for stability. This reverts commit fe76b796fc5194cc3d57265002e3a748566d073f. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 2 +- fs/f2fs/data.c | 35 +---------------------------------- fs/f2fs/f2fs.h | 1 - fs/f2fs/node.c | 2 +- 4 files changed, 3 insertions(+), 37 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 04c608646fd5..760d1ad22722 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -386,7 +386,7 @@ static int f2fs_set_meta_page_dirty(struct page *page) if (!PageUptodate(page)) SetPageUptodate(page); if (!PageDirty(page)) { - f2fs_set_page_dirty_nobuffers(page); + __set_page_dirty_nobuffers(page); inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_META); SetPagePrivate(page); f2fs_trace_pid(page); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index b8c142de7bb4..b48c578c0bf6 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -19,8 +19,6 @@ #include #include #include -#include -#include #include #include "f2fs.h" @@ -2473,37 +2471,6 @@ int f2fs_release_page(struct page *page, gfp_t wait) return 1; } -/* - * This was copied from __set_page_dirty_buffers which gives higher performance - * in very high speed storages. (e.g., pmem) - */ -void f2fs_set_page_dirty_nobuffers(struct page *page) -{ - struct address_space *mapping = page->mapping; - struct mem_cgroup *memcg; - unsigned long flags; - - if (unlikely(!mapping)) - return; - - spin_lock(&mapping->private_lock); - memcg = mem_cgroup_begin_page_stat(page); - SetPageDirty(page); - spin_unlock(&mapping->private_lock); - - spin_lock_irqsave(&mapping->tree_lock, flags); - WARN_ON_ONCE(!PageUptodate(page)); - account_page_dirtied(page, mapping, memcg); - radix_tree_tag_set(&mapping->page_tree, - page_index(page), PAGECACHE_TAG_DIRTY); - spin_unlock_irqrestore(&mapping->tree_lock, flags); - - mem_cgroup_end_page_stat(memcg); - - __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); - return; -} - static int f2fs_set_data_page_dirty(struct page *page) { struct address_space *mapping = page->mapping; @@ -2527,7 +2494,7 @@ static int f2fs_set_data_page_dirty(struct page *page) } if (!PageDirty(page)) { - f2fs_set_page_dirty_nobuffers(page); + __set_page_dirty_nobuffers(page); update_dirty_page(inode, page); return 1; } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 5ca193f25874..486107b8b38c 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2975,7 +2975,6 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len); bool should_update_inplace(struct inode *inode, struct f2fs_io_info *fio); bool should_update_outplace(struct inode *inode, struct f2fs_io_info *fio); -void f2fs_set_page_dirty_nobuffers(struct page *page); int __f2fs_write_data_pages(struct address_space *mapping, struct writeback_control *wbc, enum iostat_type io_type); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 157d768c7b31..3871e7d3f69e 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1775,7 +1775,7 @@ static int f2fs_set_node_page_dirty(struct page *page) if (!PageUptodate(page)) SetPageUptodate(page); if (!PageDirty(page)) { - f2fs_set_page_dirty_nobuffers(page); + __set_page_dirty_nobuffers(page); inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES); SetPagePrivate(page); f2fs_trace_pid(page); From f819874f58cf77184907d41e7358d970f32bc061 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 20 Apr 2018 23:44:59 -0700 Subject: [PATCH 668/804] f2fs: check cap_resource only for data blocks This patch changes the rule to check cap_resource for data blocks, not inode or node blocks in order to avoid selinux denial. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 486107b8b38c..978c58d329f8 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1678,7 +1678,7 @@ static inline bool f2fs_has_xattr_block(unsigned int ofs) } static inline bool __allow_reserved_blocks(struct f2fs_sb_info *sbi, - struct inode *inode) + struct inode *inode, bool cap) { if (!inode) return true; @@ -1691,7 +1691,7 @@ static inline bool __allow_reserved_blocks(struct f2fs_sb_info *sbi, if (!gid_eq(F2FS_OPTION(sbi).s_resgid, GLOBAL_ROOT_GID) && in_group_p(F2FS_OPTION(sbi).s_resgid)) return true; - if (capable(CAP_SYS_RESOURCE)) + if (cap && capable(CAP_SYS_RESOURCE)) return true; return false; } @@ -1726,7 +1726,7 @@ static inline int inc_valid_block_count(struct f2fs_sb_info *sbi, avail_user_block_count = sbi->user_block_count - sbi->current_reserved_blocks; - if (!__allow_reserved_blocks(sbi, inode)) + if (!__allow_reserved_blocks(sbi, inode, true)) avail_user_block_count -= F2FS_OPTION(sbi).root_reserved_blocks; if (unlikely(sbi->total_valid_block_count > avail_user_block_count)) { @@ -1933,7 +1933,7 @@ static inline int inc_valid_node_count(struct f2fs_sb_info *sbi, valid_block_count = sbi->total_valid_block_count + sbi->current_reserved_blocks + 1; - if (!__allow_reserved_blocks(sbi, inode)) + if (!__allow_reserved_blocks(sbi, inode, false)) valid_block_count += F2FS_OPTION(sbi).root_reserved_blocks; if (unlikely(valid_block_count > sbi->user_block_count)) { From a44b418c31458b213ab59659776f5597e7e78b32 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 11 Apr 2018 23:09:04 -0700 Subject: [PATCH 669/804] f2fs: clear PageError on writepage - part 2 This patch clears PageError in some pages tagged by read path, but when we write the pages with valid contents, writepage should clear the bit likewise ext4. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 1 + fs/f2fs/inline.c | 1 + fs/f2fs/node.c | 1 + fs/f2fs/segment.c | 1 + 4 files changed, 4 insertions(+) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index c009b50d69f5..d28d31cbd7d2 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -693,6 +693,7 @@ static void move_data_block(struct inode *inode, block_t bidx, dec_page_count(fio.sbi, F2FS_DIRTY_META); set_page_writeback(fio.encrypted_page); + ClearPageError(page); /* allocate block address */ f2fs_wait_on_page_writeback(dn.node_page, NODE, true); diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 67523fabb822..2ff0305391cd 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -139,6 +139,7 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) /* write data page to try to make data consistent */ set_page_writeback(page); + ClearPageError(page); fio.old_blkaddr = dn->data_blkaddr; set_inode_flag(dn->inode, FI_HOT_DATA); write_data_page(dn, &fio); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 3871e7d3f69e..16aee2a7b8a9 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1398,6 +1398,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, fio.op_flags |= WRITE_FLUSH_FUA; set_page_writeback(page); + ClearPageError(page); fio.old_blkaddr = ni.blk_addr; write_node_page(nid, &fio); set_node_addr(sbi, &ni, fio.new_blkaddr, is_fsync_dnode(page)); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index d7bac60ad719..01bc94df9f00 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2838,6 +2838,7 @@ void write_meta_page(struct f2fs_sb_info *sbi, struct page *page, fio.op_flags &= ~REQ_META; set_page_writeback(page); + ClearPageError(page); f2fs_submit_page_write(&fio); f2fs_update_iostat(sbi, io_type, F2FS_BLKSIZE); From 520a9486182437847212c8e226d042b1e14b7cc2 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 3 May 2018 23:26:02 -0700 Subject: [PATCH 670/804] f2fs: avoid fsync() failure caused by EAGAIN in writepage() pageout() in MM traslates EAGAIN, so calls handle_write_error() -> mapping_set_error() -> set_bit(AS_EIO, ...). file_write_and_wait_range() will see EIO error, which is critical to return value of fsync() followed by atomic_write failure to user. Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index b48c578c0bf6..b675d5dd5c91 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1901,7 +1901,13 @@ out: redirty_out: redirty_page_for_writepage(wbc, page); - if (!err) + /* + * pageout() in MM traslates EAGAIN, so calls handle_write_error() + * -> mapping_set_error() -> set_bit(AS_EIO, ...). + * file_write_and_wait_range() will see EIO error, which is critical + * to return value of fsync() followed by atomic_write failure to user. + */ + if (!err || wbc->for_reclaim) return AOP_WRITEPAGE_ACTIVATE; unlock_page(page); return err; From bb53d06b5f21161295e6dea0eda941351cc9d3a1 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 24 May 2018 13:57:26 -0700 Subject: [PATCH 671/804] f2fs: let fstrim issue discard commands in lower priority The fstrim gathers huge number of large discard commands, and tries to issue without IO awareness, which results in long user-perceive IO latencies on READ, WRITE, and FLUSH in UFS. We've observed some of commands take several seconds due to long discard latency. This patch limits the maximum size to 2MB per candidate, and check IO congestion when issuing them to disk. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 4 +- fs/f2fs/segment.c | 139 +++++++++++++++++++++++++--------------------- 2 files changed, 78 insertions(+), 65 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 978c58d329f8..0bb23ad94b39 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -250,6 +250,7 @@ enum { (BATCHED_TRIM_SEGMENTS(sbi) << (sbi)->log_blocks_per_seg) #define MAX_DISCARD_BLOCKS(sbi) BLKS_PER_SEC(sbi) #define DEF_MAX_DISCARD_REQUEST 8 /* issue 8 discards per round */ +#define DEF_MAX_DISCARD_LEN 512 /* Max. 2MB per discard */ #define DEF_MIN_DISCARD_ISSUE_TIME 50 /* 50 ms, if exists */ #define DEF_MAX_DISCARD_ISSUE_TIME 60000 /* 60 s, if no candidates */ #define DEF_CP_INTERVAL 60 /* 60 secs */ @@ -758,7 +759,8 @@ static inline void set_extent_info(struct extent_info *ei, unsigned int fofs, static inline bool __is_discard_mergeable(struct discard_info *back, struct discard_info *front) { - return back->lstart + back->len == front->lstart; + return (back->lstart + back->len == front->lstart) && + (back->len + front->len < DEF_MAX_DISCARD_LEN); } static inline bool __is_discard_back_mergeable(struct discard_info *cur, diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 01bc94df9f00..0889f4b8dbf3 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1210,68 +1210,6 @@ static int __queue_discard_cmd(struct f2fs_sb_info *sbi, return 0; } -static void __issue_discard_cmd_range(struct f2fs_sb_info *sbi, - struct discard_policy *dpolicy, - unsigned int start, unsigned int end) -{ - struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; - struct discard_cmd *prev_dc = NULL, *next_dc = NULL; - struct rb_node **insert_p = NULL, *insert_parent = NULL; - struct discard_cmd *dc; - struct blk_plug plug; - int issued; - -next: - issued = 0; - - mutex_lock(&dcc->cmd_lock); - f2fs_bug_on(sbi, !__check_rb_tree_consistence(sbi, &dcc->root)); - - dc = (struct discard_cmd *)__lookup_rb_tree_ret(&dcc->root, - NULL, start, - (struct rb_entry **)&prev_dc, - (struct rb_entry **)&next_dc, - &insert_p, &insert_parent, true); - if (!dc) - dc = next_dc; - - blk_start_plug(&plug); - - while (dc && dc->lstart <= end) { - struct rb_node *node; - - if (dc->len < dpolicy->granularity) - goto skip; - - if (dc->state != D_PREP) { - list_move_tail(&dc->list, &dcc->fstrim_list); - goto skip; - } - - __submit_discard_cmd(sbi, dpolicy, dc); - - if (++issued >= dpolicy->max_requests) { - start = dc->lstart + dc->len; - - blk_finish_plug(&plug); - mutex_unlock(&dcc->cmd_lock); - - schedule(); - - goto next; - } -skip: - node = rb_next(&dc->rb_node); - dc = rb_entry_safe(node, struct discard_cmd, rb_node); - - if (fatal_signal_pending(current)) - break; - } - - blk_finish_plug(&plug); - mutex_unlock(&dcc->cmd_lock); -} - static int __issue_discard_cmd(struct f2fs_sb_info *sbi, struct discard_policy *dpolicy) { @@ -1412,7 +1350,18 @@ next: static void __wait_all_discard_cmd(struct f2fs_sb_info *sbi, struct discard_policy *dpolicy) { - __wait_discard_cmd_range(sbi, dpolicy, 0, UINT_MAX); + struct discard_policy dp; + + if (dpolicy) { + __wait_discard_cmd_range(sbi, dpolicy, 0, UINT_MAX); + return; + } + + /* wait all */ + init_discard_policy(&dp, DPOLICY_FSTRIM, 1); + __wait_discard_cmd_range(sbi, &dp, 0, UINT_MAX); + init_discard_policy(&dp, DPOLICY_UMOUNT, 1); + __wait_discard_cmd_range(sbi, &dp, 0, UINT_MAX); } /* This should be covered by global mutex, &sit_i->sentry_lock */ @@ -1460,8 +1409,9 @@ bool f2fs_wait_discard_bios(struct f2fs_sb_info *sbi) init_discard_policy(&dpolicy, DPOLICY_UMOUNT, dcc->discard_granularity); __issue_discard_cmd(sbi, &dpolicy); dropped = __drop_discard_cmd(sbi); - __wait_all_discard_cmd(sbi, &dpolicy); + /* just to make sure there is no pending discard commands */ + __wait_all_discard_cmd(sbi, NULL); return dropped; } @@ -2453,6 +2403,67 @@ bool exist_trim_candidates(struct f2fs_sb_info *sbi, struct cp_control *cpc) return has_candidate; } +static void __issue_discard_cmd_range(struct f2fs_sb_info *sbi, + struct discard_policy *dpolicy, + unsigned int start, unsigned int end) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct discard_cmd *prev_dc = NULL, *next_dc = NULL; + struct rb_node **insert_p = NULL, *insert_parent = NULL; + struct discard_cmd *dc; + struct blk_plug plug; + int issued; + +next: + issued = 0; + + mutex_lock(&dcc->cmd_lock); + f2fs_bug_on(sbi, !__check_rb_tree_consistence(sbi, &dcc->root)); + + dc = (struct discard_cmd *)__lookup_rb_tree_ret(&dcc->root, + NULL, start, + (struct rb_entry **)&prev_dc, + (struct rb_entry **)&next_dc, + &insert_p, &insert_parent, true); + if (!dc) + dc = next_dc; + + blk_start_plug(&plug); + + while (dc && dc->lstart <= end) { + struct rb_node *node; + + if (dc->len < dpolicy->granularity) + goto skip; + + if (dc->state != D_PREP) { + list_move_tail(&dc->list, &dcc->fstrim_list); + goto skip; + } + + __submit_discard_cmd(sbi, dpolicy, dc); + + if (++issued >= dpolicy->max_requests) { + start = dc->lstart + dc->len; + + blk_finish_plug(&plug); + mutex_unlock(&dcc->cmd_lock); + __wait_all_discard_cmd(sbi, NULL); + congestion_wait(BLK_RW_ASYNC, HZ/50); + goto next; + } +skip: + node = rb_next(&dc->rb_node); + dc = rb_entry_safe(node, struct discard_cmd, rb_node); + + if (fatal_signal_pending(current)) + break; + } + + blk_finish_plug(&plug); + mutex_unlock(&dcc->cmd_lock); +} + int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) { __u64 start = F2FS_BYTES_TO_BLK(range->start); From 70676ef73646598128b0521187d78e2fd492bed0 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 25 May 2018 18:02:58 -0700 Subject: [PATCH 672/804] f2fs: add fsync_mode=nobarrier for non-atomic files For non-atomic files, this patch adds an option to give nobarrier which doesn't issue flush commands to the device. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.txt | 16 +++++++++------- fs/f2fs/f2fs.h | 1 + fs/f2fs/file.c | 2 +- fs/f2fs/super.c | 4 ++++ 4 files changed, 15 insertions(+), 8 deletions(-) diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt index 1f52baea2f69..ecccb51c7279 100644 --- a/Documentation/filesystems/f2fs.txt +++ b/Documentation/filesystems/f2fs.txt @@ -180,13 +180,15 @@ whint_mode=%s Control which write hints are passed down to block passes down hints with its policy. alloc_mode=%s Adjust block allocation policy, which supports "reuse" and "default". -fsync_mode=%s Control the policy of fsync. Currently supports "posix" - and "strict". In "posix" mode, which is default, fsync - will follow POSIX semantics and does a light operation - to improve the filesystem performance. In "strict" mode, - fsync will be heavy and behaves in line with xfs, ext4 - and btrfs, where xfstest generic/342 will pass, but the - performance will regress. +fsync_mode=%s Control the policy of fsync. Currently supports "posix", + "strict", and "nobarrier". In "posix" mode, which is + default, fsync will follow POSIX semantics and does a + light operation to improve the filesystem performance. + In "strict" mode, fsync will be heavy and behaves in line + with xfs, ext4 and btrfs, where xfstest generic/342 will + pass, but the performance will regress. "nobarrier" is + based on "posix", but doesn't issue flush command for + non-atomic files likewise "nobarrier" mount option. test_dummy_encryption Enable dummy encryption, which provides a fake fscrypt context. The fake fscrypt context is used by xfstests. diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 0bb23ad94b39..2b722d50f096 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1146,6 +1146,7 @@ enum { enum fsync_mode { FSYNC_MODE_POSIX, /* fsync follows posix semantics */ FSYNC_MODE_STRICT, /* fsync behaves in line with ext4 */ + FSYNC_MODE_NOBARRIER, /* fsync behaves nobarrier based on posix */ }; #ifdef CONFIG_F2FS_FS_ENCRYPTION diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 7587758a285f..40d03d58b390 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -309,7 +309,7 @@ sync_nodes: remove_ino_entry(sbi, ino, APPEND_INO); clear_inode_flag(inode, FI_APPEND_WRITE); flush_out: - if (!atomic) + if (!atomic && F2FS_OPTION(sbi).fsync_mode != FSYNC_MODE_NOBARRIER) ret = f2fs_issue_flush(sbi, inode->i_ino); if (!ret) { remove_ino_entry(sbi, ino, UPDATE_INO); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 55b2bad55671..cb57ad3ca32d 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -740,6 +740,10 @@ static int parse_options(struct super_block *sb, char *options) } else if (strlen(name) == 6 && !strncmp(name, "strict", 6)) { F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_STRICT; + } else if (strlen(name) == 9 && + !strncmp(name, "nobarrier", 9)) { + F2FS_OPTION(sbi).fsync_mode = + FSYNC_MODE_NOBARRIER; } else { kfree(name); return -EINVAL; From 31e2713935ea102ddb29dc5bf496d335a213f7f9 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 29 May 2018 09:58:42 -0700 Subject: [PATCH 673/804] f2fs: issue discard commands proactively in high fs utilization In the high utilization like over 80%, we don't expect huge # of large discard commands, but do many small pending discards which affects FTL GCs a lot. Let's issue them in that case. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 3 +- fs/f2fs/segment.c | 71 ++++++++++++++++++++++++++--------------------- 2 files changed, 40 insertions(+), 34 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 2b722d50f096..249635a5f472 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -253,6 +253,7 @@ enum { #define DEF_MAX_DISCARD_LEN 512 /* Max. 2MB per discard */ #define DEF_MIN_DISCARD_ISSUE_TIME 50 /* 50 ms, if exists */ #define DEF_MAX_DISCARD_ISSUE_TIME 60000 /* 60 s, if no candidates */ +#define DEF_DISCARD_URGENT_UTIL 80 /* do more discard over 80% */ #define DEF_CP_INTERVAL 60 /* 60 secs */ #define DEF_IDLE_INTERVAL 5 /* 5 secs */ @@ -2861,8 +2862,6 @@ int f2fs_flush_device_cache(struct f2fs_sb_info *sbi); void destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free); void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr); bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr); -void init_discard_policy(struct discard_policy *dpolicy, int discard_type, - unsigned int granularity); void drop_discard_cmd(struct f2fs_sb_info *sbi); void stop_discard_thread(struct f2fs_sb_info *sbi); bool f2fs_wait_discard_bios(struct f2fs_sb_info *sbi); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 0889f4b8dbf3..8df1a168256b 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -996,6 +996,38 @@ static void __check_sit_bitmap(struct f2fs_sb_info *sbi, #endif } +static void __init_discard_policy(struct f2fs_sb_info *sbi, + struct discard_policy *dpolicy, + int discard_type, unsigned int granularity) +{ + /* common policy */ + dpolicy->type = discard_type; + dpolicy->sync = true; + dpolicy->granularity = granularity; + + dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST; + dpolicy->io_aware_gran = MAX_PLIST_NUM; + + if (discard_type == DPOLICY_BG) { + dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME; + dpolicy->max_interval = DEF_MAX_DISCARD_ISSUE_TIME; + dpolicy->io_aware = true; + if (utilization(sbi) > DEF_DISCARD_URGENT_UTIL) { + dpolicy->granularity = 1; + dpolicy->max_interval = DEF_MIN_DISCARD_ISSUE_TIME; + } + } else if (discard_type == DPOLICY_FORCE) { + dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME; + dpolicy->max_interval = DEF_MAX_DISCARD_ISSUE_TIME; + dpolicy->io_aware = false; + } else if (discard_type == DPOLICY_FSTRIM) { + dpolicy->io_aware = false; + } else if (discard_type == DPOLICY_UMOUNT) { + dpolicy->io_aware = false; + } +} + + /* this function is copied from blkdev_issue_discard from block/blk-lib.c */ static void __submit_discard_cmd(struct f2fs_sb_info *sbi, struct discard_policy *dpolicy, @@ -1358,9 +1390,9 @@ static void __wait_all_discard_cmd(struct f2fs_sb_info *sbi, } /* wait all */ - init_discard_policy(&dp, DPOLICY_FSTRIM, 1); + __init_discard_policy(sbi, &dp, DPOLICY_FSTRIM, 1); __wait_discard_cmd_range(sbi, &dp, 0, UINT_MAX); - init_discard_policy(&dp, DPOLICY_UMOUNT, 1); + __init_discard_policy(sbi, &dp, DPOLICY_UMOUNT, 1); __wait_discard_cmd_range(sbi, &dp, 0, UINT_MAX); } @@ -1406,7 +1438,8 @@ bool f2fs_wait_discard_bios(struct f2fs_sb_info *sbi) struct discard_policy dpolicy; bool dropped; - init_discard_policy(&dpolicy, DPOLICY_UMOUNT, dcc->discard_granularity); + __init_discard_policy(sbi, &dpolicy, DPOLICY_UMOUNT, + dcc->discard_granularity); __issue_discard_cmd(sbi, &dpolicy); dropped = __drop_discard_cmd(sbi); @@ -1427,7 +1460,7 @@ static int issue_discard_thread(void *data) set_freezable(); do { - init_discard_policy(&dpolicy, DPOLICY_BG, + __init_discard_policy(sbi, &dpolicy, DPOLICY_BG, dcc->discard_granularity); wait_event_interruptible_timeout(*q, @@ -1445,7 +1478,7 @@ static int issue_discard_thread(void *data) dcc->discard_wake = 0; if (sbi->gc_thread && sbi->gc_thread->gc_urgent) - init_discard_policy(&dpolicy, DPOLICY_FORCE, 1); + __init_discard_policy(sbi, &dpolicy, DPOLICY_FORCE, 1); sb_start_intwrite(sbi->sb); @@ -1738,32 +1771,6 @@ skip: wake_up_discard_thread(sbi, false); } -void init_discard_policy(struct discard_policy *dpolicy, - int discard_type, unsigned int granularity) -{ - /* common policy */ - dpolicy->type = discard_type; - dpolicy->sync = true; - dpolicy->granularity = granularity; - - dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST; - dpolicy->io_aware_gran = MAX_PLIST_NUM; - - if (discard_type == DPOLICY_BG) { - dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME; - dpolicy->max_interval = DEF_MAX_DISCARD_ISSUE_TIME; - dpolicy->io_aware = true; - } else if (discard_type == DPOLICY_FORCE) { - dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME; - dpolicy->max_interval = DEF_MAX_DISCARD_ISSUE_TIME; - dpolicy->io_aware = false; - } else if (discard_type == DPOLICY_FSTRIM) { - dpolicy->io_aware = false; - } else if (discard_type == DPOLICY_UMOUNT) { - dpolicy->io_aware = false; - } -} - static int create_discard_cmd_control(struct f2fs_sb_info *sbi) { dev_t dev = sbi->sb->s_bdev->bd_dev; @@ -2522,7 +2529,7 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) start_block = START_BLOCK(sbi, start_segno); end_block = START_BLOCK(sbi, min(cur_segno, end_segno) + 1); - init_discard_policy(&dpolicy, DPOLICY_FSTRIM, cpc.trim_minlen); + __init_discard_policy(sbi, &dpolicy, DPOLICY_FSTRIM, cpc.trim_minlen); __issue_discard_cmd_range(sbi, &dpolicy, start_block, end_block); trimmed = __wait_discard_cmd_range(sbi, &dpolicy, start_block, end_block); From 4738f527db84ecb5d40691b8e5bf3e9bfced2243 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 9 Apr 2018 10:25:23 +0800 Subject: [PATCH 674/804] f2fs: don't split checkpoint in fstrim Now, we issue discard asynchronously in separated thread instead of in checkpoint, after that, we won't encounter long latency in checkpoint due to huge number of synchronous discard command handling, so, we don't need to split checkpoint to do trim in batch, merge it and obsolete related sysfs entry. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 1 + fs/f2fs/f2fs.h | 5 ---- fs/f2fs/segment.c | 39 ++++++++----------------- fs/f2fs/sysfs.c | 3 ++ 4 files changed, 16 insertions(+), 32 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index b8d0a30f1644..f82da9bbb1fd 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -101,6 +101,7 @@ Date: February 2015 Contact: "Jaegeuk Kim" Description: Controls the trimming rate in batch mode. + What: /sys/fs/f2fs//cp_interval Date: October 2015 diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 249635a5f472..51e4a9499f04 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -243,11 +243,6 @@ enum { #define CP_DISCARD 0x00000010 #define CP_TRIMMED 0x00000020 -#define DEF_BATCHED_TRIM_SECTIONS 2048 -#define BATCHED_TRIM_SEGMENTS(sbi) \ - (GET_SEG_FROM_SEC(sbi, SM_I(sbi)->trim_sections)) -#define BATCHED_TRIM_BLOCKS(sbi) \ - (BATCHED_TRIM_SEGMENTS(sbi) << (sbi)->log_blocks_per_seg) #define MAX_DISCARD_BLOCKS(sbi) BLKS_PER_SEC(sbi) #define DEF_MAX_DISCARD_REQUEST 8 /* issue 8 discards per round */ #define DEF_MAX_DISCARD_LEN 512 /* Max. 2MB per discard */ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 8df1a168256b..30f07dd5da3f 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2475,7 +2475,7 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) { __u64 start = F2FS_BYTES_TO_BLK(range->start); __u64 end = start + F2FS_BYTES_TO_BLK(range->len) - 1; - unsigned int start_segno, end_segno, cur_segno; + unsigned int start_segno, end_segno; block_t start_block, end_block; struct cp_control cpc; struct discard_policy dpolicy; @@ -2501,40 +2501,27 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) cpc.reason = CP_DISCARD; cpc.trim_minlen = max_t(__u64, 1, F2FS_BYTES_TO_BLK(range->minlen)); + cpc.trim_start = start_segno; + cpc.trim_end = end_segno; - /* do checkpoint to issue discard commands safely */ - for (cur_segno = start_segno; cur_segno <= end_segno; - cur_segno = cpc.trim_end + 1) { - cpc.trim_start = cur_segno; + if (sbi->discard_blks == 0) + goto out; - if (sbi->discard_blks == 0) - break; - else if (sbi->discard_blks < BATCHED_TRIM_BLOCKS(sbi)) - cpc.trim_end = end_segno; - else - cpc.trim_end = min_t(unsigned int, - rounddown(cur_segno + - BATCHED_TRIM_SEGMENTS(sbi), - sbi->segs_per_sec) - 1, end_segno); - - mutex_lock(&sbi->gc_mutex); - err = write_checkpoint(sbi, &cpc); - mutex_unlock(&sbi->gc_mutex); - if (err) - break; - - schedule(); - } + mutex_lock(&sbi->gc_mutex); + err = write_checkpoint(sbi, &cpc); + mutex_unlock(&sbi->gc_mutex); + if (err) + goto out; start_block = START_BLOCK(sbi, start_segno); - end_block = START_BLOCK(sbi, min(cur_segno, end_segno) + 1); + end_block = START_BLOCK(sbi, end_segno + 1); __init_discard_policy(sbi, &dpolicy, DPOLICY_FSTRIM, cpc.trim_minlen); __issue_discard_cmd_range(sbi, &dpolicy, start_block, end_block); trimmed = __wait_discard_cmd_range(sbi, &dpolicy, start_block, end_block); -out: range->len = F2FS_BLK_TO_BYTES(trimmed); +out: return err; } @@ -3922,8 +3909,6 @@ int build_segment_manager(struct f2fs_sb_info *sbi) sm_info->min_hot_blocks = DEF_MIN_HOT_BLOCKS; sm_info->min_ssr_sections = reserved_sections(sbi); - sm_info->trim_sections = DEF_BATCHED_TRIM_SECTIONS; - INIT_LIST_HEAD(&sm_info->sit_entry_set); init_rwsem(&sm_info->curseg_lock); diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index f33a56d6e6dd..2c53de9251be 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -245,6 +245,9 @@ out: return count; } + if (!strcmp(a->attr.name, "trim_sections")) + return -EINVAL; + *ui = t; if (!strcmp(a->attr.name, "iostat_enable") && *ui == 0) From 85d2070f60c66b469e21a0c8e67c25c7cd5b4c45 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 10 Apr 2018 15:43:09 +0800 Subject: [PATCH 675/804] f2fs: turn down IO priority of discard from background In order to avoid interfering normal r/w IO, let's turn down IO priority of discard issued from background. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 30f07dd5da3f..478a4504ba9a 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1012,6 +1012,7 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi, dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME; dpolicy->max_interval = DEF_MAX_DISCARD_ISSUE_TIME; dpolicy->io_aware = true; + dpolicy->sync = false; if (utilization(sbi) > DEF_DISCARD_URGENT_UTIL) { dpolicy->granularity = 1; dpolicy->max_interval = DEF_MIN_DISCARD_ISSUE_TIME; From 73450231fffffe6a2863d493cd053596f2a2de57 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 31 May 2018 10:20:48 -0700 Subject: [PATCH 676/804] f2fs: run fstrim asynchronously if runtime discard is on We don't need to wait for whole bunch of discard candidates in fstrim, since runtime discard will issue them in idle time. Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 478a4504ba9a..a02d5c1a7ed2 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2519,9 +2519,18 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) __init_discard_policy(sbi, &dpolicy, DPOLICY_FSTRIM, cpc.trim_minlen); __issue_discard_cmd_range(sbi, &dpolicy, start_block, end_block); - trimmed = __wait_discard_cmd_range(sbi, &dpolicy, + + /* + * We filed discard candidates, but actually we don't need to wait for + * all of them, since they'll be issued in idle time along with runtime + * discard option. User configuration looks like using runtime discard + * or periodic fstrim instead of it. + */ + if (!test_opt(sbi, DISCARD)) { + trimmed = __wait_discard_cmd_range(sbi, &dpolicy, start_block, end_block); - range->len = F2FS_BLK_TO_BYTES(trimmed); + range->len = F2FS_BLK_TO_BYTES(trimmed); + } out: return err; } From 39b14449060651cde9d2d5e0b6e48f0674a087c7 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 20 Apr 2018 16:30:02 -0700 Subject: [PATCH 677/804] fscrypt: use unbound workqueue for decryption Improve fscrypt read performance by switching the decryption workqueue from bound to unbound. With the bound workqueue, when multiple bios completed on the same CPU, they were decrypted on that same CPU. But with the unbound queue, they are now decrypted in parallel on any CPU. Although fscrypt read performance can be tough to measure due to the many sources of variation, this change is most beneficial when decryption is slow, e.g. on CPUs without AES instructions. For example, I timed tarring up encrypted directories on f2fs. On x86 with AES-NI instructions disabled, the unbound workqueue improved performance by about 25-35%, using 1 to NUM_CPUs jobs with 4 or 8 CPUs available. But with AES-NI enabled, performance was unchanged to within ~2%. I also did the same test on a quad-core ARM CPU using xts-speck128-neon encryption. There performance was usually about 10% better with the unbound workqueue, bringing it closer to the unencrypted speed. The unbound workqueue may be worse in some cases due to worse locality, but I think it's still the better default. dm-crypt uses an unbound workqueue by default too, so this change makes fscrypt match. Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/crypto/crypto.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index 0758d32ad01b..2f646b1248bc 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -433,8 +433,17 @@ fail: */ static int __init fscrypt_init(void) { + /* + * Use an unbound workqueue to allow bios to be decrypted in parallel + * even when they happen to complete on the same CPU. This sacrifices + * locality, but it's worthwhile since decryption is CPU-intensive. + * + * Also use a high-priority workqueue to prioritize decryption work, + * which blocks reads from completing, over regular application tasks. + */ fscrypt_read_workqueue = alloc_workqueue("fscrypt_read_queue", - WQ_HIGHPRI, 0); + WQ_UNBOUND | WQ_HIGHPRI, + num_online_cpus()); if (!fscrypt_read_workqueue) goto fail; From fb10231825e94a1eea7d5e0b9d23824b6add6113 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 30 Apr 2018 15:51:36 -0700 Subject: [PATCH 678/804] fscrypt: clean up after fscrypt_prepare_lookup() conversions Now that all filesystems have been converted to use fscrypt_prepare_lookup(), we can remove the fscrypt_set_d_op() and fscrypt_set_encrypted_dentry() functions as well as un-export fscrypt_d_ops. Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/crypto/crypto.c | 1 - fs/crypto/fscrypt_private.h | 1 + include/linux/fscrypt_notsupp.h | 10 ---------- include/linux/fscrypt_supp.h | 14 -------------- 4 files changed, 1 insertion(+), 25 deletions(-) diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index 2f646b1248bc..a00efa266eb5 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -359,7 +359,6 @@ static int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags) const struct dentry_operations fscrypt_d_ops = { .d_revalidate = fscrypt_d_revalidate, }; -EXPORT_SYMBOL(fscrypt_d_ops); void fscrypt_restore_control_page(struct page *page) { diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index 426aa1b27f17..978d0e061aed 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -115,6 +115,7 @@ extern int fscrypt_do_page_crypto(const struct inode *inode, gfp_t gfp_flags); extern struct page *fscrypt_alloc_bounce_page(struct fscrypt_ctx *ctx, gfp_t gfp_flags); +extern const struct dentry_operations fscrypt_d_ops; /* fname.c */ extern int fname_encrypt(struct inode *inode, const struct qstr *iname, diff --git a/include/linux/fscrypt_notsupp.h b/include/linux/fscrypt_notsupp.h index 44bd4fbd3ec5..e2729c6d9829 100644 --- a/include/linux/fscrypt_notsupp.h +++ b/include/linux/fscrypt_notsupp.h @@ -67,16 +67,6 @@ static inline void fscrypt_restore_control_page(struct page *page) return; } -static inline void fscrypt_set_d_op(struct dentry *dentry) -{ - return; -} - -static inline void fscrypt_set_encrypted_dentry(struct dentry *dentry) -{ - return; -} - /* policy.c */ static inline int fscrypt_ioctl_set_policy(struct file *filp, const void __user *arg) diff --git a/include/linux/fscrypt_supp.h b/include/linux/fscrypt_supp.h index 9d1857302b73..4f0a5c5ab441 100644 --- a/include/linux/fscrypt_supp.h +++ b/include/linux/fscrypt_supp.h @@ -74,20 +74,6 @@ static inline struct page *fscrypt_control_page(struct page *page) extern void fscrypt_restore_control_page(struct page *); -extern const struct dentry_operations fscrypt_d_ops; - -static inline void fscrypt_set_d_op(struct dentry *dentry) -{ - d_set_d_op(dentry, &fscrypt_d_ops); -} - -static inline void fscrypt_set_encrypted_dentry(struct dentry *dentry) -{ - spin_lock(&dentry->d_lock); - dentry->d_flags |= DCACHE_ENCRYPTED_WITH_KEY; - spin_unlock(&dentry->d_lock); -} - /* policy.c */ extern int fscrypt_ioctl_set_policy(struct file *, const void __user *); extern int fscrypt_ioctl_get_policy(struct file *, void __user *); From f68d3b84aef18bb91329907995d2c7e083ae8c75 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 30 Apr 2018 15:51:37 -0700 Subject: [PATCH 679/804] fscrypt: remove unnecessary NULL check when allocating skcipher crypto_alloc_skcipher() returns an ERR_PTR() on failure, not NULL. Remove the unnecessary check for NULL. Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/crypto/keyinfo.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c index 7c00331da5df..7750179bba4b 100644 --- a/fs/crypto/keyinfo.c +++ b/fs/crypto/keyinfo.c @@ -317,8 +317,8 @@ int fscrypt_get_encryption_info(struct inode *inode) goto out; } ctfm = crypto_alloc_skcipher(cipher_str, 0, 0); - if (!ctfm || IS_ERR(ctfm)) { - res = ctfm ? PTR_ERR(ctfm) : -ENOMEM; + if (IS_ERR(ctfm)) { + res = PTR_ERR(ctfm); pr_debug("%s: error %d (inode %lu) allocating crypto tfm\n", __func__, res, inode->i_ino); goto out; From d56de4e926ade7e0afd929792aced636c1f178ba Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 30 Apr 2018 15:51:38 -0700 Subject: [PATCH 680/804] fscrypt: remove error messages for skcipher_request_alloc() failure skcipher_request_alloc() can only fail due to lack of memory, and in that case the memory allocator will have already printed a detailed error message. Thus, remove the redundant error messages from fscrypt. Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/crypto/crypto.c | 6 +----- fs/crypto/fname.c | 10 ++-------- 2 files changed, 3 insertions(+), 13 deletions(-) diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index a00efa266eb5..021f348900b1 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -162,12 +162,8 @@ int fscrypt_do_page_crypto(const struct inode *inode, fscrypt_direction_t rw, } req = skcipher_request_alloc(tfm, gfp_flags); - if (!req) { - printk_ratelimited(KERN_ERR - "%s: crypto_request_alloc() failed\n", - __func__); + if (!req) return -ENOMEM; - } skcipher_request_set_callback( req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index b18fa323d1d9..8af9e35b4f29 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -58,11 +58,8 @@ int fname_encrypt(struct inode *inode, const struct qstr *iname, /* Set up the encryption request */ req = skcipher_request_alloc(tfm, GFP_NOFS); - if (!req) { - printk_ratelimited(KERN_ERR - "%s: skcipher_request_alloc() failed\n", __func__); + if (!req) return -ENOMEM; - } skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, crypto_req_done, &wait); @@ -107,11 +104,8 @@ static int fname_decrypt(struct inode *inode, /* Allocate request */ req = skcipher_request_alloc(tfm, GFP_NOFS); - if (!req) { - printk_ratelimited(KERN_ERR - "%s: crypto_request_alloc() failed\n", __func__); + if (!req) return -ENOMEM; - } skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, crypto_req_done, &wait); From 89b7fb82982fbe9a0951fde557a23cdf99b8cdbd Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 30 Apr 2018 15:51:39 -0700 Subject: [PATCH 681/804] fscrypt: remove stale comment from fscrypt_d_revalidate() Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/crypto/crypto.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index 021f348900b1..b12c53e6efb1 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -328,7 +328,6 @@ static int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags) return 0; } - /* this should eventually be an flag in d_flags */ spin_lock(&dentry->d_lock); cached_with_key = dentry->d_flags & DCACHE_ENCRYPTED_WITH_KEY; spin_unlock(&dentry->d_lock); From 52c51f7b7bde658d8e5abb50729dae361d0e8e35 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 30 Apr 2018 15:51:40 -0700 Subject: [PATCH 682/804] fscrypt: don't clear flags on crypto transform fscrypt is clearing the flags on the crypto_skcipher it allocates for each inode. But, this is unnecessary and may cause problems in the future because it will even clear flags that are meant to be internal to the crypto API, e.g. CRYPTO_TFM_NEED_KEY. Remove the unnecessary flag clearing. Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/crypto/keyinfo.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c index 7750179bba4b..875ee0108468 100644 --- a/fs/crypto/keyinfo.c +++ b/fs/crypto/keyinfo.c @@ -324,7 +324,6 @@ int fscrypt_get_encryption_info(struct inode *inode) goto out; } crypt_info->ci_ctfm = ctfm; - crypto_skcipher_clear_flags(ctfm, ~0); crypto_skcipher_set_flags(ctfm, CRYPTO_TFM_REQ_WEAK_KEY); /* * if the provided key is longer than keysize, we use the first From 3f7af9d27fd6cabff05d149ca1178a5e27852c19 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 30 Apr 2018 15:51:41 -0700 Subject: [PATCH 683/804] fscrypt: don't special-case EOPNOTSUPP from fscrypt_get_encryption_info() In fscrypt_setup_filename(), remove the unnecessary check for fscrypt_get_encryption_info() returning EOPNOTSUPP. There's no reason to handle this error differently from any other. I think there may have been some confusion because the "notsupp" version of fscrypt_get_encryption_info() returns EOPNOTSUPP -- but that's not applicable from inside fs/crypto/. Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/crypto/fname.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index 8af9e35b4f29..19715de54d37 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -334,7 +334,7 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, return 0; } ret = fscrypt_get_encryption_info(dir); - if (ret && ret != -EOPNOTSUPP) + if (ret) return ret; if (dir->i_crypt_info) { From 0077eff1d2e3816a40b71997ab677bb6ca671115 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 30 Apr 2018 15:51:42 -0700 Subject: [PATCH 684/804] fscrypt: drop max_namelen check from fname_decrypt() fname_decrypt() returns an error if the input filename is longer than the inode's ->max_namelen() as given by the filesystem. But, this doesn't actually make sense because the filesystem provided the input filename in the first place, where it was subject to the filesystem's limits. And fname_decrypt() has no internal limit itself. Thus, remove this unnecessary check. Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/crypto/fname.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index 19715de54d37..d21a5329d6ca 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -92,14 +92,11 @@ static int fname_decrypt(struct inode *inode, struct skcipher_request *req = NULL; DECLARE_CRYPTO_WAIT(wait); struct scatterlist src_sg, dst_sg; - struct fscrypt_info *ci = inode->i_crypt_info; - struct crypto_skcipher *tfm = ci->ci_ctfm; + struct crypto_skcipher *tfm = inode->i_crypt_info->ci_ctfm; int res = 0; char iv[FS_CRYPTO_BLOCK_SIZE]; - unsigned lim; - lim = inode->i_sb->s_cop->max_namelen(inode); - if (iname->len <= 0 || iname->len > lim) + if (iname->len <= 0) return -EIO; /* Allocate request */ From f572a22ef9a515eb0f81d6e9bac0fcc4988399f2 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 30 Apr 2018 15:51:43 -0700 Subject: [PATCH 685/804] fscrypt: drop empty name check from fname_decrypt() fname_decrypt() is validating that the encrypted filename is nonempty. However, earlier a stronger precondition was already enforced: the encrypted filename must be at least 16 (FS_CRYPTO_BLOCK_SIZE) bytes. Drop the redundant check for an empty filename. Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/crypto/fname.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index d21a5329d6ca..6c4c84ec18ff 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -96,9 +96,6 @@ static int fname_decrypt(struct inode *inode, int res = 0; char iv[FS_CRYPTO_BLOCK_SIZE]; - if (iname->len <= 0) - return -EIO; - /* Allocate request */ req = skcipher_request_alloc(tfm, GFP_NOFS); if (!req) From 56446c91422e938c60f19fcf36115cbaae737b0d Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 30 Apr 2018 15:51:44 -0700 Subject: [PATCH 686/804] fscrypt: make fscrypt_operations.max_namelen an integer Now ->max_namelen() is only called to limit the filename length when adding NUL padding, and only for real filenames -- not symlink targets. It also didn't give the correct length for symlink targets anyway since it forgot to subtract 'sizeof(struct fscrypt_symlink_data)'. Thus, change ->max_namelen from a function to a simple 'unsigned int' that gives the filesystem's maximum filename length. Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/crypto/fname.c | 2 +- fs/f2fs/super.c | 8 +------- include/linux/fscrypt_supp.h | 2 +- 3 files changed, 3 insertions(+), 9 deletions(-) diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index 6c4c84ec18ff..b1b69ec4b4ff 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -333,7 +333,7 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, if (dir->i_crypt_info) { if (!fscrypt_fname_encrypted_size(dir, iname->len, - dir->i_sb->s_cop->max_namelen(dir), + dir->i_sb->s_cop->max_namelen, &fname->crypto_buf.len)) return -ENAMETOOLONG; fname->crypto_buf.name = kmalloc(fname->crypto_buf.len, diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index cb57ad3ca32d..777ed4eafa6c 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1938,19 +1938,13 @@ static bool f2fs_dummy_context(struct inode *inode) return DUMMY_ENCRYPTION_ENABLED(F2FS_I_SB(inode)); } -static unsigned f2fs_max_namelen(struct inode *inode) -{ - return S_ISLNK(inode->i_mode) ? - inode->i_sb->s_blocksize : F2FS_NAME_LEN; -} - static const struct fscrypt_operations f2fs_cryptops = { .key_prefix = "f2fs:", .get_context = f2fs_get_context, .set_context = f2fs_set_context, .dummy_context = f2fs_dummy_context, .empty_dir = f2fs_empty_dir, - .max_namelen = f2fs_max_namelen, + .max_namelen = F2FS_NAME_LEN, }; #endif diff --git a/include/linux/fscrypt_supp.h b/include/linux/fscrypt_supp.h index 4f0a5c5ab441..46b62d82b6d6 100644 --- a/include/linux/fscrypt_supp.h +++ b/include/linux/fscrypt_supp.h @@ -28,7 +28,7 @@ struct fscrypt_operations { int (*set_context)(struct inode *, const void *, size_t, void *); bool (*dummy_context)(struct inode *); bool (*empty_dir)(struct inode *); - unsigned (*max_namelen)(struct inode *); + unsigned int max_namelen; }; struct fscrypt_ctx { From 7149dd4d39b54d3a59ecea7b2a95c842aa39a283 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 30 Apr 2018 15:51:45 -0700 Subject: [PATCH 687/804] fscrypt: remove unnecessary check for non-logon key type We're passing 'key_type_logon' to request_key(), so the found key is guaranteed to be of type "logon". Thus, there is no reason to check later that the key is really a "logon" key. Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/crypto/keyinfo.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c index 875ee0108468..90b326941c67 100644 --- a/fs/crypto/keyinfo.c +++ b/fs/crypto/keyinfo.c @@ -88,12 +88,6 @@ static int validate_user_key(struct fscrypt_info *crypt_info, return PTR_ERR(keyring_key); down_read(&keyring_key->sem); - if (keyring_key->type != &key_type_logon) { - printk_once(KERN_WARNING - "%s: key type must be logon\n", __func__); - res = -ENOKEY; - goto out; - } ukp = user_key_payload(keyring_key); if (!ukp) { /* key was revoked before we acquired its semaphore */ From ff8e7c745e2bb71c549a0813dc8fbd8a1daf970f Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 30 Apr 2018 15:51:46 -0700 Subject: [PATCH 688/804] fscrypt: remove internal key size constants With one exception, the internal key size constants such as FS_AES_256_XTS_KEY_SIZE are only used for the 'available_modes' array, where they really only serve to obfuscate what the values are. Also some of the constants are unused, and the key sizes tend to be in the names of the algorithms anyway. In the past these values were also misused, e.g. we used to have FS_AES_256_XTS_KEY_SIZE in places that technically should have been FS_MAX_KEY_SIZE. The exception is that FS_AES_128_ECB_KEY_SIZE is used for key derivation. But it's more appropriate to use FS_KEY_DERIVATION_NONCE_SIZE for that instead. Thus, just put the sizes directly in the 'available_modes' array. Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/crypto/fscrypt_private.h | 10 +--------- fs/crypto/keyinfo.c | 17 ++++++----------- 2 files changed, 7 insertions(+), 20 deletions(-) diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index 978d0e061aed..cc64e7e42fa1 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -17,15 +17,7 @@ /* Encryption parameters */ #define FS_IV_SIZE 16 -#define FS_AES_128_ECB_KEY_SIZE 16 -#define FS_AES_128_CBC_KEY_SIZE 16 -#define FS_AES_128_CTS_KEY_SIZE 16 -#define FS_AES_256_GCM_KEY_SIZE 32 -#define FS_AES_256_CBC_KEY_SIZE 32 -#define FS_AES_256_CTS_KEY_SIZE 32 -#define FS_AES_256_XTS_KEY_SIZE 64 - -#define FS_KEY_DERIVATION_NONCE_SIZE 16 +#define FS_KEY_DERIVATION_NONCE_SIZE 16 /** * Encryption context for inode diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c index 90b326941c67..f1ea6c517cfb 100644 --- a/fs/crypto/keyinfo.c +++ b/fs/crypto/keyinfo.c @@ -26,7 +26,7 @@ static struct crypto_shash *essiv_hash_tfm; * * Return: Zero on success; non-zero otherwise. */ -static int derive_key_aes(u8 deriving_key[FS_AES_128_ECB_KEY_SIZE], +static int derive_key_aes(u8 deriving_key[FS_KEY_DERIVATION_NONCE_SIZE], const struct fscrypt_key *source_key, u8 derived_raw_key[FS_MAX_KEY_SIZE]) { @@ -51,7 +51,7 @@ static int derive_key_aes(u8 deriving_key[FS_AES_128_ECB_KEY_SIZE], CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, crypto_req_done, &wait); res = crypto_skcipher_setkey(tfm, deriving_key, - FS_AES_128_ECB_KEY_SIZE); + FS_KEY_DERIVATION_NONCE_SIZE); if (res < 0) goto out; @@ -99,7 +99,6 @@ static int validate_user_key(struct fscrypt_info *crypt_info, goto out; } master_key = (struct fscrypt_key *)ukp->data; - BUILD_BUG_ON(FS_AES_128_ECB_KEY_SIZE != FS_KEY_DERIVATION_NONCE_SIZE); if (master_key->size < min_keysize || master_key->size > FS_MAX_KEY_SIZE || master_key->size % AES_BLOCK_SIZE != 0) { @@ -120,14 +119,10 @@ static const struct { const char *cipher_str; int keysize; } available_modes[] = { - [FS_ENCRYPTION_MODE_AES_256_XTS] = { "xts(aes)", - FS_AES_256_XTS_KEY_SIZE }, - [FS_ENCRYPTION_MODE_AES_256_CTS] = { "cts(cbc(aes))", - FS_AES_256_CTS_KEY_SIZE }, - [FS_ENCRYPTION_MODE_AES_128_CBC] = { "cbc(aes)", - FS_AES_128_CBC_KEY_SIZE }, - [FS_ENCRYPTION_MODE_AES_128_CTS] = { "cts(cbc(aes))", - FS_AES_128_CTS_KEY_SIZE }, + [FS_ENCRYPTION_MODE_AES_256_XTS] = { "xts(aes)", 64 }, + [FS_ENCRYPTION_MODE_AES_256_CTS] = { "cts(cbc(aes))", 32 }, + [FS_ENCRYPTION_MODE_AES_128_CBC] = { "cbc(aes)", 16 }, + [FS_ENCRYPTION_MODE_AES_128_CTS] = { "cts(cbc(aes))", 16 }, }; static int determine_cipher_type(struct fscrypt_info *ci, struct inode *inode, From 52359cf4fd6dd5208b6b9613df5140dfd9a329c7 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 30 Apr 2018 15:51:47 -0700 Subject: [PATCH 689/804] fscrypt: use a common logging function Use a common function for fscrypt warning and error messages so that all the messages are consistently ratelimited, include the "fscrypt:" prefix, and include the filesystem name if applicable. Also fix up a few of the log messages to be more descriptive. Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/crypto/crypto.c | 28 +++++++++++++++++++++++++--- fs/crypto/fname.c | 10 ++++++---- fs/crypto/fscrypt_private.h | 8 ++++++++ fs/crypto/hooks.c | 5 +++-- fs/crypto/keyinfo.c | 27 +++++++++++++++------------ 5 files changed, 57 insertions(+), 21 deletions(-) diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index b12c53e6efb1..0f46cf550907 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -180,9 +180,10 @@ int fscrypt_do_page_crypto(const struct inode *inode, fscrypt_direction_t rw, res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait); skcipher_request_free(req); if (res) { - printk_ratelimited(KERN_ERR - "%s: crypto_skcipher_encrypt() returned %d\n", - __func__, res); + fscrypt_err(inode->i_sb, + "%scryption failed for inode %lu, block %llu: %d", + (rw == FS_DECRYPT ? "de" : "en"), + inode->i_ino, lblk_num, res); return res; } return 0; @@ -422,6 +423,27 @@ fail: return res; } +void fscrypt_msg(struct super_block *sb, const char *level, + const char *fmt, ...) +{ + static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + struct va_format vaf; + va_list args; + + if (!__ratelimit(&rs)) + return; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + if (sb) + printk("%sfscrypt (%s): %pV\n", level, sb->s_id, &vaf); + else + printk("%sfscrypt: %pV\n", level, &vaf); + va_end(args); +} + /** * fscrypt_init() - Set up for fs encryption. */ diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index b1b69ec4b4ff..1bdb9f226eec 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -70,8 +70,9 @@ int fname_encrypt(struct inode *inode, const struct qstr *iname, res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait); skcipher_request_free(req); if (res < 0) { - printk_ratelimited(KERN_ERR - "%s: Error (error code %d)\n", __func__, res); + fscrypt_err(inode->i_sb, + "Filename encryption failed for inode %lu: %d", + inode->i_ino, res); return res; } @@ -114,8 +115,9 @@ static int fname_decrypt(struct inode *inode, res = crypto_wait_req(crypto_skcipher_decrypt(req), &wait); skcipher_request_free(req); if (res < 0) { - printk_ratelimited(KERN_ERR - "%s: Error (error code %d)\n", __func__, res); + fscrypt_err(inode->i_sb, + "Filename decryption failed for inode %lu: %d", + inode->i_ino, res); return res; } diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index cc64e7e42fa1..92c6c0ace1b1 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -109,6 +109,14 @@ extern struct page *fscrypt_alloc_bounce_page(struct fscrypt_ctx *ctx, gfp_t gfp_flags); extern const struct dentry_operations fscrypt_d_ops; +extern void __printf(3, 4) __cold +fscrypt_msg(struct super_block *sb, const char *level, const char *fmt, ...); + +#define fscrypt_warn(sb, fmt, ...) \ + fscrypt_msg(sb, KERN_WARNING, fmt, ##__VA_ARGS__) +#define fscrypt_err(sb, fmt, ...) \ + fscrypt_msg(sb, KERN_ERR, fmt, ##__VA_ARGS__) + /* fname.c */ extern int fname_encrypt(struct inode *inode, const struct qstr *iname, u8 *out, unsigned int olen); diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c index bc010e4609ef..b5328a0c6364 100644 --- a/fs/crypto/hooks.c +++ b/fs/crypto/hooks.c @@ -39,8 +39,9 @@ int fscrypt_file_open(struct inode *inode, struct file *filp) dir = dget_parent(file_dentry(filp)); if (IS_ENCRYPTED(d_inode(dir)) && !fscrypt_has_permitted_context(d_inode(dir), inode)) { - pr_warn_ratelimited("fscrypt: inconsistent encryption contexts: %lu/%lu", - d_inode(dir)->i_ino, inode->i_ino); + fscrypt_warn(inode->i_sb, + "inconsistent encryption contexts: %lu/%lu", + d_inode(dir)->i_ino, inode->i_ino); err = -EPERM; } dput(dir); diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c index f1ea6c517cfb..580117f81a54 100644 --- a/fs/crypto/keyinfo.c +++ b/fs/crypto/keyinfo.c @@ -102,9 +102,8 @@ static int validate_user_key(struct fscrypt_info *crypt_info, if (master_key->size < min_keysize || master_key->size > FS_MAX_KEY_SIZE || master_key->size % AES_BLOCK_SIZE != 0) { - printk_once(KERN_WARNING - "%s: key size incorrect: %d\n", - __func__, master_key->size); + fscrypt_warn(NULL, "key size incorrect: %u", + master_key->size); res = -ENOKEY; goto out; } @@ -131,9 +130,10 @@ static int determine_cipher_type(struct fscrypt_info *ci, struct inode *inode, u32 mode; if (!fscrypt_valid_enc_modes(ci->ci_data_mode, ci->ci_filename_mode)) { - pr_warn_ratelimited("fscrypt: inode %lu uses unsupported encryption modes (contents mode %d, filenames mode %d)\n", - inode->i_ino, - ci->ci_data_mode, ci->ci_filename_mode); + fscrypt_warn(inode->i_sb, + "inode %lu uses unsupported encryption modes (contents mode %d, filenames mode %d)", + inode->i_ino, ci->ci_data_mode, + ci->ci_filename_mode); return -EINVAL; } @@ -172,8 +172,9 @@ static int derive_essiv_salt(const u8 *key, int keysize, u8 *salt) tfm = crypto_alloc_shash("sha256", 0, 0); if (IS_ERR(tfm)) { - pr_warn_ratelimited("fscrypt: error allocating SHA-256 transform: %ld\n", - PTR_ERR(tfm)); + fscrypt_warn(NULL, + "error allocating SHA-256 transform: %ld", + PTR_ERR(tfm)); return PTR_ERR(tfm); } prev_tfm = cmpxchg(&essiv_hash_tfm, NULL, tfm); @@ -308,8 +309,9 @@ int fscrypt_get_encryption_info(struct inode *inode) ctfm = crypto_alloc_skcipher(cipher_str, 0, 0); if (IS_ERR(ctfm)) { res = PTR_ERR(ctfm); - pr_debug("%s: error %d (inode %lu) allocating crypto tfm\n", - __func__, res, inode->i_ino); + fscrypt_warn(inode->i_sb, + "error allocating '%s' transform for inode %lu: %d", + cipher_str, inode->i_ino, res); goto out; } crypt_info->ci_ctfm = ctfm; @@ -326,8 +328,9 @@ int fscrypt_get_encryption_info(struct inode *inode) crypt_info->ci_data_mode == FS_ENCRYPTION_MODE_AES_128_CBC) { res = init_essiv_generator(crypt_info, raw_key, keysize); if (res) { - pr_debug("%s: error %d (inode %lu) allocating essiv tfm\n", - __func__, res, inode->i_ino); + fscrypt_warn(inode->i_sb, + "error initializing ESSIV generator for inode %lu: %d", + inode->i_ino, res); goto out; } } From f68a71fa8f773c82ed70feb398d7b1ab8cca2dd3 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 30 Apr 2018 15:51:48 -0700 Subject: [PATCH 690/804] fscrypt: separate key lookup from key derivation Refactor the confusingly-named function 'validate_user_key()' into a new function 'find_and_derive_key()' which first finds the keyring key, then does the key derivation. Among other benefits this avoids the strange behavior we had previously where if key derivation failed for some reason, then we would fall back to the alternate key prefix. Now, we'll only fall back to the alternate key prefix if a valid key isn't found. This patch also improves the warning messages that are logged when the keyring key's payload is invalid. Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/crypto/keyinfo.c | 124 +++++++++++++++++++++++++++----------------- 1 file changed, 75 insertions(+), 49 deletions(-) diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c index 580117f81a54..86177a7b1001 100644 --- a/fs/crypto/keyinfo.c +++ b/fs/crypto/keyinfo.c @@ -26,7 +26,7 @@ static struct crypto_shash *essiv_hash_tfm; * * Return: Zero on success; non-zero otherwise. */ -static int derive_key_aes(u8 deriving_key[FS_KEY_DERIVATION_NONCE_SIZE], +static int derive_key_aes(const u8 deriving_key[FS_KEY_DERIVATION_NONCE_SIZE], const struct fscrypt_key *source_key, u8 derived_raw_key[FS_MAX_KEY_SIZE]) { @@ -66,52 +66,88 @@ out: return res; } -static int validate_user_key(struct fscrypt_info *crypt_info, - struct fscrypt_context *ctx, u8 *raw_key, - const char *prefix, int min_keysize) +/* + * Search the current task's subscribed keyrings for a "logon" key with + * description prefix:descriptor, and if found acquire a read lock on it and + * return a pointer to its validated payload in *payload_ret. + */ +static struct key * +find_and_lock_process_key(const char *prefix, + const u8 descriptor[FS_KEY_DESCRIPTOR_SIZE], + unsigned int min_keysize, + const struct fscrypt_key **payload_ret) { char *description; - struct key *keyring_key; - struct fscrypt_key *master_key; + struct key *key; const struct user_key_payload *ukp; - int res; + const struct fscrypt_key *payload; description = kasprintf(GFP_NOFS, "%s%*phN", prefix, - FS_KEY_DESCRIPTOR_SIZE, - ctx->master_key_descriptor); + FS_KEY_DESCRIPTOR_SIZE, descriptor); if (!description) - return -ENOMEM; + return ERR_PTR(-ENOMEM); - keyring_key = request_key(&key_type_logon, description, NULL); + key = request_key(&key_type_logon, description, NULL); kfree(description); - if (IS_ERR(keyring_key)) - return PTR_ERR(keyring_key); - down_read(&keyring_key->sem); + if (IS_ERR(key)) + return key; - ukp = user_key_payload(keyring_key); - if (!ukp) { - /* key was revoked before we acquired its semaphore */ - res = -EKEYREVOKED; - goto out; - } - if (ukp->datalen != sizeof(struct fscrypt_key)) { - res = -EINVAL; - goto out; - } - master_key = (struct fscrypt_key *)ukp->data; + down_read(&key->sem); + ukp = user_key_payload(key); - if (master_key->size < min_keysize || master_key->size > FS_MAX_KEY_SIZE - || master_key->size % AES_BLOCK_SIZE != 0) { - fscrypt_warn(NULL, "key size incorrect: %u", - master_key->size); - res = -ENOKEY; - goto out; + if (!ukp) /* was the key revoked before we acquired its semaphore? */ + goto invalid; + + payload = (const struct fscrypt_key *)ukp->data; + + if (ukp->datalen != sizeof(struct fscrypt_key) || + payload->size < 1 || payload->size > FS_MAX_KEY_SIZE) { + fscrypt_warn(NULL, + "key with description '%s' has invalid payload", + key->description); + goto invalid; } - res = derive_key_aes(ctx->nonce, master_key, raw_key); -out: - up_read(&keyring_key->sem); - key_put(keyring_key); - return res; + + if (payload->size < min_keysize || + payload->size % AES_BLOCK_SIZE != 0) { + fscrypt_warn(NULL, + "key with description '%s' is too short or is misaligned (got %u bytes, need %u+ bytes)", + key->description, payload->size, min_keysize); + goto invalid; + } + + *payload_ret = payload; + return key; + +invalid: + up_read(&key->sem); + key_put(key); + return ERR_PTR(-ENOKEY); +} + +/* Find the master key, then derive the inode's actual encryption key */ +static int find_and_derive_key(const struct inode *inode, + const struct fscrypt_context *ctx, + u8 *derived_key, unsigned int derived_keysize) +{ + struct key *key; + const struct fscrypt_key *payload; + int err; + + key = find_and_lock_process_key(FS_KEY_DESC_PREFIX, + ctx->master_key_descriptor, + derived_keysize, &payload); + if (key == ERR_PTR(-ENOKEY) && inode->i_sb->s_cop->key_prefix) { + key = find_and_lock_process_key(inode->i_sb->s_cop->key_prefix, + ctx->master_key_descriptor, + derived_keysize, &payload); + } + if (IS_ERR(key)) + return PTR_ERR(key); + err = derive_key_aes(ctx->nonce, payload, derived_key); + up_read(&key->sem); + key_put(key); + return err; } static const struct { @@ -292,20 +328,10 @@ int fscrypt_get_encryption_info(struct inode *inode) if (!raw_key) goto out; - res = validate_user_key(crypt_info, &ctx, raw_key, FS_KEY_DESC_PREFIX, - keysize); - if (res && inode->i_sb->s_cop->key_prefix) { - int res2 = validate_user_key(crypt_info, &ctx, raw_key, - inode->i_sb->s_cop->key_prefix, - keysize); - if (res2) { - if (res2 == -ENOKEY) - res = -ENOKEY; - goto out; - } - } else if (res) { + res = find_and_derive_key(inode, &ctx, raw_key, keysize); + if (res) goto out; - } + ctfm = crypto_alloc_skcipher(cipher_str, 0, 0); if (IS_ERR(ctfm)) { res = PTR_ERR(ctfm); From 27a0e77380a3767929df1b4e563096f51b90a041 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 30 Apr 2018 15:51:49 -0700 Subject: [PATCH 691/804] fscrypt: only derive the needed portion of the key Currently the key derivation function in fscrypt uses the master key length as the amount of output key material to derive. This works, but it means we can waste time deriving more key material than is actually used, e.g. most commonly, deriving 64 bytes for directories which only take a 32-byte AES-256-CTS-CBC key. It also forces us to validate that the master key length is a multiple of AES_BLOCK_SIZE, which wouldn't otherwise be necessary. Fix it to only derive the needed length key. Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/crypto/keyinfo.c | 39 ++++++++++++++++----------------------- 1 file changed, 16 insertions(+), 23 deletions(-) diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c index 86177a7b1001..44bcb695c206 100644 --- a/fs/crypto/keyinfo.c +++ b/fs/crypto/keyinfo.c @@ -18,17 +18,16 @@ static struct crypto_shash *essiv_hash_tfm; -/** - * derive_key_aes() - Derive a key using AES-128-ECB - * @deriving_key: Encryption key used for derivation. - * @source_key: Source key to which to apply derivation. - * @derived_raw_key: Derived raw key. +/* + * Key derivation function. This generates the derived key by encrypting the + * master key with AES-128-ECB using the inode's nonce as the AES key. * - * Return: Zero on success; non-zero otherwise. + * The master key must be at least as long as the derived key. If the master + * key is longer, then only the first 'derived_keysize' bytes are used. */ -static int derive_key_aes(const u8 deriving_key[FS_KEY_DERIVATION_NONCE_SIZE], - const struct fscrypt_key *source_key, - u8 derived_raw_key[FS_MAX_KEY_SIZE]) +static int derive_key_aes(const u8 *master_key, + const struct fscrypt_context *ctx, + u8 *derived_key, unsigned int derived_keysize) { int res = 0; struct skcipher_request *req = NULL; @@ -50,14 +49,13 @@ static int derive_key_aes(const u8 deriving_key[FS_KEY_DERIVATION_NONCE_SIZE], skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, crypto_req_done, &wait); - res = crypto_skcipher_setkey(tfm, deriving_key, - FS_KEY_DERIVATION_NONCE_SIZE); + res = crypto_skcipher_setkey(tfm, ctx->nonce, sizeof(ctx->nonce)); if (res < 0) goto out; - sg_init_one(&src_sg, source_key->raw, source_key->size); - sg_init_one(&dst_sg, derived_raw_key, source_key->size); - skcipher_request_set_crypt(req, &src_sg, &dst_sg, source_key->size, + sg_init_one(&src_sg, master_key, derived_keysize); + sg_init_one(&dst_sg, derived_key, derived_keysize); + skcipher_request_set_crypt(req, &src_sg, &dst_sg, derived_keysize, NULL); res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait); out: @@ -108,10 +106,9 @@ find_and_lock_process_key(const char *prefix, goto invalid; } - if (payload->size < min_keysize || - payload->size % AES_BLOCK_SIZE != 0) { + if (payload->size < min_keysize) { fscrypt_warn(NULL, - "key with description '%s' is too short or is misaligned (got %u bytes, need %u+ bytes)", + "key with description '%s' is too short (got %u bytes, need %u+ bytes)", key->description, payload->size, min_keysize); goto invalid; } @@ -144,7 +141,7 @@ static int find_and_derive_key(const struct inode *inode, } if (IS_ERR(key)) return PTR_ERR(key); - err = derive_key_aes(ctx->nonce, payload, derived_key); + err = derive_key_aes(payload->raw, ctx, derived_key, derived_keysize); up_read(&key->sem); key_put(key); return err; @@ -324,7 +321,7 @@ int fscrypt_get_encryption_info(struct inode *inode) * crypto API as part of key derivation. */ res = -ENOMEM; - raw_key = kmalloc(FS_MAX_KEY_SIZE, GFP_NOFS); + raw_key = kmalloc(keysize, GFP_NOFS); if (!raw_key) goto out; @@ -342,10 +339,6 @@ int fscrypt_get_encryption_info(struct inode *inode) } crypt_info->ci_ctfm = ctfm; crypto_skcipher_set_flags(ctfm, CRYPTO_TFM_REQ_WEAK_KEY); - /* - * if the provided key is longer than keysize, we use the first - * keysize bytes of the derived key only - */ res = crypto_skcipher_setkey(ctfm, raw_key, keysize); if (res) goto out; From eb13e0b69296ad1d3a9a3fa0cb6570aaf99f9f0c Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 7 May 2018 17:22:08 -0700 Subject: [PATCH 692/804] fscrypt: add Speck128/256 support fscrypt currently only supports AES encryption. However, many low-end mobile devices have older CPUs that don't have AES instructions, e.g. the ARMv8 Cryptography Extensions. Currently, user data on such devices is not encrypted at rest because AES is too slow, even when the NEON bit-sliced implementation of AES is used. Unfortunately, it is infeasible to encrypt these devices at all when AES is the only option. Therefore, this patch updates fscrypt to support the Speck block cipher, which was recently added to the crypto API. The C implementation of Speck is not especially fast, but Speck can be implemented very efficiently with general-purpose vector instructions, e.g. ARM NEON. For example, on an ARMv7 processor, we measured the NEON-accelerated Speck128/256-XTS at 69 MB/s for both encryption and decryption, while AES-256-XTS with the NEON bit-sliced implementation was only 22 MB/s encryption and 19 MB/s decryption. There are multiple variants of Speck. This patch only adds support for Speck128/256, which is the variant with a 128-bit block size and 256-bit key size -- the same as AES-256. This is believed to be the most secure variant of Speck, and it's only about 6% slower than Speck128/128. Speck64/128 would be at least 20% faster because it has 20% rounds, and it can be even faster on CPUs that can't efficiently do the 64-bit operations needed for Speck128. However, Speck64's 64-bit block size is not preferred security-wise. ARM NEON also supports the needed 64-bit operations even on 32-bit CPUs, resulting in Speck128 being fast enough for our targeted use cases so far. The chosen modes of operation are XTS for contents and CTS-CBC for filenames. These are the same modes of operation that fscrypt defaults to for AES. Note that as with the other fscrypt modes, Speck will not be used unless userspace chooses to use it. Nor are any of the existing modes (which are all AES-based) being removed, of course. We intentionally don't make CONFIG_FS_ENCRYPTION select CONFIG_CRYPTO_SPECK, so people will have to enable Speck support themselves if they need it. This is because we shouldn't bloat the FS_ENCRYPTION dependencies with every new cipher, especially ones that aren't recommended for most users. Moreover, CRYPTO_SPECK is just the generic implementation, which won't be fast enough for many users; in practice, they'll need to enable CRYPTO_SPECK_NEON to get acceptable performance. More details about our choice of Speck can be found in our patches that added Speck to the crypto API, and the follow-on discussion threads. We're planning a publication that explains the choice in more detail. But briefly, we can't use ChaCha20 as we previously proposed, since it would be insecure to use a stream cipher in this context, with potential IV reuse during writes on f2fs and/or on wear-leveling flash storage. We also evaluated many other lightweight and/or ARX-based block ciphers such as Chaskey-LTS, RC5, LEA, CHAM, Threefish, RC6, NOEKEON, SPARX, and XTEA. However, all had disadvantages vs. Speck, such as insufficient performance with NEON, much less published cryptanalysis, or an insufficient security level. Various design choices in Speck make it perform better with NEON than competing ciphers while still having a security margin similar to AES, and in the case of Speck128 also the same available security levels. Unfortunately, Speck does have some political baggage attached -- it's an NSA designed cipher, and was rejected from an ISO standard (though for context, as far as I know none of the above-mentioned alternatives are ISO standards either). Nevertheless, we believe it is a good solution to the problem from a technical perspective. Certain algorithms constructed from ChaCha or the ChaCha permutation, such as MEM (Masked Even-Mansour) or HPolyC, may also meet our performance requirements. However, these are new constructions that need more time to receive the cryptographic review and acceptance needed to be confident in their security. HPolyC hasn't been published yet, and we are concerned that MEM makes stronger assumptions about the underlying permutation than the ChaCha stream cipher does. In contrast, the XTS mode of operation is relatively well accepted, and Speck has over 70 cryptanalysis papers. Of course, these ChaCha-based algorithms can still be added later if they become ready. The best known attack on Speck128/256 is a differential cryptanalysis attack on 25 of 34 rounds with 2^253 time complexity and 2^125 chosen plaintexts, i.e. only marginally faster than brute force. There is no known attack on the full 34 rounds. Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- Documentation/filesystems/fscrypt.rst | 626 ++++++++++++++++++++++++++ fs/crypto/fscrypt_private.h | 4 + fs/crypto/keyinfo.c | 2 + include/uapi/linux/fs.h | 2 + 4 files changed, 634 insertions(+) create mode 100644 Documentation/filesystems/fscrypt.rst diff --git a/Documentation/filesystems/fscrypt.rst b/Documentation/filesystems/fscrypt.rst new file mode 100644 index 000000000000..48b424de85bb --- /dev/null +++ b/Documentation/filesystems/fscrypt.rst @@ -0,0 +1,626 @@ +===================================== +Filesystem-level encryption (fscrypt) +===================================== + +Introduction +============ + +fscrypt is a library which filesystems can hook into to support +transparent encryption of files and directories. + +Note: "fscrypt" in this document refers to the kernel-level portion, +implemented in ``fs/crypto/``, as opposed to the userspace tool +`fscrypt `_. This document only +covers the kernel-level portion. For command-line examples of how to +use encryption, see the documentation for the userspace tool `fscrypt +`_. Also, it is recommended to use +the fscrypt userspace tool, or other existing userspace tools such as +`fscryptctl `_ or `Android's key +management system +`_, over +using the kernel's API directly. Using existing tools reduces the +chance of introducing your own security bugs. (Nevertheless, for +completeness this documentation covers the kernel's API anyway.) + +Unlike dm-crypt, fscrypt operates at the filesystem level rather than +at the block device level. This allows it to encrypt different files +with different keys and to have unencrypted files on the same +filesystem. This is useful for multi-user systems where each user's +data-at-rest needs to be cryptographically isolated from the others. +However, except for filenames, fscrypt does not encrypt filesystem +metadata. + +Unlike eCryptfs, which is a stacked filesystem, fscrypt is integrated +directly into supported filesystems --- currently ext4, F2FS, and +UBIFS. This allows encrypted files to be read and written without +caching both the decrypted and encrypted pages in the pagecache, +thereby nearly halving the memory used and bringing it in line with +unencrypted files. Similarly, half as many dentries and inodes are +needed. eCryptfs also limits encrypted filenames to 143 bytes, +causing application compatibility issues; fscrypt allows the full 255 +bytes (NAME_MAX). Finally, unlike eCryptfs, the fscrypt API can be +used by unprivileged users, with no need to mount anything. + +fscrypt does not support encrypting files in-place. Instead, it +supports marking an empty directory as encrypted. Then, after +userspace provides the key, all regular files, directories, and +symbolic links created in that directory tree are transparently +encrypted. + +Threat model +============ + +Offline attacks +--------------- + +Provided that userspace chooses a strong encryption key, fscrypt +protects the confidentiality of file contents and filenames in the +event of a single point-in-time permanent offline compromise of the +block device content. fscrypt does not protect the confidentiality of +non-filename metadata, e.g. file sizes, file permissions, file +timestamps, and extended attributes. Also, the existence and location +of holes (unallocated blocks which logically contain all zeroes) in +files is not protected. + +fscrypt is not guaranteed to protect confidentiality or authenticity +if an attacker is able to manipulate the filesystem offline prior to +an authorized user later accessing the filesystem. + +Online attacks +-------------- + +fscrypt (and storage encryption in general) can only provide limited +protection, if any at all, against online attacks. In detail: + +fscrypt is only resistant to side-channel attacks, such as timing or +electromagnetic attacks, to the extent that the underlying Linux +Cryptographic API algorithms are. If a vulnerable algorithm is used, +such as a table-based implementation of AES, it may be possible for an +attacker to mount a side channel attack against the online system. +Side channel attacks may also be mounted against applications +consuming decrypted data. + +After an encryption key has been provided, fscrypt is not designed to +hide the plaintext file contents or filenames from other users on the +same system, regardless of the visibility of the keyring key. +Instead, existing access control mechanisms such as file mode bits, +POSIX ACLs, LSMs, or mount namespaces should be used for this purpose. +Also note that as long as the encryption keys are *anywhere* in +memory, an online attacker can necessarily compromise them by mounting +a physical attack or by exploiting any kernel security vulnerability +which provides an arbitrary memory read primitive. + +While it is ostensibly possible to "evict" keys from the system, +recently accessed encrypted files will remain accessible at least +until the filesystem is unmounted or the VFS caches are dropped, e.g. +using ``echo 2 > /proc/sys/vm/drop_caches``. Even after that, if the +RAM is compromised before being powered off, it will likely still be +possible to recover portions of the plaintext file contents, if not +some of the encryption keys as well. (Since Linux v4.12, all +in-kernel keys related to fscrypt are sanitized before being freed. +However, userspace would need to do its part as well.) + +Currently, fscrypt does not prevent a user from maliciously providing +an incorrect key for another user's existing encrypted files. A +protection against this is planned. + +Key hierarchy +============= + +Master Keys +----------- + +Each encrypted directory tree is protected by a *master key*. Master +keys can be up to 64 bytes long, and must be at least as long as the +greater of the key length needed by the contents and filenames +encryption modes being used. For example, if AES-256-XTS is used for +contents encryption, the master key must be 64 bytes (512 bits). Note +that the XTS mode is defined to require a key twice as long as that +required by the underlying block cipher. + +To "unlock" an encrypted directory tree, userspace must provide the +appropriate master key. There can be any number of master keys, each +of which protects any number of directory trees on any number of +filesystems. + +Userspace should generate master keys either using a cryptographically +secure random number generator, or by using a KDF (Key Derivation +Function). Note that whenever a KDF is used to "stretch" a +lower-entropy secret such as a passphrase, it is critical that a KDF +designed for this purpose be used, such as scrypt, PBKDF2, or Argon2. + +Per-file keys +------------- + +Master keys are not used to encrypt file contents or names directly. +Instead, a unique key is derived for each encrypted file, including +each regular file, directory, and symbolic link. This has several +advantages: + +- In cryptosystems, the same key material should never be used for + different purposes. Using the master key as both an XTS key for + contents encryption and as a CTS-CBC key for filenames encryption + would violate this rule. +- Per-file keys simplify the choice of IVs (Initialization Vectors) + for contents encryption. Without per-file keys, to ensure IV + uniqueness both the inode and logical block number would need to be + encoded in the IVs. This would make it impossible to renumber + inodes, which e.g. ``resize2fs`` can do when resizing an ext4 + filesystem. With per-file keys, it is sufficient to encode just the + logical block number in the IVs. +- Per-file keys strengthen the encryption of filenames, where IVs are + reused out of necessity. With a unique key per directory, IV reuse + is limited to within a single directory. +- Per-file keys allow individual files to be securely erased simply by + securely erasing their keys. (Not yet implemented.) + +A KDF (Key Derivation Function) is used to derive per-file keys from +the master key. This is done instead of wrapping a randomly-generated +key for each file because it reduces the size of the encryption xattr, +which for some filesystems makes the xattr more likely to fit in-line +in the filesystem's inode table. With a KDF, only a 16-byte nonce is +required --- long enough to make key reuse extremely unlikely. A +wrapped key, on the other hand, would need to be up to 64 bytes --- +the length of an AES-256-XTS key. Furthermore, currently there is no +requirement to support unlocking a file with multiple alternative +master keys or to support rotating master keys. Instead, the master +keys may be wrapped in userspace, e.g. as done by the `fscrypt +`_ tool. + +The current KDF encrypts the master key using the 16-byte nonce as an +AES-128-ECB key. The output is used as the derived key. If the +output is longer than needed, then it is truncated to the needed +length. Truncation is the norm for directories and symlinks, since +those use the CTS-CBC encryption mode which requires a key half as +long as that required by the XTS encryption mode. + +Note: this KDF meets the primary security requirement, which is to +produce unique derived keys that preserve the entropy of the master +key, assuming that the master key is already a good pseudorandom key. +However, it is nonstandard and has some problems such as being +reversible, so it is generally considered to be a mistake! It may be +replaced with HKDF or another more standard KDF in the future. + +Encryption modes and usage +========================== + +fscrypt allows one encryption mode to be specified for file contents +and one encryption mode to be specified for filenames. Different +directory trees are permitted to use different encryption modes. +Currently, the following pairs of encryption modes are supported: + +- AES-256-XTS for contents and AES-256-CTS-CBC for filenames +- AES-128-CBC for contents and AES-128-CTS-CBC for filenames +- Speck128/256-XTS for contents and Speck128/256-CTS-CBC for filenames + +It is strongly recommended to use AES-256-XTS for contents encryption. +AES-128-CBC was added only for low-powered embedded devices with +crypto accelerators such as CAAM or CESA that do not support XTS. + +Similarly, Speck128/256 support was only added for older or low-end +CPUs which cannot do AES fast enough -- especially ARM CPUs which have +NEON instructions but not the Cryptography Extensions -- and for which +it would not otherwise be feasible to use encryption at all. It is +not recommended to use Speck on CPUs that have AES instructions. +Speck support is only available if it has been enabled in the crypto +API via CONFIG_CRYPTO_SPECK. Also, on ARM platforms, to get +acceptable performance CONFIG_CRYPTO_SPECK_NEON must be enabled. + +New encryption modes can be added relatively easily, without changes +to individual filesystems. However, authenticated encryption (AE) +modes are not currently supported because of the difficulty of dealing +with ciphertext expansion. + +For file contents, each filesystem block is encrypted independently. +Currently, only the case where the filesystem block size is equal to +the system's page size (usually 4096 bytes) is supported. With the +XTS mode of operation (recommended), the logical block number within +the file is used as the IV. With the CBC mode of operation (not +recommended), ESSIV is used; specifically, the IV for CBC is the +logical block number encrypted with AES-256, where the AES-256 key is +the SHA-256 hash of the inode's data encryption key. + +For filenames, the full filename is encrypted at once. Because of the +requirements to retain support for efficient directory lookups and +filenames of up to 255 bytes, a constant initialization vector (IV) is +used. However, each encrypted directory uses a unique key, which +limits IV reuse to within a single directory. Note that IV reuse in +the context of CTS-CBC encryption means that when the original +filenames share a common prefix at least as long as the cipher block +size (16 bytes for AES), the corresponding encrypted filenames will +also share a common prefix. This is undesirable; it may be fixed in +the future by switching to an encryption mode that is a strong +pseudorandom permutation on arbitrary-length messages, e.g. the HEH +(Hash-Encrypt-Hash) mode. + +Since filenames are encrypted with the CTS-CBC mode of operation, the +plaintext and ciphertext filenames need not be multiples of the AES +block size, i.e. 16 bytes. However, the minimum size that can be +encrypted is 16 bytes, so shorter filenames are NUL-padded to 16 bytes +before being encrypted. In addition, to reduce leakage of filename +lengths via their ciphertexts, all filenames are NUL-padded to the +next 4, 8, 16, or 32-byte boundary (configurable). 32 is recommended +since this provides the best confidentiality, at the cost of making +directory entries consume slightly more space. Note that since NUL +(``\0``) is not otherwise a valid character in filenames, the padding +will never produce duplicate plaintexts. + +Symbolic link targets are considered a type of filename and are +encrypted in the same way as filenames in directory entries. Each +symlink also uses a unique key; hence, the hardcoded IV is not a +problem for symlinks. + +User API +======== + +Setting an encryption policy +---------------------------- + +The FS_IOC_SET_ENCRYPTION_POLICY ioctl sets an encryption policy on an +empty directory or verifies that a directory or regular file already +has the specified encryption policy. It takes in a pointer to a +:c:type:`struct fscrypt_policy`, defined as follows:: + + #define FS_KEY_DESCRIPTOR_SIZE 8 + + struct fscrypt_policy { + __u8 version; + __u8 contents_encryption_mode; + __u8 filenames_encryption_mode; + __u8 flags; + __u8 master_key_descriptor[FS_KEY_DESCRIPTOR_SIZE]; + }; + +This structure must be initialized as follows: + +- ``version`` must be 0. + +- ``contents_encryption_mode`` and ``filenames_encryption_mode`` must + be set to constants from ```` which identify the + encryption modes to use. If unsure, use + FS_ENCRYPTION_MODE_AES_256_XTS (1) for ``contents_encryption_mode`` + and FS_ENCRYPTION_MODE_AES_256_CTS (4) for + ``filenames_encryption_mode``. + +- ``flags`` must be set to a value from ```` which + identifies the amount of NUL-padding to use when encrypting + filenames. If unsure, use FS_POLICY_FLAGS_PAD_32 (0x3). + +- ``master_key_descriptor`` specifies how to find the master key in + the keyring; see `Adding keys`_. It is up to userspace to choose a + unique ``master_key_descriptor`` for each master key. The e4crypt + and fscrypt tools use the first 8 bytes of + ``SHA-512(SHA-512(master_key))``, but this particular scheme is not + required. Also, the master key need not be in the keyring yet when + FS_IOC_SET_ENCRYPTION_POLICY is executed. However, it must be added + before any files can be created in the encrypted directory. + +If the file is not yet encrypted, then FS_IOC_SET_ENCRYPTION_POLICY +verifies that the file is an empty directory. If so, the specified +encryption policy is assigned to the directory, turning it into an +encrypted directory. After that, and after providing the +corresponding master key as described in `Adding keys`_, all regular +files, directories (recursively), and symlinks created in the +directory will be encrypted, inheriting the same encryption policy. +The filenames in the directory's entries will be encrypted as well. + +Alternatively, if the file is already encrypted, then +FS_IOC_SET_ENCRYPTION_POLICY validates that the specified encryption +policy exactly matches the actual one. If they match, then the ioctl +returns 0. Otherwise, it fails with EEXIST. This works on both +regular files and directories, including nonempty directories. + +Note that the ext4 filesystem does not allow the root directory to be +encrypted, even if it is empty. Users who want to encrypt an entire +filesystem with one key should consider using dm-crypt instead. + +FS_IOC_SET_ENCRYPTION_POLICY can fail with the following errors: + +- ``EACCES``: the file is not owned by the process's uid, nor does the + process have the CAP_FOWNER capability in a namespace with the file + owner's uid mapped +- ``EEXIST``: the file is already encrypted with an encryption policy + different from the one specified +- ``EINVAL``: an invalid encryption policy was specified (invalid + version, mode(s), or flags) +- ``ENOTDIR``: the file is unencrypted and is a regular file, not a + directory +- ``ENOTEMPTY``: the file is unencrypted and is a nonempty directory +- ``ENOTTY``: this type of filesystem does not implement encryption +- ``EOPNOTSUPP``: the kernel was not configured with encryption + support for this filesystem, or the filesystem superblock has not + had encryption enabled on it. (For example, to use encryption on an + ext4 filesystem, CONFIG_EXT4_ENCRYPTION must be enabled in the + kernel config, and the superblock must have had the "encrypt" + feature flag enabled using ``tune2fs -O encrypt`` or ``mkfs.ext4 -O + encrypt``.) +- ``EPERM``: this directory may not be encrypted, e.g. because it is + the root directory of an ext4 filesystem +- ``EROFS``: the filesystem is readonly + +Getting an encryption policy +---------------------------- + +The FS_IOC_GET_ENCRYPTION_POLICY ioctl retrieves the :c:type:`struct +fscrypt_policy`, if any, for a directory or regular file. See above +for the struct definition. No additional permissions are required +beyond the ability to open the file. + +FS_IOC_GET_ENCRYPTION_POLICY can fail with the following errors: + +- ``EINVAL``: the file is encrypted, but it uses an unrecognized + encryption context format +- ``ENODATA``: the file is not encrypted +- ``ENOTTY``: this type of filesystem does not implement encryption +- ``EOPNOTSUPP``: the kernel was not configured with encryption + support for this filesystem + +Note: if you only need to know whether a file is encrypted or not, on +most filesystems it is also possible to use the FS_IOC_GETFLAGS ioctl +and check for FS_ENCRYPT_FL, or to use the statx() system call and +check for STATX_ATTR_ENCRYPTED in stx_attributes. + +Getting the per-filesystem salt +------------------------------- + +Some filesystems, such as ext4 and F2FS, also support the deprecated +ioctl FS_IOC_GET_ENCRYPTION_PWSALT. This ioctl retrieves a randomly +generated 16-byte value stored in the filesystem superblock. This +value is intended to used as a salt when deriving an encryption key +from a passphrase or other low-entropy user credential. + +FS_IOC_GET_ENCRYPTION_PWSALT is deprecated. Instead, prefer to +generate and manage any needed salt(s) in userspace. + +Adding keys +----------- + +To provide a master key, userspace must add it to an appropriate +keyring using the add_key() system call (see: +``Documentation/security/keys/core.rst``). The key type must be +"logon"; keys of this type are kept in kernel memory and cannot be +read back by userspace. The key description must be "fscrypt:" +followed by the 16-character lower case hex representation of the +``master_key_descriptor`` that was set in the encryption policy. The +key payload must conform to the following structure:: + + #define FS_MAX_KEY_SIZE 64 + + struct fscrypt_key { + u32 mode; + u8 raw[FS_MAX_KEY_SIZE]; + u32 size; + }; + +``mode`` is ignored; just set it to 0. The actual key is provided in +``raw`` with ``size`` indicating its size in bytes. That is, the +bytes ``raw[0..size-1]`` (inclusive) are the actual key. + +The key description prefix "fscrypt:" may alternatively be replaced +with a filesystem-specific prefix such as "ext4:". However, the +filesystem-specific prefixes are deprecated and should not be used in +new programs. + +There are several different types of keyrings in which encryption keys +may be placed, such as a session keyring, a user session keyring, or a +user keyring. Each key must be placed in a keyring that is "attached" +to all processes that might need to access files encrypted with it, in +the sense that request_key() will find the key. Generally, if only +processes belonging to a specific user need to access a given +encrypted directory and no session keyring has been installed, then +that directory's key should be placed in that user's user session +keyring or user keyring. Otherwise, a session keyring should be +installed if needed, and the key should be linked into that session +keyring, or in a keyring linked into that session keyring. + +Note: introducing the complex visibility semantics of keyrings here +was arguably a mistake --- especially given that by design, after any +process successfully opens an encrypted file (thereby setting up the +per-file key), possessing the keyring key is not actually required for +any process to read/write the file until its in-memory inode is +evicted. In the future there probably should be a way to provide keys +directly to the filesystem instead, which would make the intended +semantics clearer. + +Access semantics +================ + +With the key +------------ + +With the encryption key, encrypted regular files, directories, and +symlinks behave very similarly to their unencrypted counterparts --- +after all, the encryption is intended to be transparent. However, +astute users may notice some differences in behavior: + +- Unencrypted files, or files encrypted with a different encryption + policy (i.e. different key, modes, or flags), cannot be renamed or + linked into an encrypted directory; see `Encryption policy + enforcement`_. Attempts to do so will fail with EPERM. However, + encrypted files can be renamed within an encrypted directory, or + into an unencrypted directory. + +- Direct I/O is not supported on encrypted files. Attempts to use + direct I/O on such files will fall back to buffered I/O. + +- The fallocate operations FALLOC_FL_COLLAPSE_RANGE, + FALLOC_FL_INSERT_RANGE, and FALLOC_FL_ZERO_RANGE are not supported + on encrypted files and will fail with EOPNOTSUPP. + +- Online defragmentation of encrypted files is not supported. The + EXT4_IOC_MOVE_EXT and F2FS_IOC_MOVE_RANGE ioctls will fail with + EOPNOTSUPP. + +- The ext4 filesystem does not support data journaling with encrypted + regular files. It will fall back to ordered data mode instead. + +- DAX (Direct Access) is not supported on encrypted files. + +- The st_size of an encrypted symlink will not necessarily give the + length of the symlink target as required by POSIX. It will actually + give the length of the ciphertext, which will be slightly longer + than the plaintext due to NUL-padding and an extra 2-byte overhead. + +- The maximum length of an encrypted symlink is 2 bytes shorter than + the maximum length of an unencrypted symlink. For example, on an + EXT4 filesystem with a 4K block size, unencrypted symlinks can be up + to 4095 bytes long, while encrypted symlinks can only be up to 4093 + bytes long (both lengths excluding the terminating null). + +Note that mmap *is* supported. This is possible because the pagecache +for an encrypted file contains the plaintext, not the ciphertext. + +Without the key +--------------- + +Some filesystem operations may be performed on encrypted regular +files, directories, and symlinks even before their encryption key has +been provided: + +- File metadata may be read, e.g. using stat(). + +- Directories may be listed, in which case the filenames will be + listed in an encoded form derived from their ciphertext. The + current encoding algorithm is described in `Filename hashing and + encoding`_. The algorithm is subject to change, but it is + guaranteed that the presented filenames will be no longer than + NAME_MAX bytes, will not contain the ``/`` or ``\0`` characters, and + will uniquely identify directory entries. + + The ``.`` and ``..`` directory entries are special. They are always + present and are not encrypted or encoded. + +- Files may be deleted. That is, nondirectory files may be deleted + with unlink() as usual, and empty directories may be deleted with + rmdir() as usual. Therefore, ``rm`` and ``rm -r`` will work as + expected. + +- Symlink targets may be read and followed, but they will be presented + in encrypted form, similar to filenames in directories. Hence, they + are unlikely to point to anywhere useful. + +Without the key, regular files cannot be opened or truncated. +Attempts to do so will fail with ENOKEY. This implies that any +regular file operations that require a file descriptor, such as +read(), write(), mmap(), fallocate(), and ioctl(), are also forbidden. + +Also without the key, files of any type (including directories) cannot +be created or linked into an encrypted directory, nor can a name in an +encrypted directory be the source or target of a rename, nor can an +O_TMPFILE temporary file be created in an encrypted directory. All +such operations will fail with ENOKEY. + +It is not currently possible to backup and restore encrypted files +without the encryption key. This would require special APIs which +have not yet been implemented. + +Encryption policy enforcement +============================= + +After an encryption policy has been set on a directory, all regular +files, directories, and symbolic links created in that directory +(recursively) will inherit that encryption policy. Special files --- +that is, named pipes, device nodes, and UNIX domain sockets --- will +not be encrypted. + +Except for those special files, it is forbidden to have unencrypted +files, or files encrypted with a different encryption policy, in an +encrypted directory tree. Attempts to link or rename such a file into +an encrypted directory will fail with EPERM. This is also enforced +during ->lookup() to provide limited protection against offline +attacks that try to disable or downgrade encryption in known locations +where applications may later write sensitive data. It is recommended +that systems implementing a form of "verified boot" take advantage of +this by validating all top-level encryption policies prior to access. + +Implementation details +====================== + +Encryption context +------------------ + +An encryption policy is represented on-disk by a :c:type:`struct +fscrypt_context`. It is up to individual filesystems to decide where +to store it, but normally it would be stored in a hidden extended +attribute. It should *not* be exposed by the xattr-related system +calls such as getxattr() and setxattr() because of the special +semantics of the encryption xattr. (In particular, there would be +much confusion if an encryption policy were to be added to or removed +from anything other than an empty directory.) The struct is defined +as follows:: + + #define FS_KEY_DESCRIPTOR_SIZE 8 + #define FS_KEY_DERIVATION_NONCE_SIZE 16 + + struct fscrypt_context { + u8 format; + u8 contents_encryption_mode; + u8 filenames_encryption_mode; + u8 flags; + u8 master_key_descriptor[FS_KEY_DESCRIPTOR_SIZE]; + u8 nonce[FS_KEY_DERIVATION_NONCE_SIZE]; + }; + +Note that :c:type:`struct fscrypt_context` contains the same +information as :c:type:`struct fscrypt_policy` (see `Setting an +encryption policy`_), except that :c:type:`struct fscrypt_context` +also contains a nonce. The nonce is randomly generated by the kernel +and is used to derive the inode's encryption key as described in +`Per-file keys`_. + +Data path changes +----------------- + +For the read path (->readpage()) of regular files, filesystems can +read the ciphertext into the page cache and decrypt it in-place. The +page lock must be held until decryption has finished, to prevent the +page from becoming visible to userspace prematurely. + +For the write path (->writepage()) of regular files, filesystems +cannot encrypt data in-place in the page cache, since the cached +plaintext must be preserved. Instead, filesystems must encrypt into a +temporary buffer or "bounce page", then write out the temporary +buffer. Some filesystems, such as UBIFS, already use temporary +buffers regardless of encryption. Other filesystems, such as ext4 and +F2FS, have to allocate bounce pages specially for encryption. + +Filename hashing and encoding +----------------------------- + +Modern filesystems accelerate directory lookups by using indexed +directories. An indexed directory is organized as a tree keyed by +filename hashes. When a ->lookup() is requested, the filesystem +normally hashes the filename being looked up so that it can quickly +find the corresponding directory entry, if any. + +With encryption, lookups must be supported and efficient both with and +without the encryption key. Clearly, it would not work to hash the +plaintext filenames, since the plaintext filenames are unavailable +without the key. (Hashing the plaintext filenames would also make it +impossible for the filesystem's fsck tool to optimize encrypted +directories.) Instead, filesystems hash the ciphertext filenames, +i.e. the bytes actually stored on-disk in the directory entries. When +asked to do a ->lookup() with the key, the filesystem just encrypts +the user-supplied name to get the ciphertext. + +Lookups without the key are more complicated. The raw ciphertext may +contain the ``\0`` and ``/`` characters, which are illegal in +filenames. Therefore, readdir() must base64-encode the ciphertext for +presentation. For most filenames, this works fine; on ->lookup(), the +filesystem just base64-decodes the user-supplied name to get back to +the raw ciphertext. + +However, for very long filenames, base64 encoding would cause the +filename length to exceed NAME_MAX. To prevent this, readdir() +actually presents long filenames in an abbreviated form which encodes +a strong "hash" of the ciphertext filename, along with the optional +filesystem-specific hash(es) needed for directory lookups. This +allows the filesystem to still, with a high degree of confidence, map +the filename given in ->lookup() back to a particular directory entry +that was previously listed by readdir(). See :c:type:`struct +fscrypt_digested_name` in the source for more details. + +Note that the precise way that filenames are presented to userspace +without the key is subject to change in the future. It is only meant +as a way to temporarily present valid filenames so that commands like +``rm -r`` work as expected on encrypted directories. diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index 92c6c0ace1b1..ea372cd53ab6 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -93,6 +93,10 @@ static inline bool fscrypt_valid_enc_modes(u32 contents_mode, filenames_mode == FS_ENCRYPTION_MODE_AES_256_CTS) return true; + if (contents_mode == FS_ENCRYPTION_MODE_SPECK128_256_XTS && + filenames_mode == FS_ENCRYPTION_MODE_SPECK128_256_CTS) + return true; + return false; } diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c index 44bcb695c206..154cd89c2212 100644 --- a/fs/crypto/keyinfo.c +++ b/fs/crypto/keyinfo.c @@ -155,6 +155,8 @@ static const struct { [FS_ENCRYPTION_MODE_AES_256_CTS] = { "cts(cbc(aes))", 32 }, [FS_ENCRYPTION_MODE_AES_128_CBC] = { "cbc(aes)", 16 }, [FS_ENCRYPTION_MODE_AES_128_CTS] = { "cts(cbc(aes))", 16 }, + [FS_ENCRYPTION_MODE_SPECK128_256_XTS] = { "xts(speck128)", 64 }, + [FS_ENCRYPTION_MODE_SPECK128_256_CTS] = { "cts(cbc(speck128))", 32 }, }; static int determine_cipher_type(struct fscrypt_info *ci, struct inode *inode, diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index f3ef5016cf9c..52cedebfd202 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -191,6 +191,8 @@ struct inodes_stat_t { #define FS_ENCRYPTION_MODE_AES_256_CTS 4 #define FS_ENCRYPTION_MODE_AES_128_CBC 5 #define FS_ENCRYPTION_MODE_AES_128_CTS 6 +#define FS_ENCRYPTION_MODE_SPECK128_256_XTS 7 +#define FS_ENCRYPTION_MODE_SPECK128_256_CTS 8 struct fscrypt_policy { From a0ca4bdf47449c111a0225f49b644bf5e1fc72bd Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 21 Jan 2016 17:10:56 +0800 Subject: [PATCH 693/804] crypto: skcipher - Add default key size helper While converting ecryptfs over to skcipher I found that it needs to pick a default key size if one isn't given. Rather than having it poke into the guts of the algorithm to get max_keysize, let's provide a helper that is meant to give a sane default (just in case we ever get an algorithm that has no maximum key size). Signed-off-by: Herbert Xu --- crypto/skcipher.c | 4 ++-- include/crypto/skcipher.h | 11 ++++++++--- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/crypto/skcipher.c b/crypto/skcipher.c index d199c0b1751c..69230e9d4ac9 100644 --- a/crypto/skcipher.c +++ b/crypto/skcipher.c @@ -118,7 +118,7 @@ static int crypto_init_skcipher_ops_blkcipher(struct crypto_tfm *tfm) skcipher->decrypt = skcipher_decrypt_blkcipher; skcipher->ivsize = crypto_blkcipher_ivsize(blkcipher); - skcipher->has_setkey = calg->cra_blkcipher.max_keysize; + skcipher->keysize = calg->cra_blkcipher.max_keysize; return 0; } @@ -211,7 +211,7 @@ static int crypto_init_skcipher_ops_ablkcipher(struct crypto_tfm *tfm) skcipher->ivsize = crypto_ablkcipher_ivsize(ablkcipher); skcipher->reqsize = crypto_ablkcipher_reqsize(ablkcipher) + sizeof(struct ablkcipher_request); - skcipher->has_setkey = calg->cra_ablkcipher.max_keysize; + skcipher->keysize = calg->cra_ablkcipher.max_keysize; return 0; } diff --git a/include/crypto/skcipher.h b/include/crypto/skcipher.h index fd8742a40ff3..2f07b4fce3e0 100644 --- a/include/crypto/skcipher.h +++ b/include/crypto/skcipher.h @@ -60,8 +60,7 @@ struct crypto_skcipher { unsigned int ivsize; unsigned int reqsize; - - bool has_setkey; + unsigned int keysize; struct crypto_tfm base; }; @@ -309,7 +308,13 @@ static inline int crypto_skcipher_setkey(struct crypto_skcipher *tfm, static inline bool crypto_skcipher_has_setkey(struct crypto_skcipher *tfm) { - return tfm->has_setkey; + return tfm->keysize; +} + +static inline unsigned int crypto_skcipher_default_keysize( + struct crypto_skcipher *tfm) +{ + return tfm->keysize; } /** From a9146e42354783b81999191970349a9e5a9d1c98 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 26 Jan 2016 22:14:36 +0800 Subject: [PATCH 694/804] crypto: skcipher - Add helper to retrieve driver name This patch adds the helper crypto_skcipher_driver_name which returns the driver name of the alg object for a given tfm. This is needed by ecryptfs. Signed-off-by: Herbert Xu --- include/crypto/skcipher.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/include/crypto/skcipher.h b/include/crypto/skcipher.h index 2f07b4fce3e0..41418790c536 100644 --- a/include/crypto/skcipher.h +++ b/include/crypto/skcipher.h @@ -231,6 +231,12 @@ static inline int crypto_has_skcipher(const char *alg_name, u32 type, crypto_skcipher_mask(mask)); } +static inline const char *crypto_skcipher_driver_name( + struct crypto_skcipher *tfm) +{ + return crypto_tfm_alg_name(crypto_skcipher_tfm(tfm)); +} + /** * crypto_skcipher_ivsize() - obtain IV size * @tfm: cipher handle From b24dcaae875314079dd4fe65ce231fd9b0bf58be Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 12 Jul 2016 13:17:31 +0800 Subject: [PATCH 695/804] crypto: skcipher - Add low-level skcipher interface This patch allows skcipher algorithms and instances to be created and registered with the crypto API. They are accessible through the top-level skcipher interface, along with ablkcipher/blkcipher algorithms and instances. This patch also introduces a new parameter called chunk size which is meant for ciphers such as CTR and CTS which ostensibly can handle arbitrary lengths, but still behave like block ciphers in that you can only process a partial block at the very end. For these ciphers the block size will continue to be set to 1 as it is now while the chunk size will be set to the underlying block size. Signed-off-by: Herbert Xu --- crypto/skcipher.c | 196 +++++++++++++++++++++++++++-- include/crypto/internal/skcipher.h | 87 +++++++++++++ include/crypto/skcipher.h | 130 +++++++++++++++++++ include/linux/crypto.h | 1 + 4 files changed, 407 insertions(+), 7 deletions(-) diff --git a/crypto/skcipher.c b/crypto/skcipher.c index 69230e9d4ac9..d248008e7f7b 100644 --- a/crypto/skcipher.c +++ b/crypto/skcipher.c @@ -16,7 +16,11 @@ #include #include +#include #include +#include +#include +#include #include "internal.h" @@ -25,10 +29,11 @@ static unsigned int crypto_skcipher_extsize(struct crypto_alg *alg) if (alg->cra_type == &crypto_blkcipher_type) return sizeof(struct crypto_blkcipher *); - BUG_ON(alg->cra_type != &crypto_ablkcipher_type && - alg->cra_type != &crypto_givcipher_type); + if (alg->cra_type == &crypto_ablkcipher_type || + alg->cra_type == &crypto_givcipher_type) + return sizeof(struct crypto_ablkcipher *); - return sizeof(struct crypto_ablkcipher *); + return crypto_alg_extsize(alg); } static int skcipher_setkey_blkcipher(struct crypto_skcipher *tfm, @@ -216,26 +221,118 @@ static int crypto_init_skcipher_ops_ablkcipher(struct crypto_tfm *tfm) return 0; } +static void crypto_skcipher_exit_tfm(struct crypto_tfm *tfm) +{ + struct crypto_skcipher *skcipher = __crypto_skcipher_cast(tfm); + struct skcipher_alg *alg = crypto_skcipher_alg(skcipher); + + alg->exit(skcipher); +} + static int crypto_skcipher_init_tfm(struct crypto_tfm *tfm) { + struct crypto_skcipher *skcipher = __crypto_skcipher_cast(tfm); + struct skcipher_alg *alg = crypto_skcipher_alg(skcipher); + if (tfm->__crt_alg->cra_type == &crypto_blkcipher_type) return crypto_init_skcipher_ops_blkcipher(tfm); - BUG_ON(tfm->__crt_alg->cra_type != &crypto_ablkcipher_type && - tfm->__crt_alg->cra_type != &crypto_givcipher_type); + if (tfm->__crt_alg->cra_type == &crypto_ablkcipher_type || + tfm->__crt_alg->cra_type == &crypto_givcipher_type) + return crypto_init_skcipher_ops_ablkcipher(tfm); - return crypto_init_skcipher_ops_ablkcipher(tfm); + skcipher->setkey = alg->setkey; + skcipher->encrypt = alg->encrypt; + skcipher->decrypt = alg->decrypt; + skcipher->ivsize = alg->ivsize; + skcipher->keysize = alg->max_keysize; + + if (alg->exit) + skcipher->base.exit = crypto_skcipher_exit_tfm; + + if (alg->init) + return alg->init(skcipher); + + return 0; } +static void crypto_skcipher_free_instance(struct crypto_instance *inst) +{ + struct skcipher_instance *skcipher = + container_of(inst, struct skcipher_instance, s.base); + + skcipher->free(skcipher); +} + +static void crypto_skcipher_show(struct seq_file *m, struct crypto_alg *alg) + __attribute__ ((unused)); +static void crypto_skcipher_show(struct seq_file *m, struct crypto_alg *alg) +{ + struct skcipher_alg *skcipher = container_of(alg, struct skcipher_alg, + base); + + seq_printf(m, "type : skcipher\n"); + seq_printf(m, "async : %s\n", + alg->cra_flags & CRYPTO_ALG_ASYNC ? "yes" : "no"); + seq_printf(m, "blocksize : %u\n", alg->cra_blocksize); + seq_printf(m, "min keysize : %u\n", skcipher->min_keysize); + seq_printf(m, "max keysize : %u\n", skcipher->max_keysize); + seq_printf(m, "ivsize : %u\n", skcipher->ivsize); + seq_printf(m, "chunksize : %u\n", skcipher->chunksize); +} + +#ifdef CONFIG_NET +static int crypto_skcipher_report(struct sk_buff *skb, struct crypto_alg *alg) +{ + struct crypto_report_blkcipher rblkcipher; + struct skcipher_alg *skcipher = container_of(alg, struct skcipher_alg, + base); + + strncpy(rblkcipher.type, "skcipher", sizeof(rblkcipher.type)); + strncpy(rblkcipher.geniv, "", sizeof(rblkcipher.geniv)); + + rblkcipher.blocksize = alg->cra_blocksize; + rblkcipher.min_keysize = skcipher->min_keysize; + rblkcipher.max_keysize = skcipher->max_keysize; + rblkcipher.ivsize = skcipher->ivsize; + + if (nla_put(skb, CRYPTOCFGA_REPORT_BLKCIPHER, + sizeof(struct crypto_report_blkcipher), &rblkcipher)) + goto nla_put_failure; + return 0; + +nla_put_failure: + return -EMSGSIZE; +} +#else +static int crypto_skcipher_report(struct sk_buff *skb, struct crypto_alg *alg) +{ + return -ENOSYS; +} +#endif + static const struct crypto_type crypto_skcipher_type2 = { .extsize = crypto_skcipher_extsize, .init_tfm = crypto_skcipher_init_tfm, + .free = crypto_skcipher_free_instance, +#ifdef CONFIG_PROC_FS + .show = crypto_skcipher_show, +#endif + .report = crypto_skcipher_report, .maskclear = ~CRYPTO_ALG_TYPE_MASK, .maskset = CRYPTO_ALG_TYPE_BLKCIPHER_MASK, - .type = CRYPTO_ALG_TYPE_BLKCIPHER, + .type = CRYPTO_ALG_TYPE_SKCIPHER, .tfmsize = offsetof(struct crypto_skcipher, base), }; +int crypto_grab_skcipher2(struct crypto_skcipher_spawn *spawn, + const char *name, u32 type, u32 mask) +{ + spawn->base.frontend = &crypto_skcipher_type2; + return crypto_grab_spawn(&spawn->base, name, type, mask); +} +EXPORT_SYMBOL_GPL(crypto_grab_skcipher2); + struct crypto_skcipher *crypto_alloc_skcipher(const char *alg_name, u32 type, u32 mask) { @@ -243,5 +340,90 @@ struct crypto_skcipher *crypto_alloc_skcipher(const char *alg_name, } EXPORT_SYMBOL_GPL(crypto_alloc_skcipher); +int crypto_has_skcipher2(const char *alg_name, u32 type, u32 mask) +{ + return crypto_type_has_alg(alg_name, &crypto_skcipher_type2, + type, mask); +} +EXPORT_SYMBOL_GPL(crypto_has_skcipher2); + +static int skcipher_prepare_alg(struct skcipher_alg *alg) +{ + struct crypto_alg *base = &alg->base; + + if (alg->ivsize > PAGE_SIZE / 8 || alg->chunksize > PAGE_SIZE / 8) + return -EINVAL; + + if (!alg->chunksize) + alg->chunksize = base->cra_blocksize; + + base->cra_type = &crypto_skcipher_type2; + base->cra_flags &= ~CRYPTO_ALG_TYPE_MASK; + base->cra_flags |= CRYPTO_ALG_TYPE_SKCIPHER; + + return 0; +} + +int crypto_register_skcipher(struct skcipher_alg *alg) +{ + struct crypto_alg *base = &alg->base; + int err; + + err = skcipher_prepare_alg(alg); + if (err) + return err; + + return crypto_register_alg(base); +} +EXPORT_SYMBOL_GPL(crypto_register_skcipher); + +void crypto_unregister_skcipher(struct skcipher_alg *alg) +{ + crypto_unregister_alg(&alg->base); +} +EXPORT_SYMBOL_GPL(crypto_unregister_skcipher); + +int crypto_register_skciphers(struct skcipher_alg *algs, int count) +{ + int i, ret; + + for (i = 0; i < count; i++) { + ret = crypto_register_skcipher(&algs[i]); + if (ret) + goto err; + } + + return 0; + +err: + for (--i; i >= 0; --i) + crypto_unregister_skcipher(&algs[i]); + + return ret; +} +EXPORT_SYMBOL_GPL(crypto_register_skciphers); + +void crypto_unregister_skciphers(struct skcipher_alg *algs, int count) +{ + int i; + + for (i = count - 1; i >= 0; --i) + crypto_unregister_skcipher(&algs[i]); +} +EXPORT_SYMBOL_GPL(crypto_unregister_skciphers); + +int skcipher_register_instance(struct crypto_template *tmpl, + struct skcipher_instance *inst) +{ + int err; + + err = skcipher_prepare_alg(&inst->alg); + if (err) + return err; + + return crypto_register_instance(tmpl, skcipher_crypto_instance(inst)); +} +EXPORT_SYMBOL_GPL(skcipher_register_instance); + MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Symmetric key cipher type"); diff --git a/include/crypto/internal/skcipher.h b/include/crypto/internal/skcipher.h index 2cf7a61ece59..ce6619c339fe 100644 --- a/include/crypto/internal/skcipher.h +++ b/include/crypto/internal/skcipher.h @@ -19,12 +19,46 @@ struct rtattr; +struct skcipher_instance { + void (*free)(struct skcipher_instance *inst); + union { + struct { + char head[offsetof(struct skcipher_alg, base)]; + struct crypto_instance base; + } s; + struct skcipher_alg alg; + }; +}; + struct crypto_skcipher_spawn { struct crypto_spawn base; }; extern const struct crypto_type crypto_givcipher_type; +static inline struct crypto_instance *skcipher_crypto_instance( + struct skcipher_instance *inst) +{ + return &inst->s.base; +} + +static inline struct skcipher_instance *skcipher_alg_instance( + struct crypto_skcipher *skcipher) +{ + return container_of(crypto_skcipher_alg(skcipher), + struct skcipher_instance, alg); +} + +static inline void *skcipher_instance_ctx(struct skcipher_instance *inst) +{ + return crypto_instance_ctx(skcipher_crypto_instance(inst)); +} + +static inline void skcipher_request_complete(struct skcipher_request *req, int err) +{ + req->base.complete(&req->base, err); +} + static inline void crypto_set_skcipher_spawn( struct crypto_skcipher_spawn *spawn, struct crypto_instance *inst) { @@ -33,6 +67,8 @@ static inline void crypto_set_skcipher_spawn( int crypto_grab_skcipher(struct crypto_skcipher_spawn *spawn, const char *name, u32 type, u32 mask); +int crypto_grab_skcipher2(struct crypto_skcipher_spawn *spawn, + const char *name, u32 type, u32 mask); struct crypto_alg *crypto_lookup_skcipher(const char *name, u32 type, u32 mask); @@ -47,6 +83,12 @@ static inline struct crypto_alg *crypto_skcipher_spawn_alg( return spawn->base.alg; } +static inline struct skcipher_alg *crypto_spawn_skcipher_alg( + struct crypto_skcipher_spawn *spawn) +{ + return container_of(spawn->base.alg, struct skcipher_alg, base); +} + static inline struct crypto_ablkcipher *crypto_spawn_skcipher( struct crypto_skcipher_spawn *spawn) { @@ -55,6 +97,25 @@ static inline struct crypto_ablkcipher *crypto_spawn_skcipher( crypto_skcipher_mask(0))); } +static inline struct crypto_skcipher *crypto_spawn_skcipher2( + struct crypto_skcipher_spawn *spawn) +{ + return crypto_spawn_tfm2(&spawn->base); +} + +static inline void crypto_skcipher_set_reqsize( + struct crypto_skcipher *skcipher, unsigned int reqsize) +{ + skcipher->reqsize = reqsize; +} + +int crypto_register_skcipher(struct skcipher_alg *alg); +void crypto_unregister_skcipher(struct skcipher_alg *alg); +int crypto_register_skciphers(struct skcipher_alg *algs, int count); +void crypto_unregister_skciphers(struct skcipher_alg *algs, int count); +int skcipher_register_instance(struct crypto_template *tmpl, + struct skcipher_instance *inst); + int skcipher_null_givencrypt(struct skcipher_givcrypt_request *req); int skcipher_null_givdecrypt(struct skcipher_givcrypt_request *req); const char *crypto_default_geniv(const struct crypto_alg *alg); @@ -122,5 +183,31 @@ static inline u32 skcipher_request_flags(struct skcipher_request *req) return req->base.flags; } +static inline unsigned int crypto_skcipher_alg_min_keysize( + struct skcipher_alg *alg) +{ + if ((alg->base.cra_flags & CRYPTO_ALG_TYPE_MASK) == + CRYPTO_ALG_TYPE_BLKCIPHER) + return alg->base.cra_blkcipher.min_keysize; + + if (alg->base.cra_ablkcipher.encrypt) + return alg->base.cra_ablkcipher.min_keysize; + + return alg->min_keysize; +} + +static inline unsigned int crypto_skcipher_alg_max_keysize( + struct skcipher_alg *alg) +{ + if ((alg->base.cra_flags & CRYPTO_ALG_TYPE_MASK) == + CRYPTO_ALG_TYPE_BLKCIPHER) + return alg->base.cra_blkcipher.max_keysize; + + if (alg->base.cra_ablkcipher.encrypt) + return alg->base.cra_ablkcipher.max_keysize; + + return alg->max_keysize; +} + #endif /* _CRYPTO_INTERNAL_SKCIPHER_H */ diff --git a/include/crypto/skcipher.h b/include/crypto/skcipher.h index 41418790c536..5c90d3edf975 100644 --- a/include/crypto/skcipher.h +++ b/include/crypto/skcipher.h @@ -65,6 +65,75 @@ struct crypto_skcipher { struct crypto_tfm base; }; +/** + * struct skcipher_alg - symmetric key cipher definition + * @min_keysize: Minimum key size supported by the transformation. This is the + * smallest key length supported by this transformation algorithm. + * This must be set to one of the pre-defined values as this is + * not hardware specific. Possible values for this field can be + * found via git grep "_MIN_KEY_SIZE" include/crypto/ + * @max_keysize: Maximum key size supported by the transformation. This is the + * largest key length supported by this transformation algorithm. + * This must be set to one of the pre-defined values as this is + * not hardware specific. Possible values for this field can be + * found via git grep "_MAX_KEY_SIZE" include/crypto/ + * @setkey: Set key for the transformation. This function is used to either + * program a supplied key into the hardware or store the key in the + * transformation context for programming it later. Note that this + * function does modify the transformation context. This function can + * be called multiple times during the existence of the transformation + * object, so one must make sure the key is properly reprogrammed into + * the hardware. This function is also responsible for checking the key + * length for validity. In case a software fallback was put in place in + * the @cra_init call, this function might need to use the fallback if + * the algorithm doesn't support all of the key sizes. + * @encrypt: Encrypt a scatterlist of blocks. This function is used to encrypt + * the supplied scatterlist containing the blocks of data. The crypto + * API consumer is responsible for aligning the entries of the + * scatterlist properly and making sure the chunks are correctly + * sized. In case a software fallback was put in place in the + * @cra_init call, this function might need to use the fallback if + * the algorithm doesn't support all of the key sizes. In case the + * key was stored in transformation context, the key might need to be + * re-programmed into the hardware in this function. This function + * shall not modify the transformation context, as this function may + * be called in parallel with the same transformation object. + * @decrypt: Decrypt a single block. This is a reverse counterpart to @encrypt + * and the conditions are exactly the same. + * @init: Initialize the cryptographic transformation object. This function + * is used to initialize the cryptographic transformation object. + * This function is called only once at the instantiation time, right + * after the transformation context was allocated. In case the + * cryptographic hardware has some special requirements which need to + * be handled by software, this function shall check for the precise + * requirement of the transformation and put any software fallbacks + * in place. + * @exit: Deinitialize the cryptographic transformation object. This is a + * counterpart to @init, used to remove various changes set in + * @init. + * @ivsize: IV size applicable for transformation. The consumer must provide an + * IV of exactly that size to perform the encrypt or decrypt operation. + * @chunksize: Equal to the block size except for stream ciphers such as + * CTR where it is set to the underlying block size. + * + * All fields except @ivsize are mandatory and must be filled. + */ +struct skcipher_alg { + int (*setkey)(struct crypto_skcipher *tfm, const u8 *key, + unsigned int keylen); + int (*encrypt)(struct skcipher_request *req); + int (*decrypt)(struct skcipher_request *req); + int (*init)(struct crypto_skcipher *tfm); + void (*exit)(struct crypto_skcipher *tfm); + + unsigned int min_keysize; + unsigned int max_keysize; + unsigned int ivsize; + unsigned int chunksize; + + struct crypto_alg base; +}; + #define SKCIPHER_REQUEST_ON_STACK(name, tfm) \ char __##name##_desc[sizeof(struct skcipher_request) + \ crypto_skcipher_reqsize(tfm)] CRYPTO_MINALIGN_ATTR; \ @@ -231,12 +300,43 @@ static inline int crypto_has_skcipher(const char *alg_name, u32 type, crypto_skcipher_mask(mask)); } +/** + * crypto_has_skcipher2() - Search for the availability of an skcipher. + * @alg_name: is the cra_name / name or cra_driver_name / driver name of the + * skcipher + * @type: specifies the type of the skcipher + * @mask: specifies the mask for the skcipher + * + * Return: true when the skcipher is known to the kernel crypto API; false + * otherwise + */ +int crypto_has_skcipher2(const char *alg_name, u32 type, u32 mask); + static inline const char *crypto_skcipher_driver_name( struct crypto_skcipher *tfm) { return crypto_tfm_alg_name(crypto_skcipher_tfm(tfm)); } +static inline struct skcipher_alg *crypto_skcipher_alg( + struct crypto_skcipher *tfm) +{ + return container_of(crypto_skcipher_tfm(tfm)->__crt_alg, + struct skcipher_alg, base); +} + +static inline unsigned int crypto_skcipher_alg_ivsize(struct skcipher_alg *alg) +{ + if ((alg->base.cra_flags & CRYPTO_ALG_TYPE_MASK) == + CRYPTO_ALG_TYPE_BLKCIPHER) + return alg->base.cra_blkcipher.ivsize; + + if (alg->base.cra_ablkcipher.encrypt) + return alg->base.cra_ablkcipher.ivsize; + + return alg->ivsize; +} + /** * crypto_skcipher_ivsize() - obtain IV size * @tfm: cipher handle @@ -251,6 +351,36 @@ static inline unsigned int crypto_skcipher_ivsize(struct crypto_skcipher *tfm) return tfm->ivsize; } +static inline unsigned int crypto_skcipher_alg_chunksize( + struct skcipher_alg *alg) +{ + if ((alg->base.cra_flags & CRYPTO_ALG_TYPE_MASK) == + CRYPTO_ALG_TYPE_BLKCIPHER) + return alg->base.cra_blocksize; + + if (alg->base.cra_ablkcipher.encrypt) + return alg->base.cra_blocksize; + + return alg->chunksize; +} + +/** + * crypto_skcipher_chunksize() - obtain chunk size + * @tfm: cipher handle + * + * The block size is set to one for ciphers such as CTR. However, + * you still need to provide incremental updates in multiples of + * the underlying block size as the IV does not have sub-block + * granularity. This is known in this API as the chunk size. + * + * Return: chunk size in bytes + */ +static inline unsigned int crypto_skcipher_chunksize( + struct crypto_skcipher *tfm) +{ + return crypto_skcipher_alg_chunksize(crypto_skcipher_alg(tfm)); +} + /** * crypto_skcipher_blocksize() - obtain block size of cipher * @tfm: cipher handle diff --git a/include/linux/crypto.h b/include/linux/crypto.h index b7c1e1a7ebac..d7c8b37b2e95 100644 --- a/include/linux/crypto.h +++ b/include/linux/crypto.h @@ -48,6 +48,7 @@ #define CRYPTO_ALG_TYPE_AEAD 0x00000003 #define CRYPTO_ALG_TYPE_BLKCIPHER 0x00000004 #define CRYPTO_ALG_TYPE_ABLKCIPHER 0x00000005 +#define CRYPTO_ALG_TYPE_SKCIPHER 0x00000005 #define CRYPTO_ALG_TYPE_GIVCIPHER 0x00000006 #define CRYPTO_ALG_TYPE_DIGEST 0x00000008 #define CRYPTO_ALG_TYPE_HASH 0x00000008 From 4cbda579cd3d67e4f2097bd790ffcd28eef40c7b Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sat, 23 Jan 2016 13:51:01 +0800 Subject: [PATCH 696/804] crypto: api - Add crypto_type_has_alg helper This patch adds the helper crypto_type_has_alg which is meant to replace crypto_has_alg for new-style crypto types. Rather than hard-coding type/mask information they're now retrieved from the crypto_type object. Signed-off-by: Herbert Xu --- crypto/algapi.c | 15 +++++++++++++++ crypto/internal.h | 3 +++ 2 files changed, 18 insertions(+) diff --git a/crypto/algapi.c b/crypto/algapi.c index 59bf491fe3d8..c63f06a8b76a 100644 --- a/crypto/algapi.c +++ b/crypto/algapi.c @@ -988,6 +988,21 @@ unsigned int crypto_alg_extsize(struct crypto_alg *alg) } EXPORT_SYMBOL_GPL(crypto_alg_extsize); +int crypto_type_has_alg(const char *name, const struct crypto_type *frontend, + u32 type, u32 mask) +{ + int ret = 0; + struct crypto_alg *alg = crypto_find_alg(name, frontend, type, mask); + + if (!IS_ERR(alg)) { + crypto_mod_put(alg); + ret = 1; + } + + return ret; +} +EXPORT_SYMBOL_GPL(crypto_type_has_alg); + static int __init crypto_algapi_init(void) { crypto_init_proc(); diff --git a/crypto/internal.h b/crypto/internal.h index 00e42a3ed814..7eefcdb00227 100644 --- a/crypto/internal.h +++ b/crypto/internal.h @@ -104,6 +104,9 @@ int crypto_probing_notify(unsigned long val, void *v); unsigned int crypto_alg_extsize(struct crypto_alg *alg); +int crypto_type_has_alg(const char *name, const struct crypto_type *frontend, + u32 type, u32 mask); + static inline struct crypto_alg *crypto_alg_get(struct crypto_alg *alg) { atomic_inc(&alg->cra_refcnt); From e7724207f71e4bb50b1a34e234f22247c721b246 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 18 May 2018 10:58:14 -0700 Subject: [PATCH 697/804] fscrypt: log the crypto algorithm implementations Log the crypto algorithm driver name for each fscrypt encryption mode on its first use, also showing a friendly name for the mode. This will help people determine whether the expected implementations are being used. In some cases we've seen people do benchmarks and reject using encryption for performance reasons, when in fact they used a much slower implementation of AES-XTS than was possible on the hardware. It can make an enormous difference; e.g., AES-XTS on ARM is about 10x faster with the crypto extensions (AES instructions) than without. This also makes it more obvious which modes are being used, now that fscrypt supports multiple combinations of modes. Example messages (with default modes, on x86_64): [ 35.492057] fscrypt: AES-256-CTS-CBC using implementation "cts(cbc-aes-aesni)" [ 35.492171] fscrypt: AES-256-XTS using implementation "xts-aes-aesni" Note: algorithms can be dynamically added to the crypto API, which can result in different implementations being used at different times. But this is rare; for most users, showing the first will be good enough. Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/crypto/keyinfo.c | 102 +++++++++++++++++++++++++++++--------------- 1 file changed, 68 insertions(+), 34 deletions(-) diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c index 154cd89c2212..382e828f2f9a 100644 --- a/fs/crypto/keyinfo.c +++ b/fs/crypto/keyinfo.c @@ -147,44 +147,64 @@ static int find_and_derive_key(const struct inode *inode, return err; } -static const struct { +static struct fscrypt_mode { + const char *friendly_name; const char *cipher_str; int keysize; + bool logged_impl_name; } available_modes[] = { - [FS_ENCRYPTION_MODE_AES_256_XTS] = { "xts(aes)", 64 }, - [FS_ENCRYPTION_MODE_AES_256_CTS] = { "cts(cbc(aes))", 32 }, - [FS_ENCRYPTION_MODE_AES_128_CBC] = { "cbc(aes)", 16 }, - [FS_ENCRYPTION_MODE_AES_128_CTS] = { "cts(cbc(aes))", 16 }, - [FS_ENCRYPTION_MODE_SPECK128_256_XTS] = { "xts(speck128)", 64 }, - [FS_ENCRYPTION_MODE_SPECK128_256_CTS] = { "cts(cbc(speck128))", 32 }, + [FS_ENCRYPTION_MODE_AES_256_XTS] = { + .friendly_name = "AES-256-XTS", + .cipher_str = "xts(aes)", + .keysize = 64, + }, + [FS_ENCRYPTION_MODE_AES_256_CTS] = { + .friendly_name = "AES-256-CTS-CBC", + .cipher_str = "cts(cbc(aes))", + .keysize = 32, + }, + [FS_ENCRYPTION_MODE_AES_128_CBC] = { + .friendly_name = "AES-128-CBC", + .cipher_str = "cbc(aes)", + .keysize = 16, + }, + [FS_ENCRYPTION_MODE_AES_128_CTS] = { + .friendly_name = "AES-128-CTS-CBC", + .cipher_str = "cts(cbc(aes))", + .keysize = 16, + }, + [FS_ENCRYPTION_MODE_SPECK128_256_XTS] = { + .friendly_name = "Speck128/256-XTS", + .cipher_str = "xts(speck128)", + .keysize = 64, + }, + [FS_ENCRYPTION_MODE_SPECK128_256_CTS] = { + .friendly_name = "Speck128/256-CTS-CBC", + .cipher_str = "cts(cbc(speck128))", + .keysize = 32, + }, }; -static int determine_cipher_type(struct fscrypt_info *ci, struct inode *inode, - const char **cipher_str_ret, int *keysize_ret) +static struct fscrypt_mode * +select_encryption_mode(const struct fscrypt_info *ci, const struct inode *inode) { - u32 mode; - if (!fscrypt_valid_enc_modes(ci->ci_data_mode, ci->ci_filename_mode)) { fscrypt_warn(inode->i_sb, "inode %lu uses unsupported encryption modes (contents mode %d, filenames mode %d)", inode->i_ino, ci->ci_data_mode, ci->ci_filename_mode); - return -EINVAL; + return ERR_PTR(-EINVAL); } - if (S_ISREG(inode->i_mode)) { - mode = ci->ci_data_mode; - } else if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) { - mode = ci->ci_filename_mode; - } else { - WARN_ONCE(1, "fscrypt: filesystem tried to load encryption info for inode %lu, which is not encryptable (file type %d)\n", - inode->i_ino, (inode->i_mode & S_IFMT)); - return -EINVAL; - } + if (S_ISREG(inode->i_mode)) + return &available_modes[ci->ci_data_mode]; - *cipher_str_ret = available_modes[mode].cipher_str; - *keysize_ret = available_modes[mode].keysize; - return 0; + if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) + return &available_modes[ci->ci_filename_mode]; + + WARN_ONCE(1, "fscrypt: filesystem tried to load encryption info for inode %lu, which is not encryptable (file type %d)\n", + inode->i_ino, (inode->i_mode & S_IFMT)); + return ERR_PTR(-EINVAL); } static void put_crypt_info(struct fscrypt_info *ci) @@ -269,8 +289,7 @@ int fscrypt_get_encryption_info(struct inode *inode) struct fscrypt_info *crypt_info; struct fscrypt_context ctx; struct crypto_skcipher *ctfm; - const char *cipher_str; - int keysize; + struct fscrypt_mode *mode; u8 *raw_key = NULL; int res; @@ -314,40 +333,55 @@ int fscrypt_get_encryption_info(struct inode *inode) memcpy(crypt_info->ci_master_key, ctx.master_key_descriptor, sizeof(crypt_info->ci_master_key)); - res = determine_cipher_type(crypt_info, inode, &cipher_str, &keysize); - if (res) + mode = select_encryption_mode(crypt_info, inode); + if (IS_ERR(mode)) { + res = PTR_ERR(mode); goto out; + } /* * This cannot be a stack buffer because it is passed to the scatterlist * crypto API as part of key derivation. */ res = -ENOMEM; - raw_key = kmalloc(keysize, GFP_NOFS); + raw_key = kmalloc(mode->keysize, GFP_NOFS); if (!raw_key) goto out; - res = find_and_derive_key(inode, &ctx, raw_key, keysize); + res = find_and_derive_key(inode, &ctx, raw_key, mode->keysize); if (res) goto out; - ctfm = crypto_alloc_skcipher(cipher_str, 0, 0); + ctfm = crypto_alloc_skcipher(mode->cipher_str, 0, 0); if (IS_ERR(ctfm)) { res = PTR_ERR(ctfm); fscrypt_warn(inode->i_sb, "error allocating '%s' transform for inode %lu: %d", - cipher_str, inode->i_ino, res); + mode->cipher_str, inode->i_ino, res); goto out; } + if (unlikely(!mode->logged_impl_name)) { + /* + * fscrypt performance can vary greatly depending on which + * crypto algorithm implementation is used. Help people debug + * performance problems by logging the ->cra_driver_name the + * first time a mode is used. Note that multiple threads can + * race here, but it doesn't really matter. + */ + mode->logged_impl_name = true; + pr_info("fscrypt: %s using implementation \"%s\"\n", + mode->friendly_name, + crypto_skcipher_alg(ctfm)->base.cra_driver_name); + } crypt_info->ci_ctfm = ctfm; crypto_skcipher_set_flags(ctfm, CRYPTO_TFM_REQ_WEAK_KEY); - res = crypto_skcipher_setkey(ctfm, raw_key, keysize); + res = crypto_skcipher_setkey(ctfm, raw_key, mode->keysize); if (res) goto out; if (S_ISREG(inode->i_mode) && crypt_info->ci_data_mode == FS_ENCRYPTION_MODE_AES_128_CBC) { - res = init_essiv_generator(crypt_info, raw_key, keysize); + res = init_essiv_generator(crypt_info, raw_key, mode->keysize); if (res) { fscrypt_warn(inode->i_sb, "error initializing ESSIV generator for inode %lu: %d", From 71aaced0e1eea5f15b6aba888d9ded4eb29f8c9b Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 3 Apr 2018 15:08:17 +0800 Subject: [PATCH 698/804] f2fs: introduce private inode status mapping Previously, we use generic FS_*_FL defined by vfs to indicate inode status for each bit of i_flags, so f2fs's flag status definition is tied to vfs' one, it will be hard for f2fs to reuse bits f2fs never used to indicate new status.. In order to solve this issue, we introduce private inode status mapping, Note, for these bits have already been persisted into disk, we should never change their definition, for other ones, we can remap them for later new coming status. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 57 ++++++++++++++++++++++++++++++++++++++++++++++--- fs/f2fs/file.c | 19 +++++++++-------- fs/f2fs/inode.c | 12 +++++------ fs/f2fs/namei.c | 6 +++--- fs/f2fs/super.c | 4 ++-- 5 files changed, 75 insertions(+), 23 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 51e4a9499f04..66c315a8ef78 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2219,9 +2219,60 @@ static inline void f2fs_change_bit(unsigned int nr, char *addr) *addr ^= mask; } -#define F2FS_REG_FLMASK (~(FS_DIRSYNC_FL | FS_TOPDIR_FL)) -#define F2FS_OTHER_FLMASK (FS_NODUMP_FL | FS_NOATIME_FL) -#define F2FS_FL_INHERITED (FS_PROJINHERIT_FL) +/* + * Inode flags + */ +#define F2FS_SECRM_FL 0x00000001 /* Secure deletion */ +#define F2FS_UNRM_FL 0x00000002 /* Undelete */ +#define F2FS_COMPR_FL 0x00000004 /* Compress file */ +#define F2FS_SYNC_FL 0x00000008 /* Synchronous updates */ +#define F2FS_IMMUTABLE_FL 0x00000010 /* Immutable file */ +#define F2FS_APPEND_FL 0x00000020 /* writes to file may only append */ +#define F2FS_NODUMP_FL 0x00000040 /* do not dump file */ +#define F2FS_NOATIME_FL 0x00000080 /* do not update atime */ +/* Reserved for compression usage... */ +#define F2FS_DIRTY_FL 0x00000100 +#define F2FS_COMPRBLK_FL 0x00000200 /* One or more compressed clusters */ +#define F2FS_NOCOMPR_FL 0x00000400 /* Don't compress */ +#define F2FS_ENCRYPT_FL 0x00000800 /* encrypted file */ +/* End compression flags --- maybe not all used */ +#define F2FS_INDEX_FL 0x00001000 /* hash-indexed directory */ +#define F2FS_IMAGIC_FL 0x00002000 /* AFS directory */ +#define F2FS_JOURNAL_DATA_FL 0x00004000 /* file data should be journaled */ +#define F2FS_NOTAIL_FL 0x00008000 /* file tail should not be merged */ +#define F2FS_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */ +#define F2FS_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ +#define F2FS_HUGE_FILE_FL 0x00040000 /* Set to each huge file */ +#define F2FS_EXTENTS_FL 0x00080000 /* Inode uses extents */ +#define F2FS_EA_INODE_FL 0x00200000 /* Inode used for large EA */ +#define F2FS_EOFBLOCKS_FL 0x00400000 /* Blocks allocated beyond EOF */ +#define F2FS_INLINE_DATA_FL 0x10000000 /* Inode has inline data. */ +#define F2FS_PROJINHERIT_FL 0x20000000 /* Create with parents projid */ +#define F2FS_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ + +#define F2FS_FL_USER_VISIBLE 0x304BDFFF /* User visible flags */ +#define F2FS_FL_USER_MODIFIABLE 0x204BC0FF /* User modifiable flags */ + +/* Flags we can manipulate with through F2FS_IOC_FSSETXATTR */ +#define F2FS_FL_XFLAG_VISIBLE (F2FS_SYNC_FL | \ + F2FS_IMMUTABLE_FL | \ + F2FS_APPEND_FL | \ + F2FS_NODUMP_FL | \ + F2FS_NOATIME_FL | \ + F2FS_PROJINHERIT_FL) + +/* Flags that should be inherited by new inodes from their parent. */ +#define F2FS_FL_INHERITED (F2FS_SECRM_FL | F2FS_UNRM_FL | F2FS_COMPR_FL |\ + F2FS_SYNC_FL | F2FS_NODUMP_FL | F2FS_NOATIME_FL |\ + F2FS_NOCOMPR_FL | F2FS_JOURNAL_DATA_FL |\ + F2FS_NOTAIL_FL | F2FS_DIRSYNC_FL |\ + F2FS_PROJINHERIT_FL) + +/* Flags that are appropriate for regular files (all but dir-specific ones). */ +#define F2FS_REG_FLMASK (~(F2FS_DIRSYNC_FL | F2FS_TOPDIR_FL)) + +/* Flags that are appropriate for non-directories/regular files. */ +#define F2FS_OTHER_FLMASK (F2FS_NODUMP_FL | F2FS_NOATIME_FL) static inline __u32 f2fs_mask_flags(umode_t mode, __u32 flags) { diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 40d03d58b390..fc7d07f93bbe 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -689,16 +689,16 @@ int f2fs_getattr(struct vfsmount *mnt, stat->btime.tv_nsec = fi->i_crtime.tv_nsec; } - flags = fi->i_flags & (FS_FL_USER_VISIBLE | FS_PROJINHERIT_FL); - if (flags & FS_APPEND_FL) + flags = fi->i_flags & (F2FS_FL_USER_VISIBLE | F2FS_PROJINHERIT_FL); + if (flags & F2FS_APPEND_FL) stat->attributes |= STATX_ATTR_APPEND; - if (flags & FS_COMPR_FL) + if (flags & F2FS_COMPR_FL) stat->attributes |= STATX_ATTR_COMPRESSED; if (f2fs_encrypted_inode(inode)) stat->attributes |= STATX_ATTR_ENCRYPTED; - if (flags & FS_IMMUTABLE_FL) + if (flags & F2FS_IMMUTABLE_FL) stat->attributes |= STATX_ATTR_IMMUTABLE; - if (flags & FS_NODUMP_FL) + if (flags & F2FS_NODUMP_FL) stat->attributes |= STATX_ATTR_NODUMP; stat->attributes_mask |= (STATX_ATTR_APPEND | @@ -1590,7 +1590,8 @@ static int f2fs_ioc_getflags(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); struct f2fs_inode_info *fi = F2FS_I(inode); - unsigned int flags = fi->i_flags & FS_FL_USER_VISIBLE; + unsigned int flags = fi->i_flags & + (F2FS_FL_USER_VISIBLE | F2FS_PROJINHERIT_FL); return put_user(flags, (int __user *)arg); } @@ -1624,15 +1625,15 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg) oldflags = fi->i_flags; - if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) { + if ((flags ^ oldflags) & (F2FS_APPEND_FL | F2FS_IMMUTABLE_FL)) { if (!capable(CAP_LINUX_IMMUTABLE)) { ret = -EPERM; goto unlock_out; } } - flags = flags & FS_FL_USER_MODIFIABLE; - flags |= oldflags & ~FS_FL_USER_MODIFIABLE; + flags = flags & (F2FS_FL_USER_MODIFIABLE | F2FS_PROJINHERIT_FL); + flags |= oldflags & ~(F2FS_FL_USER_MODIFIABLE | F2FS_PROJINHERIT_FL); fi->i_flags = flags; inode->i_ctime = current_time(inode); diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 51846fc54fbd..2056211379f9 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -36,15 +36,15 @@ void f2fs_set_inode_flags(struct inode *inode) unsigned int flags = F2FS_I(inode)->i_flags; unsigned int new_fl = 0; - if (flags & FS_SYNC_FL) + if (flags & F2FS_SYNC_FL) new_fl |= S_SYNC; - if (flags & FS_APPEND_FL) + if (flags & F2FS_APPEND_FL) new_fl |= S_APPEND; - if (flags & FS_IMMUTABLE_FL) + if (flags & F2FS_IMMUTABLE_FL) new_fl |= S_IMMUTABLE; - if (flags & FS_NOATIME_FL) + if (flags & F2FS_NOATIME_FL) new_fl |= S_NOATIME; - if (flags & FS_DIRSYNC_FL) + if (flags & F2FS_DIRSYNC_FL) new_fl |= S_DIRSYNC; if (f2fs_encrypted_inode(inode)) new_fl |= S_ENCRYPTED; @@ -268,7 +268,7 @@ static int do_read_inode(struct inode *inode) if (!need_inode_block_update(sbi, inode->i_ino)) fi->last_disk_size = inode->i_size; - if (fi->i_flags & FS_PROJINHERIT_FL) + if (fi->i_flags & F2FS_PROJINHERIT_FL) set_inode_flag(inode, FI_PROJ_INHERIT); if (f2fs_has_extra_attr(inode) && f2fs_sb_has_project_quota(sbi->sb) && diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index fecae8685d2a..dd77ecbd536d 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -61,7 +61,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) } if (f2fs_sb_has_project_quota(sbi->sb) && - (F2FS_I(dir)->i_flags & FS_PROJINHERIT_FL)) + (F2FS_I(dir)->i_flags & F2FS_PROJINHERIT_FL)) F2FS_I(inode)->i_projid = F2FS_I(dir)->i_projid; else F2FS_I(inode)->i_projid = make_kprojid(&init_user_ns, @@ -116,9 +116,9 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) f2fs_mask_flags(mode, F2FS_I(dir)->i_flags & F2FS_FL_INHERITED); if (S_ISDIR(inode->i_mode)) - F2FS_I(inode)->i_flags |= FS_INDEX_FL; + F2FS_I(inode)->i_flags |= F2FS_INDEX_FL; - if (F2FS_I(inode)->i_flags & FS_PROJINHERIT_FL) + if (F2FS_I(inode)->i_flags & F2FS_PROJINHERIT_FL) set_inode_flag(inode, FI_PROJ_INHERIT); trace_f2fs_new_inode(inode, 0); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 777ed4eafa6c..b6ce10f8128a 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1805,7 +1805,7 @@ static int f2fs_quota_on(struct super_block *sb, int type, int format_id, inode = d_inode(path->dentry); inode_lock(inode); - F2FS_I(inode)->i_flags |= FS_NOATIME_FL | FS_IMMUTABLE_FL; + F2FS_I(inode)->i_flags |= F2FS_NOATIME_FL | F2FS_IMMUTABLE_FL; inode_set_flags(inode, S_NOATIME | S_IMMUTABLE, S_NOATIME | S_IMMUTABLE); inode_unlock(inode); @@ -1829,7 +1829,7 @@ static int f2fs_quota_off(struct super_block *sb, int type) goto out_put; inode_lock(inode); - F2FS_I(inode)->i_flags &= ~(FS_NOATIME_FL | FS_IMMUTABLE_FL); + F2FS_I(inode)->i_flags &= ~(F2FS_NOATIME_FL | F2FS_IMMUTABLE_FL); inode_set_flags(inode, 0, S_NOATIME | S_IMMUTABLE); inode_unlock(inode); f2fs_mark_inode_dirty_sync(inode, false); From ec034d0f14ca093cf656843fa097350875c3895d Mon Sep 17 00:00:00 2001 From: Yunlong Song Date: Tue, 3 Apr 2018 19:42:41 +0800 Subject: [PATCH 699/804] f2fs: remove unmatched zero_user_segment when convert inline dentry Since the layout of regular dentry block is different from inline dentry block, zero_user_segment starting from MAX_INLINE_DATA(dir) is not correct for regular dentry block, besides, bitmap is already copied and used, so there is no necessary to zero page at all, so just remove the zero_user_segment is OK. Signed-off-by: Yunlong Song Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/inline.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 2ff0305391cd..85371b0971d9 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -368,7 +368,6 @@ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage, goto out; f2fs_wait_on_page_writeback(page, DATA, true); - zero_user_segment(page, MAX_INLINE_DATA(dir), PAGE_SIZE); dentry_blk = page_address(page); From cd79eb2b5e451ca0be15338684252aef56dd319d Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 4 Apr 2018 17:35:13 +0800 Subject: [PATCH 700/804] f2fs: remove redundant block plug For buffered IO, we don't need to use block plug to cache bio, for direct IO, generic f2fs_direct_IO has already added block plug, so let's remove redundant one in .write_iter. As Yunlei described in his patch: -f2fs_file_write_iter -blk_start_plug -__generic_file_write_iter ... -do_blockdev_direct_IO -blk_start_plug ... -blk_finish_plug ... -blk_finish_plug which may conduct performance decrease in our platform Signed-off-by: Yunlei He Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index fc7d07f93bbe..b2db8349c97b 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2695,7 +2695,6 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); - struct blk_plug plug; ssize_t ret; if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) @@ -2740,9 +2739,7 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) return err; } } - blk_start_plug(&plug); ret = __generic_file_write_iter(iocb, from); - blk_finish_plug(&plug); clear_inode_flag(inode, FI_NO_PREALLOC); /* if we couldn't write data, we should deallocate blocks. */ From fdf61219dc2512cd29b8b03a460a51af8ddca876 Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Wed, 4 Apr 2018 17:29:05 +0800 Subject: [PATCH 701/804] f2fs: issue all big range discards in umount process This patch modify max_requests to UINT_MAX, to issue all big range discards in umount. Signed-off-by: Yunlei He Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index a02d5c1a7ed2..3c2e44f76ff3 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1024,6 +1024,7 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi, } else if (discard_type == DPOLICY_FSTRIM) { dpolicy->io_aware = false; } else if (discard_type == DPOLICY_UMOUNT) { + dpolicy->max_requests = UINT_MAX; dpolicy->io_aware = false; } } From 298032d4d4a6dc6da4f7298da0200ef56e93006d Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 9 Apr 2018 20:25:06 +0800 Subject: [PATCH 702/804] f2fs: don't use GFP_ZERO for page caches Related to https://lkml.org/lkml/2018/4/8/661 Sometimes, we need to write meta data to new allocated block address, then we will allocate a zeroed page in inner inode's address space, and fill partial data in it, and leave other place with zero value which means some fields are initial status. There are two inner inodes (meta inode and node inode) setting __GFP_ZERO, I have just checked them, for both of them, we can avoid using __GFP_ZERO, and do initialization by ourselves to avoid unneeded/redundant zeroing from mm. Cc: Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 4 +++- fs/f2fs/inode.c | 4 ++-- fs/f2fs/segment.c | 3 +++ fs/f2fs/segment.h | 1 + 4 files changed, 9 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 760d1ad22722..0bdd5bdfeaf9 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -100,8 +100,10 @@ repeat: * readonly and make sure do not write checkpoint with non-uptodate * meta page. */ - if (unlikely(!PageUptodate(page))) + if (unlikely(!PageUptodate(page))) { + memset(page_address(page), 0, PAGE_SIZE); f2fs_stop_checkpoint(sbi, false); + } out: return page; } diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 2056211379f9..8187ef8bab98 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -320,10 +320,10 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino) make_now: if (ino == F2FS_NODE_INO(sbi)) { inode->i_mapping->a_ops = &f2fs_node_aops; - mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_ZERO); + mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); } else if (ino == F2FS_META_INO(sbi)) { inode->i_mapping->a_ops = &f2fs_meta_aops; - mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_ZERO); + mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); } else if (S_ISREG(inode->i_mode)) { inode->i_op = &f2fs_file_inode_operations; inode->i_fop = &f2fs_file_operations; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 3c2e44f76ff3..a1f9c8a19383 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2059,6 +2059,7 @@ static void write_current_sum_page(struct f2fs_sb_info *sbi, struct f2fs_summary_block *dst; dst = (struct f2fs_summary_block *)page_address(page); + memset(dst, 0, PAGE_SIZE); mutex_lock(&curseg->curseg_mutex); @@ -3214,6 +3215,7 @@ static void write_compacted_summaries(struct f2fs_sb_info *sbi, block_t blkaddr) page = grab_meta_page(sbi, blkaddr++); kaddr = (unsigned char *)page_address(page); + memset(kaddr, 0, PAGE_SIZE); /* Step 1: write nat cache */ seg_i = CURSEG_I(sbi, CURSEG_HOT_DATA); @@ -3238,6 +3240,7 @@ static void write_compacted_summaries(struct f2fs_sb_info *sbi, block_t blkaddr) if (!page) { page = grab_meta_page(sbi, blkaddr++); kaddr = (unsigned char *)page_address(page); + memset(kaddr, 0, PAGE_SIZE); written_size = 0; } summary = (struct f2fs_summary *)(kaddr + written_size); diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 96a2d57ba8a4..e352e01854b0 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -375,6 +375,7 @@ static inline void seg_info_to_sit_page(struct f2fs_sb_info *sbi, int i; raw_sit = (struct f2fs_sit_block *)page_address(page); + memset(raw_sit, 0, PAGE_SIZE); for (i = 0; i < end - start; i++) { rs = &raw_sit->entries[i]; se = get_seg_entry(sbi, start + i); From 3e90db63fcfcac8c406704b165597d2a33de4450 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sun, 8 Apr 2018 11:25:53 +0800 Subject: [PATCH 703/804] f2fs: remove unneeded F2FS_PROJINHERIT_FL Now F2FS_FL_USER_VISIBLE and F2FS_FL_USER_MODIFIABLE has included F2FS_PROJINHERIT_FL, so remove unneeded F2FS_PROJINHERIT_FL when using visible/modifiable flag macro. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index b2db8349c97b..5b4802a67eba 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -689,7 +689,7 @@ int f2fs_getattr(struct vfsmount *mnt, stat->btime.tv_nsec = fi->i_crtime.tv_nsec; } - flags = fi->i_flags & (F2FS_FL_USER_VISIBLE | F2FS_PROJINHERIT_FL); + flags = fi->i_flags & F2FS_FL_USER_VISIBLE; if (flags & F2FS_APPEND_FL) stat->attributes |= STATX_ATTR_APPEND; if (flags & F2FS_COMPR_FL) @@ -1632,8 +1632,8 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg) } } - flags = flags & (F2FS_FL_USER_MODIFIABLE | F2FS_PROJINHERIT_FL); - flags |= oldflags & ~(F2FS_FL_USER_MODIFIABLE | F2FS_PROJINHERIT_FL); + flags = flags & (F2FS_FL_USER_MODIFIABLE); + flags |= oldflags & ~(F2FS_FL_USER_MODIFIABLE); fi->i_flags = flags; inode->i_ctime = current_time(inode); From 17f85d070886c69dd5bc5f32dc4fcdbd24199a7a Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sun, 8 Apr 2018 11:27:14 +0800 Subject: [PATCH 704/804] f2fs: fix to show missing bits in FS_IOC_GETFLAGS This patch fixes to show missing encrypt/inline_data flag in FS_IOC_GETFLAGS like ext4 does. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 5b4802a67eba..06f500177bde 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1590,8 +1590,15 @@ static int f2fs_ioc_getflags(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); struct f2fs_inode_info *fi = F2FS_I(inode); - unsigned int flags = fi->i_flags & - (F2FS_FL_USER_VISIBLE | F2FS_PROJINHERIT_FL); + unsigned int flags = fi->i_flags; + + if (file_is_encrypt(inode)) + flags |= F2FS_ENCRYPT_FL; + if (f2fs_has_inline_data(inode) || f2fs_has_inline_dentry(inode)) + flags |= F2FS_INLINE_DATA_FL; + + flags &= F2FS_FL_USER_VISIBLE; + return put_user(flags, (int __user *)arg); } From 9190cadf38db9a3b321c8882b1d27219a5e6f436 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sun, 8 Apr 2018 20:39:03 +0800 Subject: [PATCH 705/804] f2fs: correct return value of f2fs_trim_fs Correct return value in two cases: - return EINVAL if end boundary is out-of-range. - return EIO if fs needs off-line check. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index a1f9c8a19383..f1fe260537e0 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2489,12 +2489,12 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) return -EINVAL; if (end <= MAIN_BLKADDR(sbi)) - goto out; + return -EINVAL; if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) { f2fs_msg(sbi->sb, KERN_WARNING, "Found FS corruption, run fsck to fix."); - goto out; + return -EIO; } /* start/end segment number in main_area */ From ea2813111f1f31e04892c955291356322eec23b8 Mon Sep 17 00:00:00 2001 From: Zhikang Zhang Date: Mon, 9 Apr 2018 04:28:41 +0800 Subject: [PATCH 706/804] f2fs: check cur_valid_map_mir & raw_sit block count when flush sit entries We should check valid_map_mir and block count to ensure the flushed raw_sit is correct. Signed-off-by: Zhikang Zhang Signed-off-by: Yunlei He Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index f1fe260537e0..7d6c1e4b1374 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -3481,6 +3481,11 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) int offset, sit_offset; se = get_seg_entry(sbi, segno); +#ifdef CONFIG_F2FS_CHECK_FS + if (memcmp(se->cur_valid_map, se->cur_valid_map_mir, + SIT_VBLOCK_MAP_SIZE)) + f2fs_bug_on(sbi, 1); +#endif /* add discard candidates */ if (!(cpc->reason & CP_DISCARD)) { @@ -3496,10 +3501,14 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) cpu_to_le32(segno); seg_info_to_raw_sit(se, &sit_in_journal(journal, offset)); + check_block_count(sbi, segno, + &sit_in_journal(journal, offset)); } else { sit_offset = SIT_ENTRY_OFFSET(sit_i, segno); seg_info_to_raw_sit(se, &raw_sit->entries[sit_offset]); + check_block_count(sbi, segno, + &raw_sit->entries[sit_offset]); } __clear_bit(segno, bitmap); From 0d17eb90b56aafeea4d7053e8eba8dd0cffaee39 Mon Sep 17 00:00:00 2001 From: Zhikang Zhang Date: Sat, 14 Apr 2018 01:02:34 +0800 Subject: [PATCH 707/804] f2fs: change le32 to le16 of f2fs_inode->i_extra_size In the structure of f2fs_inode, i_extra_size's type is __le16, so we should keep type consistent when using it. Fixes: 704956ecf5bc ("f2fs: support inode checksum") Signed-off-by: Zhikang Zhang Signed-off-by: Yunlei He Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 8187ef8bab98..b83e0cc49d3d 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -117,7 +117,6 @@ static void __recover_inline_status(struct inode *inode, struct page *ipage) static bool f2fs_enable_inode_chksum(struct f2fs_sb_info *sbi, struct page *page) { struct f2fs_inode *ri = &F2FS_NODE(page)->i; - int extra_isize = le32_to_cpu(ri->i_extra_isize); if (!f2fs_sb_has_inode_chksum(sbi->sb)) return false; @@ -125,7 +124,8 @@ static bool f2fs_enable_inode_chksum(struct f2fs_sb_info *sbi, struct page *page if (!RAW_IS_INODE(F2FS_NODE(page)) || !(ri->i_inline & F2FS_EXTRA_ATTR)) return false; - if (!F2FS_FITS_IN_INODE(ri, extra_isize, i_inode_checksum)) + if (!F2FS_FITS_IN_INODE(ri, le16_to_cpu(ri->i_extra_isize), + i_inode_checksum)) return false; return true; From 9d77ded0a71d5174ce8c4657b8b49a847122b143 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 17 Apr 2018 17:51:28 +0800 Subject: [PATCH 708/804] f2fs: fix race in between GC and atomic open Thread GC thread - f2fs_ioc_start_atomic_write - get_dirty_pages - filemap_write_and_wait_range - f2fs_gc - do_garbage_collect - gc_data_segment - move_data_page - f2fs_is_atomic_file - set_page_dirty - set_inode_flag(, FI_ATOMIC_FILE) Dirty data page can still be generated by GC in race condition as above call stack. This patch adds fi->dio_rwsem[WRITE] in f2fs_ioc_start_atomic_write to avoid such race. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 06f500177bde..93debcf83d29 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1676,6 +1676,8 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) inode_lock(inode); + down_write(&F2FS_I(inode)->dio_rwsem[WRITE]); + if (f2fs_is_atomic_file(inode)) goto out; @@ -1705,6 +1707,7 @@ inc_stat: stat_inc_atomic_write(inode); stat_update_max_atomic_write(inode); out: + up_write(&F2FS_I(inode)->dio_rwsem[WRITE]); inode_unlock(inode); mnt_drop_write_file(filp); return ret; From aa857e0f3b0993899e39659b2f671e8cc9870ac3 Mon Sep 17 00:00:00 2001 From: Sheng Yong Date: Tue, 17 Apr 2018 17:12:27 +0800 Subject: [PATCH 709/804] f2fs: check if inmem_pages list is empty correctly `cur' will never be NULL, we should check inmem_pages list instead. Signed-off-by: Sheng Yong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 7d6c1e4b1374..a7f0e5932642 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -328,7 +328,7 @@ void drop_inmem_page(struct inode *inode, struct page *page) break; } - f2fs_bug_on(sbi, !cur || cur->page != page); + f2fs_bug_on(sbi, list_empty(head) || cur->page != page); list_del(&cur->list); mutex_unlock(&fi->inmem_lock); From 258489ec52208c6cc9893f3ce2791cc9d9fbb04b Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Wed, 18 Apr 2018 11:06:39 +0800 Subject: [PATCH 710/804] f2fs: allocate hot_data for atomic write more strictly If a file not set type as hot, has dirty pages more than threshold 64 before starting atomic write, may be lose hot flag. v1->v2: move set FI_ATOMIC_FILE flag behind flush dirty pages too, in case of dirty pages before starting atomic use atomic mode to write back. Signed-off-by: Yunlei He Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 93debcf83d29..7ccb832aa929 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1685,24 +1685,20 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) if (ret) goto out; - set_inode_flag(inode, FI_ATOMIC_FILE); - set_inode_flag(inode, FI_HOT_DATA); - f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); - if (!get_dirty_pages(inode)) - goto inc_stat; + goto skip_flush; f2fs_msg(F2FS_I_SB(inode)->sb, KERN_WARNING, "Unexpected flush for atomic writes: ino=%lu, npages=%u", inode->i_ino, get_dirty_pages(inode)); ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); - if (ret) { - clear_inode_flag(inode, FI_ATOMIC_FILE); - clear_inode_flag(inode, FI_HOT_DATA); + if (ret) goto out; - } +skip_flush: + set_inode_flag(inode, FI_HOT_DATA); + set_inode_flag(inode, FI_ATOMIC_FILE); + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); -inc_stat: F2FS_I(inode)->inmem_task = current; stat_inc_atomic_write(inode); stat_update_max_atomic_write(inode); From a6d74bb282adbae0319ede6a0de3b6983c3c3b46 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 18 Apr 2018 17:45:02 +0800 Subject: [PATCH 711/804] f2fs: fix return value in f2fs_ioc_commit_atomic_write In f2fs_ioc_commit_atomic_write, if file is volatile, return -EINVAL to indicate that commit failure. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 7ccb832aa929..4334683e5491 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1725,8 +1725,10 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) down_write(&F2FS_I(inode)->dio_rwsem[WRITE]); - if (f2fs_is_volatile_file(inode)) + if (f2fs_is_volatile_file(inode)) { + ret = -EINVAL; goto err_out; + } if (f2fs_is_atomic_file(inode)) { ret = commit_inmem_pages(inode); From 937f4ef79e257735e03149815fb231c8d02e3a1f Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Fri, 13 Apr 2018 11:08:05 +0800 Subject: [PATCH 712/804] f2fs: stop issue discard if something wrong with f2fs v4->v5: move data corruption check to __submit_discard_cmd, in order to control discard io submitted more accurately, besides, increase async thread wait time if data corruption detected. This patch stop async thread and umount process to issue discard if something wrong with f2fs, which is similar to fstrim. Signed-off-by: Yunlei He Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index a7f0e5932642..d4b787b00c5a 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1044,6 +1044,9 @@ static void __submit_discard_cmd(struct f2fs_sb_info *sbi, if (dc->state != D_PREP) return; + if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) + return; + trace_f2fs_issue_discard(dc->bdev, dc->start, dc->len); dc->error = __blkdev_issue_discard(dc->bdev, @@ -1475,6 +1478,10 @@ static int issue_discard_thread(void *data) continue; if (kthread_should_stop()) return 0; + if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) { + wait_ms = dpolicy.max_interval; + continue; + } if (dcc->discard_wake) dcc->discard_wake = 0; From 23d00b02878ee939100ed8aae68b5ac170899bf2 Mon Sep 17 00:00:00 2001 From: Sheng Yong Date: Sat, 21 Apr 2018 14:12:50 +0800 Subject: [PATCH 713/804] f2fs: remove duplicated dquot_initialize and fix error handling This patch removes duplicated dquot_initialize in recover_orphan_inode(), and fix the error handling if dquot_initialize fails. Signed-off-by: Sheng Yong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 0bdd5bdfeaf9..6d331c21f7ce 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -593,10 +593,11 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) } err = dquot_initialize(inode); - if (err) + if (err) { + iput(inode); goto err_out; + } - dquot_initialize(inode); clear_nlink(inode); /* truncate all the data during iput */ From 7aff5c69da4c925dcd7dc01a248a14be7d83d5c6 Mon Sep 17 00:00:00 2001 From: Sheng Yong Date: Mon, 23 Apr 2018 10:29:13 +0800 Subject: [PATCH 714/804] f2fs: do not check F2FS_INLINE_DOTS in recover Only dir may have F2FS_INLINE_DOTS flag, so there is no need to check the flag in recover flow. Signed-off-by: Sheng Yong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/recovery.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 4ddc2262baf1..7305226a7476 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -204,8 +204,6 @@ static void recover_inline_flags(struct inode *inode, struct f2fs_inode *ri) set_inode_flag(inode, FI_DATA_EXIST); else clear_inode_flag(inode, FI_DATA_EXIST); - if (!(ri->i_inline & F2FS_INLINE_DOTS)) - clear_inode_flag(inode, FI_INLINE_DOTS); } static void recover_inode(struct inode *inode, struct page *page) From b025f6dfc018e49f53549c846c3ad6045aab39cd Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 23 Apr 2018 10:36:14 +0800 Subject: [PATCH 715/804] f2fs: clean up commit_inmem_pages() This patch moves error handling from commit_inmem_pages() into __commit_inmem_page() for cleanup, no logic change. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 54 +++++++++++++++++++++++------------------------ 1 file changed, 26 insertions(+), 28 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index d4b787b00c5a..7f6f029aa866 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -343,8 +343,7 @@ void drop_inmem_page(struct inode *inode, struct page *page) trace_f2fs_commit_inmem_page(page, INMEM_INVALIDATE); } -static int __commit_inmem_pages(struct inode *inode, - struct list_head *revoke_list) +static int __commit_inmem_pages(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); @@ -357,9 +356,12 @@ static int __commit_inmem_pages(struct inode *inode, .op_flags = REQ_SYNC | REQ_PRIO, .io_type = FS_DATA_IO, }; + struct list_head revoke_list; pgoff_t last_idx = ULONG_MAX; int err = 0; + INIT_LIST_HEAD(&revoke_list); + list_for_each_entry_safe(cur, tmp, &fi->inmem_pages, list) { struct page *page = cur->page; @@ -393,35 +395,13 @@ retry: last_idx = page->index; } unlock_page(page); - list_move_tail(&cur->list, revoke_list); + list_move_tail(&cur->list, &revoke_list); } if (last_idx != ULONG_MAX) f2fs_submit_merged_write_cond(sbi, inode, 0, last_idx, DATA); - if (!err) - __revoke_inmem_pages(inode, revoke_list, false, false); - - return err; -} - -int commit_inmem_pages(struct inode *inode) -{ - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct f2fs_inode_info *fi = F2FS_I(inode); - struct list_head revoke_list; - int err; - - INIT_LIST_HEAD(&revoke_list); - f2fs_balance_fs(sbi, true); - f2fs_lock_op(sbi); - - set_inode_flag(inode, FI_ATOMIC_COMMIT); - - mutex_lock(&fi->inmem_lock); - err = __commit_inmem_pages(inode, &revoke_list); if (err) { - int ret; /* * try to revoke all committed pages, but still we could fail * due to no memory or other reason, if that happened, EAGAIN @@ -430,13 +410,31 @@ int commit_inmem_pages(struct inode *inode) * recovery or rewrite & commit last transaction. For other * error number, revoking was done by filesystem itself. */ - ret = __revoke_inmem_pages(inode, &revoke_list, false, true); - if (ret) - err = ret; + err = __revoke_inmem_pages(inode, &revoke_list, false, true); /* drop all uncommitted pages */ __revoke_inmem_pages(inode, &fi->inmem_pages, true, false); + } else { + __revoke_inmem_pages(inode, &revoke_list, false, false); } + + return err; +} + +int commit_inmem_pages(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_inode_info *fi = F2FS_I(inode); + int err; + + f2fs_balance_fs(sbi, true); + f2fs_lock_op(sbi); + + set_inode_flag(inode, FI_ATOMIC_COMMIT); + + mutex_lock(&fi->inmem_lock); + err = __commit_inmem_pages(inode); + spin_lock(&sbi->inode_lock[ATOMIC_FILE]); if (!list_empty(&fi->inmem_ilist)) list_del_init(&fi->inmem_ilist); From 1a5d1966c0ca8c09e94d41f4490d7d7a53dd5cb2 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 23 Apr 2018 23:02:31 -0600 Subject: [PATCH 716/804] f2fs: give message and set need_fsck given broken node id syzbot hit the following crash on upstream commit 83beed7b2b26f232d782127792dd0cd4362fdc41 (Fri Apr 20 17:56:32 2018 +0000) Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/evalenti/linux-soc-thermal syzbot dashboard link: https://syzkaller.appspot.com/bug?extid=d154ec99402c6f628887 C reproducer: https://syzkaller.appspot.com/x/repro.c?id=5414336294027264 syzkaller reproducer: https://syzkaller.appspot.com/x/repro.syz?id=5471683234234368 Raw console output: https://syzkaller.appspot.com/x/log.txt?id=5436660795834368 Kernel config: https://syzkaller.appspot.com/x/.config?id=1808800213120130118 compiler: gcc (GCC) 8.0.1 20180413 (experimental) IMPORTANT: if you fix the bug, please add the following tag to the commit: Reported-by: syzbot+d154ec99402c6f628887@syzkaller.appspotmail.com It will help syzbot understand when the bug is fixed. See footer for details. If you forward the report, please keep this part and the footer. F2FS-fs (loop0): Magic Mismatch, valid(0xf2f52010) - read(0x0) F2FS-fs (loop0): Can't find valid F2FS filesystem in 1th superblock F2FS-fs (loop0): invalid crc value ------------[ cut here ]------------ kernel BUG at fs/f2fs/node.c:1185! invalid opcode: 0000 [#1] SMP KASAN Dumping ftrace buffer: (ftrace buffer empty) Modules linked in: CPU: 1 PID: 4549 Comm: syzkaller704305 Not tainted 4.17.0-rc1+ #10 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:__get_node_page+0xb68/0x16e0 fs/f2fs/node.c:1185 RSP: 0018:ffff8801d960e820 EFLAGS: 00010293 RAX: ffff8801d88205c0 RBX: 0000000000000003 RCX: ffffffff82f6cc06 RDX: 0000000000000000 RSI: ffffffff82f6d5e8 RDI: 0000000000000004 RBP: ffff8801d960ec30 R08: ffff8801d88205c0 R09: ffffed003b5e46c2 R10: 0000000000000003 R11: 0000000000000003 R12: ffff8801a86e00c0 R13: 0000000000000001 R14: ffff8801a86e0530 R15: ffff8801d9745240 FS: 000000000072c880(0000) GS:ffff8801daf00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f3d403209b8 CR3: 00000001d8f3f000 CR4: 00000000001406e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: get_node_page fs/f2fs/node.c:1237 [inline] truncate_xattr_node+0x152/0x2e0 fs/f2fs/node.c:1014 remove_inode_page+0x200/0xaf0 fs/f2fs/node.c:1039 f2fs_evict_inode+0xe86/0x1710 fs/f2fs/inode.c:547 evict+0x4a6/0x960 fs/inode.c:557 iput_final fs/inode.c:1519 [inline] iput+0x62d/0xa80 fs/inode.c:1545 f2fs_fill_super+0x5f4e/0x7bf0 fs/f2fs/super.c:2849 mount_bdev+0x30c/0x3e0 fs/super.c:1164 f2fs_mount+0x34/0x40 fs/f2fs/super.c:3020 mount_fs+0xae/0x328 fs/super.c:1267 vfs_kern_mount.part.34+0xd4/0x4d0 fs/namespace.c:1037 vfs_kern_mount fs/namespace.c:1027 [inline] do_new_mount fs/namespace.c:2518 [inline] do_mount+0x564/0x3070 fs/namespace.c:2848 ksys_mount+0x12d/0x140 fs/namespace.c:3064 __do_sys_mount fs/namespace.c:3078 [inline] __se_sys_mount fs/namespace.c:3075 [inline] __x64_sys_mount+0xbe/0x150 fs/namespace.c:3075 do_syscall_64+0x1b1/0x800 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x443dea RSP: 002b:00007ffcc7882368 EFLAGS: 00000297 ORIG_RAX: 00000000000000a5 RAX: ffffffffffffffda RBX: 0000000020000c00 RCX: 0000000000443dea RDX: 0000000020000000 RSI: 0000000020000100 RDI: 00007ffcc7882370 RBP: 0000000000000003 R08: 0000000020016a00 R09: 000000000000000a R10: 0000000000000000 R11: 0000000000000297 R12: 0000000000000004 R13: 0000000000402ce0 R14: 0000000000000000 R15: 0000000000000000 RIP: __get_node_page+0xb68/0x16e0 fs/f2fs/node.c:1185 RSP: ffff8801d960e820 ---[ end trace 4edbeb71f002bb76 ]--- Reported-and-tested-by: syzbot+d154ec99402c6f628887@syzkaller.appspotmail.com Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 13 +------------ fs/f2fs/inode.c | 13 ++++++------- fs/f2fs/node.c | 21 +++++++++++++++++++-- 3 files changed, 26 insertions(+), 21 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 66c315a8ef78..527999edc2a9 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1649,18 +1649,6 @@ static inline bool __exist_node_summaries(struct f2fs_sb_info *sbi) is_set_ckpt_flags(sbi, CP_FASTBOOT_FLAG)); } -/* - * Check whether the given nid is within node id range. - */ -static inline int check_nid_range(struct f2fs_sb_info *sbi, nid_t nid) -{ - if (unlikely(nid < F2FS_ROOT_INO(sbi))) - return -EINVAL; - if (unlikely(nid >= NM_I(sbi)->max_nid)) - return -EINVAL; - return 0; -} - /* * Check whether the inode has blocks or not */ @@ -2854,6 +2842,7 @@ f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info, struct dnode_of_data; struct node_info; +int check_nid_range(struct f2fs_sb_info *sbi, nid_t nid); bool available_free_memory(struct f2fs_sb_info *sbi, int type); int need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid); bool is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid); diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index b83e0cc49d3d..ff99110194ef 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -194,12 +194,8 @@ static int do_read_inode(struct inode *inode) projid_t i_projid; /* Check if ino is within scope */ - if (check_nid_range(sbi, inode->i_ino)) { - f2fs_msg(inode->i_sb, KERN_ERR, "bad inode number: %lu", - (unsigned long) inode->i_ino); - WARN_ON(1); + if (check_nid_range(sbi, inode->i_ino)) return -EINVAL; - } node_page = get_node_page(sbi, inode->i_ino); if (IS_ERR(node_page)) @@ -588,8 +584,11 @@ no_delete: alloc_nid_failed(sbi, inode->i_ino); clear_inode_flag(inode, FI_FREE_NID); } else { - f2fs_bug_on(sbi, err && - !exist_written_data(sbi, inode->i_ino, ORPHAN_INO)); + /* + * If xattr nid is corrupted, we can reach out error condition, + * err & !exist_written_data(sbi, inode->i_ino, ORPHAN_INO)). + * In that case, check_nid_range() is enough to give a clue. + */ } out_clear: fscrypt_put_encryption_info(inode, NULL); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 16aee2a7b8a9..7c3e8190cff2 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -29,6 +29,21 @@ static struct kmem_cache *nat_entry_slab; static struct kmem_cache *free_nid_slab; static struct kmem_cache *nat_entry_set_slab; +/* + * Check whether the given nid is within node id range. + */ +int check_nid_range(struct f2fs_sb_info *sbi, nid_t nid) +{ + if (unlikely(nid < F2FS_ROOT_INO(sbi) || nid >= NM_I(sbi)->max_nid)) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_msg(sbi->sb, KERN_WARNING, + "%s: out-of-range nid=%x, run fsck to fix.", + __func__, nid); + return -EINVAL; + } + return 0; +} + bool available_free_memory(struct f2fs_sb_info *sbi, int type) { struct f2fs_nm_info *nm_i = NM_I(sbi); @@ -1158,7 +1173,8 @@ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid) if (!nid) return; - f2fs_bug_on(sbi, check_nid_range(sbi, nid)); + if (check_nid_range(sbi, nid)) + return; rcu_read_lock(); apage = radix_tree_lookup(&NODE_MAPPING(sbi)->page_tree, nid); @@ -1182,7 +1198,8 @@ static struct page *__get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid, if (!nid) return ERR_PTR(-ENOENT); - f2fs_bug_on(sbi, check_nid_range(sbi, nid)); + if (check_nid_range(sbi, nid)) + return ERR_PTR(-EINVAL); repeat: page = f2fs_grab_cache_page(NODE_MAPPING(sbi), nid, false); if (!page) From ab758ada220fe5c9f1419bcd6c8fb249a3bd1dd4 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 24 Apr 2018 11:37:18 -0600 Subject: [PATCH 717/804] f2fs: avoid bug_on on corrupted inode syzbot has tested the proposed patch but the reproducer still triggered crash: kernel BUG at fs/f2fs/inode.c:LINE! F2FS-fs (loop1): invalid crc value F2FS-fs (loop5): Magic Mismatch, valid(0xf2f52010) - read(0x0) F2FS-fs (loop5): Can't find valid F2FS filesystem in 1th superblock F2FS-fs (loop5): invalid crc value ------------[ cut here ]------------ kernel BUG at fs/f2fs/inode.c:238! invalid opcode: 0000 [#1] SMP KASAN Dumping ftrace buffer: (ftrace buffer empty) Modules linked in: CPU: 1 PID: 4886 Comm: syz-executor1 Not tainted 4.17.0-rc1+ #1 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:do_read_inode fs/f2fs/inode.c:238 [inline] RIP: 0010:f2fs_iget+0x3307/0x3ca0 fs/f2fs/inode.c:313 RSP: 0018:ffff8801c44a70e8 EFLAGS: 00010293 RAX: ffff8801ce208040 RBX: ffff8801b3621080 RCX: ffffffff82eace18 F2FS-fs (loop2): Magic Mismatch, valid(0xf2f52010) - read(0x0) RDX: 0000000000000000 RSI: ffffffff82eaf047 RDI: 0000000000000007 RBP: ffff8801c44a7410 R08: ffff8801ce208040 R09: ffffed0039ee4176 R10: ffffed0039ee4176 R11: ffff8801cf720bb7 R12: ffff8801c0efa000 R13: 0000000000000003 R14: 0000000000000000 R15: 0000000000000000 FS: 00007f753aa9d700(0000) GS:ffff8801daf00000(0000) knlGS:0000000000000000 ------------[ cut here ]------------ CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 kernel BUG at fs/f2fs/inode.c:238! CR2: 0000000001b03018 CR3: 00000001c8b74000 CR4: 00000000001406e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: f2fs_fill_super+0x4377/0x7bf0 fs/f2fs/super.c:2842 mount_bdev+0x30c/0x3e0 fs/super.c:1165 f2fs_mount+0x34/0x40 fs/f2fs/super.c:3020 mount_fs+0xae/0x328 fs/super.c:1268 vfs_kern_mount.part.34+0xd4/0x4d0 fs/namespace.c:1037 vfs_kern_mount fs/namespace.c:1027 [inline] do_new_mount fs/namespace.c:2517 [inline] do_mount+0x564/0x3070 fs/namespace.c:2847 ksys_mount+0x12d/0x140 fs/namespace.c:3063 __do_sys_mount fs/namespace.c:3077 [inline] __se_sys_mount fs/namespace.c:3074 [inline] __x64_sys_mount+0xbe/0x150 fs/namespace.c:3074 do_syscall_64+0x1b1/0x800 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x457daa RSP: 002b:00007f753aa9cba8 EFLAGS: 00000246 ORIG_RAX: 00000000000000a5 RAX: ffffffffffffffda RBX: 0000000020000000 RCX: 0000000000457daa RDX: 0000000020000000 RSI: 0000000020000100 RDI: 00007f753aa9cbf0 RBP: 0000000000000064 R08: 0000000020016a00 R09: 0000000020000000 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000003 R13: 0000000000000064 R14: 00000000006fcb80 R15: 0000000000000000 RIP: do_read_inode fs/f2fs/inode.c:238 [inline] RSP: ffff8801c44a70e8 RIP: f2fs_iget+0x3307/0x3ca0 fs/f2fs/inode.c:313 RSP: ffff8801c44a70e8 invalid opcode: 0000 [#2] SMP KASAN ---[ end trace 1cbcbec2156680bc ]--- Reported-and-tested-by: syzbot+41a1b341571f0952badb@syzkaller.appspotmail.com Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/inode.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index ff99110194ef..bface995617b 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -185,6 +185,21 @@ void f2fs_inode_chksum_set(struct f2fs_sb_info *sbi, struct page *page) ri->i_inode_checksum = cpu_to_le32(f2fs_inode_chksum(sbi, page)); } +static bool sanity_check_inode(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + if (f2fs_sb_has_flexible_inline_xattr(sbi->sb) + && !f2fs_has_extra_attr(inode)) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_msg(sbi->sb, KERN_WARNING, + "%s: corrupted inode ino=%lx, run fsck to fix.", + __func__, inode->i_ino); + return false; + } + return true; +} + static int do_read_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -235,7 +250,6 @@ static int do_read_inode(struct inode *inode) le16_to_cpu(ri->i_extra_isize) : 0; if (f2fs_sb_has_flexible_inline_xattr(sbi->sb)) { - f2fs_bug_on(sbi, !f2fs_has_extra_attr(inode)); fi->i_inline_xattr_size = le16_to_cpu(ri->i_inline_xattr_size); } else if (f2fs_has_inline_xattr(inode) || f2fs_has_inline_dentry(inode)) { @@ -313,6 +327,10 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino) ret = do_read_inode(inode); if (ret) goto bad_inode; + if (!sanity_check_inode(inode)) { + ret = -EINVAL; + goto bad_inode; + } make_now: if (ino == F2FS_NODE_INO(sbi)) { inode->i_mapping->a_ops = &f2fs_node_aops; From 78f8b0f46fa23f9dc5c8b501db414e9546ba44b0 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 24 Apr 2018 15:44:16 -0600 Subject: [PATCH 718/804] f2fs: sanity check on sit entry syzbot hit the following crash on upstream commit 87ef12027b9b1dd0e0b12cf311fbcb19f9d92539 (Wed Apr 18 19:48:17 2018 +0000) Merge tag 'ceph-for-4.17-rc2' of git://github.com/ceph/ceph-client syzbot dashboard link: https://syzkaller.appspot.com/bug?extid=83699adeb2d13579c31e C reproducer: https://syzkaller.appspot.com/x/repro.c?id=5805208181407744 syzkaller reproducer: https://syzkaller.appspot.com/x/repro.syz?id=6005073343676416 Raw console output: https://syzkaller.appspot.com/x/log.txt?id=6555047731134464 Kernel config: https://syzkaller.appspot.com/x/.config?id=1808800213120130118 compiler: gcc (GCC) 8.0.1 20180413 (experimental) IMPORTANT: if you fix the bug, please add the following tag to the commit: Reported-by: syzbot+83699adeb2d13579c31e@syzkaller.appspotmail.com It will help syzbot understand when the bug is fixed. See footer for details. If you forward the report, please keep this part and the footer. F2FS-fs (loop0): Magic Mismatch, valid(0xf2f52010) - read(0x0) F2FS-fs (loop0): Can't find valid F2FS filesystem in 1th superblock F2FS-fs (loop0): invalid crc value BUG: unable to handle kernel paging request at ffffed006b2a50c0 PGD 21ffee067 P4D 21ffee067 PUD 21fbeb067 PMD 0 Oops: 0000 [#1] SMP KASAN Dumping ftrace buffer: (ftrace buffer empty) Modules linked in: CPU: 0 PID: 4514 Comm: syzkaller989480 Not tainted 4.17.0-rc1+ #8 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:build_sit_entries fs/f2fs/segment.c:3653 [inline] RIP: 0010:build_segment_manager+0x7ef7/0xbf70 fs/f2fs/segment.c:3852 RSP: 0018:ffff8801b102e5b0 EFLAGS: 00010a06 RAX: 1ffff1006b2a50c0 RBX: 0000000000000004 RCX: 0000000000000001 RDX: 0000000000000000 RSI: 0000000000000001 RDI: ffff8801ac74243e RBP: ffff8801b102f410 R08: ffff8801acbd46c0 R09: fffffbfff14d9af8 R10: fffffbfff14d9af8 R11: ffff8801acbd46c0 R12: ffff8801ac742a80 R13: ffff8801d9519100 R14: dffffc0000000000 R15: ffff880359528600 FS: 0000000001e04880(0000) GS:ffff8801dae00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffffed006b2a50c0 CR3: 00000001ac6ac000 CR4: 00000000001406f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: f2fs_fill_super+0x4095/0x7bf0 fs/f2fs/super.c:2803 mount_bdev+0x30c/0x3e0 fs/super.c:1165 f2fs_mount+0x34/0x40 fs/f2fs/super.c:3020 mount_fs+0xae/0x328 fs/super.c:1268 vfs_kern_mount.part.34+0xd4/0x4d0 fs/namespace.c:1037 vfs_kern_mount fs/namespace.c:1027 [inline] do_new_mount fs/namespace.c:2517 [inline] do_mount+0x564/0x3070 fs/namespace.c:2847 ksys_mount+0x12d/0x140 fs/namespace.c:3063 __do_sys_mount fs/namespace.c:3077 [inline] __se_sys_mount fs/namespace.c:3074 [inline] __x64_sys_mount+0xbe/0x150 fs/namespace.c:3074 do_syscall_64+0x1b1/0x800 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x443d6a RSP: 002b:00007ffd312813c8 EFLAGS: 00000297 ORIG_RAX: 00000000000000a5 RAX: ffffffffffffffda RBX: 0000000020000c00 RCX: 0000000000443d6a RDX: 0000000020000000 RSI: 0000000020000100 RDI: 00007ffd312813d0 RBP: 0000000000000003 R08: 0000000020016a00 R09: 000000000000000a R10: 0000000000000000 R11: 0000000000000297 R12: 0000000000000004 R13: 0000000000402c60 R14: 0000000000000000 R15: 0000000000000000 RIP: build_sit_entries fs/f2fs/segment.c:3653 [inline] RSP: ffff8801b102e5b0 RIP: build_segment_manager+0x7ef7/0xbf70 fs/f2fs/segment.c:3852 RSP: ffff8801b102e5b0 CR2: ffffed006b2a50c0 ---[ end trace a2034989e196ff17 ]--- Reported-and-tested-by: syzbot+83699adeb2d13579c31e@syzkaller.appspotmail.com Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 7f6f029aa866..ae3cf8dce38e 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -3762,6 +3762,15 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) unsigned int old_valid_blocks; start = le32_to_cpu(segno_in_journal(journal, i)); + if (start >= MAIN_SEGS(sbi)) { + f2fs_msg(sbi->sb, KERN_ERR, + "Wrong journal entry on segno %u", + start); + set_sbi_flag(sbi, SBI_NEED_FSCK); + err = -EINVAL; + break; + } + se = &sit_i->sentries[start]; sit = sit_in_journal(journal, i); From 26bf4e8a96aada18cf1b23a920f1f3ee50b5b739 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 24 Apr 2018 21:34:05 -0600 Subject: [PATCH 719/804] f2fs: sanity check for total valid node blocks This patch enhances sanity check for SIT entries. syzbot hit the following crash on upstream commit 83beed7b2b26f232d782127792dd0cd4362fdc41 (Fri Apr 20 17:56:32 2018 +0000) Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/evalenti/linux-soc-thermal syzbot dashboard link: https://syzkaller.appspot.com/bug?extid=bf9253040425feb155ad syzkaller reproducer: https://syzkaller.appspot.com/x/repro.syz?id=5692130282438656 Raw console output: https://syzkaller.appspot.com/x/log.txt?id=5095924598571008 Kernel config: https://syzkaller.appspot.com/x/.config?id=1808800213120130118 compiler: gcc (GCC) 8.0.1 20180413 (experimental) IMPORTANT: if you fix the bug, please add the following tag to the commit: Reported-by: syzbot+bf9253040425feb155ad@syzkaller.appspotmail.com It will help syzbot understand when the bug is fixed. See footer for details. If you forward the report, please keep this part and the footer. F2FS-fs (loop0): invalid crc value F2FS-fs (loop0): Try to recover 1th superblock, ret: 0 F2FS-fs (loop0): Mounted with checkpoint version = d F2FS-fs (loop0): Bitmap was wrongly cleared, blk:9740 ------------[ cut here ]------------ kernel BUG at fs/f2fs/segment.c:1884! invalid opcode: 0000 [#1] SMP KASAN Dumping ftrace buffer: (ftrace buffer empty) Modules linked in: CPU: 1 PID: 4508 Comm: syz-executor0 Not tainted 4.17.0-rc1+ #10 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:update_sit_entry+0x1215/0x1590 fs/f2fs/segment.c:1882 RSP: 0018:ffff8801af526708 EFLAGS: 00010282 RAX: ffffed0035ea4cc0 RBX: ffff8801ad454f90 RCX: 0000000000000000 RDX: 0000000000000000 RSI: ffffffff82eeb87e RDI: ffffed0035ea4cb6 RBP: ffff8801af526760 R08: ffff8801ad4a2480 R09: ffffed003b5e4f90 R10: ffffed003b5e4f90 R11: ffff8801daf27c87 R12: ffff8801adb8d380 R13: 0000000000000001 R14: 0000000000000008 R15: 00000000ffffffff FS: 00000000014af940(0000) GS:ffff8801daf00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f06bc223000 CR3: 00000001adb02000 CR4: 00000000001406e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: allocate_data_block+0x66f/0x2050 fs/f2fs/segment.c:2663 do_write_page+0x105/0x1b0 fs/f2fs/segment.c:2727 write_node_page+0x129/0x350 fs/f2fs/segment.c:2770 __write_node_page+0x7da/0x1370 fs/f2fs/node.c:1398 sync_node_pages+0x18cf/0x1eb0 fs/f2fs/node.c:1652 block_operations+0x429/0xa60 fs/f2fs/checkpoint.c:1088 write_checkpoint+0x3ba/0x5380 fs/f2fs/checkpoint.c:1405 f2fs_sync_fs+0x2fb/0x6a0 fs/f2fs/super.c:1077 __sync_filesystem fs/sync.c:39 [inline] sync_filesystem+0x265/0x310 fs/sync.c:67 generic_shutdown_super+0xd7/0x520 fs/super.c:429 kill_block_super+0xa4/0x100 fs/super.c:1191 kill_f2fs_super+0x9f/0xd0 fs/f2fs/super.c:3030 deactivate_locked_super+0x97/0x100 fs/super.c:316 deactivate_super+0x188/0x1b0 fs/super.c:347 cleanup_mnt+0xbf/0x160 fs/namespace.c:1174 __cleanup_mnt+0x16/0x20 fs/namespace.c:1181 task_work_run+0x1e4/0x290 kernel/task_work.c:113 tracehook_notify_resume include/linux/tracehook.h:191 [inline] exit_to_usermode_loop+0x2bd/0x310 arch/x86/entry/common.c:166 prepare_exit_to_usermode arch/x86/entry/common.c:196 [inline] syscall_return_slowpath arch/x86/entry/common.c:265 [inline] do_syscall_64+0x6ac/0x800 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x457d97 RSP: 002b:00007ffd46f9c8e8 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6 RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000457d97 RDX: 00000000014b09a3 RSI: 0000000000000002 RDI: 00007ffd46f9da50 RBP: 00007ffd46f9da50 R08: 0000000000000000 R09: 0000000000000009 R10: 0000000000000005 R11: 0000000000000246 R12: 00000000014b0940 R13: 0000000000000000 R14: 0000000000000002 R15: 000000000000658e RIP: update_sit_entry+0x1215/0x1590 fs/f2fs/segment.c:1882 RSP: ffff8801af526708 ---[ end trace f498328bb02610a2 ]--- Reported-and-tested-by: syzbot+bf9253040425feb155ad@syzkaller.appspotmail.com Reported-and-tested-by: syzbot+7d6d31d3bc702f566ce3@syzkaller.appspotmail.com Reported-and-tested-by: syzbot+0a725420475916460f12@syzkaller.appspotmail.com Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index ae3cf8dce38e..29a648e01415 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -3712,6 +3712,7 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) unsigned int i, start, end; unsigned int readed, start_blk = 0; int err = 0; + block_t total_node_blocks = 0; do { readed = ra_meta_pages(sbi, start_blk, BIO_MAX_PAGES, @@ -3734,6 +3735,8 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) if (err) return err; seg_info_from_raw_sit(se, &sit); + if (IS_NODESEG(se->type)) + total_node_blocks += se->valid_blocks; /* build discard map only one time */ if (f2fs_discard_en(sbi)) { @@ -3775,11 +3778,15 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) sit = sit_in_journal(journal, i); old_valid_blocks = se->valid_blocks; + if (IS_NODESEG(se->type)) + total_node_blocks -= old_valid_blocks; err = check_block_count(sbi, start, &sit); if (err) break; seg_info_from_raw_sit(se, &sit); + if (IS_NODESEG(se->type)) + total_node_blocks += se->valid_blocks; if (f2fs_discard_en(sbi)) { if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) { @@ -3798,6 +3805,15 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) se->valid_blocks - old_valid_blocks; } up_read(&curseg->journal_rwsem); + + if (!err && total_node_blocks != valid_node_count(sbi)) { + f2fs_msg(sbi->sb, KERN_ERR, + "SIT is corrupted node# %u vs %u", + total_node_blocks, valid_node_count(sbi)); + set_sbi_flag(sbi, SBI_NEED_FSCK); + err = -EINVAL; + } + return err; } From cb38cc4e1d02dcad7bbaad1bd7d5e1dc2ac2b78b Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 24 Apr 2018 22:43:01 -0600 Subject: [PATCH 720/804] f2fs: enforce fsync_mode=strict for renamed directory This is to give a option for user to be able to recover B/foo in the below case. mkdir A sync() rename(A, B) creat (B/foo) fsync (B/foo) ---crash--- Sugessted-by: Velayudhan Pillai Signed-off-by: Jaegeuk Kim --- fs/f2fs/namei.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index dd77ecbd536d..e6ddc9be1e60 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -970,8 +970,11 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, f2fs_put_page(old_dir_page, 0); f2fs_i_links_write(old_dir, false); } - if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_STRICT) + if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_STRICT) { add_ino_entry(sbi, new_dir->i_ino, TRANS_DIR_INO); + if (S_ISDIR(old_inode->i_mode)) + add_ino_entry(sbi, old_inode->i_ino, TRANS_DIR_INO); + } f2fs_unlock_op(sbi); From 8bb9a8da75d1678f5c4fc9ec5ea8702960102221 Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Tue, 24 Apr 2018 11:40:19 +0800 Subject: [PATCH 721/804] f2fs: fix missing clear FI_NO_PREALLOC in some error case This patch fix missing clear FI_NO_PREALLOC in some error case Signed-off-by: Yunlei He Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 4334683e5491..e1808ed8fc3e 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2732,6 +2732,8 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) iov_iter_count(from)) || f2fs_has_inline_data(inode) || f2fs_force_buffered_io(inode, WRITE)) { + clear_inode_flag(inode, + FI_NO_PREALLOC); inode_unlock(inode); return -EAGAIN; } From bb015824532c007d9bfbfea91f731d4e02c36320 Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Tue, 24 Apr 2018 11:40:30 +0800 Subject: [PATCH 722/804] f2fs: move mnt_want_write_file after range check This patch move mnt_want_write_file after range check, it's needless to check arguments with it. Signed-off-by: Yunlei He Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index e1808ed8fc3e..86aa14819637 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2060,15 +2060,15 @@ static int f2fs_ioc_gc_range(struct file *filp, unsigned long arg) if (f2fs_readonly(sbi->sb)) return -EROFS; + end = range.start + range.len; + if (range.start < MAIN_BLKADDR(sbi) || end >= MAX_BLKADDR(sbi)) { + return -EINVAL; + } + ret = mnt_want_write_file(filp); if (ret) return ret; - end = range.start + range.len; - if (range.start < MAIN_BLKADDR(sbi) || end >= MAX_BLKADDR(sbi)) { - ret = -EINVAL; - goto out; - } do_more: if (!range.sync) { if (!mutex_trylock(&sbi->gc_mutex)) { From f46eddc4da48ec1b8fbee2a1f53356bfa67eec3d Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 24 Apr 2018 10:55:28 +0800 Subject: [PATCH 723/804] f2fs: rename dio_rwsem to i_gc_rwsem RW semphore dio_rwsem in struct f2fs_inode_info is introduced to avoid race between dio and data gc, but now, it is more wildly used to avoid foreground operation vs data gc. So rename it to i_gc_rwsem to improve its readability. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 6 +++--- fs/f2fs/f2fs.h | 4 +++- fs/f2fs/file.c | 28 ++++++++++++++-------------- fs/f2fs/gc.c | 14 +++++++------- fs/f2fs/super.c | 4 ++-- 5 files changed, 29 insertions(+), 27 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index b675d5dd5c91..4436aba07617 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2403,17 +2403,17 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, if (rw == WRITE && whint_mode == WHINT_MODE_OFF) iocb->ki_hint = WRITE_LIFE_NOT_SET; - if (!down_read_trylock(&F2FS_I(inode)->dio_rwsem[rw])) { + if (!down_read_trylock(&F2FS_I(inode)->i_gc_rwsem[rw])) { if (iocb->ki_flags & IOCB_NOWAIT) { iocb->ki_hint = hint; err = -EAGAIN; goto out; } - down_read(&F2FS_I(inode)->dio_rwsem[rw]); + down_read(&F2FS_I(inode)->i_gc_rwsem[rw]); } err = blockdev_direct_IO(iocb, inode, iter, offset, get_data_block_dio); - up_read(&F2FS_I(inode)->dio_rwsem[rw]); + up_read(&F2FS_I(inode)->i_gc_rwsem[rw]); if (rw == WRITE) { if (whint_mode == WHINT_MODE_OFF) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 527999edc2a9..0408c9eafa3a 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -717,7 +717,9 @@ struct f2fs_inode_info { struct task_struct *inmem_task; /* store inmemory task */ struct mutex inmem_lock; /* lock for inmemory pages */ struct extent_tree *extent_tree; /* cached extent_tree entry */ - struct rw_semaphore dio_rwsem[2];/* avoid racing between dio and gc */ + + /* avoid racing between foreground op and gc */ + struct rw_semaphore i_gc_rwsem[2]; struct rw_semaphore i_mmap_sem; struct rw_semaphore i_xattr_sem; /* avoid racing between reading and changing EAs */ diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 86aa14819637..79bf6ac9b568 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1193,7 +1193,7 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) pg_end = (offset + len) >> PAGE_SHIFT; /* avoid gc operation during block exchange */ - down_write(&F2FS_I(inode)->dio_rwsem[WRITE]); + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); down_write(&F2FS_I(inode)->i_mmap_sem); /* write out all dirty pages from offset */ @@ -1219,7 +1219,7 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) f2fs_i_size_write(inode, new_size); out_unlock: up_write(&F2FS_I(inode)->i_mmap_sem); - up_write(&F2FS_I(inode)->dio_rwsem[WRITE]); + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); return ret; } @@ -1395,7 +1395,7 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) f2fs_balance_fs(sbi, true); /* avoid gc operation during block exchange */ - down_write(&F2FS_I(inode)->dio_rwsem[WRITE]); + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); down_write(&F2FS_I(inode)->i_mmap_sem); ret = truncate_blocks(inode, i_size_read(inode), true); @@ -1436,7 +1436,7 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) f2fs_i_size_write(inode, new_size); out: up_write(&F2FS_I(inode)->i_mmap_sem); - up_write(&F2FS_I(inode)->dio_rwsem[WRITE]); + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); return ret; } @@ -1676,7 +1676,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) inode_lock(inode); - down_write(&F2FS_I(inode)->dio_rwsem[WRITE]); + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); if (f2fs_is_atomic_file(inode)) goto out; @@ -1703,7 +1703,7 @@ skip_flush: stat_inc_atomic_write(inode); stat_update_max_atomic_write(inode); out: - up_write(&F2FS_I(inode)->dio_rwsem[WRITE]); + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); inode_unlock(inode); mnt_drop_write_file(filp); return ret; @@ -1723,7 +1723,7 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) inode_lock(inode); - down_write(&F2FS_I(inode)->dio_rwsem[WRITE]); + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); if (f2fs_is_volatile_file(inode)) { ret = -EINVAL; @@ -1745,7 +1745,7 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 1, false); } err_out: - up_write(&F2FS_I(inode)->dio_rwsem[WRITE]); + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); inode_unlock(inode); mnt_drop_write_file(filp); return ret; @@ -2332,12 +2332,12 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, } inode_lock(src); - down_write(&F2FS_I(src)->dio_rwsem[WRITE]); + down_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); if (src != dst) { ret = -EBUSY; if (!inode_trylock(dst)) goto out; - if (!down_write_trylock(&F2FS_I(dst)->dio_rwsem[WRITE])) { + if (!down_write_trylock(&F2FS_I(dst)->i_gc_rwsem[WRITE])) { inode_unlock(dst); goto out; } @@ -2399,11 +2399,11 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, f2fs_unlock_op(sbi); out_unlock: if (src != dst) { - up_write(&F2FS_I(dst)->dio_rwsem[WRITE]); + up_write(&F2FS_I(dst)->i_gc_rwsem[WRITE]); inode_unlock(dst); } out: - up_write(&F2FS_I(src)->dio_rwsem[WRITE]); + up_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); inode_unlock(src); return ret; } @@ -2625,9 +2625,9 @@ int f2fs_precache_extents(struct inode *inode) while (map.m_lblk < end) { map.m_len = end - map.m_lblk; - down_write(&fi->dio_rwsem[WRITE]); + down_write(&fi->i_gc_rwsem[WRITE]); err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_PRECACHE); - up_write(&fi->dio_rwsem[WRITE]); + up_write(&fi->i_gc_rwsem[WRITE]); if (err) return err; diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index d28d31cbd7d2..96b151546279 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -858,7 +858,7 @@ next_step: } if (!down_write_trylock( - &F2FS_I(inode)->dio_rwsem[WRITE])) { + &F2FS_I(inode)->i_gc_rwsem[WRITE])) { iput(inode); continue; } @@ -867,7 +867,7 @@ next_step: data_page = get_read_data_page(inode, start_bidx + ofs_in_node, REQ_RAHEAD, true); - up_write(&F2FS_I(inode)->dio_rwsem[WRITE]); + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); if (IS_ERR(data_page)) { iput(inode); continue; @@ -885,11 +885,11 @@ next_step: bool locked = false; if (S_ISREG(inode->i_mode)) { - if (!down_write_trylock(&fi->dio_rwsem[READ])) + if (!down_write_trylock(&fi->i_gc_rwsem[READ])) continue; if (!down_write_trylock( - &fi->dio_rwsem[WRITE])) { - up_write(&fi->dio_rwsem[READ]); + &fi->i_gc_rwsem[WRITE])) { + up_write(&fi->i_gc_rwsem[READ]); continue; } locked = true; @@ -907,8 +907,8 @@ next_step: segno, off); if (locked) { - up_write(&fi->dio_rwsem[WRITE]); - up_write(&fi->dio_rwsem[READ]); + up_write(&fi->i_gc_rwsem[WRITE]); + up_write(&fi->i_gc_rwsem[READ]); } stat_inc_data_blk_count(sbi, 1, gc_type); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index b6ce10f8128a..e83691880914 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -837,8 +837,8 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) INIT_LIST_HEAD(&fi->inmem_ilist); INIT_LIST_HEAD(&fi->inmem_pages); mutex_init(&fi->inmem_lock); - init_rwsem(&fi->dio_rwsem[READ]); - init_rwsem(&fi->dio_rwsem[WRITE]); + init_rwsem(&fi->i_gc_rwsem[READ]); + init_rwsem(&fi->i_gc_rwsem[WRITE]); init_rwsem(&fi->i_mmap_sem); init_rwsem(&fi->i_xattr_sem); From 03279ce90b4666931c32cebf089c49a223db0c09 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 25 Apr 2018 19:38:17 +0800 Subject: [PATCH 724/804] f2fs: fix potential overflow In build_sit_entries(), if valid_blocks in SIT block is smaller than valid_blocks in journal, for below calculation: sbi->discard_blks += old_valid_blocks - se->valid_blocks; There will be two times potential overflow: - old_valid_blocks - se->valid_blocks will overflow, and be a very large number. - sbi->discard_blks += result will overflow again, comes out a correct result accidently. Anyway, it should be fixed. Fixes: d600af236da5 ("f2fs: avoid unneeded loop in build_sit_entries") Fixes: 1f43e2ad7bff ("f2fs: introduce CP_TRIMMED_FLAG to avoid unneeded discard") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 29a648e01415..b6a420d65f4e 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -3795,14 +3795,17 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) } else { memcpy(se->discard_map, se->cur_valid_map, SIT_VBLOCK_MAP_SIZE); - sbi->discard_blks += old_valid_blocks - - se->valid_blocks; + sbi->discard_blks += old_valid_blocks; + sbi->discard_blks -= se->valid_blocks; } } - if (sbi->segs_per_sec > 1) + if (sbi->segs_per_sec > 1) { get_sec_entry(sbi, start)->valid_blocks += - se->valid_blocks - old_valid_blocks; + se->valid_blocks; + get_sec_entry(sbi, start)->valid_blocks -= + old_valid_blocks; + } } up_read(&curseg->journal_rwsem); From 2cf64590361ec367f3d2b91ab29777eb087222bb Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 25 Apr 2018 17:38:29 +0800 Subject: [PATCH 725/804] f2fs: introduce release_discard_addr() for cleanup Introduce release_discard_addr() to include common codes for cleanup. Signed-off-by: Chao Yu [Fengguang Wu: declare static function, reported by kbuild test robot] Signed-off-by: Fengguang Wu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index b6a420d65f4e..aa5da6ea4ff8 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1661,16 +1661,20 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc, return false; } +static void release_discard_addr(struct discard_entry *entry) +{ + list_del(&entry->list); + kmem_cache_free(discard_entry_slab, entry); +} + void release_discard_addrs(struct f2fs_sb_info *sbi) { struct list_head *head = &(SM_I(sbi)->dcc_info->entry_list); struct discard_entry *entry, *this; /* drop caches */ - list_for_each_entry_safe(entry, this, head, list) { - list_del(&entry->list); - kmem_cache_free(discard_entry_slab, entry); - } + list_for_each_entry_safe(entry, this, head, list) + release_discard_addr(entry); } /* @@ -1770,9 +1774,8 @@ skip: if (cur_pos < sbi->blocks_per_seg) goto find_next; - list_del(&entry->list); + release_discard_addr(entry); dcc->nr_discards -= total_len; - kmem_cache_free(discard_entry_slab, entry); } wake_up_discard_thread(sbi, false); From 9bb86b63dc0f16877a3014611bce29921c1b2ffa Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 26 Apr 2018 17:05:50 +0800 Subject: [PATCH 726/804] f2fs: treat volatile file's data as hot one Volatile file's data will be updated oftenly, so it'd better to place its data into hot data segment. In addition, for atomic file, we change to check FI_ATOMIC_FILE instead of FI_HOT_DATA to make code readability better. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 2 -- fs/f2fs/segment.c | 5 +++-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 79bf6ac9b568..ae21400d3ad5 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1695,7 +1695,6 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) if (ret) goto out; skip_flush: - set_inode_flag(inode, FI_HOT_DATA); set_inode_flag(inode, FI_ATOMIC_FILE); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); @@ -1738,7 +1737,6 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); if (!ret) { clear_inode_flag(inode, FI_ATOMIC_FILE); - clear_inode_flag(inode, FI_HOT_DATA); stat_dec_atomic_write(inode); } } else { diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index aa5da6ea4ff8..917d7acb12cf 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -309,7 +309,6 @@ void drop_inmem_pages(struct inode *inode) mutex_unlock(&fi->inmem_lock); clear_inode_flag(inode, FI_ATOMIC_FILE); - clear_inode_flag(inode, FI_HOT_DATA); stat_dec_atomic_write(inode); } @@ -2693,7 +2692,9 @@ static int __get_segment_type_6(struct f2fs_io_info *fio) if (is_cold_data(fio->page) || file_is_cold(inode)) return CURSEG_COLD_DATA; if (file_is_hot(inode) || - is_inode_flag_set(inode, FI_HOT_DATA)) + is_inode_flag_set(inode, FI_HOT_DATA) || + is_inode_flag_set(inode, FI_ATOMIC_FILE) || + is_inode_flag_set(inode, FI_VOLATILE_FILE)) return CURSEG_HOT_DATA; /* rw_hint_to_seg_type(inode->i_write_hint); */ return CURSEG_WARM_DATA; From 2bba5b8eb867e9f8ab9b00ebfae3a2a833b4c9c0 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 27 Apr 2018 19:03:22 -0700 Subject: [PATCH 727/804] f2fs: enhance sanity_check_raw_super() to avoid potential overflows In order to avoid the below overflow issue, we should have checked the boundaries in superblock before reaching out to allocation. As Linus suggested, the right place should be sanity_check_raw_super(). Dr Silvio Cesare of InfoSect reported: There are integer overflows with using the cp_payload superblock field in the f2fs filesystem potentially leading to memory corruption. include/linux/f2fs_fs.h struct f2fs_super_block { ... __le32 cp_payload; fs/f2fs/f2fs.h typedef u32 block_t; /* * should not change u32, since it is the on-disk block * address format, __le32. */ ... static inline block_t __cp_payload(struct f2fs_sb_info *sbi) { return le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_payload); } fs/f2fs/checkpoint.c block_t start_blk, orphan_blocks, i, j; ... start_blk = __start_cp_addr(sbi) + 1 + __cp_payload(sbi); orphan_blocks = __start_sum_addr(sbi) - 1 - __cp_payload(sbi); +++ integer overflows ... unsigned int cp_blks = 1 + __cp_payload(sbi); ... sbi->ckpt = kzalloc(cp_blks * blk_size, GFP_KERNEL); +++ integer overflow leading to incorrect heap allocation. int cp_payload_blks = __cp_payload(sbi); ... ckpt->cp_pack_start_sum = cpu_to_le32(1 + cp_payload_blks + orphan_blocks); +++ sign bug and integer overflow ... for (i = 1; i < 1 + cp_payload_blks; i++) +++ integer overflow ... sbi->max_orphans = (sbi->blocks_per_seg - F2FS_CP_PACKS - NR_CURSEG_TYPE - __cp_payload(sbi)) * F2FS_ORPHANS_PER_BLOCK; +++ integer overflow Reported-by: Greg KH Reported-by: Silvio Cesare Suggested-by: Linus Torvalds Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 75 ++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 68 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index e83691880914..ee74aa4a5f84 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2137,6 +2137,8 @@ static inline bool sanity_check_area_boundary(struct f2fs_sb_info *sbi, static int sanity_check_raw_super(struct f2fs_sb_info *sbi, struct buffer_head *bh) { + block_t segment_count, segs_per_sec, secs_per_zone; + block_t total_sections, blocks_per_seg; struct f2fs_super_block *raw_super = (struct f2fs_super_block *) (bh->b_data + F2FS_SUPER_OFFSET); struct super_block *sb = sbi->sb; @@ -2193,6 +2195,72 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi, return 1; } + segment_count = le32_to_cpu(raw_super->segment_count); + segs_per_sec = le32_to_cpu(raw_super->segs_per_sec); + secs_per_zone = le32_to_cpu(raw_super->secs_per_zone); + total_sections = le32_to_cpu(raw_super->section_count); + + /* blocks_per_seg should be 512, given the above check */ + blocks_per_seg = 1 << le32_to_cpu(raw_super->log_blocks_per_seg); + + if (segment_count > F2FS_MAX_SEGMENT || + segment_count < F2FS_MIN_SEGMENTS) { + f2fs_msg(sb, KERN_INFO, + "Invalid segment count (%u)", + segment_count); + return 1; + } + + if (total_sections > segment_count || + total_sections < F2FS_MIN_SEGMENTS || + segs_per_sec > segment_count || !segs_per_sec) { + f2fs_msg(sb, KERN_INFO, + "Invalid segment/section count (%u, %u x %u)", + segment_count, total_sections, segs_per_sec); + return 1; + } + + if ((segment_count / segs_per_sec) < total_sections) { + f2fs_msg(sb, KERN_INFO, + "Small segment_count (%u < %u * %u)", + segment_count, segs_per_sec, total_sections); + return 1; + } + + if (segment_count > (le32_to_cpu(raw_super->block_count) >> 9)) { + f2fs_msg(sb, KERN_INFO, + "Wrong segment_count / block_count (%u > %u)", + segment_count, le32_to_cpu(raw_super->block_count)); + return 1; + } + + if (secs_per_zone > total_sections) { + f2fs_msg(sb, KERN_INFO, + "Wrong secs_per_zone (%u > %u)", + secs_per_zone, total_sections); + return 1; + } + if (le32_to_cpu(raw_super->extension_count) > F2FS_MAX_EXTENSION || + raw_super->hot_ext_count > F2FS_MAX_EXTENSION || + (le32_to_cpu(raw_super->extension_count) + + raw_super->hot_ext_count) > F2FS_MAX_EXTENSION) { + f2fs_msg(sb, KERN_INFO, + "Corrupted extension count (%u + %u > %u)", + le32_to_cpu(raw_super->extension_count), + raw_super->hot_ext_count, + F2FS_MAX_EXTENSION); + return 1; + } + + if (le32_to_cpu(raw_super->cp_payload) > + (blocks_per_seg - F2FS_CP_PACKS)) { + f2fs_msg(sb, KERN_INFO, + "Insane cp_payload (%u > %u)", + le32_to_cpu(raw_super->cp_payload), + blocks_per_seg - F2FS_CP_PACKS); + return 1; + } + /* check reserved ino info */ if (le32_to_cpu(raw_super->node_ino) != 1 || le32_to_cpu(raw_super->meta_ino) != 2 || @@ -2205,13 +2273,6 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi, return 1; } - if (le32_to_cpu(raw_super->segment_count) > F2FS_MAX_SEGMENT) { - f2fs_msg(sb, KERN_INFO, - "Invalid segment count (%u)", - le32_to_cpu(raw_super->segment_count)); - return 1; - } - /* check CP/SIT/NAT/SSA/MAIN_AREA area boundary */ if (sanity_check_area_boundary(sbi, bh)) return 1; From 0037c639e63d9823f8d6cd00599e4e554f2c06b0 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 30 Apr 2018 16:27:44 +0100 Subject: [PATCH 728/804] f2fs: fix spelling mistake: "extenstion" -> "extension" Trivial fix to spelling mistake in extension list text Signed-off-by: Colin Ian King Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/sysfs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 2c53de9251be..6d8d8f41e517 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -147,13 +147,13 @@ static ssize_t f2fs_sbi_show(struct f2fs_attr *a, int len = 0, i; len += snprintf(buf + len, PAGE_SIZE - len, - "cold file extenstion:\n"); + "cold file extension:\n"); for (i = 0; i < cold_count; i++) len += snprintf(buf + len, PAGE_SIZE - len, "%s\n", extlist[i]); len += snprintf(buf + len, PAGE_SIZE - len, - "hot file extenstion:\n"); + "hot file extension:\n"); for (i = cold_count; i < cold_count + hot_count; i++) len += snprintf(buf + len, PAGE_SIZE - len, "%s\n", extlist[i]); From 2494cc7c0bcd945ec970568b1de44f3b80aeb6d9 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 4 May 2018 18:04:22 -0700 Subject: [PATCH 729/804] f2fs: don't drop any page on f2fs_cp_error() case We still provide readdir() after shtudown, so we should keep pages to avoid additional IOs. Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 7c3e8190cff2..a6c0e1023d13 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1380,11 +1380,8 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, trace_f2fs_writepage(page, NODE); - if (unlikely(f2fs_cp_error(sbi))) { - dec_page_count(sbi, F2FS_DIRTY_NODES); - unlock_page(page); - return 0; - } + if (unlikely(f2fs_cp_error(sbi))) + goto redirty_out; if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) goto redirty_out; From 331ae0c25b4412df8e4c75d64e33791f16d1a264 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 26 Apr 2018 17:05:51 +0800 Subject: [PATCH 730/804] Revert "f2fs: add ovp valid_blocks check for bg gc victim to fg_gc" For extreme case: 10 section, op = 10%, no_fggc_threshold = 90% All section usage: 85% 85% 85% 85% 90% 90% 95% 95% 95% 95% During foreground GC, if we skip select dirty section whose usage is larger than no_fggc_threshold, we can only recycle 80% invalid space from four 85% usage sections and two 90% usage sections, result in encountering out-of-space issue. This reverts commit e93b9865251a0503d83fd570e7d5a7c8bc351715 to fix this issue, besides, we keep the logic that we scan all dirty section when searching a victim, so that GC can select victim with least valid blocks. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 3 --- fs/f2fs/gc.c | 16 ---------------- fs/f2fs/segment.h | 9 --------- 3 files changed, 28 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 0408c9eafa3a..dfbf59a0525d 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1260,9 +1260,6 @@ struct f2fs_sb_info { struct f2fs_gc_kthread *gc_thread; /* GC thread */ unsigned int cur_victim_sec; /* current victim section num */ - /* threshold for converting bg victims for fg */ - u64 fggc_threshold; - /* threshold for gc trials on pinned files */ u64 gc_pin_file_threshold; diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 96b151546279..ffcb744ffcfe 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -234,10 +234,6 @@ static unsigned int check_bg_victims(struct f2fs_sb_info *sbi) for_each_set_bit(secno, dirty_i->victim_secmap, MAIN_SECS(sbi)) { if (sec_usage_check(sbi, secno)) continue; - - if (no_fggc_candidate(sbi, secno)) - continue; - clear_bit(secno, dirty_i->victim_secmap); return GET_SEG_FROM_SEC(sbi, secno); } @@ -377,9 +373,6 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, goto next; if (gc_type == BG_GC && test_bit(secno, dirty_i->victim_secmap)) goto next; - if (gc_type == FG_GC && p.alloc_mode == LFS && - no_fggc_candidate(sbi, secno)) - goto next; cost = get_gc_cost(sbi, segno, &p); @@ -1105,17 +1098,8 @@ stop: void build_gc_manager(struct f2fs_sb_info *sbi) { - u64 main_count, resv_count, ovp_count; - DIRTY_I(sbi)->v_ops = &default_v_ops; - /* threshold of # of valid blocks in a section for victims of FG_GC */ - main_count = SM_I(sbi)->main_segments << sbi->log_blocks_per_seg; - resv_count = SM_I(sbi)->reserved_segments << sbi->log_blocks_per_seg; - ovp_count = SM_I(sbi)->ovp_segments << sbi->log_blocks_per_seg; - - sbi->fggc_threshold = div64_u64((main_count - ovp_count) * - BLKS_PER_SEC(sbi), (main_count - resv_count)); sbi->gc_pin_file_threshold = DEF_GC_FAILED_PINNED_FILES; /* give warm/cold data area from slower device */ diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index e352e01854b0..21c1cc89ee6e 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -771,15 +771,6 @@ static inline block_t sum_blk_addr(struct f2fs_sb_info *sbi, int base, int type) - (base + 1) + type; } -static inline bool no_fggc_candidate(struct f2fs_sb_info *sbi, - unsigned int secno) -{ - if (get_valid_blocks(sbi, GET_SEG_FROM_SEC(sbi, secno), true) > - sbi->fggc_threshold) - return true; - return false; -} - static inline bool sec_usage_check(struct f2fs_sb_info *sbi, unsigned int secno) { if (IS_CURSEC(sbi, secno) || (sbi->cur_victim_sec == secno)) From cdcf2b3e2559797ad166d6fbf8206dc13ff25c4e Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 7 May 2018 20:28:52 +0800 Subject: [PATCH 731/804] f2fs: fix to initialize i_current_depth according to inode type i_current_depth is used only for directory inode, but its space is shared with i_gc_failures field used for regular inode, in order to avoid affecting i_gc_failures' value, this patch fixes to initialize the union's fields according to inode type. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/inode.c | 12 +++++++++--- fs/f2fs/namei.c | 3 +++ fs/f2fs/super.c | 1 - 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index bface995617b..e4d4b51fac31 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -232,8 +232,10 @@ static int do_read_inode(struct inode *inode) inode->i_ctime.tv_nsec = le32_to_cpu(ri->i_ctime_nsec); inode->i_mtime.tv_nsec = le32_to_cpu(ri->i_mtime_nsec); inode->i_generation = le32_to_cpu(ri->i_generation); - - fi->i_current_depth = le32_to_cpu(ri->i_current_depth); + if (S_ISDIR(inode->i_mode)) + fi->i_current_depth = le32_to_cpu(ri->i_current_depth); + else if (S_ISREG(inode->i_mode)) + fi->i_gc_failures = le16_to_cpu(ri->i_gc_failures); fi->i_xattr_nid = le32_to_cpu(ri->i_xattr_nid); fi->i_flags = le32_to_cpu(ri->i_flags); fi->flags = 0; @@ -422,7 +424,11 @@ void update_inode(struct inode *inode, struct page *node_page) ri->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec); ri->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); ri->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); - ri->i_current_depth = cpu_to_le32(F2FS_I(inode)->i_current_depth); + if (S_ISDIR(inode->i_mode)) + ri->i_current_depth = + cpu_to_le32(F2FS_I(inode)->i_current_depth); + else if (S_ISREG(inode->i_mode)) + ri->i_gc_failures = cpu_to_le16(F2FS_I(inode)->i_gc_failures); ri->i_xattr_nid = cpu_to_le32(F2FS_I(inode)->i_xattr_nid); ri->i_flags = cpu_to_le32(F2FS_I(inode)->i_flags); ri->i_pino = cpu_to_le32(F2FS_I(inode)->i_pino); diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index e6ddc9be1e60..b32433d8667b 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -54,6 +54,9 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) F2FS_I(inode)->i_crtime = current_time(inode); inode->i_generation = sbi->s_next_generation++; + if (S_ISDIR(inode->i_mode)) + F2FS_I(inode)->i_current_depth = 1; + err = insert_inode_locked(inode); if (err) { err = -EINVAL; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index ee74aa4a5f84..cc55475832e2 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -830,7 +830,6 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) /* Initialize f2fs-specific inode info */ atomic_set(&fi->dirty_pages, 0); - fi->i_current_depth = 1; init_rwsem(&fi->i_sem); INIT_LIST_HEAD(&fi->dirty_list); INIT_LIST_HEAD(&fi->gdirty_list); From 9bb4d22cf5de448a6d5ebad67f7b8a27c9eacd0a Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 8 May 2018 14:06:03 +0800 Subject: [PATCH 732/804] f2fs: fix to let checkpoint guarantee atomic page persistence 1. thread A: commit_inmem_pages submit data into block layer, but haven't waited it writeback. 2. thread A: commit_inmem_pages update related node. 3. thread B: do checkpoint, flush all nodes to disk. 4. SPOR Then, atomic file becomes corrupted since nodes is flushed before data. This patch fixes to treat atomic page as checkpoint guaranteed one, then in checkpoint, we can make sure all atomic page can be writebacked with metadata of atomic file. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 4436aba07617..67e3b59da064 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -47,6 +47,8 @@ static bool __is_cp_guaranteed(struct page *page) if (inode->i_ino == F2FS_META_INO(sbi) || inode->i_ino == F2FS_NODE_INO(sbi) || S_ISDIR(inode->i_mode) || + (S_ISREG(inode->i_mode) && + is_inode_flag_set(inode, FI_ATOMIC_FILE)) || is_cold_data(page)) return true; return false; From a5d0ccbc189a02a0931d7a3ee092d64f89d69f0f Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 15 May 2018 18:59:55 +0800 Subject: [PATCH 733/804] f2fs: fix to initialize min_mtime with ULLONG_MAX Since sit_i.min_mtime's type is unsigned long long, so we should initialize it with max value of the type ULLONG_MAX instead of LLONG_MAX. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 917d7acb12cf..719022d62d8f 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -3919,7 +3919,7 @@ static void init_min_max_mtime(struct f2fs_sb_info *sbi) down_write(&sit_i->sentry_lock); - sit_i->min_mtime = LLONG_MAX; + sit_i->min_mtime = ULLONG_MAX; for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) { unsigned int i; From 461247b21fde524b9022dcadb2a8e751ab520a55 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 23 May 2018 22:25:08 +0800 Subject: [PATCH 734/804] f2fs: clean up with is_valid_blkaddr() - rename is_valid_blkaddr() to is_valid_meta_blkaddr() for readability. - introduce is_valid_blkaddr() for cleanup. No logic change in this patch. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 4 ++-- fs/f2fs/data.c | 18 +++++------------- fs/f2fs/f2fs.h | 9 ++++++++- fs/f2fs/file.c | 2 +- fs/f2fs/inode.c | 2 +- fs/f2fs/node.c | 5 ++--- fs/f2fs/recovery.c | 6 +++--- fs/f2fs/segment.c | 4 ++-- fs/f2fs/segment.h | 2 +- 9 files changed, 25 insertions(+), 27 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 6d331c21f7ce..4e50459b3ad3 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -119,7 +119,7 @@ struct page *get_tmp_page(struct f2fs_sb_info *sbi, pgoff_t index) return __get_meta_page(sbi, index, false); } -bool is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type) +bool is_valid_meta_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type) { switch (type) { case META_NAT: @@ -176,7 +176,7 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, blk_start_plug(&plug); for (; nrpages-- > 0; blkno++) { - if (!is_valid_blkaddr(sbi, blkno, type)) + if (!is_valid_meta_blkaddr(sbi, blkno, type)) goto out; switch (type) { diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 67e3b59da064..3000fa45b34d 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -484,7 +484,7 @@ next: spin_unlock(&io->io_lock); } - if (fio->old_blkaddr != NEW_ADDR) + if (is_valid_blkaddr(fio->old_blkaddr)) verify_block_addr(fio, fio->old_blkaddr); verify_block_addr(fio, fio->new_blkaddr); @@ -1044,7 +1044,7 @@ next_dnode: next_block: blkaddr = datablock_addr(dn.inode, dn.node_page, dn.ofs_in_node); - if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR) { + if (!is_valid_blkaddr(blkaddr)) { if (create) { if (unlikely(f2fs_cp_error(sbi))) { err = -EIO; @@ -1678,15 +1678,6 @@ static inline bool need_inplace_update(struct f2fs_io_info *fio) return should_update_inplace(inode, fio); } -static inline bool valid_ipu_blkaddr(struct f2fs_io_info *fio) -{ - if (fio->old_blkaddr == NEW_ADDR) - return false; - if (fio->old_blkaddr == NULL_ADDR) - return false; - return true; -} - int do_write_data_page(struct f2fs_io_info *fio) { struct page *page = fio->page; @@ -1701,7 +1692,7 @@ int do_write_data_page(struct f2fs_io_info *fio) f2fs_lookup_extent_cache(inode, page->index, &ei)) { fio->old_blkaddr = ei.blk + page->index - ei.fofs; - if (valid_ipu_blkaddr(fio)) { + if (is_valid_blkaddr(fio->old_blkaddr)) { ipu_force = true; fio->need_lock = LOCK_DONE; goto got_it; @@ -1728,7 +1719,8 @@ got_it: * If current allocation needs SSR, * it had better in-place writes for updated data. */ - if (ipu_force || (valid_ipu_blkaddr(fio) && need_inplace_update(fio))) { + if (ipu_force || (is_valid_blkaddr(fio->old_blkaddr) && + need_inplace_update(fio))) { err = encrypt_one_page(fio); if (err) goto out_writepage; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index dfbf59a0525d..4d4a344f2a55 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2728,6 +2728,13 @@ static inline void f2fs_update_iostat(struct f2fs_sb_info *sbi, spin_unlock(&sbi->iostat_lock); } +static inline bool is_valid_blkaddr(block_t blkaddr) +{ + if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR) + return false; + return true; +} + /* * file.c */ @@ -2946,7 +2953,7 @@ void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io); struct page *grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index); struct page *get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index); struct page *get_tmp_page(struct f2fs_sb_info *sbi, pgoff_t index); -bool is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type); +bool is_valid_meta_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type); int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type, bool sync); void ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index ae21400d3ad5..0fb9f15f2068 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -354,7 +354,7 @@ static bool __found_offset(block_t blkaddr, pgoff_t dirty, pgoff_t pgofs, switch (whence) { case SEEK_DATA: if ((blkaddr == NEW_ADDR && dirty == pgofs) || - (blkaddr != NEW_ADDR && blkaddr != NULL_ADDR)) + is_valid_blkaddr(blkaddr)) return true; break; case SEEK_HOLE: diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index e4d4b51fac31..a814dd221eed 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -72,7 +72,7 @@ static bool __written_first_block(struct f2fs_inode *ri) { block_t addr = le32_to_cpu(ri->i_addr[offset_in_addr(ri)]); - if (addr != NEW_ADDR && addr != NULL_ADDR) + if (is_valid_blkaddr(addr)) return true; return false; } diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index a6c0e1023d13..50f6ee79f3f7 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -379,8 +379,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, new_blkaddr == NULL_ADDR); f2fs_bug_on(sbi, nat_get_blkaddr(e) == NEW_ADDR && new_blkaddr == NEW_ADDR); - f2fs_bug_on(sbi, nat_get_blkaddr(e) != NEW_ADDR && - nat_get_blkaddr(e) != NULL_ADDR && + f2fs_bug_on(sbi, is_valid_blkaddr(nat_get_blkaddr(e)) && new_blkaddr == NEW_ADDR); /* increment version no as node is removed */ @@ -391,7 +390,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, /* change address */ nat_set_blkaddr(e, new_blkaddr); - if (new_blkaddr == NEW_ADDR || new_blkaddr == NULL_ADDR) + if (!is_valid_blkaddr(new_blkaddr)) set_nat_flag(e, IS_CHECKPOINTED, false); __set_nat_cache_dirty(nm_i, e); diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 7305226a7476..3c3551811134 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -252,7 +252,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head, while (1) { struct fsync_inode_entry *entry; - if (!is_valid_blkaddr(sbi, blkaddr, META_POR)) + if (!is_valid_meta_blkaddr(sbi, blkaddr, META_POR)) return 0; page = get_tmp_page(sbi, blkaddr); @@ -506,7 +506,7 @@ retry_dn: } /* dest is valid block, try to recover from src to dest */ - if (is_valid_blkaddr(sbi, dest, META_POR)) { + if (is_valid_meta_blkaddr(sbi, dest, META_POR)) { if (src == NULL_ADDR) { err = reserve_new_block(&dn); @@ -567,7 +567,7 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list, while (1) { struct fsync_inode_entry *entry; - if (!is_valid_blkaddr(sbi, blkaddr, META_POR)) + if (!is_valid_meta_blkaddr(sbi, blkaddr, META_POR)) break; ra_meta_pages_cond(sbi, blkaddr); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 719022d62d8f..4412c506c6ad 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1975,7 +1975,7 @@ bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr) struct seg_entry *se; bool is_cp = false; - if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR) + if (!is_valid_blkaddr(blkaddr)) return true; down_read(&sit_i->sentry_lock); @@ -3040,7 +3040,7 @@ void f2fs_wait_on_block_writeback(struct f2fs_sb_info *sbi, block_t blkaddr) { struct page *cpage; - if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR) + if (!is_valid_blkaddr(blkaddr)) return; cpage = find_lock_page(META_MAPPING(sbi), blkaddr); diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 21c1cc89ee6e..3367ce263fb9 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -85,7 +85,7 @@ (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) & ((sbi)->blocks_per_seg - 1)) #define GET_SEGNO(sbi, blk_addr) \ - ((((blk_addr) == NULL_ADDR) || ((blk_addr) == NEW_ADDR)) ? \ + ((!is_valid_blkaddr(blk_addr)) ? \ NULL_SEGNO : GET_L2R_SEGNO(FREE_I(sbi), \ GET_SEGNO_FROM_SEG0(sbi, blk_addr))) #define BLKS_PER_SEC(sbi) \ From bf9510b162c4d0f19d4a7f834efe065b2e6b0659 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 15 Nov 2017 17:34:33 -0800 Subject: [PATCH 735/804] mm: implement find_get_pages_range_tag() Patch series "Ranged pagevec tagged lookup", v3. In this series I provide a ranged variant of pagevec_lookup_tag() and use it in places where it makes sense. This series removes some common code and it also has a potential for speeding up some operations similarly as for pagevec_lookup_range() (but for now I can think of only artificial cases where this happens). This patch (of 16): Implement a variant of find_get_pages_tag() that stops iterating at given index. Lots of users of this function (through pagevec_lookup()) actually want a range lookup and all of them are currently open-coding this. Also create corresponding pagevec_lookup_range_tag() function. Link: http://lkml.kernel.org/r/20171009151359.31984-2-jack@suse.cz Signed-off-by: Jan Kara Reviewed-by: Daniel Jordan Cc: Bob Peterson Cc: Chao Yu Cc: David Howells Cc: David Sterba Cc: Ilya Dryomov Cc: Jaegeuk Kim Cc: Ryusuke Konishi Cc: Steve French Cc: "Theodore Ts'o" Cc: "Yan, Zheng" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 12 ++++++++++-- include/linux/pagevec.h | 11 +++++++++-- mm/filemap.c | 33 ++++++++++++++++++++++++--------- mm/swap.c | 9 +++++---- 4 files changed, 48 insertions(+), 17 deletions(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index fbfadba81c5a..81ddfdc5d1d8 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -359,8 +359,16 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start, unsigned int nr_pages, struct page **pages); unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t start, unsigned int nr_pages, struct page **pages); -unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, - int tag, unsigned int nr_pages, struct page **pages); +unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, + pgoff_t end, int tag, unsigned int nr_pages, + struct page **pages); +static inline unsigned find_get_pages_tag(struct address_space *mapping, + pgoff_t *index, int tag, unsigned int nr_pages, + struct page **pages) +{ + return find_get_pages_range_tag(mapping, index, (pgoff_t)-1, tag, + nr_pages, pages); +} struct page *grab_cache_page_write_begin(struct address_space *mapping, pgoff_t index, unsigned flags); diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h index b45d391b4540..b59927938834 100644 --- a/include/linux/pagevec.h +++ b/include/linux/pagevec.h @@ -29,9 +29,16 @@ unsigned pagevec_lookup_entries(struct pagevec *pvec, void pagevec_remove_exceptionals(struct pagevec *pvec); unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping, pgoff_t start, unsigned nr_pages); -unsigned pagevec_lookup_tag(struct pagevec *pvec, +unsigned pagevec_lookup_range_tag(struct pagevec *pvec, + struct address_space *mapping, pgoff_t *index, pgoff_t end, + int tag, unsigned nr_pages); +static inline unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping, pgoff_t *index, int tag, - unsigned nr_pages); + unsigned nr_pages) +{ + return pagevec_lookup_range_tag(pvec, mapping, index, (pgoff_t)-1, tag, + nr_pages); +} static inline void pagevec_init(struct pagevec *pvec, int cold) { diff --git a/mm/filemap.c b/mm/filemap.c index 1bb007624b53..4d16907c0684 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1415,9 +1415,10 @@ repeat: EXPORT_SYMBOL(find_get_pages_contig); /** - * find_get_pages_tag - find and return pages that match @tag + * find_get_pages_range_tag - find and return pages in given range matching @tag * @mapping: the address_space to search * @index: the starting page index + * @end: The final page index (inclusive) * @tag: the tag index * @nr_pages: the maximum number of pages * @pages: where the resulting pages are placed @@ -1425,8 +1426,9 @@ EXPORT_SYMBOL(find_get_pages_contig); * Like find_get_pages, except we only return pages which are tagged with * @tag. We update @index to index the next page for the traversal. */ -unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, - int tag, unsigned int nr_pages, struct page **pages) +unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, + pgoff_t end, int tag, unsigned int nr_pages, + struct page **pages) { struct radix_tree_iter iter; void **slot; @@ -1440,6 +1442,9 @@ restart: radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, *index, tag) { struct page *page; + + if (iter.index > end) + break; repeat: page = radix_tree_deref_slot(slot); if (unlikely(!page)) @@ -1478,18 +1483,28 @@ repeat: } pages[ret] = page; - if (++ret == nr_pages) - break; + if (++ret == nr_pages) { + *index = pages[ret - 1]->index + 1; + goto out; + } } + /* + * We come here when we got at @end. We take care to not overflow the + * index @index as it confuses some of the callers. This breaks the + * iteration when there is page at index -1 but that is already broken + * anyway. + */ + if (end == (pgoff_t)-1) + *index = (pgoff_t)-1; + else + *index = end + 1; +out: rcu_read_unlock(); - if (ret) - *index = pages[ret - 1]->index + 1; - return ret; } -EXPORT_SYMBOL(find_get_pages_tag); +EXPORT_SYMBOL(find_get_pages_range_tag); /* * CD/DVDs are error prone. When a medium error occurs, the driver may fail diff --git a/mm/swap.c b/mm/swap.c index 39395fb549c0..6eefbfabafc0 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -1130,14 +1130,15 @@ unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping, } EXPORT_SYMBOL(pagevec_lookup); -unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping, - pgoff_t *index, int tag, unsigned nr_pages) +unsigned pagevec_lookup_range_tag(struct pagevec *pvec, + struct address_space *mapping, pgoff_t *index, pgoff_t end, + int tag, unsigned nr_pages) { - pvec->nr = find_get_pages_tag(mapping, index, tag, + pvec->nr = find_get_pages_range_tag(mapping, index, end, tag, nr_pages, pvec->pages); return pagevec_count(pvec); } -EXPORT_SYMBOL(pagevec_lookup_tag); +EXPORT_SYMBOL(pagevec_lookup_range_tag); /* * Perform any setup for the swap system From e25fadabb5c779787b33198d97890e8c9b3c1c7a Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 15 Nov 2017 17:34:37 -0800 Subject: [PATCH 736/804] btrfs: use pagevec_lookup_range_tag() We want only pages from given range in btree_write_cache_pages() and extent_write_cache_pages(). Use pagevec_lookup_range_tag() instead of pagevec_lookup_tag() and remove unnecessary code. Link: http://lkml.kernel.org/r/20171009151359.31984-3-jack@suse.cz Signed-off-by: Jan Kara Reviewed-by: David Sterba Reviewed-by: Daniel Jordan Cc: David Sterba Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/btrfs/extent_io.c | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 257bbdcb5df6..bc6b8635917f 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3932,8 +3932,8 @@ retry: if (wbc->sync_mode == WB_SYNC_ALL) tag_pages_for_writeback(mapping, index, end); while (!done && !nr_to_write_done && (index <= end) && - (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, - min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { + (nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, + tag, PAGEVEC_SIZE))) { unsigned i; scanned = 1; @@ -3943,11 +3943,6 @@ retry: if (!PagePrivate(page)) continue; - if (!wbc->range_cyclic && page->index > end) { - done = 1; - break; - } - spin_lock(&mapping->private_lock); if (!PagePrivate(page)) { spin_unlock(&mapping->private_lock); @@ -4076,8 +4071,8 @@ retry: if (wbc->sync_mode == WB_SYNC_ALL) tag_pages_for_writeback(mapping, index, end); while (!done && !nr_to_write_done && (index <= end) && - (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, - min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { + (nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, + tag, PAGEVEC_SIZE))) { unsigned i; scanned = 1; @@ -4101,12 +4096,6 @@ retry: continue; } - if (!wbc->range_cyclic && page->index > end) { - done = 1; - unlock_page(page); - continue; - } - if (wbc->sync_mode != WB_SYNC_NONE) { if (PageWriteback(page)) flush_fn(data); From 1c7be24f65cdd4d053ef8c2b4ff83a150167fb80 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 15 Nov 2017 17:34:41 -0800 Subject: [PATCH 737/804] ceph: use pagevec_lookup_range_tag() We want only pages from given range in ceph_writepages_start(). Use pagevec_lookup_range_tag() instead of pagevec_lookup_tag() and remove unnecessary code. Link: http://lkml.kernel.org/r/20171009151359.31984-4-jack@suse.cz Signed-off-by: Jan Kara Reviewed-by: Daniel Jordan Reviewed-by: "Yan, Zheng" Cc: Ilya Dryomov Cc: "Yan, Zheng" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ceph/addr.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index b7d218a168fb..c720b5032c68 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -797,10 +797,10 @@ get_more_pages: min((pgoff_t)PAGEVEC_SIZE, max_pages - (pgoff_t)locked_pages) - 1) + 1; - pvec_pages = pagevec_lookup_tag(&pvec, mapping, &index, - PAGECACHE_TAG_DIRTY, + pvec_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, + end, PAGECACHE_TAG_DIRTY, want); - dout("pagevec_lookup_tag got %d\n", pvec_pages); + dout("pagevec_lookup_range_tag got %d\n", pvec_pages); if (!pvec_pages && !locked_pages) break; for (i = 0; i < pvec_pages && locked_pages < max_pages; i++) { From 18a4848ffded01d4d6f9102ce0af7fa2dd40bc7c Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 15 Nov 2017 17:34:44 -0800 Subject: [PATCH 738/804] ext4: use pagevec_lookup_range_tag() We want only pages from given range in ext4_writepages(). Use pagevec_lookup_range_tag() instead of pagevec_lookup_tag() and remove unnecessary code. Link: http://lkml.kernel.org/r/20171009151359.31984-5-jack@suse.cz Signed-off-by: Jan Kara Reviewed-by: Daniel Jordan Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ext4/inode.c | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index df30d04f6760..3eed917db1e7 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2367,24 +2367,14 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) mpd->map.m_len = 0; mpd->next_page = index; while (index <= end) { - nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, - min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); + nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, + tag, PAGEVEC_SIZE); if (nr_pages == 0) goto out; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; - /* - * At this point, the page may be truncated or - * invalidated (changing page->mapping to NULL), or - * even swizzled back from swapper_space to tmpfs file - * mapping. However, page->index will not change - * because we have a reference on the page. - */ - if (page->index > end) - goto out; - /* * Accumulated enough dirty pages? This doesn't apply * to WB_SYNC_ALL mode. For integrity sync we have to From a05d8a6a2bdec7ed7200390d33dd45656df2d2eb Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 15 Nov 2017 17:34:48 -0800 Subject: [PATCH 739/804] f2fs: use pagevec_lookup_range_tag() We want only pages from given range in f2fs_write_cache_pages(). Use pagevec_lookup_range_tag() instead of pagevec_lookup_tag() and remove unnecessary code. Link: http://lkml.kernel.org/r/20171009151359.31984-6-jack@suse.cz Signed-off-by: Jan Kara Reviewed-by: Chao Yu Reviewed-by: Daniel Jordan Cc: Jaegeuk Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/f2fs/data.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 3000fa45b34d..a7dc2cbeb3d1 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1969,8 +1969,8 @@ retry: while (!done && (index <= end)) { int i; - nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, - min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1); + nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, + tag, PAGEVEC_SIZE); if (nr_pages == 0) break; @@ -1978,11 +1978,6 @@ retry: struct page *page = pvec.pages[i]; bool submitted = false; - if (page->index > end) { - done = 1; - break; - } - done_index = page->index; retry_write: lock_page(page); From 6cf6fb8645ffa50459fc2a1599e9ccbcce2eb87a Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 15 Nov 2017 17:34:51 -0800 Subject: [PATCH 740/804] f2fs: simplify page iteration loops In several places we want to iterate over all tagged pages in a mapping. However the code was apparently copied from places that iterate only over a limited range and thus it checks for index <= end, optimizes the case where we are coming close to range end which is all pointless when end == ULONG_MAX. So just remove this dead code. [akpm@linux-foundation.org: fix warnings] Link: http://lkml.kernel.org/r/20171009151359.31984-7-jack@suse.cz Signed-off-by: Jan Kara Reviewed-by: Daniel Jordan Reviewed-by: Chao Yu Cc: Jaegeuk Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/f2fs/checkpoint.c | 13 ++++------ fs/f2fs/node.c | 59 +++++++++++++++----------------------------- 2 files changed, 25 insertions(+), 47 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 4e50459b3ad3..46799d35c632 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -312,9 +312,10 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, long nr_to_write, enum iostat_type io_type) { struct address_space *mapping = META_MAPPING(sbi); - pgoff_t index = 0, end = ULONG_MAX, prev = ULONG_MAX; + pgoff_t index = 0, prev = ULONG_MAX; struct pagevec pvec; long nwritten = 0; + int nr_pages; struct writeback_control wbc = { .for_reclaim = 0, }; @@ -324,13 +325,9 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, blk_start_plug(&plug); - while (index <= end) { - int i, nr_pages; - nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, - PAGECACHE_TAG_DIRTY, - min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); - if (unlikely(nr_pages == 0)) - break; + while ((nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, + PAGECACHE_TAG_DIRTY, PAGEVEC_SIZE))) { + int i; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 50f6ee79f3f7..8ddd435ea50b 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1299,21 +1299,17 @@ iput_out: static struct page *last_fsync_dnode(struct f2fs_sb_info *sbi, nid_t ino) { - pgoff_t index, end; + pgoff_t index; struct pagevec pvec; struct page *last_page = NULL; + int nr_pages; pagevec_init(&pvec, 0); index = 0; - end = ULONG_MAX; - while (index <= end) { - int i, nr_pages; - nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, - PAGECACHE_TAG_DIRTY, - min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); - if (nr_pages == 0) - break; + while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, + PAGECACHE_TAG_DIRTY, PAGEVEC_SIZE))) { + int i; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; @@ -1482,13 +1478,14 @@ static int f2fs_write_node_page(struct page *page, int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, struct writeback_control *wbc, bool atomic) { - pgoff_t index, end; + pgoff_t index; pgoff_t last_idx = ULONG_MAX; struct pagevec pvec; int ret = 0; struct page *last_page = NULL; bool marked = false; nid_t ino = inode->i_ino; + int nr_pages; if (atomic) { last_page = last_fsync_dnode(sbi, ino); @@ -1498,15 +1495,10 @@ int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, retry: pagevec_init(&pvec, 0); index = 0; - end = ULONG_MAX; - while (index <= end) { - int i, nr_pages; - nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, - PAGECACHE_TAG_DIRTY, - min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); - if (nr_pages == 0) - break; + while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, + PAGECACHE_TAG_DIRTY, PAGEVEC_SIZE))) { + int i; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; @@ -1605,25 +1597,21 @@ out: int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc, bool do_balance, enum iostat_type io_type) { - pgoff_t index, end; + pgoff_t index; struct pagevec pvec; int step = 0; int nwritten = 0; int ret = 0; + int nr_pages; pagevec_init(&pvec, 0); next_step: index = 0; - end = ULONG_MAX; - while (index <= end) { - int i, nr_pages; - nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, - PAGECACHE_TAG_DIRTY, - min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); - if (nr_pages == 0) - break; + while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, + PAGECACHE_TAG_DIRTY, PAGEVEC_SIZE))) { + int i; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; @@ -1709,27 +1697,20 @@ continue_unlock: int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino) { - pgoff_t index = 0, end = ULONG_MAX; + pgoff_t index = 0; struct pagevec pvec; int ret2 = 0, ret = 0; + int nr_pages; pagevec_init(&pvec, 0); - while (index <= end) { - int i, nr_pages; - nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, - PAGECACHE_TAG_WRITEBACK, - min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); - if (nr_pages == 0) - break; + while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, + PAGECACHE_TAG_WRITEBACK, PAGEVEC_SIZE))) { + int i; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; - /* until radix tree lookup accepts end_index */ - if (unlikely(page->index > end)) - continue; - if (ino && ino_of_node(page) == ino) { f2fs_wait_on_page_writeback(page, NODE, true); if (TestClearPageError(page)) From 564108e83a74d8dff72fb2e8b37ebb302fc2b8ad Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 15 Nov 2017 17:34:55 -0800 Subject: [PATCH 741/804] f2fs: use find_get_pages_tag() for looking up single page __get_first_dirty_index() wants to lookup only the first dirty page after given index. There's no point in using pagevec_lookup_tag() for that. Just use find_get_pages_tag() directly. Link: http://lkml.kernel.org/r/20171009151359.31984-8-jack@suse.cz Signed-off-by: Jan Kara Reviewed-by: Chao Yu Reviewed-by: Daniel Jordan Cc: Jaegeuk Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/f2fs/file.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 0fb9f15f2068..2a87f4531e0e 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -333,18 +333,19 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) static pgoff_t __get_first_dirty_index(struct address_space *mapping, pgoff_t pgofs, int whence) { - struct pagevec pvec; + struct page *page; int nr_pages; if (whence != SEEK_DATA) return 0; /* find first dirty page index */ - pagevec_init(&pvec, 0); - nr_pages = pagevec_lookup_tag(&pvec, mapping, &pgofs, - PAGECACHE_TAG_DIRTY, 1); - pgofs = nr_pages ? pvec.pages[0]->index : ULONG_MAX; - pagevec_release(&pvec); + nr_pages = find_get_pages_tag(mapping, &pgofs, PAGECACHE_TAG_DIRTY, + 1, &page); + if (!nr_pages) + return ULONG_MAX; + pgofs = page->index; + put_page(page); return pgofs; } From 160355d69f4610cccc570fec7d72a8e87da4428c Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 15 Nov 2017 17:34:58 -0800 Subject: [PATCH 742/804] gfs2: use pagevec_lookup_range_tag() We want only pages from given range in gfs2_write_cache_jdata(). Use pagevec_lookup_range_tag() instead of pagevec_lookup_tag() and remove unnecessary code. Link: http://lkml.kernel.org/r/20171009151359.31984-9-jack@suse.cz Signed-off-by: Jan Kara Reviewed-by: Daniel Jordan Cc: Bob Peterson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/gfs2/aops.c | 20 ++------------------ 1 file changed, 2 insertions(+), 18 deletions(-) diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 1caee0534587..2505627f024e 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -249,22 +249,6 @@ static int gfs2_write_jdata_pagevec(struct address_space *mapping, for(i = 0; i < nr_pages; i++) { struct page *page = pvec->pages[i]; - /* - * At this point, the page may be truncated or - * invalidated (changing page->mapping to NULL), or - * even swizzled back from swapper_space to tmpfs file - * mapping. However, page->index will not change - * because we have a reference on the page. - */ - if (page->index > end) { - /* - * can't be range_cyclic (1st pass) because - * end == -1 in that case. - */ - ret = 1; - break; - } - *done_index = page->index; lock_page(page); @@ -382,8 +366,8 @@ retry: tag_pages_for_writeback(mapping, index, end); done_index = index; while (!done && (index <= end)) { - nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, - min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); + nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, + tag, PAGEVEC_SIZE); if (nr_pages == 0) break; From 94f1b99298bd5d82b855664f721a6f543617df4e Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 15 Nov 2017 17:35:02 -0800 Subject: [PATCH 743/804] nilfs2: use pagevec_lookup_range_tag() We want only pages from given range in nilfs_lookup_dirty_data_buffers(). Use pagevec_lookup_range_tag() instead of pagevec_lookup_tag() and remove unnecessary code. Link: http://lkml.kernel.org/r/20171009151359.31984-10-jack@suse.cz Signed-off-by: Jan Kara Reviewed-by: Daniel Jordan Acked-by: Ryusuke Konishi Cc: Ryusuke Konishi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nilfs2/segment.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 3b65adaae7e4..5bcd2f32449b 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -705,18 +705,14 @@ static size_t nilfs_lookup_dirty_data_buffers(struct inode *inode, pagevec_init(&pvec, 0); repeat: if (unlikely(index > last) || - !pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY, - min_t(pgoff_t, last - index, - PAGEVEC_SIZE - 1) + 1)) + !pagevec_lookup_range_tag(&pvec, mapping, &index, last, + PAGECACHE_TAG_DIRTY, PAGEVEC_SIZE)) return ndirties; for (i = 0; i < pagevec_count(&pvec); i++) { struct buffer_head *bh, *head; struct page *page = pvec.pages[i]; - if (unlikely(page->index > last)) - break; - lock_page(page); if (!page_has_buffers(page)) create_empty_buffers(page, 1 << inode->i_blkbits, 0); From 26778b87a0067fd32b061d1977000c17c4685ffd Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 15 Nov 2017 17:35:05 -0800 Subject: [PATCH 744/804] mm: use pagevec_lookup_range_tag() in __filemap_fdatawait_range() Use pagevec_lookup_range_tag() in __filemap_fdatawait_range() as it is interested only in pages from given range. Remove unnecessary code resulting from this. Link: http://lkml.kernel.org/r/20171009151359.31984-11-jack@suse.cz Signed-off-by: Jan Kara Reviewed-by: Daniel Jordan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 4d16907c0684..1544865fa64a 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -344,19 +344,17 @@ static int __filemap_fdatawait_range(struct address_space *mapping, goto out; pagevec_init(&pvec, 0); - while ((index <= end) && - (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, - PAGECACHE_TAG_WRITEBACK, - min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) { + while (index <= end) { unsigned i; + nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, + end, PAGECACHE_TAG_WRITEBACK, PAGEVEC_SIZE); + if (!nr_pages) + break; + for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; - /* until radix tree lookup accepts end_index */ - if (page->index > end) - continue; - wait_on_page_writeback(page); if (TestClearPageError(page)) ret = -EIO; From 8914877e374a1bef6834d1adfab32a4564943f12 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 15 Nov 2017 17:35:09 -0800 Subject: [PATCH 745/804] mm: use pagevec_lookup_range_tag() in write_cache_pages() Use pagevec_lookup_range_tag() in write_cache_pages() as it is interested only in pages from given range. Remove unnecessary code resulting from this. Link: http://lkml.kernel.org/r/20171009151359.31984-12-jack@suse.cz Signed-off-by: Jan Kara Reviewed-by: Daniel Jordan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 20 ++------------------ 1 file changed, 2 insertions(+), 18 deletions(-) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index fd51ebfc423f..4bdd7ef43f6e 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2187,30 +2187,14 @@ retry: while (!done && (index <= end)) { int i; - nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, - min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); + nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, + tag, PAGEVEC_SIZE); if (nr_pages == 0) break; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; - /* - * At this point, the page may be truncated or - * invalidated (changing page->mapping to NULL), or - * even swizzled back from swapper_space to tmpfs file - * mapping. However, page->index will not change - * because we have a reference on the page. - */ - if (page->index > end) { - /* - * can't be range_cyclic (1st pass) because - * end == -1 in that case. - */ - done = 1; - break; - } - done_index = page->index; lock_page(page); From f3aa4a25b8b0f92f537b8f55538a4e687ead1336 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 15 Nov 2017 17:35:12 -0800 Subject: [PATCH 746/804] mm: add variant of pagevec_lookup_range_tag() taking number of pages Currently pagevec_lookup_range_tag() takes number of pages to look up but most users don't need this. Create a new function pagevec_lookup_range_nr_tag() that takes maximum number of pages to lookup for Ceph which wants this functionality so that we can drop nr_pages argument from pagevec_lookup_range_tag(). Link: http://lkml.kernel.org/r/20171009151359.31984-13-jack@suse.cz Signed-off-by: Jan Kara Reviewed-by: Daniel Jordan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagevec.h | 3 +++ mm/swap.c | 9 +++++++++ 2 files changed, 12 insertions(+) diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h index b59927938834..cfed0c5ec659 100644 --- a/include/linux/pagevec.h +++ b/include/linux/pagevec.h @@ -32,6 +32,9 @@ unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping, unsigned pagevec_lookup_range_tag(struct pagevec *pvec, struct address_space *mapping, pgoff_t *index, pgoff_t end, int tag, unsigned nr_pages); +unsigned pagevec_lookup_range_nr_tag(struct pagevec *pvec, + struct address_space *mapping, pgoff_t *index, pgoff_t end, + int tag, unsigned max_pages); static inline unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping, pgoff_t *index, int tag, unsigned nr_pages) diff --git a/mm/swap.c b/mm/swap.c index 6eefbfabafc0..bef40c04f864 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -1140,6 +1140,15 @@ unsigned pagevec_lookup_range_tag(struct pagevec *pvec, } EXPORT_SYMBOL(pagevec_lookup_range_tag); +unsigned pagevec_lookup_range_nr_tag(struct pagevec *pvec, + struct address_space *mapping, pgoff_t *index, pgoff_t end, + int tag, unsigned max_pages) +{ + pvec->nr = find_get_pages_range_tag(mapping, index, end, tag, + min_t(unsigned int, max_pages, PAGEVEC_SIZE), pvec->pages); + return pagevec_count(pvec); +} +EXPORT_SYMBOL(pagevec_lookup_range_nr_tag); /* * Perform any setup for the swap system */ From feb94dc82928286a323576eaaadfec057d474112 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 15 Nov 2017 17:35:16 -0800 Subject: [PATCH 747/804] ceph: use pagevec_lookup_range_nr_tag() Use new function for looking up pages since nr_pages argument from pagevec_lookup_range_tag() is going away. Link: http://lkml.kernel.org/r/20171009151359.31984-14-jack@suse.cz Signed-off-by: Jan Kara Reviewed-by: "Yan, Zheng" Reviewed-by: Daniel Jordan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ceph/addr.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index c720b5032c68..e6bb73963914 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -783,8 +783,7 @@ retry: struct page **pages = NULL; mempool_t *pool = NULL; /* Becomes non-null if mempool used */ struct page *page; - int want; - u64 offset, len; + u64 offset = 0, len = 0; long writeback_stat; next = 0; @@ -793,13 +792,9 @@ retry: get_more_pages: first = -1; - want = min(end - index, - min((pgoff_t)PAGEVEC_SIZE, - max_pages - (pgoff_t)locked_pages) - 1) - + 1; pvec_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, PAGECACHE_TAG_DIRTY, - want); + max_pages - locked_pages); dout("pagevec_lookup_range_tag got %d\n", pvec_pages); if (!pvec_pages && !locked_pages) break; From 91e7d9d2ddbfda40393c8400e3d0b4852ea3c6d0 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 15 Nov 2017 17:35:19 -0800 Subject: [PATCH 748/804] mm: remove nr_pages argument from pagevec_lookup_{,range}_tag() All users of pagevec_lookup() and pagevec_lookup_range() now pass PAGEVEC_SIZE as a desired number of pages. Just drop the argument. Link: http://lkml.kernel.org/r/20171009151359.31984-15-jack@suse.cz Signed-off-by: Jan Kara Reviewed-by: Daniel Jordan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/btrfs/extent_io.c | 6 +++--- fs/ceph/addr.c | 3 +-- fs/ext4/inode.c | 2 +- fs/f2fs/checkpoint.c | 2 +- fs/f2fs/data.c | 2 +- fs/f2fs/node.c | 8 ++++---- fs/gfs2/aops.c | 2 +- fs/nilfs2/btree.c | 4 ++-- fs/nilfs2/page.c | 7 +++---- fs/nilfs2/segment.c | 6 +++--- include/linux/pagevec.h | 8 +++----- mm/filemap.c | 2 +- mm/page-writeback.c | 2 +- mm/swap.c | 4 ++-- 14 files changed, 27 insertions(+), 31 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index bc6b8635917f..322a4046a23a 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3933,7 +3933,7 @@ retry: tag_pages_for_writeback(mapping, index, end); while (!done && !nr_to_write_done && (index <= end) && (nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, - tag, PAGEVEC_SIZE))) { + tag))) { unsigned i; scanned = 1; @@ -4071,8 +4071,8 @@ retry: if (wbc->sync_mode == WB_SYNC_ALL) tag_pages_for_writeback(mapping, index, end); while (!done && !nr_to_write_done && (index <= end) && - (nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, - tag, PAGEVEC_SIZE))) { + (nr_pages = pagevec_lookup_range_tag(&pvec, mapping, + &index, end, tag))) { unsigned i; scanned = 1; diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index e6bb73963914..c30366bb034e 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -793,8 +793,7 @@ retry: get_more_pages: first = -1; pvec_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, - end, PAGECACHE_TAG_DIRTY, - max_pages - locked_pages); + end, PAGECACHE_TAG_DIRTY); dout("pagevec_lookup_range_tag got %d\n", pvec_pages); if (!pvec_pages && !locked_pages) break; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 3eed917db1e7..0550beb2b255 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2368,7 +2368,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) mpd->next_page = index; while (index <= end) { nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, - tag, PAGEVEC_SIZE); + tag); if (nr_pages == 0) goto out; diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 46799d35c632..0159a84ba02d 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -326,7 +326,7 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, blk_start_plug(&plug); while ((nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, - PAGECACHE_TAG_DIRTY, PAGEVEC_SIZE))) { + PAGECACHE_TAG_DIRTY))) { int i; for (i = 0; i < nr_pages; i++) { diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index a7dc2cbeb3d1..02be804b1226 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1970,7 +1970,7 @@ retry: int i; nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, - tag, PAGEVEC_SIZE); + tag); if (nr_pages == 0) break; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 8ddd435ea50b..0d6bb27370ff 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1308,7 +1308,7 @@ static struct page *last_fsync_dnode(struct f2fs_sb_info *sbi, nid_t ino) index = 0; while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, - PAGECACHE_TAG_DIRTY, PAGEVEC_SIZE))) { + PAGECACHE_TAG_DIRTY))) { int i; for (i = 0; i < nr_pages; i++) { @@ -1497,7 +1497,7 @@ retry: index = 0; while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, - PAGECACHE_TAG_DIRTY, PAGEVEC_SIZE))) { + PAGECACHE_TAG_DIRTY))) { int i; for (i = 0; i < nr_pages; i++) { @@ -1610,7 +1610,7 @@ next_step: index = 0; while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, - PAGECACHE_TAG_DIRTY, PAGEVEC_SIZE))) { + PAGECACHE_TAG_DIRTY))) { int i; for (i = 0; i < nr_pages; i++) { @@ -1705,7 +1705,7 @@ int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino) pagevec_init(&pvec, 0); while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, - PAGECACHE_TAG_WRITEBACK, PAGEVEC_SIZE))) { + PAGECACHE_TAG_WRITEBACK))) { int i; for (i = 0; i < nr_pages; i++) { diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 2505627f024e..582ef53f2104 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -367,7 +367,7 @@ retry: done_index = index; while (!done && (index <= end)) { nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, - tag, PAGEVEC_SIZE); + tag); if (nr_pages == 0) break; diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c index 3a3821b00486..9deca59be7e5 100644 --- a/fs/nilfs2/btree.c +++ b/fs/nilfs2/btree.c @@ -2147,8 +2147,8 @@ static void nilfs_btree_lookup_dirty_buffers(struct nilfs_bmap *btree, pagevec_init(&pvec, 0); - while (pagevec_lookup_tag(&pvec, btcache, &index, PAGECACHE_TAG_DIRTY, - PAGEVEC_SIZE)) { + while (pagevec_lookup_tag(&pvec, btcache, &index, + PAGECACHE_TAG_DIRTY)) { for (i = 0; i < pagevec_count(&pvec); i++) { bh = head = page_buffers(pvec.pages[i]); do { diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index 45d650addd56..447999563737 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -262,8 +262,7 @@ int nilfs_copy_dirty_pages(struct address_space *dmap, pagevec_init(&pvec, 0); repeat: - if (!pagevec_lookup_tag(&pvec, smap, &index, PAGECACHE_TAG_DIRTY, - PAGEVEC_SIZE)) + if (!pagevec_lookup_tag(&pvec, smap, &index, PAGECACHE_TAG_DIRTY)) return 0; for (i = 0; i < pagevec_count(&pvec); i++) { @@ -382,8 +381,8 @@ void nilfs_clear_dirty_pages(struct address_space *mapping, bool silent) pagevec_init(&pvec, 0); - while (pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY, - PAGEVEC_SIZE)) { + while (pagevec_lookup_tag(&pvec, mapping, &index, + PAGECACHE_TAG_DIRTY)) { for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 5bcd2f32449b..37781eaffc00 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -706,7 +706,7 @@ static size_t nilfs_lookup_dirty_data_buffers(struct inode *inode, repeat: if (unlikely(index > last) || !pagevec_lookup_range_tag(&pvec, mapping, &index, last, - PAGECACHE_TAG_DIRTY, PAGEVEC_SIZE)) + PAGECACHE_TAG_DIRTY)) return ndirties; for (i = 0; i < pagevec_count(&pvec); i++) { @@ -749,8 +749,8 @@ static void nilfs_lookup_dirty_node_buffers(struct inode *inode, pagevec_init(&pvec, 0); - while (pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY, - PAGEVEC_SIZE)) { + while (pagevec_lookup_tag(&pvec, mapping, &index, + PAGECACHE_TAG_DIRTY)) { for (i = 0; i < pagevec_count(&pvec); i++) { bh = head = page_buffers(pvec.pages[i]); do { diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h index cfed0c5ec659..cead4419f933 100644 --- a/include/linux/pagevec.h +++ b/include/linux/pagevec.h @@ -31,16 +31,14 @@ unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping, pgoff_t start, unsigned nr_pages); unsigned pagevec_lookup_range_tag(struct pagevec *pvec, struct address_space *mapping, pgoff_t *index, pgoff_t end, - int tag, unsigned nr_pages); + int tag); unsigned pagevec_lookup_range_nr_tag(struct pagevec *pvec, struct address_space *mapping, pgoff_t *index, pgoff_t end, int tag, unsigned max_pages); static inline unsigned pagevec_lookup_tag(struct pagevec *pvec, - struct address_space *mapping, pgoff_t *index, int tag, - unsigned nr_pages) + struct address_space *mapping, pgoff_t *index, int tag) { - return pagevec_lookup_range_tag(pvec, mapping, index, (pgoff_t)-1, tag, - nr_pages); + return pagevec_lookup_range_tag(pvec, mapping, index, (pgoff_t)-1, tag); } static inline void pagevec_init(struct pagevec *pvec, int cold) diff --git a/mm/filemap.c b/mm/filemap.c index 1544865fa64a..226e4d89ef5d 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -348,7 +348,7 @@ static int __filemap_fdatawait_range(struct address_space *mapping, unsigned i; nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, - end, PAGECACHE_TAG_WRITEBACK, PAGEVEC_SIZE); + end, PAGECACHE_TAG_WRITEBACK); if (!nr_pages) break; diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 4bdd7ef43f6e..38979615c7ad 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2188,7 +2188,7 @@ retry: int i; nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, - tag, PAGEVEC_SIZE); + tag); if (nr_pages == 0) break; diff --git a/mm/swap.c b/mm/swap.c index bef40c04f864..8e6bcb688779 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -1132,10 +1132,10 @@ EXPORT_SYMBOL(pagevec_lookup); unsigned pagevec_lookup_range_tag(struct pagevec *pvec, struct address_space *mapping, pgoff_t *index, pgoff_t end, - int tag, unsigned nr_pages) + int tag) { pvec->nr = find_get_pages_range_tag(mapping, index, end, tag, - nr_pages, pvec->pages); + PAGEVEC_SIZE, pvec->pages); return pagevec_count(pvec); } EXPORT_SYMBOL(pagevec_lookup_range_tag); From ed74404955cd8eeaa41ff1aa57a5af6f8e6f62a8 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 23 May 2018 22:25:09 +0800 Subject: [PATCH 749/804] f2fs: detect synchronous writeback more earlier This patch changes to detect synchronous writeback more earlier before, in order to avoid unnecessary page writeback before exiting asynchronous writeback. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 02be804b1226..9deff7960bb2 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1978,6 +1978,13 @@ retry: struct page *page = pvec.pages[i]; bool submitted = false; + /* give a priority to WB_SYNC threads */ + if (atomic_read(&F2FS_M_SB(mapping)->wb_sync_req) && + wbc->sync_mode == WB_SYNC_NONE) { + done = 1; + break; + } + done_index = page->index; retry_write: lock_page(page); @@ -2032,9 +2039,7 @@ continue_unlock: last_idx = page->index; } - /* give a priority to WB_SYNC threads */ - if ((atomic_read(&F2FS_M_SB(mapping)->wb_sync_req) || - --wbc->nr_to_write <= 0) && + if (--wbc->nr_to_write <= 0 && wbc->sync_mode == WB_SYNC_NONE) { done = 1; break; From 9db5be4af890fdacab65a4c746f5e330537d1e16 Mon Sep 17 00:00:00 2001 From: Sahitya Tummala Date: Fri, 18 May 2018 11:51:52 +0530 Subject: [PATCH 750/804] f2fs: Fix deadlock in shutdown ioctl f2fs_ioc_shutdown() ioctl gets stuck in the below path when issued with F2FS_GOING_DOWN_FULLSYNC option. __switch_to+0x90/0xc4 percpu_down_write+0x8c/0xc0 freeze_super+0xec/0x1e4 freeze_bdev+0xc4/0xcc f2fs_ioctl+0xc0c/0x1ce0 f2fs_compat_ioctl+0x98/0x1f0 Signed-off-by: Sahitya Tummala Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 2a87f4531e0e..ab476867c30b 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1857,9 +1857,11 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) if (get_user(in, (__u32 __user *)arg)) return -EFAULT; - ret = mnt_want_write_file(filp); - if (ret) - return ret; + if (in != F2FS_GOING_DOWN_FULLSYNC) { + ret = mnt_want_write_file(filp); + if (ret) + return ret; + } switch (in) { case F2FS_GOING_DOWN_FULLSYNC: @@ -1900,7 +1902,8 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) f2fs_update_time(sbi, REQ_TIME); out: - mnt_drop_write_file(filp); + if (in != F2FS_GOING_DOWN_FULLSYNC) + mnt_drop_write_file(filp); return ret; } From c4408c238722fdfd3302be50dcce1f89c12d6666 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 23 Apr 2018 10:36:13 +0800 Subject: [PATCH 751/804] f2fs: fix to wait page writeback during revoking atomic write After revoking atomic write, related LBA can be reused by others, so we need to wait page writeback before reusing the LBA, in order to avoid interference between old atomic written in-flight IO and new IO. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 4412c506c6ad..a31517e231b6 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -230,6 +230,8 @@ static int __revoke_inmem_pages(struct inode *inode, lock_page(page); + f2fs_wait_on_page_writeback(page, DATA, true); + if (recover) { struct dnode_of_data dn; struct node_info ni; From 1f62e4702a34d1fc33be8734777fd858b1147bb7 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 26 May 2018 09:00:13 +0800 Subject: [PATCH 752/804] f2fs: keep migration IO order in LFS mode For non-migration IO, we will keep order of data/node blocks' submitting as allocation sequence by sorting IOs in per log io_list list, but for migration IO, it could be out-of-order. In LFS mode, we should keep all IOs including migration IO be ordered, so that this patch fixes to add an additional lock to keep submitting order. Signed-off-by: Chao Yu Signed-off-by: Yunlong Song Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 ++ fs/f2fs/gc.c | 6 ++++++ fs/f2fs/segment.c | 5 +++++ fs/f2fs/super.c | 1 + 4 files changed, 14 insertions(+) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 4d4a344f2a55..ca5dc3b8d066 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1178,6 +1178,8 @@ struct f2fs_sb_info { struct f2fs_bio_info *write_io[NR_PAGE_TYPE]; /* for write bios */ struct mutex wio_mutex[NR_PAGE_TYPE - 1][NR_TEMP_TYPE]; /* bio ordering for NODE/DATA */ + /* keep migration IO order for LFS mode */ + struct rw_semaphore io_order_lock; mempool_t *write_io_dummy; /* Dummy pages */ /* for checkpoint */ diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index ffcb744ffcfe..bd189e5bc4d7 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -614,6 +614,7 @@ static void move_data_block(struct inode *inode, block_t bidx, struct page *page; block_t newaddr; int err; + bool lfs_mode = test_opt(fio.sbi, LFS); /* do not read out */ page = f2fs_grab_cache_page(inode->i_mapping, bidx, false); @@ -654,6 +655,9 @@ static void move_data_block(struct inode *inode, block_t bidx, fio.page = page; fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr; + if (lfs_mode) + down_write(&fio.sbi->io_order_lock); + allocate_data_block(fio.sbi, NULL, fio.old_blkaddr, &newaddr, &sum, CURSEG_COLD_DATA, NULL, false); @@ -710,6 +714,8 @@ static void move_data_block(struct inode *inode, block_t bidx, put_page_out: f2fs_put_page(fio.encrypted_page, 1); recover_block: + if (lfs_mode) + up_write(&fio.sbi->io_order_lock); if (err) __f2fs_replace_block(fio.sbi, &sum, newaddr, fio.old_blkaddr, true, true); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index a31517e231b6..ce5a2bd19e4b 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2831,7 +2831,10 @@ static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) { int type = __get_segment_type(fio); int err; + bool keep_order = (test_opt(fio->sbi, LFS) && type == CURSEG_COLD_DATA); + if (keep_order) + down_read(&fio->sbi->io_order_lock); reallocate: allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr, &fio->new_blkaddr, sum, type, fio, true); @@ -2844,6 +2847,8 @@ reallocate: } else if (!err) { update_device_state(fio); } + if (keep_order) + up_read(&fio->sbi->io_order_lock); } void write_meta_page(struct f2fs_sb_info *sbi, struct page *page, diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index cc55475832e2..7064f6e33211 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2367,6 +2367,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi) for (i = 0; i < NR_PAGE_TYPE - 1; i++) for (j = HOT; j < NR_TEMP_TYPE; j++) mutex_init(&sbi->wio_mutex[i][j]); + init_rwsem(&sbi->io_order_lock); spin_lock_init(&sbi->cp_lock); sbi->dirty_device = 0; From 405909e7f53293a13c9a0fad5c81ce1472e9fd32 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 7 May 2018 14:22:40 -0700 Subject: [PATCH 753/804] f2fs: introduce sbi->gc_mode to determine the policy This is to avoid sbi->gc_thread pointer access. Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 8 ++++++++ fs/f2fs/gc.c | 28 ++++++++++++---------------- fs/f2fs/gc.h | 2 -- fs/f2fs/segment.c | 4 ++-- fs/f2fs/sysfs.c | 33 +++++++++++++++++++++++++-------- 5 files changed, 47 insertions(+), 28 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index ca5dc3b8d066..6b3573cf7f10 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1130,6 +1130,13 @@ enum { MAX_TIME, }; +enum { + GC_NORMAL, + GC_IDLE_CB, + GC_IDLE_GREEDY, + GC_URGENT, +}; + enum { WHINT_MODE_OFF, /* not pass down write hints */ WHINT_MODE_USER, /* try to pass down hints given by users */ @@ -1261,6 +1268,7 @@ struct f2fs_sb_info { struct mutex gc_mutex; /* mutex for GC */ struct f2fs_gc_kthread *gc_thread; /* GC thread */ unsigned int cur_victim_sec; /* current victim section num */ + unsigned int gc_mode; /* current GC state */ /* threshold for gc trials on pinned files */ u64 gc_pin_file_threshold; diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index bd189e5bc4d7..1df27eb2ec14 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -76,7 +76,7 @@ static int gc_thread_func(void *data) * invalidated soon after by user update or deletion. * So, I'd like to wait some time to collect dirty segments. */ - if (gc_th->gc_urgent) { + if (sbi->gc_mode == GC_URGENT) { wait_ms = gc_th->urgent_sleep_time; mutex_lock(&sbi->gc_mutex); goto do_gc; @@ -131,8 +131,6 @@ int start_gc_thread(struct f2fs_sb_info *sbi) gc_th->max_sleep_time = DEF_GC_THREAD_MAX_SLEEP_TIME; gc_th->no_gc_sleep_time = DEF_GC_THREAD_NOGC_SLEEP_TIME; - gc_th->gc_idle = 0; - gc_th->gc_urgent = 0; gc_th->gc_wake= 0; sbi->gc_thread = gc_th; @@ -158,21 +156,19 @@ void stop_gc_thread(struct f2fs_sb_info *sbi) sbi->gc_thread = NULL; } -static int select_gc_type(struct f2fs_gc_kthread *gc_th, int gc_type) +static int select_gc_type(struct f2fs_sb_info *sbi, int gc_type) { int gc_mode = (gc_type == BG_GC) ? GC_CB : GC_GREEDY; - if (!gc_th) - return gc_mode; - - if (gc_th->gc_idle) { - if (gc_th->gc_idle == 1) - gc_mode = GC_CB; - else if (gc_th->gc_idle == 2) - gc_mode = GC_GREEDY; - } - if (gc_th->gc_urgent) + switch (sbi->gc_mode) { + case GC_IDLE_CB: + gc_mode = GC_CB; + break; + case GC_IDLE_GREEDY: + case GC_URGENT: gc_mode = GC_GREEDY; + break; + } return gc_mode; } @@ -187,7 +183,7 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type, p->max_search = dirty_i->nr_dirty[type]; p->ofs_unit = 1; } else { - p->gc_mode = select_gc_type(sbi->gc_thread, gc_type); + p->gc_mode = select_gc_type(sbi, gc_type); p->dirty_segmap = dirty_i->dirty_segmap[DIRTY]; p->max_search = dirty_i->nr_dirty[DIRTY]; p->ofs_unit = sbi->segs_per_sec; @@ -195,7 +191,7 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type, /* we need to check every dirty segments in the FG_GC case */ if (gc_type != FG_GC && - (sbi->gc_thread && !sbi->gc_thread->gc_urgent) && + (sbi->gc_mode != GC_URGENT) && p->max_search > sbi->max_victim_search) p->max_search = sbi->max_victim_search; diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h index b0045d4c8d1e..c8619e408009 100644 --- a/fs/f2fs/gc.h +++ b/fs/f2fs/gc.h @@ -36,8 +36,6 @@ struct f2fs_gc_kthread { unsigned int no_gc_sleep_time; /* for changing gc mode */ - unsigned int gc_idle; - unsigned int gc_urgent; unsigned int gc_wake; }; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index ce5a2bd19e4b..8656295c76e3 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -177,7 +177,7 @@ bool need_SSR(struct f2fs_sb_info *sbi) if (test_opt(sbi, LFS)) return false; - if (sbi->gc_thread && sbi->gc_thread->gc_urgent) + if (sbi->gc_mode == GC_URGENT) return true; return free_sections(sbi) <= (node_secs + 2 * dent_secs + imeta_secs + @@ -1485,7 +1485,7 @@ static int issue_discard_thread(void *data) if (dcc->discard_wake) dcc->discard_wake = 0; - if (sbi->gc_thread && sbi->gc_thread->gc_urgent) + if (sbi->gc_mode == GC_URGENT) __init_discard_policy(sbi, &dpolicy, DPOLICY_FORCE, 1); sb_start_intwrite(sbi->sb); diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 6d8d8f41e517..dd940d156af6 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -248,16 +248,33 @@ out: if (!strcmp(a->attr.name, "trim_sections")) return -EINVAL; + if (!strcmp(a->attr.name, "gc_urgent")) { + if (t >= 1) { + sbi->gc_mode = GC_URGENT; + if (sbi->gc_thread) { + wake_up_interruptible_all( + &sbi->gc_thread->gc_wait_queue_head); + wake_up_discard_thread(sbi, true); + } + } else { + sbi->gc_mode = GC_NORMAL; + } + return count; + } + if (!strcmp(a->attr.name, "gc_idle")) { + if (t == GC_IDLE_CB) + sbi->gc_mode = GC_IDLE_CB; + else if (t == GC_IDLE_GREEDY) + sbi->gc_mode = GC_IDLE_GREEDY; + else + sbi->gc_mode = GC_NORMAL; + return count; + } + *ui = t; if (!strcmp(a->attr.name, "iostat_enable") && *ui == 0) f2fs_reset_iostat(sbi); - if (!strcmp(a->attr.name, "gc_urgent") && t == 1 && sbi->gc_thread) { - sbi->gc_thread->gc_wake = 1; - wake_up_interruptible_all(&sbi->gc_thread->gc_wait_queue_head); - wake_up_discard_thread(sbi, true); - } - return count; } @@ -349,8 +366,8 @@ F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_urgent_sleep_time, F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_min_sleep_time, min_sleep_time); F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_max_sleep_time, max_sleep_time); F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_no_gc_sleep_time, no_gc_sleep_time); -F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_idle, gc_idle); -F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_urgent, gc_urgent); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_idle, gc_mode); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_urgent, gc_mode); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments); F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_small_discards, max_discards); F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, discard_granularity, discard_granularity); From b125dfb20d18db91eac671aa241346cd1e1c0106 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 7 May 2018 20:28:54 +0800 Subject: [PATCH 754/804] f2fs: avoid stucking GC due to atomic write f2fs doesn't allow abuse on atomic write class interface, so except limiting in-mem pages' total memory usage capacity, we need to limit atomic-write usage as well when filesystem is seriously fragmented, otherwise we may run into infinite loop during foreground GC because target blocks in victim segment are belong to atomic opened file for long time. Now, we will detect failure due to atomic write in foreground GC, if the count exceeds threshold, we will drop all atomic written data in cache, by this, I expect it can keep our system running safely to prevent Dos attack. In addition, his patch adds to show GC skip information in debugfs, now it just shows count of skipped caused by atomic write. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 +- fs/f2fs/debug.c | 6 ++++++ fs/f2fs/f2fs.h | 21 +++++++++++++++------ fs/f2fs/file.c | 20 ++++++++++++++------ fs/f2fs/gc.c | 27 +++++++++++++++++++++++---- fs/f2fs/inode.c | 6 ++++-- fs/f2fs/segment.c | 11 ++++++++++- fs/f2fs/segment.h | 2 ++ 8 files changed, 75 insertions(+), 20 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 9deff7960bb2..0c9e3e186f79 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2322,7 +2322,7 @@ fail: f2fs_put_page(page, 1); f2fs_write_failed(mapping, pos + len); if (drop_atomic) - drop_inmem_pages_all(sbi); + drop_inmem_pages_all(sbi, false); return err; } diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index a66107b5cfff..2d65e77ae5cf 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -104,6 +104,8 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->avail_nids = NM_I(sbi)->available_nids; si->alloc_nids = NM_I(sbi)->nid_cnt[PREALLOC_NID]; si->bg_gc = sbi->bg_gc; + si->skipped_atomic_files[BG_GC] = sbi->skipped_atomic_files[BG_GC]; + si->skipped_atomic_files[FG_GC] = sbi->skipped_atomic_files[FG_GC]; si->util_free = (int)(free_user_blocks(sbi) >> sbi->log_blocks_per_seg) * 100 / (int)(sbi->user_block_count >> sbi->log_blocks_per_seg) / 2; @@ -342,6 +344,10 @@ static int stat_show(struct seq_file *s, void *v) si->bg_data_blks); seq_printf(s, " - node blocks : %d (%d)\n", si->node_blks, si->bg_node_blks); + seq_printf(s, "Skipped : atomic write %llu (%llu)\n", + si->skipped_atomic_files[BG_GC] + + si->skipped_atomic_files[FG_GC], + si->skipped_atomic_files[BG_GC]); seq_puts(s, "\nExtent Cache:\n"); seq_printf(s, " - Hit Count: L1-1:%llu L1-2:%llu L2:%llu\n", si->hit_largest, si->hit_cached, diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 6b3573cf7f10..6a04aae9480e 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -681,15 +681,20 @@ enum { #define DEF_DIR_LEVEL 0 +enum { + GC_FAILURE_PIN, + GC_FAILURE_ATOMIC, + MAX_GC_FAILURE +}; + struct f2fs_inode_info { struct inode vfs_inode; /* serve a vfs inode */ unsigned long i_flags; /* keep an inode flags for ioctl */ unsigned char i_advise; /* use to give file attribute hints */ unsigned char i_dir_level; /* use for dentry level for large dir */ - union { - unsigned int i_current_depth; /* only for directory depth */ - unsigned short i_gc_failures; /* only for regular file */ - }; + unsigned int i_current_depth; /* only for directory depth */ + /* for gc failure statistic */ + unsigned int i_gc_failures[MAX_GC_FAILURE]; unsigned int i_pino; /* parent inode number */ umode_t i_acl_mode; /* keep file acl mode temporarily */ @@ -1269,6 +1274,8 @@ struct f2fs_sb_info { struct f2fs_gc_kthread *gc_thread; /* GC thread */ unsigned int cur_victim_sec; /* current victim section num */ unsigned int gc_mode; /* current GC state */ + /* for skip statistic */ + unsigned long long skipped_atomic_files[2]; /* FG_GC and BG_GC */ /* threshold for gc trials on pinned files */ u64 gc_pin_file_threshold; @@ -2312,6 +2319,7 @@ enum { FI_EXTRA_ATTR, /* indicate file has extra attribute */ FI_PROJ_INHERIT, /* indicate file inherits projectid */ FI_PIN_FILE, /* indicate file should not be gced */ + FI_ATOMIC_REVOKE_REQUEST, /* request to drop atomic data */ }; static inline void __mark_inode_dirty_flag(struct inode *inode, @@ -2410,7 +2418,7 @@ static inline void f2fs_i_depth_write(struct inode *inode, unsigned int depth) static inline void f2fs_i_gc_failures_write(struct inode *inode, unsigned int count) { - F2FS_I(inode)->i_gc_failures = count; + F2FS_I(inode)->i_gc_failures[GC_FAILURE_PIN] = count; f2fs_mark_inode_dirty_sync(inode, true); } @@ -2901,7 +2909,7 @@ void destroy_node_manager_caches(void); */ bool need_SSR(struct f2fs_sb_info *sbi); void register_inmem_page(struct inode *inode, struct page *page); -void drop_inmem_pages_all(struct f2fs_sb_info *sbi); +void drop_inmem_pages_all(struct f2fs_sb_info *sbi, bool gc_failure); void drop_inmem_pages(struct inode *inode); void drop_inmem_page(struct inode *inode, struct page *page); int commit_inmem_pages(struct inode *inode); @@ -3092,6 +3100,7 @@ struct f2fs_stat_info { int bg_node_segs, bg_data_segs; int tot_blks, data_blks, node_blks; int bg_data_blks, bg_node_blks; + unsigned long long skipped_atomic_files[2]; int curseg[NR_CURSEG_TYPE]; int cursec[NR_CURSEG_TYPE]; int curzone[NR_CURSEG_TYPE]; diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index ab476867c30b..4b7629f2422e 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1697,6 +1697,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) goto out; skip_flush: set_inode_flag(inode, FI_ATOMIC_FILE); + clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); F2FS_I(inode)->inmem_task = current; @@ -1738,12 +1739,17 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); if (!ret) { clear_inode_flag(inode, FI_ATOMIC_FILE); + F2FS_I(inode)->i_gc_failures[GC_FAILURE_ATOMIC] = 0; stat_dec_atomic_write(inode); } } else { ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 1, false); } err_out: + if (is_inode_flag_set(inode, FI_ATOMIC_REVOKE_REQUEST)) { + clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); + ret = -EINVAL; + } up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); inode_unlock(inode); mnt_drop_write_file(filp); @@ -2532,12 +2538,14 @@ int f2fs_pin_file_control(struct inode *inode, bool inc) /* Use i_gc_failures for normal file as a risk signal. */ if (inc) - f2fs_i_gc_failures_write(inode, fi->i_gc_failures + 1); + f2fs_i_gc_failures_write(inode, + fi->i_gc_failures[GC_FAILURE_PIN] + 1); - if (fi->i_gc_failures > sbi->gc_pin_file_threshold) { + if (fi->i_gc_failures[GC_FAILURE_PIN] > sbi->gc_pin_file_threshold) { f2fs_msg(sbi->sb, KERN_WARNING, "%s: Enable GC = ino %lx after %x GC trials\n", - __func__, inode->i_ino, fi->i_gc_failures); + __func__, inode->i_ino, + fi->i_gc_failures[GC_FAILURE_PIN]); clear_inode_flag(inode, FI_PIN_FILE); return -EAGAIN; } @@ -2575,7 +2583,7 @@ static int f2fs_ioc_set_pin_file(struct file *filp, unsigned long arg) if (!pin) { clear_inode_flag(inode, FI_PIN_FILE); - F2FS_I(inode)->i_gc_failures = 1; + F2FS_I(inode)->i_gc_failures[GC_FAILURE_PIN] = 1; goto done; } @@ -2588,7 +2596,7 @@ static int f2fs_ioc_set_pin_file(struct file *filp, unsigned long arg) goto out; set_inode_flag(inode, FI_PIN_FILE); - ret = F2FS_I(inode)->i_gc_failures; + ret = F2FS_I(inode)->i_gc_failures[GC_FAILURE_PIN]; done: f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); out: @@ -2603,7 +2611,7 @@ static int f2fs_ioc_get_pin_file(struct file *filp, unsigned long arg) __u32 pin = 0; if (is_inode_flag_set(inode, FI_PIN_FILE)) - pin = F2FS_I(inode)->i_gc_failures; + pin = F2FS_I(inode)->i_gc_failures[GC_FAILURE_PIN]; return put_user(pin, (u32 __user *)arg); } diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 1df27eb2ec14..ef8291f705dc 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -592,7 +592,7 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, * This can be used to move blocks, aka LBAs, directly on disk. */ static void move_data_block(struct inode *inode, block_t bidx, - unsigned int segno, int off) + int gc_type, unsigned int segno, int off) { struct f2fs_io_info fio = { .sbi = F2FS_I_SB(inode), @@ -620,8 +620,11 @@ static void move_data_block(struct inode *inode, block_t bidx, if (!check_valid_map(F2FS_I_SB(inode), segno, off)) goto out; - if (f2fs_is_atomic_file(inode)) + if (f2fs_is_atomic_file(inode)) { + F2FS_I(inode)->i_gc_failures[GC_FAILURE_ATOMIC]++; + F2FS_I_SB(inode)->skipped_atomic_files[gc_type]++; goto out; + } if (f2fs_is_pinned_file(inode)) { f2fs_pin_file_control(inode, true); @@ -733,8 +736,11 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type, if (!check_valid_map(F2FS_I_SB(inode), segno, off)) goto out; - if (f2fs_is_atomic_file(inode)) + if (f2fs_is_atomic_file(inode)) { + F2FS_I(inode)->i_gc_failures[GC_FAILURE_ATOMIC]++; + F2FS_I_SB(inode)->skipped_atomic_files[gc_type]++; goto out; + } if (f2fs_is_pinned_file(inode)) { if (gc_type == FG_GC) f2fs_pin_file_control(inode, true); @@ -896,7 +902,8 @@ next_step: start_bidx = start_bidx_of_node(nofs, inode) + ofs_in_node; if (f2fs_post_read_required(inode)) - move_data_block(inode, start_bidx, segno, off); + move_data_block(inode, start_bidx, gc_type, + segno, off); else move_data_page(inode, start_bidx, gc_type, segno, off); @@ -1013,6 +1020,8 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, .ilist = LIST_HEAD_INIT(gc_list.ilist), .iroot = RADIX_TREE_INIT(GFP_NOFS), }; + unsigned long long last_skipped = sbi->skipped_atomic_files[FG_GC]; + unsigned int skipped_round = 0, round = 0; trace_f2fs_gc_begin(sbi->sb, sync, background, get_pages(sbi, F2FS_DIRTY_NODES), @@ -1064,11 +1073,21 @@ gc_more: sec_freed++; total_freed += seg_freed; + if (gc_type == FG_GC) { + if (sbi->skipped_atomic_files[FG_GC] > last_skipped) + skipped_round++; + last_skipped = sbi->skipped_atomic_files[FG_GC]; + round++; + } + if (gc_type == FG_GC) sbi->cur_victim_sec = NULL_SEGNO; if (!sync) { if (has_not_enough_free_secs(sbi, sec_freed, 0)) { + if (skipped_round > MAX_SKIP_ATOMIC_COUNT && + skipped_round * 2 >= round) + drop_inmem_pages_all(sbi, true); segno = NULL_SEGNO; goto gc_more; } diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index a814dd221eed..2f8c99ab99f7 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -235,7 +235,8 @@ static int do_read_inode(struct inode *inode) if (S_ISDIR(inode->i_mode)) fi->i_current_depth = le32_to_cpu(ri->i_current_depth); else if (S_ISREG(inode->i_mode)) - fi->i_gc_failures = le16_to_cpu(ri->i_gc_failures); + fi->i_gc_failures[GC_FAILURE_PIN] = + le16_to_cpu(ri->i_gc_failures); fi->i_xattr_nid = le32_to_cpu(ri->i_xattr_nid); fi->i_flags = le32_to_cpu(ri->i_flags); fi->flags = 0; @@ -428,7 +429,8 @@ void update_inode(struct inode *inode, struct page *node_page) ri->i_current_depth = cpu_to_le32(F2FS_I(inode)->i_current_depth); else if (S_ISREG(inode->i_mode)) - ri->i_gc_failures = cpu_to_le16(F2FS_I(inode)->i_gc_failures); + ri->i_gc_failures = + cpu_to_le16(F2FS_I(inode)->i_gc_failures[GC_FAILURE_PIN]); ri->i_xattr_nid = cpu_to_le32(F2FS_I(inode)->i_xattr_nid); ri->i_flags = cpu_to_le32(F2FS_I(inode)->i_flags); ri->i_pino = cpu_to_le32(F2FS_I(inode)->i_pino); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 8656295c76e3..e2317c6c1080 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -273,7 +273,7 @@ next: return err; } -void drop_inmem_pages_all(struct f2fs_sb_info *sbi) +void drop_inmem_pages_all(struct f2fs_sb_info *sbi, bool gc_failure) { struct list_head *head = &sbi->inode_list[ATOMIC_FILE]; struct inode *inode; @@ -289,9 +289,17 @@ next: spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); if (inode) { + if (gc_failure) { + if (fi->i_gc_failures[GC_FAILURE_ATOMIC]) + goto drop; + goto skip; + } +drop: + set_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); drop_inmem_pages(inode); iput(inode); } +skip: congestion_wait(BLK_RW_ASYNC, HZ/50); cond_resched(); goto next; @@ -311,6 +319,7 @@ void drop_inmem_pages(struct inode *inode) mutex_unlock(&fi->inmem_lock); clear_inode_flag(inode, FI_ATOMIC_FILE); + fi->i_gc_failures[GC_FAILURE_ATOMIC] = 0; stat_dec_atomic_write(inode); } diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 3367ce263fb9..3e7ef7c6771f 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -215,6 +215,8 @@ struct segment_allocation { #define IS_DUMMY_WRITTEN_PAGE(page) \ (page_private(page) == (unsigned long)DUMMY_WRITTEN_PAGE) +#define MAX_SKIP_ATOMIC_COUNT 16 + struct inmem_pages { struct list_head list; struct page *page; From b25a1872e9a518c8ea5c76bb8441209db3117574 Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Sun, 8 Apr 2018 15:11:11 +0800 Subject: [PATCH 755/804] f2fs: let discard thread wait a little longer if dev is busy This patch modify discard thread wait policy as below: issued io_interrupted wait time(ms) 1. 8 0 50 2. (0,8) 1 50 3. 0 1 500 (dev is busy) 4. 0 0 60000 (no candidates) Signed-off-by: Yunlei He Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 ++ fs/f2fs/segment.c | 6 +++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 6a04aae9480e..fab0ccf95a9f 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -247,6 +247,7 @@ enum { #define DEF_MAX_DISCARD_REQUEST 8 /* issue 8 discards per round */ #define DEF_MAX_DISCARD_LEN 512 /* Max. 2MB per discard */ #define DEF_MIN_DISCARD_ISSUE_TIME 50 /* 50 ms, if exists */ +#define DEF_MID_DISCARD_ISSUE_TIME 500 /* 500 ms, if device busy */ #define DEF_MAX_DISCARD_ISSUE_TIME 60000 /* 60 s, if no candidates */ #define DEF_DISCARD_URGENT_UTIL 80 /* do more discard over 80% */ #define DEF_CP_INTERVAL 60 /* 60 secs */ @@ -349,6 +350,7 @@ enum { struct discard_policy { int type; /* type of discard */ unsigned int min_interval; /* used for candidates exist */ + unsigned int mid_interval; /* used for device busy */ unsigned int max_interval; /* used for candidates not exist */ unsigned int max_requests; /* # of discards issued per round */ unsigned int io_aware_gran; /* minimum granularity discard not be aware of I/O */ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index e2317c6c1080..fe3b6c3e7553 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1018,6 +1018,7 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi, if (discard_type == DPOLICY_BG) { dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME; + dpolicy->mid_interval = DEF_MID_DISCARD_ISSUE_TIME; dpolicy->max_interval = DEF_MAX_DISCARD_ISSUE_TIME; dpolicy->io_aware = true; dpolicy->sync = false; @@ -1027,6 +1028,7 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi, } } else if (discard_type == DPOLICY_FORCE) { dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME; + dpolicy->mid_interval = DEF_MID_DISCARD_ISSUE_TIME; dpolicy->max_interval = DEF_MAX_DISCARD_ISSUE_TIME; dpolicy->io_aware = false; } else if (discard_type == DPOLICY_FSTRIM) { @@ -1500,9 +1502,11 @@ static int issue_discard_thread(void *data) sb_start_intwrite(sbi->sb); issued = __issue_discard_cmd(sbi, &dpolicy); - if (issued) { + if (issued > 0) { __wait_all_discard_cmd(sbi, &dpolicy); wait_ms = dpolicy.min_interval; + } else if (issued == -1){ + wait_ms = dpolicy.mid_interval; } else { wait_ms = dpolicy.max_interval; } From e72a2cca82d8e8809be75012ded23781434d31fa Mon Sep 17 00:00:00 2001 From: Sheng Yong Date: Tue, 8 May 2018 17:51:34 +0800 Subject: [PATCH 756/804] f2fs: clear discard_wake earlier If SBI_NEED_FSCK is set, discard_wake will never be cleared. As a result, the condition of wait_event_interruptible_timeout() is always true, which gets discard thread run too frequently. Signed-off-by: Sheng Yong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index fe3b6c3e7553..0caabb0f42bc 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1482,6 +1482,10 @@ static int issue_discard_thread(void *data) kthread_should_stop() || freezing(current) || dcc->discard_wake, msecs_to_jiffies(wait_ms)); + + if (dcc->discard_wake) + dcc->discard_wake = 0; + if (try_to_freeze()) continue; if (f2fs_readonly(sbi->sb)) @@ -1493,9 +1497,6 @@ static int issue_discard_thread(void *data) continue; } - if (dcc->discard_wake) - dcc->discard_wake = 0; - if (sbi->gc_mode == GC_URGENT) __init_discard_policy(sbi, &dpolicy, DPOLICY_FORCE, 1); From c74034518fdc8b21a2b3f0aace06965cea5fa09d Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 26 May 2018 18:03:34 +0800 Subject: [PATCH 757/804] f2fs: fix to don't trigger writeback during recovery - f2fs_fill_super - recover_fsync_data - recover_data - del_fsync_inode - iput - iput_final - write_inode_now - f2fs_write_inode - f2fs_balance_fs - f2fs_balance_fs_bg - sync_dirty_inodes With data_flush mount option, during recovery, in order to avoid entering above writeback flow, let's detect recovery status and do skip in f2fs_balance_fs_bg. Signed-off-by: Chao Yu Signed-off-by: Yunlei He Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 0caabb0f42bc..4557704a852e 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -486,6 +486,9 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) { + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) + return; + /* try to shrink extent cache when there is no enough memory */ if (!available_free_memory(sbi, EXTENT_CACHE)) f2fs_shrink_extent_tree(sbi, EXTENT_CACHE_SHRINK_NUMBER); From d2e0f2f786a68136d2fb5c57c669896eba2f4d7f Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 26 May 2018 18:03:35 +0800 Subject: [PATCH 758/804] f2fs: clean up with clear_radix_tree_dirty_tag Introduce clear_radix_tree_dirty_tag to include common codes for cleanup. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 11 +++++++++++ fs/f2fs/dir.c | 8 +------- fs/f2fs/f2fs.h | 1 + fs/f2fs/inline.c | 7 +------ fs/f2fs/node.c | 12 ++---------- 5 files changed, 16 insertions(+), 23 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 0c9e3e186f79..da09ba77f874 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2588,6 +2588,17 @@ const struct address_space_operations f2fs_dblock_aops = { #endif }; +void clear_radix_tree_dirty_tag(struct page *page) +{ + struct address_space *mapping = page_mapping(page); + unsigned long flags; + + spin_lock_irqsave(&mapping->tree_lock, flags); + radix_tree_tag_clear(&mapping->page_tree, page_index(page), + PAGECACHE_TAG_DIRTY); + spin_unlock_irqrestore(&mapping->tree_lock, flags); +} + int __init f2fs_init_post_read_processing(void) { bio_post_read_ctx_cache = KMEM_CACHE(bio_post_read_ctx, 0); diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 41d32171bd52..2887bcef118b 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -698,8 +698,6 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, struct f2fs_dentry_block *dentry_blk; unsigned int bit_pos; int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len)); - struct address_space *mapping = page_mapping(page); - unsigned long flags; int i; f2fs_update_time(F2FS_I_SB(dir), REQ_TIME); @@ -732,11 +730,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, if (bit_pos == NR_DENTRY_IN_BLOCK && !truncate_hole(dir, page->index, page->index + 1)) { - spin_lock_irqsave(&mapping->tree_lock, flags); - radix_tree_tag_clear(&mapping->page_tree, page_index(page), - PAGECACHE_TAG_DIRTY); - spin_unlock_irqrestore(&mapping->tree_lock, flags); - + clear_radix_tree_dirty_tag(page); clear_page_dirty_for_io(page); ClearPagePrivate(page); ClearPageUptodate(page); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index fab0ccf95a9f..ccdcf5865fb9 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3049,6 +3049,7 @@ int f2fs_migrate_page(struct address_space *mapping, struct page *newpage, struct page *page, enum migrate_mode mode); #endif bool f2fs_overwrite_io(struct inode *inode, loff_t pos, size_t len); +void clear_radix_tree_dirty_tag(struct page *page); /* * gc.c diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 85371b0971d9..b51cc241f354 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -204,8 +204,6 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page) { void *src_addr, *dst_addr; struct dnode_of_data dn; - struct address_space *mapping = page_mapping(page); - unsigned long flags; int err; set_new_dnode(&dn, inode, NULL, NULL, 0); @@ -227,10 +225,7 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page) kunmap_atomic(src_addr); set_page_dirty(dn.inode_page); - spin_lock_irqsave(&mapping->tree_lock, flags); - radix_tree_tag_clear(&mapping->page_tree, page_index(page), - PAGECACHE_TAG_DIRTY); - spin_unlock_irqrestore(&mapping->tree_lock, flags); + clear_radix_tree_dirty_tag(page); set_inode_flag(inode, FI_APPEND_WRITE); set_inode_flag(inode, FI_DATA_EXIST); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 0d6bb27370ff..2902e1fadebc 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -102,18 +102,10 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type) static void clear_node_page_dirty(struct page *page) { - struct address_space *mapping = page->mapping; - unsigned int long flags; - if (PageDirty(page)) { - spin_lock_irqsave(&mapping->tree_lock, flags); - radix_tree_tag_clear(&mapping->page_tree, - page_index(page), - PAGECACHE_TAG_DIRTY); - spin_unlock_irqrestore(&mapping->tree_lock, flags); - + clear_radix_tree_dirty_tag(page); clear_page_dirty_for_io(page); - dec_page_count(F2FS_M_SB(mapping), F2FS_DIRTY_NODES); + dec_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES); } ClearPageUptodate(page); } From a1259450b6dba27306a065b4e079d8fb234d0a4e Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 28 May 2018 16:57:32 +0800 Subject: [PATCH 759/804] f2fs: fix to avoid race during access gc_thread pointer Thread A Thread B - f2fs_remount - stop_gc_thread - f2fs_sbi_store sbi->gc_thread = NULL; access sbi->gc_thread->gc_* Previously, we allocate memory for sbi->gc_thread based on background gc thread mount option, the memory can be released if we turn off that mount option, but still there are several places access gc_thread pointer without considering race condition, result in NULL point dereference. In order to fix this issue, use sb->s_umount to exclude those operations. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/sysfs.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index dd940d156af6..ac3ea6044936 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -165,7 +165,7 @@ static ssize_t f2fs_sbi_show(struct f2fs_attr *a, return snprintf(buf, PAGE_SIZE, "%u\n", *ui); } -static ssize_t f2fs_sbi_store(struct f2fs_attr *a, +static ssize_t __f2fs_sbi_store(struct f2fs_attr *a, struct f2fs_sb_info *sbi, const char *buf, size_t count) { @@ -278,6 +278,23 @@ out: return count; } +static ssize_t f2fs_sbi_store(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, + const char *buf, size_t count) +{ + ssize_t ret; + bool gc_entry = (!strcmp(a->attr.name, "gc_urgent") || + a->struct_type == GC_THREAD); + + if (gc_entry) + down_read(&sbi->sb->s_umount); + ret = __f2fs_sbi_store(a, sbi, buf, count); + if (gc_entry) + up_read(&sbi->sb->s_umount); + + return ret; +} + static ssize_t f2fs_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) { From 0291bd36d076f2ff8c6c6cc3b8715b3399680e00 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 28 May 2018 16:59:26 +0800 Subject: [PATCH 760/804] f2fs: don't drop dentry pages after fs shutdown As description in commit "f2fs: don't drop any page on f2fs_cp_error() case": "We still provide readdir() after shtudown, so we should keep pages to avoid additional IOs." In order to provider lastest directory structure, let's keep dentry pages in cache after fs shutdown. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index da09ba77f874..65e7669155e8 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1798,6 +1798,12 @@ static int __write_data_page(struct page *page, bool *submitted, /* we should bypass data pages to proceed the kworkder jobs */ if (unlikely(f2fs_cp_error(sbi))) { mapping_set_error(page->mapping, -EIO); + /* + * don't drop any dirty dentry pages for keeping lastest + * directory structure. + */ + if (S_ISDIR(inode->i_mode)) + goto redirty_out; goto out; } From 02afc275a5bd35825e77a91a3aaad8d5aa730dbd Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 28 May 2018 16:59:27 +0800 Subject: [PATCH 761/804] f2fs: fix error path of move_data_page This patch fixes error path of move_data_page: - clear cold data flag if it fails to write page. - redirty page for non-ENOMEM case. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index ef8291f705dc..2b81537387c9 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -780,9 +780,14 @@ retry: set_cold_data(page); err = do_write_data_page(&fio); - if (err == -ENOMEM && is_dirty) { - congestion_wait(BLK_RW_ASYNC, HZ/50); - goto retry; + if (err) { + clear_cold_data(page); + if (err == -ENOMEM) { + congestion_wait(BLK_RW_ASYNC, HZ/50); + goto retry; + } + if (is_dirty) + set_page_dirty(page); } } out: From e48fcd857657d3328d9f148f490c64c5e147fb85 Mon Sep 17 00:00:00 2001 From: Anatoly Pugachev Date: Mon, 28 May 2018 02:06:37 +0300 Subject: [PATCH 762/804] disable loading f2fs module on PAGE_SIZE > 4KB The following patch disables loading of f2fs module on architectures which have PAGE_SIZE > 4096 , since it is impossible to mount f2fs on such architectures , log messages are: mount: /mnt: wrong fs type, bad option, bad superblock on /dev/vdiskb1, missing codepage or helper program, or other error. /dev/vdiskb1: F2FS filesystem, UUID=1d8b9ca4-2389-4910-af3b-10998969f09c, volume name "" May 15 18:03:13 ttip kernel: F2FS-fs (vdiskb1): Invalid page_cache_size (8192), supports only 4KB May 15 18:03:13 ttip kernel: F2FS-fs (vdiskb1): Can't find valid F2FS filesystem in 1th superblock May 15 18:03:13 ttip kernel: F2FS-fs (vdiskb1): Invalid page_cache_size (8192), supports only 4KB May 15 18:03:13 ttip kernel: F2FS-fs (vdiskb1): Can't find valid F2FS filesystem in 2th superblock May 15 18:03:13 ttip kernel: F2FS-fs (vdiskb1): Invalid page_cache_size (8192), supports only 4KB which was introduced by git commit 5c9b469295fb6b10d98923eab5e79c4edb80ed20 tested on git kernel 4.17.0-rc6-00309-gec30dcf7f425 with patch applied: modprobe: ERROR: could not insert 'f2fs': Invalid argument May 28 01:40:28 v215 kernel: F2FS not supported on PAGE_SIZE(8192) != 4096 Signed-off-by: Anatoly Pugachev Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 7064f6e33211..9819c04e6848 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -3130,6 +3130,12 @@ static int __init init_f2fs_fs(void) { int err; + if (PAGE_SIZE != F2FS_BLKSIZE) { + printk("F2FS not supported on PAGE_SIZE(%lu) != %d\n", + PAGE_SIZE, F2FS_BLKSIZE); + return -EINVAL; + } + f2fs_build_trace_ios(); err = init_inodecache(); From b7f55946709538653ce33f06e67219c15b140039 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 28 May 2018 23:47:18 +0800 Subject: [PATCH 763/804] f2fs: fix to let caller retry allocating block address Configure io_bits with 2 and enable LFS mode, generic/013 reports below dmesg: BUG: unable to handle kernel NULL pointer dereference at 00000104 *pdpt = 0000000029b7b001 *pde = 0000000000000000 Oops: 0002 [#1] PREEMPT SMP Modules linked in: crc32_generic zram f2fs(O) rfcomm bnep bluetooth ecdh_generic snd_intel8x0 snd_ac97_codec ac97_bus snd_pcm snd_seq_midi snd_seq_midi_event snd_rawmidi snd_seq pcbc joydev snd_seq_device aesni_intel snd_timer aes_i586 snd crypto_simd cryptd soundcore i2c_piix4 serio_raw mac_hid video parport_pc ppdev lp parport hid_generic psmouse usbhid hid e1000 CPU: 0 PID: 11161 Comm: fsstress Tainted: G O 4.17.0-rc2 #38 Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006 EIP: f2fs_submit_page_write+0x28d/0x550 [f2fs] EFLAGS: 00010206 CPU: 0 EAX: e863dcd8 EBX: 00000000 ECX: 00000100 EDX: 00000200 ESI: e863dcf4 EDI: f6f82768 EBP: e863dbb0 ESP: e863db74 DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068 CR0: 80050033 CR2: 00000104 CR3: 29a62020 CR4: 000406f0 Call Trace: do_write_page+0x6f/0xc0 [f2fs] write_data_page+0x4a/0xd0 [f2fs] do_write_data_page+0x327/0x630 [f2fs] __write_data_page+0x34b/0x820 [f2fs] __f2fs_write_data_pages+0x42d/0x8c0 [f2fs] f2fs_write_data_pages+0x27/0x30 [f2fs] do_writepages+0x1a/0x70 __filemap_fdatawrite_range+0x94/0xd0 filemap_write_and_wait_range+0x3d/0xa0 __generic_file_write_iter+0x11a/0x1f0 f2fs_file_write_iter+0xdd/0x3b0 [f2fs] __vfs_write+0xd2/0x150 vfs_write+0x9b/0x190 ksys_write+0x45/0x90 sys_write+0x16/0x20 do_fast_syscall_32+0xaa/0x22c entry_SYSENTER_32+0x4c/0x7b EIP: 0xb7fc8c51 EFLAGS: 00000246 CPU: 0 EAX: ffffffda EBX: 00000003 ECX: 09cde000 EDX: 00001000 ESI: 00000003 EDI: 00001000 EBP: 00000000 ESP: bfbded38 DS: 007b ES: 007b FS: 0000 GS: 0033 SS: 007b Code: e8 f9 77 34 c9 8b 45 e0 8b 80 b8 00 00 00 39 45 d8 0f 84 bb 02 00 00 8b 45 e0 8b 80 b8 00 00 00 8d 50 d8 8b 08 89 55 f0 8b 50 04 <89> 51 04 89 0a c7 00 00 01 00 00 c7 40 04 00 02 00 00 8b 45 dc EIP: f2fs_submit_page_write+0x28d/0x550 [f2fs] SS:ESP: 0068:e863db74 CR2: 0000000000000104 ---[ end trace 4cac79c0d1305ee6 ]--- allocate_data_block will submit all sequential pending IOs sorted by a FIFO list, If we failed to submit other user's IO due to unaligned write, we will retry to allocate new block address for current IO, then it will initialize fio.list again, if fio was in the list before, it can break FIFO list, result in above panic. Thread A Thread B - do_write_page - allocate_data_block - list_add_tail : fioA cached in FIFO list. - do_write_page - allocate_data_block - list_add_tail : fioB cached in FIFO list. - f2fs_submit_page_write : fail to submit IO - allocate_data_block - INIT_LIST_HEAD - f2fs_submit_page_write - list_del <-- NULL pointer dereference This patch adds fio.retry parameter to indicate failure status for each IO, and avoid bailing out if there is still pending IO in FIFO list for fixing. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 14 ++++++-------- fs/f2fs/f2fs.h | 3 ++- fs/f2fs/gc.c | 5 +++-- fs/f2fs/segment.c | 11 ++++++----- 4 files changed, 17 insertions(+), 16 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 65e7669155e8..0b7806cb4e7b 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -460,13 +460,12 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) return 0; } -int f2fs_submit_page_write(struct f2fs_io_info *fio) +void f2fs_submit_page_write(struct f2fs_io_info *fio) { struct f2fs_sb_info *sbi = fio->sbi; enum page_type btype = PAGE_TYPE_OF_BIO(fio->type); struct f2fs_bio_info *io = sbi->write_io[btype] + fio->temp; struct page *bio_page; - int err = 0; f2fs_bug_on(sbi, is_read_io(fio->op)); @@ -476,7 +475,7 @@ next: spin_lock(&io->io_lock); if (list_empty(&io->io_list)) { spin_unlock(&io->io_lock); - goto out_fail; + goto out; } fio = list_first_entry(&io->io_list, struct f2fs_io_info, list); @@ -503,9 +502,9 @@ alloc_new: if (io->bio == NULL) { if ((fio->type == DATA || fio->type == NODE) && fio->new_blkaddr & F2FS_IO_SIZE_MASK(sbi)) { - err = -EAGAIN; dec_page_count(sbi, WB_DATA_TYPE(bio_page)); - goto out_fail; + fio->retry = true; + goto skip; } io->bio = __bio_alloc(sbi, fio->new_blkaddr, fio->io_wbc, BIO_MAX_PAGES, false, @@ -525,12 +524,11 @@ alloc_new: f2fs_trace_ios(fio, 0); trace_f2fs_submit_page_write(fio->page, fio); - +skip: if (fio->in_list) goto next; -out_fail: +out: up_write(&io->io_rwsem); - return err; } static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index ccdcf5865fb9..f702aeaf6f38 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1076,6 +1076,7 @@ struct f2fs_io_info { int need_lock; /* indicate we need to lock cp_rwsem */ bool in_list; /* indicate fio is in io_list */ bool is_meta; /* indicate borrow meta inode mapping or not */ + bool retry; /* need to reallocate block address */ enum iostat_type io_type; /* io type */ struct writeback_control *io_wbc; /* writeback control */ }; @@ -3013,7 +3014,7 @@ void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi, enum page_type type); void f2fs_flush_merged_writes(struct f2fs_sb_info *sbi); int f2fs_submit_page_bio(struct f2fs_io_info *fio); -int f2fs_submit_page_write(struct f2fs_io_info *fio); +void f2fs_submit_page_write(struct f2fs_io_info *fio); struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi, block_t blk_addr, struct bio *bio); int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr); diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 2b81537387c9..45713a64612d 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -603,6 +603,7 @@ static void move_data_block(struct inode *inode, block_t bidx, .op_flags = REQ_SYNC, .encrypted_page = NULL, .in_list = false, + .retry = false, }; struct dnode_of_data dn; struct f2fs_summary sum; @@ -697,8 +698,8 @@ static void move_data_block(struct inode *inode, block_t bidx, fio.op = REQ_OP_WRITE; fio.op_flags = REQ_SYNC | REQ_NOIDLE; fio.new_blkaddr = newaddr; - err = f2fs_submit_page_write(&fio); - if (err) { + f2fs_submit_page_write(&fio); + if (fio.retry) { if (PageWriteback(fio.encrypted_page)) end_page_writeback(fio.encrypted_page); goto put_page_out; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 4557704a852e..507f697178b6 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2812,6 +2812,7 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, INIT_LIST_HEAD(&fio->list); fio->in_list = true; + fio->retry = false; io = sbi->write_io[fio->type] + fio->temp; spin_lock(&io->io_lock); list_add_tail(&fio->list, &io->io_list); @@ -2847,7 +2848,6 @@ static void update_device_state(struct f2fs_io_info *fio) static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) { int type = __get_segment_type(fio); - int err; bool keep_order = (test_opt(fio->sbi, LFS) && type == CURSEG_COLD_DATA); if (keep_order) @@ -2857,13 +2857,14 @@ reallocate: &fio->new_blkaddr, sum, type, fio, true); /* writeout dirty page into bdev */ - err = f2fs_submit_page_write(fio); - if (err == -EAGAIN) { + f2fs_submit_page_write(fio); + if (fio->retry) { fio->old_blkaddr = fio->new_blkaddr; goto reallocate; - } else if (!err) { - update_device_state(fio); } + + update_device_state(fio); + if (keep_order) up_read(&fio->sbi->io_order_lock); } From cc8093af7c420333d412ec5ef748900a53c433df Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 28 May 2018 23:47:19 +0800 Subject: [PATCH 764/804] f2fs: fix to avoid accessing cross the boundary Configure io_bits with 2 and enable LFS mode, generic/017 reports below dmesg: BUG: unable to handle kernel NULL pointer dereference at 00000039 *pdpt = 000000002fcb2001 *pde = 0000000000000000 Oops: 0000 [#1] PREEMPT SMP Modules linked in: crc32_generic zram f2fs(O) bnep rfcomm bluetooth ecdh_generic snd_intel8x0 snd_ac97_codec ac97_bus snd_pcm snd_seq_midi snd_seq_midi_event snd_rawmidi pcbc snd_seq joydev aesni_intel aes_i586 snd_seq_device snd_timer crypto_simd cryptd snd soundcore i2c_piix4 serio_raw mac_hid video parport_pc ppdev lp parport hid_generic usbhid psmouse hid e1000 CPU: 2 PID: 20779 Comm: xfs_io Tainted: G O 4.17.0-rc2 #38 Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006 EIP: is_checkpointed_data+0x84/0xd0 [f2fs] EFLAGS: 00010207 CPU: 2 EAX: 00000000 EBX: f5cd7000 ECX: fffffe32 EDX: 00000039 ESI: 000001cd EDI: ec95fb6c EBP: e264bd80 ESP: e264bd6c DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068 CR0: 80050033 CR2: 00000039 CR3: 2fe55660 CR4: 000406f0 Call Trace: __exchange_data_block+0xb3f/0x1000 [f2fs] f2fs_fallocate+0xab9/0x16b0 [f2fs] vfs_fallocate+0x17c/0x2d0 ksys_fallocate+0x42/0x70 sys_fallocate+0x31/0x40 do_fast_syscall_32+0xaa/0x22c entry_SYSENTER_32+0x4c/0x7b EIP: 0xb7f98c51 EFLAGS: 00000293 CPU: 2 EAX: ffffffda EBX: 00000003 ECX: 00000008 EDX: 01001000 ESI: 00000000 EDI: 00001000 EBP: 00000000 ESP: bfc0357c DS: 007b ES: 007b FS: 0000 GS: 0033 SS: 007b Code: 00 00 d3 e8 8b 4d ec 2b 02 8b 55 f0 6b c0 1c 03 41 70 29 d6 8b 93 d0 06 00 00 8b 40 0c 83 ea 01 21 d6 89 f2 89 f1 c1 ea 03 f7 d1 <0f> be 14 10 83 e1 07 b8 01 00 00 00 d3 e0 85 c2 89 f8 0f 95 c3 EIP: is_checkpointed_data+0x84/0xd0 [f2fs] SS:ESP: 0068:e264bd6c CR2: 0000000000000039 ---[ end trace 9a4d4087cce6080a ]--- This is because in recovery flow of __exchange_data_block, we didn't pass olen to __roll_back_blkaddrs, instead we passed len, which indicates wrong array size, result in copying random block address into dnode page. Later, once that random block address was accessed by is_checkpointed_data, it can cause NULL pointer dereference. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 4b7629f2422e..0c2af49be162 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1151,7 +1151,7 @@ static int __exchange_data_block(struct inode *src_inode, return 0; roll_back: - __roll_back_blkaddrs(src_inode, src_blkaddr, do_replace, src, len); + __roll_back_blkaddrs(src_inode, src_blkaddr, do_replace, src, olen); kvfree(src_blkaddr); kvfree(do_replace); return ret; From 5d1633aa1071aa481442434a540c084a10efd810 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 30 May 2018 00:20:39 +0800 Subject: [PATCH 765/804] f2fs: make __f2fs_write_data_pages() static Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 +- fs/f2fs/f2fs.h | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 0b7806cb4e7b..bf55ee0dfccd 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2069,7 +2069,7 @@ continue_unlock: return ret; } -int __f2fs_write_data_pages(struct address_space *mapping, +static int __f2fs_write_data_pages(struct address_space *mapping, struct writeback_control *wbc, enum iostat_type io_type) { diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index f702aeaf6f38..7ce0272733e2 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3039,9 +3039,6 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len); bool should_update_inplace(struct inode *inode, struct f2fs_io_info *fio); bool should_update_outplace(struct inode *inode, struct f2fs_io_info *fio); -int __f2fs_write_data_pages(struct address_space *mapping, - struct writeback_control *wbc, - enum iostat_type io_type); void f2fs_invalidate_page(struct page *page, unsigned int offset, unsigned int length); int f2fs_release_page(struct page *page, gfp_t wait); From fcf37e16f3cb91bad01a7ca1df5ebd032de614f3 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 30 May 2018 00:20:40 +0800 Subject: [PATCH 766/804] f2fs: make set_de_type() static Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 2 +- fs/f2fs/f2fs.h | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 2887bcef118b..f8e7bafd092a 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -60,7 +60,7 @@ static unsigned char f2fs_type_by_mode[S_IFMT >> S_SHIFT] = { [S_IFLNK >> S_SHIFT] = F2FS_FT_SYMLINK, }; -void set_de_type(struct f2fs_dir_entry *de, umode_t mode) +static void set_de_type(struct f2fs_dir_entry *de, umode_t mode) { de->file_type = f2fs_type_by_mode[(mode & S_IFMT) >> S_SHIFT]; } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 7ce0272733e2..c80ee4b1fa51 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2798,7 +2798,6 @@ struct dentry *f2fs_get_parent(struct dentry *child); /* * dir.c */ -void set_de_type(struct f2fs_dir_entry *de, umode_t mode); unsigned char get_de_type(struct f2fs_dir_entry *de); struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *fname, f2fs_hash_t namehash, int *max_slots, From c35da89531b3cf7939498e4e1f39bf9338ebc10f Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 30 May 2018 00:20:41 +0800 Subject: [PATCH 767/804] f2fs: clean up symbol namespace As Ted reported: "Hi, I was looking at f2fs's sources recently, and I noticed that there is a very large number of non-static symbols which don't have a f2fs prefix. There's well over a hundred (see attached below). As one example, in fs/f2fs/dir.c there is: unsigned char get_de_type(struct f2fs_dir_entry *de) This function is clearly only useful for f2fs, but it has a generic name. This means that if any other file system tries to have the same symbol name, there will be a symbol conflict and the kernel would not successfully build. It also means that when someone is looking f2fs sources, it's not at all obvious whether a function such as read_data_page(), invalidate_blocks(), is a generic kernel function found in the fs, mm, or block layers, or a f2fs specific function. You might want to fix this at some point. Hopefully Kent's bcachefs isn't similarly using genericly named functions, since that might cause conflicts with f2fs's functions --- but just as this would be a problem that we would rightly insist that Kent fix, this is something that we should have rightly insisted that f2fs should have fixed before it was integrated into the mainline kernel. acquire_orphan_inode add_ino_entry add_orphan_inode allocate_data_block allocate_new_segments alloc_nid alloc_nid_done alloc_nid_failed available_free_memory ...." This patch adds "f2fs_" prefix for all non-static symbols in order to: a) avoid conflict with other kernel generic symbols; b) to indicate the function is f2fs specific one instead of generic one; Reported-by: Theodore Ts'o Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 134 +++++++++--------- fs/f2fs/data.c | 115 ++++++++-------- fs/f2fs/dir.c | 68 ++++----- fs/f2fs/extent_cache.c | 22 +-- fs/f2fs/f2fs.h | 305 +++++++++++++++++++++-------------------- fs/f2fs/file.c | 136 +++++++++--------- fs/f2fs/gc.c | 64 ++++----- fs/f2fs/inline.c | 69 +++++----- fs/f2fs/inode.c | 52 +++---- fs/f2fs/namei.c | 55 ++++---- fs/f2fs/node.c | 223 +++++++++++++++--------------- fs/f2fs/recovery.c | 63 ++++----- fs/f2fs/segment.c | 185 +++++++++++++------------ fs/f2fs/shrinker.c | 4 +- fs/f2fs/super.c | 94 ++++++------- fs/f2fs/sysfs.c | 8 +- fs/f2fs/xattr.c | 26 ++-- 17 files changed, 826 insertions(+), 797 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 0159a84ba02d..b00c807c8c8b 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -24,7 +24,7 @@ #include static struct kmem_cache *ino_entry_slab; -struct kmem_cache *inode_entry_slab; +struct kmem_cache *f2fs_inode_entry_slab; void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io) { @@ -36,7 +36,7 @@ void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io) /* * We guarantee no failure on the returned page. */ -struct page *grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index) +struct page *f2fs_grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index) { struct address_space *mapping = META_MAPPING(sbi); struct page *page = NULL; @@ -108,18 +108,19 @@ out: return page; } -struct page *get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index) +struct page *f2fs_get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index) { return __get_meta_page(sbi, index, true); } /* for POR only */ -struct page *get_tmp_page(struct f2fs_sb_info *sbi, pgoff_t index) +struct page *f2fs_get_tmp_page(struct f2fs_sb_info *sbi, pgoff_t index) { return __get_meta_page(sbi, index, false); } -bool is_valid_meta_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type) +bool f2fs_is_valid_meta_blkaddr(struct f2fs_sb_info *sbi, + block_t blkaddr, int type) { switch (type) { case META_NAT: @@ -153,7 +154,7 @@ bool is_valid_meta_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type) /* * Readahead CP/NAT/SIT/SSA pages */ -int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, +int f2fs_ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type, bool sync) { struct page *page; @@ -176,7 +177,7 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, blk_start_plug(&plug); for (; nrpages-- > 0; blkno++) { - if (!is_valid_meta_blkaddr(sbi, blkno, type)) + if (!f2fs_is_valid_meta_blkaddr(sbi, blkno, type)) goto out; switch (type) { @@ -220,7 +221,7 @@ out: return blkno - start; } -void ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index) +void f2fs_ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index) { struct page *page; bool readahead = false; @@ -231,7 +232,7 @@ void ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index) f2fs_put_page(page, 0); if (readahead) - ra_meta_pages(sbi, index, BIO_MAX_PAGES, META_POR, true); + f2fs_ra_meta_pages(sbi, index, BIO_MAX_PAGES, META_POR, true); } static int __f2fs_write_meta_page(struct page *page, @@ -252,7 +253,7 @@ static int __f2fs_write_meta_page(struct page *page, if (wbc->for_reclaim && page->index < GET_SUM_BLOCK(sbi, 0)) goto redirty_out; - write_meta_page(sbi, page, io_type); + f2fs_do_write_meta_page(sbi, page, io_type); dec_page_count(sbi, F2FS_DIRTY_META); if (wbc->for_reclaim) @@ -297,7 +298,7 @@ static int f2fs_write_meta_pages(struct address_space *mapping, trace_f2fs_writepages(mapping->host, wbc, META); diff = nr_pages_to_write(sbi, META, wbc); - written = sync_meta_pages(sbi, META, wbc->nr_to_write, FS_META_IO); + written = f2fs_sync_meta_pages(sbi, META, wbc->nr_to_write, FS_META_IO); mutex_unlock(&sbi->cp_mutex); wbc->nr_to_write = max((long)0, wbc->nr_to_write - written - diff); return 0; @@ -308,7 +309,7 @@ skip_write: return 0; } -long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, +long f2fs_sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, long nr_to_write, enum iostat_type io_type) { struct address_space *mapping = META_MAPPING(sbi); @@ -458,20 +459,20 @@ static void __remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) spin_unlock(&im->ino_lock); } -void add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) +void f2fs_add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) { /* add new dirty ino entry into list */ __add_ino_entry(sbi, ino, 0, type); } -void remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) +void f2fs_remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) { /* remove dirty ino entry from list */ __remove_ino_entry(sbi, ino, type); } /* mode should be APPEND_INO or UPDATE_INO */ -bool exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode) +bool f2fs_exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode) { struct inode_management *im = &sbi->im[mode]; struct ino_entry *e; @@ -482,7 +483,7 @@ bool exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode) return e ? true : false; } -void release_ino_entry(struct f2fs_sb_info *sbi, bool all) +void f2fs_release_ino_entry(struct f2fs_sb_info *sbi, bool all) { struct ino_entry *e, *tmp; int i; @@ -501,13 +502,13 @@ void release_ino_entry(struct f2fs_sb_info *sbi, bool all) } } -void set_dirty_device(struct f2fs_sb_info *sbi, nid_t ino, +void f2fs_set_dirty_device(struct f2fs_sb_info *sbi, nid_t ino, unsigned int devidx, int type) { __add_ino_entry(sbi, ino, devidx, type); } -bool is_dirty_device(struct f2fs_sb_info *sbi, nid_t ino, +bool f2fs_is_dirty_device(struct f2fs_sb_info *sbi, nid_t ino, unsigned int devidx, int type) { struct inode_management *im = &sbi->im[type]; @@ -522,7 +523,7 @@ bool is_dirty_device(struct f2fs_sb_info *sbi, nid_t ino, return is_dirty; } -int acquire_orphan_inode(struct f2fs_sb_info *sbi) +int f2fs_acquire_orphan_inode(struct f2fs_sb_info *sbi) { struct inode_management *im = &sbi->im[ORPHAN_INO]; int err = 0; @@ -545,7 +546,7 @@ int acquire_orphan_inode(struct f2fs_sb_info *sbi) return err; } -void release_orphan_inode(struct f2fs_sb_info *sbi) +void f2fs_release_orphan_inode(struct f2fs_sb_info *sbi) { struct inode_management *im = &sbi->im[ORPHAN_INO]; @@ -555,14 +556,14 @@ void release_orphan_inode(struct f2fs_sb_info *sbi) spin_unlock(&im->ino_lock); } -void add_orphan_inode(struct inode *inode) +void f2fs_add_orphan_inode(struct inode *inode) { /* add new orphan ino entry into list */ __add_ino_entry(F2FS_I_SB(inode), inode->i_ino, 0, ORPHAN_INO); - update_inode_page(inode); + f2fs_update_inode_page(inode); } -void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) +void f2fs_remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) { /* remove orphan entry from orphan list */ __remove_ino_entry(sbi, ino, ORPHAN_INO); @@ -572,7 +573,7 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) { struct inode *inode; struct node_info ni; - int err = acquire_orphan_inode(sbi); + int err = f2fs_acquire_orphan_inode(sbi); if (err) goto err_out; @@ -600,7 +601,7 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) /* truncate all the data during iput */ iput(inode); - get_node_info(sbi, ino, &ni); + f2fs_get_node_info(sbi, ino, &ni); /* ENOMEM was fully retried in f2fs_evict_inode. */ if (ni.blk_addr != NULL_ADDR) { @@ -618,7 +619,7 @@ err_out: return err; } -int recover_orphan_inodes(struct f2fs_sb_info *sbi) +int f2fs_recover_orphan_inodes(struct f2fs_sb_info *sbi) { block_t start_blk, orphan_blocks, i, j; unsigned int s_flags = sbi->sb->s_flags; @@ -646,10 +647,10 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi) start_blk = __start_cp_addr(sbi) + 1 + __cp_payload(sbi); orphan_blocks = __start_sum_addr(sbi) - 1 - __cp_payload(sbi); - ra_meta_pages(sbi, start_blk, orphan_blocks, META_CP, true); + f2fs_ra_meta_pages(sbi, start_blk, orphan_blocks, META_CP, true); for (i = 0; i < orphan_blocks; i++) { - struct page *page = get_meta_page(sbi, start_blk + i); + struct page *page = f2fs_get_meta_page(sbi, start_blk + i); struct f2fs_orphan_block *orphan_blk; orphan_blk = (struct f2fs_orphan_block *)page_address(page); @@ -699,7 +700,7 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk) /* loop for each orphan inode entry and write them in Jornal block */ list_for_each_entry(orphan, head, list) { if (!page) { - page = grab_meta_page(sbi, start_blk++); + page = f2fs_grab_meta_page(sbi, start_blk++); orphan_blk = (struct f2fs_orphan_block *)page_address(page); memset(orphan_blk, 0, sizeof(*orphan_blk)); @@ -741,7 +742,7 @@ static int get_checkpoint_version(struct f2fs_sb_info *sbi, block_t cp_addr, size_t crc_offset = 0; __u32 crc = 0; - *cp_page = get_meta_page(sbi, cp_addr); + *cp_page = f2fs_get_meta_page(sbi, cp_addr); *cp_block = (struct f2fs_checkpoint *)page_address(*cp_page); crc_offset = le32_to_cpu((*cp_block)->checksum_offset); @@ -794,7 +795,7 @@ invalid_cp1: return NULL; } -int get_valid_checkpoint(struct f2fs_sb_info *sbi) +int f2fs_get_valid_checkpoint(struct f2fs_sb_info *sbi) { struct f2fs_checkpoint *cp_block; struct f2fs_super_block *fsb = sbi->raw_super; @@ -838,7 +839,7 @@ int get_valid_checkpoint(struct f2fs_sb_info *sbi) memcpy(sbi->ckpt, cp_block, blk_size); /* Sanity checking of checkpoint */ - if (sanity_check_ckpt(sbi)) + if (f2fs_sanity_check_ckpt(sbi)) goto free_fail_no_cp; if (cur_page == cp1) @@ -857,7 +858,7 @@ int get_valid_checkpoint(struct f2fs_sb_info *sbi) void *sit_bitmap_ptr; unsigned char *ckpt = (unsigned char *)sbi->ckpt; - cur_page = get_meta_page(sbi, cp_blk_no + i); + cur_page = f2fs_get_meta_page(sbi, cp_blk_no + i); sit_bitmap_ptr = page_address(cur_page); memcpy(ckpt + i * blk_size, sit_bitmap_ptr, blk_size); f2fs_put_page(cur_page, 1); @@ -902,7 +903,7 @@ static void __remove_dirty_inode(struct inode *inode, enum inode_type type) stat_dec_dirty_inode(F2FS_I_SB(inode), type); } -void update_dirty_page(struct inode *inode, struct page *page) +void f2fs_update_dirty_page(struct inode *inode, struct page *page) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); enum inode_type type = S_ISDIR(inode->i_mode) ? DIR_INODE : FILE_INODE; @@ -921,7 +922,7 @@ void update_dirty_page(struct inode *inode, struct page *page) f2fs_trace_pid(page); } -void remove_dirty_inode(struct inode *inode) +void f2fs_remove_dirty_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); enum inode_type type = S_ISDIR(inode->i_mode) ? DIR_INODE : FILE_INODE; @@ -938,7 +939,7 @@ void remove_dirty_inode(struct inode *inode) spin_unlock(&sbi->inode_lock[type]); } -int sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type) +int f2fs_sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type) { struct list_head *head; struct inode *inode; @@ -1021,7 +1022,7 @@ int f2fs_sync_inode_meta(struct f2fs_sb_info *sbi) /* it's on eviction */ if (is_inode_flag_set(inode, FI_DIRTY_INODE)) - update_inode_page(inode); + f2fs_update_inode_page(inode); iput(inode); } } @@ -1061,7 +1062,7 @@ retry_flush_dents: /* write all the dirty dentry pages */ if (get_pages(sbi, F2FS_DIRTY_DENTS)) { f2fs_unlock_all(sbi); - err = sync_dirty_inodes(sbi, DIR_INODE); + err = f2fs_sync_dirty_inodes(sbi, DIR_INODE); if (err) goto out; cond_resched(); @@ -1089,7 +1090,7 @@ retry_flush_nodes: if (get_pages(sbi, F2FS_DIRTY_NODES)) { up_write(&sbi->node_write); - err = sync_node_pages(sbi, &wbc, false, FS_CP_NODE_IO); + err = f2fs_sync_node_pages(sbi, &wbc, false, FS_CP_NODE_IO); if (err) { up_write(&sbi->node_change); f2fs_unlock_all(sbi); @@ -1183,10 +1184,10 @@ static void commit_checkpoint(struct f2fs_sb_info *sbi, /* * pagevec_lookup_tag and lock_page again will take - * some extra time. Therefore, update_meta_pages and - * sync_meta_pages are combined in this function. + * some extra time. Therefore, f2fs_update_meta_pages and + * f2fs_sync_meta_pages are combined in this function. */ - struct page *page = grab_meta_page(sbi, blk_addr); + struct page *page = f2fs_grab_meta_page(sbi, blk_addr); int err; memcpy(page_address(page), src, PAGE_SIZE); @@ -1224,7 +1225,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* Flush all the NAT/SIT pages */ while (get_pages(sbi, F2FS_DIRTY_META)) { - sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO); + f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO); if (unlikely(f2fs_cp_error(sbi))) return -EIO; } @@ -1253,7 +1254,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) } /* 2 cp + n data seg summary + orphan inode blocks */ - data_sum_blocks = npages_for_summary_flush(sbi, false); + data_sum_blocks = f2fs_npages_for_summary_flush(sbi, false); spin_lock_irqsave(&sbi->cp_lock, flags); if (data_sum_blocks < NR_CURSEG_DATA_TYPE) __set_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG); @@ -1298,22 +1299,23 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) blk = start_blk + sbi->blocks_per_seg - nm_i->nat_bits_blocks; for (i = 0; i < nm_i->nat_bits_blocks; i++) - update_meta_page(sbi, nm_i->nat_bits + + f2fs_update_meta_page(sbi, nm_i->nat_bits + (i << F2FS_BLKSIZE_BITS), blk + i); /* Flush all the NAT BITS pages */ while (get_pages(sbi, F2FS_DIRTY_META)) { - sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO); + f2fs_sync_meta_pages(sbi, META, LONG_MAX, + FS_CP_META_IO); if (unlikely(f2fs_cp_error(sbi))) return -EIO; } } /* write out checkpoint buffer at block 0 */ - update_meta_page(sbi, ckpt, start_blk++); + f2fs_update_meta_page(sbi, ckpt, start_blk++); for (i = 1; i < 1 + cp_payload_blks; i++) - update_meta_page(sbi, (char *)ckpt + i * F2FS_BLKSIZE, + f2fs_update_meta_page(sbi, (char *)ckpt + i * F2FS_BLKSIZE, start_blk++); if (orphan_num) { @@ -1321,7 +1323,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) start_blk += orphan_blocks; } - write_data_summaries(sbi, start_blk); + f2fs_write_data_summaries(sbi, start_blk); start_blk += data_sum_blocks; /* Record write statistics in the hot node summary */ @@ -1332,7 +1334,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) seg_i->journal->info.kbytes_written = cpu_to_le64(kbytes_written); if (__remain_node_summaries(cpc->reason)) { - write_node_summaries(sbi, start_blk); + f2fs_write_node_summaries(sbi, start_blk); start_blk += NR_CURSEG_NODE_TYPE; } @@ -1341,7 +1343,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) percpu_counter_set(&sbi->alloc_valid_block_count, 0); /* Here, we have one bio having CP pack except cp pack 2 page */ - sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO); + f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO); /* wait for previous submitted meta pages writeback */ wait_on_all_pages_writeback(sbi); @@ -1358,7 +1360,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) commit_checkpoint(sbi, ckpt, start_blk); wait_on_all_pages_writeback(sbi); - release_ino_entry(sbi, false); + f2fs_release_ino_entry(sbi, false); if (unlikely(f2fs_cp_error(sbi))) return -EIO; @@ -1383,7 +1385,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* * We guarantee that this checkpoint procedure will not fail. */ -int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) +int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); unsigned long long ckpt_ver; @@ -1416,7 +1418,7 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* this is the case of multiple fstrims without any changes */ if (cpc->reason & CP_DISCARD) { - if (!exist_trim_candidates(sbi, cpc)) { + if (!f2fs_exist_trim_candidates(sbi, cpc)) { unblock_operations(sbi); goto out; } @@ -1424,8 +1426,8 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) if (NM_I(sbi)->dirty_nat_cnt == 0 && SIT_I(sbi)->dirty_sentries == 0 && prefree_segments(sbi) == 0) { - flush_sit_entries(sbi, cpc); - clear_prefree_segments(sbi, cpc); + f2fs_flush_sit_entries(sbi, cpc); + f2fs_clear_prefree_segments(sbi, cpc); unblock_operations(sbi); goto out; } @@ -1440,15 +1442,15 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) ckpt->checkpoint_ver = cpu_to_le64(++ckpt_ver); /* write cached NAT/SIT entries to NAT/SIT area */ - flush_nat_entries(sbi, cpc); - flush_sit_entries(sbi, cpc); + f2fs_flush_nat_entries(sbi, cpc); + f2fs_flush_sit_entries(sbi, cpc); /* unlock all the fs_lock[] in do_checkpoint() */ err = do_checkpoint(sbi, cpc); if (err) - release_discard_addrs(sbi); + f2fs_release_discard_addrs(sbi); else - clear_prefree_segments(sbi, cpc); + f2fs_clear_prefree_segments(sbi, cpc); unblock_operations(sbi); stat_inc_cp_count(sbi->stat_info); @@ -1465,7 +1467,7 @@ out: return err; } -void init_ino_entry_info(struct f2fs_sb_info *sbi) +void f2fs_init_ino_entry_info(struct f2fs_sb_info *sbi) { int i; @@ -1483,23 +1485,23 @@ void init_ino_entry_info(struct f2fs_sb_info *sbi) F2FS_ORPHANS_PER_BLOCK; } -int __init create_checkpoint_caches(void) +int __init f2fs_create_checkpoint_caches(void) { ino_entry_slab = f2fs_kmem_cache_create("f2fs_ino_entry", sizeof(struct ino_entry)); if (!ino_entry_slab) return -ENOMEM; - inode_entry_slab = f2fs_kmem_cache_create("f2fs_inode_entry", + f2fs_inode_entry_slab = f2fs_kmem_cache_create("f2fs_inode_entry", sizeof(struct inode_entry)); - if (!inode_entry_slab) { + if (!f2fs_inode_entry_slab) { kmem_cache_destroy(ino_entry_slab); return -ENOMEM; } return 0; } -void destroy_checkpoint_caches(void) +void f2fs_destroy_checkpoint_caches(void) { kmem_cache_destroy(ino_entry_slab); - kmem_cache_destroy(inode_entry_slab); + kmem_cache_destroy(f2fs_inode_entry_slab); } diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index bf55ee0dfccd..4b0db685e5d5 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -245,7 +245,7 @@ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr, } else { bio->bi_end_io = f2fs_write_end_io; bio->bi_private = sbi; - bio->bi_write_hint = io_type_to_rw_hint(sbi, type, temp); + bio->bi_write_hint = f2fs_io_type_to_rw_hint(sbi, type, temp); } if (wbc) wbc_init_bio(wbc, bio); @@ -602,7 +602,7 @@ static void __set_data_blkaddr(struct dnode_of_data *dn) * ->node_page * update block addresses in the node page */ -void set_data_blkaddr(struct dnode_of_data *dn) +void f2fs_set_data_blkaddr(struct dnode_of_data *dn) { f2fs_wait_on_page_writeback(dn->node_page, NODE, true); __set_data_blkaddr(dn); @@ -613,12 +613,12 @@ void set_data_blkaddr(struct dnode_of_data *dn) void f2fs_update_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr) { dn->data_blkaddr = blkaddr; - set_data_blkaddr(dn); + f2fs_set_data_blkaddr(dn); f2fs_update_extent_cache(dn); } /* dn->ofs_in_node will be returned with up-to-date last block pointer */ -int reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count) +int f2fs_reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count) { struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); int err; @@ -652,12 +652,12 @@ int reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count) } /* Should keep dn->ofs_in_node unchanged */ -int reserve_new_block(struct dnode_of_data *dn) +int f2fs_reserve_new_block(struct dnode_of_data *dn) { unsigned int ofs_in_node = dn->ofs_in_node; int ret; - ret = reserve_new_blocks(dn, 1); + ret = f2fs_reserve_new_blocks(dn, 1); dn->ofs_in_node = ofs_in_node; return ret; } @@ -667,12 +667,12 @@ int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index) bool need_put = dn->inode_page ? false : true; int err; - err = get_dnode_of_data(dn, index, ALLOC_NODE); + err = f2fs_get_dnode_of_data(dn, index, ALLOC_NODE); if (err) return err; if (dn->data_blkaddr == NULL_ADDR) - err = reserve_new_block(dn); + err = f2fs_reserve_new_block(dn); if (err || need_put) f2fs_put_dnode(dn); return err; @@ -691,7 +691,7 @@ int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index) return f2fs_reserve_block(dn, index); } -struct page *get_read_data_page(struct inode *inode, pgoff_t index, +struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index, int op_flags, bool for_write) { struct address_space *mapping = inode->i_mapping; @@ -710,7 +710,7 @@ struct page *get_read_data_page(struct inode *inode, pgoff_t index, } set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, index, LOOKUP_NODE); + err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE); if (err) goto put_err; f2fs_put_dnode(&dn); @@ -729,7 +729,8 @@ got_it: * A new dentry page is allocated but not able to be written, since its * new inode page couldn't be allocated due to -ENOSPC. * In such the case, its blkaddr can be remained as NEW_ADDR. - * see, f2fs_add_link -> get_new_data_page -> init_inode_metadata. + * see, f2fs_add_link -> f2fs_get_new_data_page -> + * f2fs_init_inode_metadata. */ if (dn.data_blkaddr == NEW_ADDR) { zero_user_segment(page, 0, PAGE_SIZE); @@ -749,7 +750,7 @@ put_err: return ERR_PTR(err); } -struct page *find_data_page(struct inode *inode, pgoff_t index) +struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index) { struct address_space *mapping = inode->i_mapping; struct page *page; @@ -759,7 +760,7 @@ struct page *find_data_page(struct inode *inode, pgoff_t index) return page; f2fs_put_page(page, 0); - page = get_read_data_page(inode, index, REQ_SYNC, false); + page = f2fs_get_read_data_page(inode, index, REQ_SYNC, false); if (IS_ERR(page)) return page; @@ -779,13 +780,13 @@ struct page *find_data_page(struct inode *inode, pgoff_t index) * Because, the callers, functions in dir.c and GC, should be able to know * whether this page exists or not. */ -struct page *get_lock_data_page(struct inode *inode, pgoff_t index, +struct page *f2fs_get_lock_data_page(struct inode *inode, pgoff_t index, bool for_write) { struct address_space *mapping = inode->i_mapping; struct page *page; repeat: - page = get_read_data_page(inode, index, REQ_SYNC, for_write); + page = f2fs_get_read_data_page(inode, index, REQ_SYNC, for_write); if (IS_ERR(page)) return page; @@ -811,7 +812,7 @@ repeat: * Note that, ipage is set only by make_empty_dir, and if any error occur, * ipage should be released by this function. */ -struct page *get_new_data_page(struct inode *inode, +struct page *f2fs_get_new_data_page(struct inode *inode, struct page *ipage, pgoff_t index, bool new_i_size) { struct address_space *mapping = inode->i_mapping; @@ -850,7 +851,7 @@ struct page *get_new_data_page(struct inode *inode, /* if ipage exists, blkaddr should be NEW_ADDR */ f2fs_bug_on(F2FS_I_SB(inode), ipage); - page = get_lock_data_page(inode, index, true); + page = f2fs_get_lock_data_page(inode, index, true); if (IS_ERR(page)) return page; } @@ -882,15 +883,15 @@ static int __allocate_data_block(struct dnode_of_data *dn, int seg_type) return err; alloc: - get_node_info(sbi, dn->nid, &ni); + f2fs_get_node_info(sbi, dn->nid, &ni); set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); - allocate_data_block(sbi, NULL, dn->data_blkaddr, &dn->data_blkaddr, + f2fs_allocate_data_block(sbi, NULL, dn->data_blkaddr, &dn->data_blkaddr, &sum, seg_type, NULL, false); - set_data_blkaddr(dn); + f2fs_set_data_blkaddr(dn); /* update i_size */ - fofs = start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) + + fofs = f2fs_start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) + dn->ofs_in_node; if (i_size_read(dn->inode) < ((loff_t)(fofs + 1) << PAGE_SHIFT)) f2fs_i_size_write(dn->inode, @@ -928,7 +929,7 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) map.m_seg_type = NO_CHECK_TYPE; if (direct_io) { - map.m_seg_type = rw_hint_to_seg_type(iocb->ki_hint); + map.m_seg_type = f2fs_rw_hint_to_seg_type(iocb->ki_hint); flag = f2fs_force_buffered_io(inode, WRITE) ? F2FS_GET_BLOCK_PRE_AIO : F2FS_GET_BLOCK_PRE_DIO; @@ -1018,7 +1019,7 @@ next_dnode: /* When reading holes, we need its node page */ set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, pgofs, mode); + err = f2fs_get_dnode_of_data(&dn, pgofs, mode); if (err) { if (flag == F2FS_GET_BLOCK_BMAP) map->m_pblk = 0; @@ -1026,10 +1027,10 @@ next_dnode: err = 0; if (map->m_next_pgofs) *map->m_next_pgofs = - get_next_page_offset(&dn, pgofs); + f2fs_get_next_page_offset(&dn, pgofs); if (map->m_next_extent) *map->m_next_extent = - get_next_page_offset(&dn, pgofs); + f2fs_get_next_page_offset(&dn, pgofs); } goto unlock_out; } @@ -1115,7 +1116,7 @@ skip: (pgofs == end || dn.ofs_in_node == end_offset)) { dn.ofs_in_node = ofs_in_node; - err = reserve_new_blocks(&dn, prealloc); + err = f2fs_reserve_new_blocks(&dn, prealloc); if (err) goto sync_out; @@ -1234,7 +1235,7 @@ static int get_data_block_dio(struct inode *inode, sector_t iblock, { return __get_data_block(inode, iblock, bh_result, create, F2FS_GET_BLOCK_DEFAULT, NULL, - rw_hint_to_seg_type( + f2fs_rw_hint_to_seg_type( inode->i_write_hint)); } @@ -1279,7 +1280,7 @@ static int f2fs_xattr_fiemap(struct inode *inode, if (!page) return -ENOMEM; - get_node_info(sbi, inode->i_ino, &ni); + f2fs_get_node_info(sbi, inode->i_ino, &ni); phys = (__u64)blk_to_logical(inode, ni.blk_addr); offset = offsetof(struct f2fs_inode, i_addr) + @@ -1306,7 +1307,7 @@ static int f2fs_xattr_fiemap(struct inode *inode, if (!page) return -ENOMEM; - get_node_info(sbi, xnid, &ni); + f2fs_get_node_info(sbi, xnid, &ni); phys = (__u64)blk_to_logical(inode, ni.blk_addr); len = inode->i_sb->s_blocksize; @@ -1609,12 +1610,12 @@ static inline bool check_inplace_update_policy(struct inode *inode, if (policy & (0x1 << F2FS_IPU_FORCE)) return true; - if (policy & (0x1 << F2FS_IPU_SSR) && need_SSR(sbi)) + if (policy & (0x1 << F2FS_IPU_SSR) && f2fs_need_SSR(sbi)) return true; if (policy & (0x1 << F2FS_IPU_UTIL) && utilization(sbi) > SM_I(sbi)->min_ipu_util) return true; - if (policy & (0x1 << F2FS_IPU_SSR_UTIL) && need_SSR(sbi) && + if (policy & (0x1 << F2FS_IPU_SSR_UTIL) && f2fs_need_SSR(sbi) && utilization(sbi) > SM_I(sbi)->min_ipu_util) return true; @@ -1635,7 +1636,7 @@ static inline bool check_inplace_update_policy(struct inode *inode, return false; } -bool should_update_inplace(struct inode *inode, struct f2fs_io_info *fio) +bool f2fs_should_update_inplace(struct inode *inode, struct f2fs_io_info *fio) { if (f2fs_is_pinned_file(inode)) return true; @@ -1647,7 +1648,7 @@ bool should_update_inplace(struct inode *inode, struct f2fs_io_info *fio) return check_inplace_update_policy(inode, fio); } -bool should_update_outplace(struct inode *inode, struct f2fs_io_info *fio) +bool f2fs_should_update_outplace(struct inode *inode, struct f2fs_io_info *fio) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -1670,13 +1671,13 @@ static inline bool need_inplace_update(struct f2fs_io_info *fio) { struct inode *inode = fio->page->mapping->host; - if (should_update_outplace(inode, fio)) + if (f2fs_should_update_outplace(inode, fio)) return false; - return should_update_inplace(inode, fio); + return f2fs_should_update_inplace(inode, fio); } -int do_write_data_page(struct f2fs_io_info *fio) +int f2fs_do_write_data_page(struct f2fs_io_info *fio) { struct page *page = fio->page; struct inode *inode = page->mapping->host; @@ -1701,7 +1702,7 @@ int do_write_data_page(struct f2fs_io_info *fio) if (fio->need_lock == LOCK_REQ && !f2fs_trylock_op(fio->sbi)) return -EAGAIN; - err = get_dnode_of_data(&dn, page->index, LOOKUP_NODE); + err = f2fs_get_dnode_of_data(&dn, page->index, LOOKUP_NODE); if (err) goto out; @@ -1728,7 +1729,7 @@ got_it: f2fs_put_dnode(&dn); if (fio->need_lock == LOCK_REQ) f2fs_unlock_op(fio->sbi); - err = rewrite_data_page(fio); + err = f2fs_inplace_write_data(fio); trace_f2fs_do_write_data_page(fio->page, IPU); set_inode_flag(inode, FI_UPDATE_WRITE); return err; @@ -1750,7 +1751,7 @@ got_it: ClearPageError(page); /* LFS mode write path */ - write_data_page(&dn, fio); + f2fs_outplace_write_data(&dn, fio); trace_f2fs_do_write_data_page(page, OPU); set_inode_flag(inode, FI_APPEND_WRITE); if (page->index == 0) @@ -1826,13 +1827,13 @@ write: /* we should not write 0'th page having journal header */ if (f2fs_is_volatile_file(inode) && (!page->index || (!wbc->for_reclaim && - available_free_memory(sbi, BASE_CHECK)))) + f2fs_available_free_memory(sbi, BASE_CHECK)))) goto redirty_out; /* Dentry blocks are controlled by checkpoint */ if (S_ISDIR(inode->i_mode)) { fio.need_lock = LOCK_DONE; - err = do_write_data_page(&fio); + err = f2fs_do_write_data_page(&fio); goto done; } @@ -1851,10 +1852,10 @@ write: } if (err == -EAGAIN) { - err = do_write_data_page(&fio); + err = f2fs_do_write_data_page(&fio); if (err == -EAGAIN) { fio.need_lock = LOCK_REQ; - err = do_write_data_page(&fio); + err = f2fs_do_write_data_page(&fio); } } @@ -1879,7 +1880,7 @@ out: if (wbc->for_reclaim) { f2fs_submit_merged_write_cond(sbi, inode, 0, page->index, DATA); clear_inode_flag(inode, FI_HOT_DATA); - remove_dirty_inode(inode); + f2fs_remove_dirty_inode(inode); submitted = NULL; } @@ -2092,7 +2093,7 @@ static int __f2fs_write_data_pages(struct address_space *mapping, if (S_ISDIR(inode->i_mode) && wbc->sync_mode == WB_SYNC_NONE && get_dirty_pages(inode) < nr_pages_to_skip(sbi, DATA) && - available_free_memory(sbi, DIRTY_DENTS)) + f2fs_available_free_memory(sbi, DIRTY_DENTS)) goto skip_write; /* skip writing during file defragment */ @@ -2118,7 +2119,7 @@ static int __f2fs_write_data_pages(struct address_space *mapping, * to detect pending bios. */ - remove_dirty_inode(inode); + f2fs_remove_dirty_inode(inode); return ret; skip_write: @@ -2145,7 +2146,7 @@ static void f2fs_write_failed(struct address_space *mapping, loff_t to) if (to > i_size) { down_write(&F2FS_I(inode)->i_mmap_sem); truncate_pagecache(inode, i_size); - truncate_blocks(inode, i_size, true); + f2fs_truncate_blocks(inode, i_size, true); up_write(&F2FS_I(inode)->i_mmap_sem); } } @@ -2177,7 +2178,7 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi, } restart: /* check inline_data */ - ipage = get_node_page(sbi, inode->i_ino); + ipage = f2fs_get_node_page(sbi, inode->i_ino); if (IS_ERR(ipage)) { err = PTR_ERR(ipage); goto unlock_out; @@ -2187,7 +2188,7 @@ restart: if (f2fs_has_inline_data(inode)) { if (pos + len <= MAX_INLINE_DATA(inode)) { - read_inline_data(page, ipage); + f2fs_do_read_inline_data(page, ipage); set_inode_flag(inode, FI_DATA_EXIST); if (inode->i_nlink) set_inline_node(ipage); @@ -2205,7 +2206,7 @@ restart: dn.data_blkaddr = ei.blk + index - ei.fofs; } else { /* hole case */ - err = get_dnode_of_data(&dn, index, LOOKUP_NODE); + err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE); if (err || dn.data_blkaddr == NULL_ADDR) { f2fs_put_dnode(&dn); __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, @@ -2242,7 +2243,7 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, trace_f2fs_write_begin(inode, pos, len, flags); if (f2fs_is_atomic_file(inode) && - !available_free_memory(sbi, INMEM_PAGES)) { + !f2fs_available_free_memory(sbi, INMEM_PAGES)) { err = -ENOMEM; drop_atomic = true; goto fail; @@ -2326,7 +2327,7 @@ fail: f2fs_put_page(page, 1); f2fs_write_failed(mapping, pos + len); if (drop_atomic) - drop_inmem_pages_all(sbi, false); + f2fs_drop_inmem_pages_all(sbi, false); return err; } @@ -2448,13 +2449,13 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset, dec_page_count(sbi, F2FS_DIRTY_NODES); } else { inode_dec_dirty_pages(inode); - remove_dirty_inode(inode); + f2fs_remove_dirty_inode(inode); } } /* This is atomic written page, keep Private */ if (IS_ATOMIC_WRITTEN_PAGE(page)) - return drop_inmem_page(inode, page); + return f2fs_drop_inmem_page(inode, page); set_page_private(page, 0); ClearPagePrivate(page); @@ -2487,7 +2488,7 @@ static int f2fs_set_data_page_dirty(struct page *page) if (f2fs_is_atomic_file(inode) && !f2fs_is_commit_atomic_write(inode)) { if (!IS_ATOMIC_WRITTEN_PAGE(page)) { - register_inmem_page(inode, page); + f2fs_register_inmem_page(inode, page); return 1; } /* @@ -2499,7 +2500,7 @@ static int f2fs_set_data_page_dirty(struct page *page) if (!PageDirty(page)) { __set_page_dirty_nobuffers(page); - update_dirty_page(inode, page); + f2fs_update_dirty_page(inode, page); return 1; } return 0; @@ -2592,7 +2593,7 @@ const struct address_space_operations f2fs_dblock_aops = { #endif }; -void clear_radix_tree_dirty_tag(struct page *page) +void f2fs_clear_radix_tree_dirty_tag(struct page *page) { struct address_space *mapping = page_mapping(page); unsigned long flags; diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index f8e7bafd092a..a7feed756592 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -65,7 +65,7 @@ static void set_de_type(struct f2fs_dir_entry *de, umode_t mode) de->file_type = f2fs_type_by_mode[(mode & S_IFMT) >> S_SHIFT]; } -unsigned char get_de_type(struct f2fs_dir_entry *de) +unsigned char f2fs_get_de_type(struct f2fs_dir_entry *de) { if (de->file_type < F2FS_FT_MAX) return f2fs_filetype_table[de->file_type]; @@ -97,14 +97,14 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page, dentry_blk = (struct f2fs_dentry_block *)page_address(dentry_page); make_dentry_ptr_block(NULL, &d, dentry_blk); - de = find_target_dentry(fname, namehash, max_slots, &d); + de = f2fs_find_target_dentry(fname, namehash, max_slots, &d); if (de) *res_page = dentry_page; return de; } -struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *fname, +struct f2fs_dir_entry *f2fs_find_target_dentry(struct fscrypt_name *fname, f2fs_hash_t namehash, int *max_slots, struct f2fs_dentry_ptr *d) { @@ -171,7 +171,7 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, for (; bidx < end_block; bidx++) { /* no need to allocate new dentry pages to all the indices */ - dentry_page = find_data_page(dir, bidx); + dentry_page = f2fs_find_data_page(dir, bidx); if (IS_ERR(dentry_page)) { if (PTR_ERR(dentry_page) == -ENOENT) { room = true; @@ -210,7 +210,7 @@ struct f2fs_dir_entry *__f2fs_find_entry(struct inode *dir, if (f2fs_has_inline_dentry(dir)) { *res_page = NULL; - de = find_in_inline_dir(dir, fname, res_page); + de = f2fs_find_in_inline_dir(dir, fname, res_page); goto out; } @@ -319,7 +319,7 @@ static void init_dent_inode(const struct qstr *name, struct page *ipage) set_page_dirty(ipage); } -void do_make_empty_dir(struct inode *inode, struct inode *parent, +void f2fs_do_make_empty_dir(struct inode *inode, struct inode *parent, struct f2fs_dentry_ptr *d) { struct qstr dot = QSTR_INIT(".", 1); @@ -340,23 +340,23 @@ static int make_empty_dir(struct inode *inode, struct f2fs_dentry_ptr d; if (f2fs_has_inline_dentry(inode)) - return make_empty_inline_dir(inode, parent, page); + return f2fs_make_empty_inline_dir(inode, parent, page); - dentry_page = get_new_data_page(inode, page, 0, true); + dentry_page = f2fs_get_new_data_page(inode, page, 0, true); if (IS_ERR(dentry_page)) return PTR_ERR(dentry_page); dentry_blk = page_address(dentry_page); make_dentry_ptr_block(NULL, &d, dentry_blk); - do_make_empty_dir(inode, parent, &d); + f2fs_do_make_empty_dir(inode, parent, &d); set_page_dirty(dentry_page); f2fs_put_page(dentry_page, 1); return 0; } -struct page *init_inode_metadata(struct inode *inode, struct inode *dir, +struct page *f2fs_init_inode_metadata(struct inode *inode, struct inode *dir, const struct qstr *new_name, const struct qstr *orig_name, struct page *dpage) { @@ -365,7 +365,7 @@ struct page *init_inode_metadata(struct inode *inode, struct inode *dir, int err; if (is_inode_flag_set(inode, FI_NEW_INODE)) { - page = new_inode_page(inode); + page = f2fs_new_inode_page(inode); if (IS_ERR(page)) return page; @@ -395,7 +395,7 @@ struct page *init_inode_metadata(struct inode *inode, struct inode *dir, goto put_error; } } else { - page = get_node_page(F2FS_I_SB(dir), inode->i_ino); + page = f2fs_get_node_page(F2FS_I_SB(dir), inode->i_ino); if (IS_ERR(page)) return page; } @@ -418,19 +418,19 @@ struct page *init_inode_metadata(struct inode *inode, struct inode *dir, * we should remove this inode from orphan list. */ if (inode->i_nlink == 0) - remove_orphan_inode(F2FS_I_SB(dir), inode->i_ino); + f2fs_remove_orphan_inode(F2FS_I_SB(dir), inode->i_ino); f2fs_i_links_write(inode, true); } return page; put_error: clear_nlink(inode); - update_inode(inode, page); + f2fs_update_inode(inode, page); f2fs_put_page(page, 1); return ERR_PTR(err); } -void update_parent_metadata(struct inode *dir, struct inode *inode, +void f2fs_update_parent_metadata(struct inode *dir, struct inode *inode, unsigned int current_depth) { if (inode && is_inode_flag_set(inode, FI_NEW_INODE)) { @@ -448,7 +448,7 @@ void update_parent_metadata(struct inode *dir, struct inode *inode, clear_inode_flag(inode, FI_INC_LINK); } -int room_for_filename(const void *bitmap, int slots, int max_slots) +int f2fs_room_for_filename(const void *bitmap, int slots, int max_slots) { int bit_start = 0; int zero_start, zero_end; @@ -537,12 +537,12 @@ start: (le32_to_cpu(dentry_hash) % nbucket)); for (block = bidx; block <= (bidx + nblock - 1); block++) { - dentry_page = get_new_data_page(dir, NULL, block, true); + dentry_page = f2fs_get_new_data_page(dir, NULL, block, true); if (IS_ERR(dentry_page)) return PTR_ERR(dentry_page); dentry_blk = page_address(dentry_page); - bit_pos = room_for_filename(&dentry_blk->dentry_bitmap, + bit_pos = f2fs_room_for_filename(&dentry_blk->dentry_bitmap, slots, NR_DENTRY_IN_BLOCK); if (bit_pos < NR_DENTRY_IN_BLOCK) goto add_dentry; @@ -558,7 +558,7 @@ add_dentry: if (inode) { down_write(&F2FS_I(inode)->i_sem); - page = init_inode_metadata(inode, dir, new_name, + page = f2fs_init_inode_metadata(inode, dir, new_name, orig_name, NULL); if (IS_ERR(page)) { err = PTR_ERR(page); @@ -576,7 +576,7 @@ add_dentry: f2fs_put_page(page, 1); } - update_parent_metadata(dir, inode, current_depth); + f2fs_update_parent_metadata(dir, inode, current_depth); fail: if (inode) up_write(&F2FS_I(inode)->i_sem); @@ -586,7 +586,7 @@ fail: return err; } -int __f2fs_do_add_link(struct inode *dir, struct fscrypt_name *fname, +int f2fs_add_dentry(struct inode *dir, struct fscrypt_name *fname, struct inode *inode, nid_t ino, umode_t mode) { struct qstr new_name; @@ -610,7 +610,7 @@ int __f2fs_do_add_link(struct inode *dir, struct fscrypt_name *fname, * Caller should grab and release a rwsem by calling f2fs_lock_op() and * f2fs_unlock_op(). */ -int __f2fs_add_link(struct inode *dir, const struct qstr *name, +int f2fs_do_add_link(struct inode *dir, const struct qstr *name, struct inode *inode, nid_t ino, umode_t mode) { struct fscrypt_name fname; @@ -639,7 +639,7 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name, } else if (IS_ERR(page)) { err = PTR_ERR(page); } else { - err = __f2fs_do_add_link(dir, &fname, inode, ino, mode); + err = f2fs_add_dentry(dir, &fname, inode, ino, mode); } fscrypt_free_filename(&fname); return err; @@ -651,7 +651,7 @@ int f2fs_do_tmpfile(struct inode *inode, struct inode *dir) int err = 0; down_write(&F2FS_I(inode)->i_sem); - page = init_inode_metadata(inode, dir, NULL, NULL, NULL); + page = f2fs_init_inode_metadata(inode, dir, NULL, NULL, NULL); if (IS_ERR(page)) { err = PTR_ERR(page); goto fail; @@ -683,9 +683,9 @@ void f2fs_drop_nlink(struct inode *dir, struct inode *inode) up_write(&F2FS_I(inode)->i_sem); if (inode->i_nlink == 0) - add_orphan_inode(inode); + f2fs_add_orphan_inode(inode); else - release_orphan_inode(sbi); + f2fs_release_orphan_inode(sbi); } /* @@ -703,7 +703,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, f2fs_update_time(F2FS_I_SB(dir), REQ_TIME); if (F2FS_OPTION(F2FS_I_SB(dir)).fsync_mode == FSYNC_MODE_STRICT) - add_ino_entry(F2FS_I_SB(dir), dir->i_ino, TRANS_DIR_INO); + f2fs_add_ino_entry(F2FS_I_SB(dir), dir->i_ino, TRANS_DIR_INO); if (f2fs_has_inline_dentry(dir)) return f2fs_delete_inline_entry(dentry, page, dir, inode); @@ -729,13 +729,13 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, f2fs_drop_nlink(dir, inode); if (bit_pos == NR_DENTRY_IN_BLOCK && - !truncate_hole(dir, page->index, page->index + 1)) { - clear_radix_tree_dirty_tag(page); + !f2fs_truncate_hole(dir, page->index, page->index + 1)) { + f2fs_clear_radix_tree_dirty_tag(page); clear_page_dirty_for_io(page); ClearPagePrivate(page); ClearPageUptodate(page); inode_dec_dirty_pages(dir); - remove_dirty_inode(dir); + f2fs_remove_dirty_inode(dir); } f2fs_put_page(page, 1); } @@ -752,7 +752,7 @@ bool f2fs_empty_dir(struct inode *dir) return f2fs_empty_inline_dir(dir); for (bidx = 0; bidx < nblock; bidx++) { - dentry_page = get_lock_data_page(dir, bidx, false); + dentry_page = f2fs_get_lock_data_page(dir, bidx, false); if (IS_ERR(dentry_page)) { if (PTR_ERR(dentry_page) == -ENOENT) continue; @@ -800,7 +800,7 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, continue; } - d_type = get_de_type(de); + d_type = f2fs_get_de_type(de); de_name.name = d->filename[bit_pos]; de_name.len = le16_to_cpu(de->name_len); @@ -824,7 +824,7 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, return 1; if (sbi->readdir_ra == 1) - ra_node_page(sbi, le32_to_cpu(de->ino)); + f2fs_ra_node_page(sbi, le32_to_cpu(de->ino)); bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); ctx->pos = start_pos + bit_pos; @@ -874,7 +874,7 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) page_cache_sync_readahead(inode->i_mapping, ra, file, n, min(npages - n, (pgoff_t)MAX_DIR_RA_PAGES)); - dentry_page = get_lock_data_page(inode, n, false); + dentry_page = f2fs_get_lock_data_page(inode, n, false); if (IS_ERR(dentry_page)) { err = PTR_ERR(dentry_page); if (err == -ENOENT) { diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index d5a861bf2b42..231b77ef5a53 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -49,7 +49,7 @@ static struct rb_entry *__lookup_rb_tree_slow(struct rb_root *root, return NULL; } -struct rb_entry *__lookup_rb_tree(struct rb_root *root, +struct rb_entry *f2fs_lookup_rb_tree(struct rb_root *root, struct rb_entry *cached_re, unsigned int ofs) { struct rb_entry *re; @@ -61,7 +61,7 @@ struct rb_entry *__lookup_rb_tree(struct rb_root *root, return re; } -struct rb_node **__lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi, +struct rb_node **f2fs_lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi, struct rb_root *root, struct rb_node **parent, unsigned int ofs) { @@ -92,7 +92,7 @@ struct rb_node **__lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi, * in order to simpfy the insertion after. * tree must stay unchanged between lookup and insertion. */ -struct rb_entry *__lookup_rb_tree_ret(struct rb_root *root, +struct rb_entry *f2fs_lookup_rb_tree_ret(struct rb_root *root, struct rb_entry *cached_re, unsigned int ofs, struct rb_entry **prev_entry, @@ -159,7 +159,7 @@ lookup_neighbors: return re; } -bool __check_rb_tree_consistence(struct f2fs_sb_info *sbi, +bool f2fs_check_rb_tree_consistence(struct f2fs_sb_info *sbi, struct rb_root *root) { #ifdef CONFIG_F2FS_CHECK_FS @@ -390,7 +390,7 @@ static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs, goto out; } - en = (struct extent_node *)__lookup_rb_tree(&et->root, + en = (struct extent_node *)f2fs_lookup_rb_tree(&et->root, (struct rb_entry *)et->cached_en, pgofs); if (!en) goto out; @@ -470,7 +470,7 @@ static struct extent_node *__insert_extent_tree(struct inode *inode, goto do_insert; } - p = __lookup_rb_tree_for_insert(sbi, &et->root, &parent, ei->fofs); + p = f2fs_lookup_rb_tree_for_insert(sbi, &et->root, &parent, ei->fofs); do_insert: en = __attach_extent_node(sbi, et, ei, parent, p); if (!en) @@ -520,7 +520,7 @@ static void f2fs_update_extent_tree_range(struct inode *inode, __drop_largest_extent(inode, fofs, len); /* 1. lookup first extent node in range [fofs, fofs + len - 1] */ - en = (struct extent_node *)__lookup_rb_tree_ret(&et->root, + en = (struct extent_node *)f2fs_lookup_rb_tree_ret(&et->root, (struct rb_entry *)et->cached_en, fofs, (struct rb_entry **)&prev_en, (struct rb_entry **)&next_en, @@ -773,7 +773,7 @@ void f2fs_update_extent_cache(struct dnode_of_data *dn) else blkaddr = dn->data_blkaddr; - fofs = start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) + + fofs = f2fs_start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) + dn->ofs_in_node; f2fs_update_extent_tree_range(dn->inode, fofs, blkaddr, 1); } @@ -788,7 +788,7 @@ void f2fs_update_extent_cache_range(struct dnode_of_data *dn, f2fs_update_extent_tree_range(dn->inode, fofs, blkaddr, len); } -void init_extent_cache_info(struct f2fs_sb_info *sbi) +void f2fs_init_extent_cache_info(struct f2fs_sb_info *sbi) { INIT_RADIX_TREE(&sbi->extent_tree_root, GFP_NOIO); mutex_init(&sbi->extent_tree_lock); @@ -800,7 +800,7 @@ void init_extent_cache_info(struct f2fs_sb_info *sbi) atomic_set(&sbi->total_ext_node, 0); } -int __init create_extent_cache(void) +int __init f2fs_create_extent_cache(void) { extent_tree_slab = f2fs_kmem_cache_create("f2fs_extent_tree", sizeof(struct extent_tree)); @@ -815,7 +815,7 @@ int __init create_extent_cache(void) return 0; } -void destroy_extent_cache(void) +void f2fs_destroy_extent_cache(void) { kmem_cache_destroy(extent_node_slab); kmem_cache_destroy(extent_tree_slab); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index c80ee4b1fa51..e91f7ff71dc6 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2710,7 +2710,7 @@ static inline int get_inline_xattr_addrs(struct inode *inode) return F2FS_I(inode)->i_inline_xattr_size; } -#define get_inode_mode(i) \ +#define f2fs_get_inode_mode(i) \ ((is_inode_flag_set(i, FI_ACL_MODE)) ? \ (F2FS_I(i)->i_acl_mode) : ((i)->i_mode)) @@ -2760,14 +2760,14 @@ static inline bool is_valid_blkaddr(block_t blkaddr) * file.c */ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync); -void truncate_data_blocks(struct dnode_of_data *dn); -int truncate_blocks(struct inode *inode, u64 from, bool lock); +void f2fs_truncate_data_blocks(struct dnode_of_data *dn); +int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock); int f2fs_truncate(struct inode *inode); int f2fs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); int f2fs_setattr(struct dentry *dentry, struct iattr *attr); -int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end); -void truncate_data_blocks_range(struct dnode_of_data *dn, int count); +int f2fs_truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end); +void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count); int f2fs_precache_extents(struct inode *inode); long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); @@ -2781,37 +2781,37 @@ bool f2fs_inode_chksum_verify(struct f2fs_sb_info *sbi, struct page *page); void f2fs_inode_chksum_set(struct f2fs_sb_info *sbi, struct page *page); struct inode *f2fs_iget(struct super_block *sb, unsigned long ino); struct inode *f2fs_iget_retry(struct super_block *sb, unsigned long ino); -int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink); -void update_inode(struct inode *inode, struct page *node_page); -void update_inode_page(struct inode *inode); +int f2fs_try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink); +void f2fs_update_inode(struct inode *inode, struct page *node_page); +void f2fs_update_inode_page(struct inode *inode); int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc); void f2fs_evict_inode(struct inode *inode); -void handle_failed_inode(struct inode *inode); +void f2fs_handle_failed_inode(struct inode *inode); /* * namei.c */ -int update_extension_list(struct f2fs_sb_info *sbi, const char *name, +int f2fs_update_extension_list(struct f2fs_sb_info *sbi, const char *name, bool hot, bool set); struct dentry *f2fs_get_parent(struct dentry *child); /* * dir.c */ -unsigned char get_de_type(struct f2fs_dir_entry *de); -struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *fname, +unsigned char f2fs_get_de_type(struct f2fs_dir_entry *de); +struct f2fs_dir_entry *f2fs_find_target_dentry(struct fscrypt_name *fname, f2fs_hash_t namehash, int *max_slots, struct f2fs_dentry_ptr *d); int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, unsigned int start_pos, struct fscrypt_str *fstr); -void do_make_empty_dir(struct inode *inode, struct inode *parent, +void f2fs_do_make_empty_dir(struct inode *inode, struct inode *parent, struct f2fs_dentry_ptr *d); -struct page *init_inode_metadata(struct inode *inode, struct inode *dir, +struct page *f2fs_init_inode_metadata(struct inode *inode, struct inode *dir, const struct qstr *new_name, const struct qstr *orig_name, struct page *dpage); -void update_parent_metadata(struct inode *dir, struct inode *inode, +void f2fs_update_parent_metadata(struct inode *dir, struct inode *inode, unsigned int current_depth); -int room_for_filename(const void *bitmap, int slots, int max_slots); +int f2fs_room_for_filename(const void *bitmap, int slots, int max_slots); void f2fs_drop_nlink(struct inode *dir, struct inode *inode); struct f2fs_dir_entry *__f2fs_find_entry(struct inode *dir, struct fscrypt_name *fname, struct page **res_page); @@ -2828,9 +2828,9 @@ void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *d, int f2fs_add_regular_entry(struct inode *dir, const struct qstr *new_name, const struct qstr *orig_name, struct inode *inode, nid_t ino, umode_t mode); -int __f2fs_do_add_link(struct inode *dir, struct fscrypt_name *fname, +int f2fs_add_dentry(struct inode *dir, struct fscrypt_name *fname, struct inode *inode, nid_t ino, umode_t mode); -int __f2fs_add_link(struct inode *dir, const struct qstr *name, +int f2fs_do_add_link(struct inode *dir, const struct qstr *name, struct inode *inode, nid_t ino, umode_t mode); void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, struct inode *dir, struct inode *inode); @@ -2839,7 +2839,7 @@ bool f2fs_empty_dir(struct inode *dir); static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode) { - return __f2fs_add_link(d_inode(dentry->d_parent), &dentry->d_name, + return f2fs_do_add_link(d_inode(dentry->d_parent), &dentry->d_name, inode, inode->i_ino, inode->i_mode); } @@ -2854,7 +2854,7 @@ int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover); int f2fs_sync_fs(struct super_block *sb, int sync); extern __printf(3, 4) void f2fs_msg(struct super_block *sb, const char *level, const char *fmt, ...); -int sanity_check_ckpt(struct f2fs_sb_info *sbi); +int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi); /* * hash.c @@ -2868,139 +2868,146 @@ f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info, struct dnode_of_data; struct node_info; -int check_nid_range(struct f2fs_sb_info *sbi, nid_t nid); -bool available_free_memory(struct f2fs_sb_info *sbi, int type); -int need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid); -bool is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid); -bool need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino); -void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni); -pgoff_t get_next_page_offset(struct dnode_of_data *dn, pgoff_t pgofs); -int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode); -int truncate_inode_blocks(struct inode *inode, pgoff_t from); -int truncate_xattr_node(struct inode *inode); -int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino); -int remove_inode_page(struct inode *inode); -struct page *new_inode_page(struct inode *inode); -struct page *new_node_page(struct dnode_of_data *dn, unsigned int ofs); -void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid); -struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid); -struct page *get_node_page_ra(struct page *parent, int start); -void move_node_page(struct page *node_page, int gc_type); -int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, +int f2fs_check_nid_range(struct f2fs_sb_info *sbi, nid_t nid); +bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type); +int f2fs_need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid); +bool f2fs_is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid); +bool f2fs_need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino); +void f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid, + struct node_info *ni); +pgoff_t f2fs_get_next_page_offset(struct dnode_of_data *dn, pgoff_t pgofs); +int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode); +int f2fs_truncate_inode_blocks(struct inode *inode, pgoff_t from); +int f2fs_truncate_xattr_node(struct inode *inode); +int f2fs_wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino); +int f2fs_remove_inode_page(struct inode *inode); +struct page *f2fs_new_inode_page(struct inode *inode); +struct page *f2fs_new_node_page(struct dnode_of_data *dn, unsigned int ofs); +void f2fs_ra_node_page(struct f2fs_sb_info *sbi, nid_t nid); +struct page *f2fs_get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid); +struct page *f2fs_get_node_page_ra(struct page *parent, int start); +void f2fs_move_node_page(struct page *node_page, int gc_type); +int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, struct writeback_control *wbc, bool atomic); -int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc, +int f2fs_sync_node_pages(struct f2fs_sb_info *sbi, + struct writeback_control *wbc, bool do_balance, enum iostat_type io_type); -void build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount); -bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid); -void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid); -void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid); -int try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink); -void recover_inline_xattr(struct inode *inode, struct page *page); -int recover_xattr_data(struct inode *inode, struct page *page); -int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page); -void restore_node_summary(struct f2fs_sb_info *sbi, +void f2fs_build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount); +bool f2fs_alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid); +void f2fs_alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid); +void f2fs_alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid); +int f2fs_try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink); +void f2fs_recover_inline_xattr(struct inode *inode, struct page *page); +int f2fs_recover_xattr_data(struct inode *inode, struct page *page); +int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct page *page); +void f2fs_restore_node_summary(struct f2fs_sb_info *sbi, unsigned int segno, struct f2fs_summary_block *sum); -void flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc); -int build_node_manager(struct f2fs_sb_info *sbi); -void destroy_node_manager(struct f2fs_sb_info *sbi); -int __init create_node_manager_caches(void); -void destroy_node_manager_caches(void); +void f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc); +int f2fs_build_node_manager(struct f2fs_sb_info *sbi); +void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi); +int __init f2fs_create_node_manager_caches(void); +void f2fs_destroy_node_manager_caches(void); /* * segment.c */ -bool need_SSR(struct f2fs_sb_info *sbi); -void register_inmem_page(struct inode *inode, struct page *page); -void drop_inmem_pages_all(struct f2fs_sb_info *sbi, bool gc_failure); -void drop_inmem_pages(struct inode *inode); -void drop_inmem_page(struct inode *inode, struct page *page); -int commit_inmem_pages(struct inode *inode); +bool f2fs_need_SSR(struct f2fs_sb_info *sbi); +void f2fs_register_inmem_page(struct inode *inode, struct page *page); +void f2fs_drop_inmem_pages_all(struct f2fs_sb_info *sbi, bool gc_failure); +void f2fs_drop_inmem_pages(struct inode *inode); +void f2fs_drop_inmem_page(struct inode *inode, struct page *page); +int f2fs_commit_inmem_pages(struct inode *inode); void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need); void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi); int f2fs_issue_flush(struct f2fs_sb_info *sbi, nid_t ino); -int create_flush_cmd_control(struct f2fs_sb_info *sbi); +int f2fs_create_flush_cmd_control(struct f2fs_sb_info *sbi); int f2fs_flush_device_cache(struct f2fs_sb_info *sbi); -void destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free); -void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr); -bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr); -void drop_discard_cmd(struct f2fs_sb_info *sbi); -void stop_discard_thread(struct f2fs_sb_info *sbi); +void f2fs_destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free); +void f2fs_invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr); +bool f2fs_is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr); +void f2fs_drop_discard_cmd(struct f2fs_sb_info *sbi); +void f2fs_stop_discard_thread(struct f2fs_sb_info *sbi); bool f2fs_wait_discard_bios(struct f2fs_sb_info *sbi); -void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc); -void release_discard_addrs(struct f2fs_sb_info *sbi); -int npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra); -void allocate_new_segments(struct f2fs_sb_info *sbi); +void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi, + struct cp_control *cpc); +void f2fs_release_discard_addrs(struct f2fs_sb_info *sbi); +int f2fs_npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra); +void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi); int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range); -bool exist_trim_candidates(struct f2fs_sb_info *sbi, struct cp_control *cpc); -struct page *get_sum_page(struct f2fs_sb_info *sbi, unsigned int segno); -void update_meta_page(struct f2fs_sb_info *sbi, void *src, block_t blk_addr); -void write_meta_page(struct f2fs_sb_info *sbi, struct page *page, +bool f2fs_exist_trim_candidates(struct f2fs_sb_info *sbi, + struct cp_control *cpc); +struct page *f2fs_get_sum_page(struct f2fs_sb_info *sbi, unsigned int segno); +void f2fs_update_meta_page(struct f2fs_sb_info *sbi, void *src, + block_t blk_addr); +void f2fs_do_write_meta_page(struct f2fs_sb_info *sbi, struct page *page, enum iostat_type io_type); -void write_node_page(unsigned int nid, struct f2fs_io_info *fio); -void write_data_page(struct dnode_of_data *dn, struct f2fs_io_info *fio); -int rewrite_data_page(struct f2fs_io_info *fio); -void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, +void f2fs_do_write_node_page(unsigned int nid, struct f2fs_io_info *fio); +void f2fs_outplace_write_data(struct dnode_of_data *dn, + struct f2fs_io_info *fio); +int f2fs_inplace_write_data(struct f2fs_io_info *fio); +void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, block_t old_blkaddr, block_t new_blkaddr, bool recover_curseg, bool recover_newaddr); void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn, block_t old_addr, block_t new_addr, unsigned char version, bool recover_curseg, bool recover_newaddr); -void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, +void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, block_t old_blkaddr, block_t *new_blkaddr, struct f2fs_summary *sum, int type, struct f2fs_io_info *fio, bool add_list); void f2fs_wait_on_page_writeback(struct page *page, enum page_type type, bool ordered); void f2fs_wait_on_block_writeback(struct f2fs_sb_info *sbi, block_t blkaddr); -void write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk); -void write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk); -int lookup_journal_in_cursum(struct f2fs_journal *journal, int type, +void f2fs_write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk); +void f2fs_write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk); +int f2fs_lookup_journal_in_cursum(struct f2fs_journal *journal, int type, unsigned int val, int alloc); -void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc); -int build_segment_manager(struct f2fs_sb_info *sbi); -void destroy_segment_manager(struct f2fs_sb_info *sbi); -int __init create_segment_manager_caches(void); -void destroy_segment_manager_caches(void); -int rw_hint_to_seg_type(enum rw_hint hint); -enum rw_hint io_type_to_rw_hint(struct f2fs_sb_info *sbi, enum page_type type, - enum temp_type temp); +void f2fs_flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc); +int f2fs_build_segment_manager(struct f2fs_sb_info *sbi); +void f2fs_destroy_segment_manager(struct f2fs_sb_info *sbi); +int __init f2fs_create_segment_manager_caches(void); +void f2fs_destroy_segment_manager_caches(void); +int f2fs_rw_hint_to_seg_type(enum rw_hint hint); +enum rw_hint f2fs_io_type_to_rw_hint(struct f2fs_sb_info *sbi, + enum page_type type, enum temp_type temp); /* * checkpoint.c */ void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io); -struct page *grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index); -struct page *get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index); -struct page *get_tmp_page(struct f2fs_sb_info *sbi, pgoff_t index); -bool is_valid_meta_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type); -int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, +struct page *f2fs_grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index); +struct page *f2fs_get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index); +struct page *f2fs_get_tmp_page(struct f2fs_sb_info *sbi, pgoff_t index); +bool f2fs_is_valid_meta_blkaddr(struct f2fs_sb_info *sbi, + block_t blkaddr, int type); +int f2fs_ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type, bool sync); -void ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index); -long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, +void f2fs_ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index); +long f2fs_sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, long nr_to_write, enum iostat_type io_type); -void add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type); -void remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type); -void release_ino_entry(struct f2fs_sb_info *sbi, bool all); -bool exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode); -void set_dirty_device(struct f2fs_sb_info *sbi, nid_t ino, +void f2fs_add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type); +void f2fs_remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type); +void f2fs_release_ino_entry(struct f2fs_sb_info *sbi, bool all); +bool f2fs_exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode); +void f2fs_set_dirty_device(struct f2fs_sb_info *sbi, nid_t ino, unsigned int devidx, int type); -bool is_dirty_device(struct f2fs_sb_info *sbi, nid_t ino, +bool f2fs_is_dirty_device(struct f2fs_sb_info *sbi, nid_t ino, unsigned int devidx, int type); int f2fs_sync_inode_meta(struct f2fs_sb_info *sbi); -int acquire_orphan_inode(struct f2fs_sb_info *sbi); -void release_orphan_inode(struct f2fs_sb_info *sbi); -void add_orphan_inode(struct inode *inode); -void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino); -int recover_orphan_inodes(struct f2fs_sb_info *sbi); -int get_valid_checkpoint(struct f2fs_sb_info *sbi); -void update_dirty_page(struct inode *inode, struct page *page); -void remove_dirty_inode(struct inode *inode); -int sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type); -int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc); -void init_ino_entry_info(struct f2fs_sb_info *sbi); -int __init create_checkpoint_caches(void); -void destroy_checkpoint_caches(void); +int f2fs_acquire_orphan_inode(struct f2fs_sb_info *sbi); +void f2fs_release_orphan_inode(struct f2fs_sb_info *sbi); +void f2fs_add_orphan_inode(struct inode *inode); +void f2fs_remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino); +int f2fs_recover_orphan_inodes(struct f2fs_sb_info *sbi); +int f2fs_get_valid_checkpoint(struct f2fs_sb_info *sbi); +void f2fs_update_dirty_page(struct inode *inode, struct page *page); +void f2fs_remove_dirty_inode(struct inode *inode); +int f2fs_sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type); +int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc); +void f2fs_init_ino_entry_info(struct f2fs_sb_info *sbi); +int __init f2fs_create_checkpoint_caches(void); +void f2fs_destroy_checkpoint_caches(void); /* * data.c @@ -3017,27 +3024,27 @@ void f2fs_submit_page_write(struct f2fs_io_info *fio); struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi, block_t blk_addr, struct bio *bio); int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr); -void set_data_blkaddr(struct dnode_of_data *dn); +void f2fs_set_data_blkaddr(struct dnode_of_data *dn); void f2fs_update_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr); -int reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count); -int reserve_new_block(struct dnode_of_data *dn); +int f2fs_reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count); +int f2fs_reserve_new_block(struct dnode_of_data *dn); int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index); int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from); int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index); -struct page *get_read_data_page(struct inode *inode, pgoff_t index, +struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index, int op_flags, bool for_write); -struct page *find_data_page(struct inode *inode, pgoff_t index); -struct page *get_lock_data_page(struct inode *inode, pgoff_t index, +struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index); +struct page *f2fs_get_lock_data_page(struct inode *inode, pgoff_t index, bool for_write); -struct page *get_new_data_page(struct inode *inode, +struct page *f2fs_get_new_data_page(struct inode *inode, struct page *ipage, pgoff_t index, bool new_i_size); -int do_write_data_page(struct f2fs_io_info *fio); +int f2fs_do_write_data_page(struct f2fs_io_info *fio); int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, int create, int flag); int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len); -bool should_update_inplace(struct inode *inode, struct f2fs_io_info *fio); -bool should_update_outplace(struct inode *inode, struct f2fs_io_info *fio); +bool f2fs_should_update_inplace(struct inode *inode, struct f2fs_io_info *fio); +bool f2fs_should_update_outplace(struct inode *inode, struct f2fs_io_info *fio); void f2fs_invalidate_page(struct page *page, unsigned int offset, unsigned int length); int f2fs_release_page(struct page *page, gfp_t wait); @@ -3046,23 +3053,23 @@ int f2fs_migrate_page(struct address_space *mapping, struct page *newpage, struct page *page, enum migrate_mode mode); #endif bool f2fs_overwrite_io(struct inode *inode, loff_t pos, size_t len); -void clear_radix_tree_dirty_tag(struct page *page); +void f2fs_clear_radix_tree_dirty_tag(struct page *page); /* * gc.c */ -int start_gc_thread(struct f2fs_sb_info *sbi); -void stop_gc_thread(struct f2fs_sb_info *sbi); -block_t start_bidx_of_node(unsigned int node_ofs, struct inode *inode); +int f2fs_start_gc_thread(struct f2fs_sb_info *sbi); +void f2fs_stop_gc_thread(struct f2fs_sb_info *sbi); +block_t f2fs_start_bidx_of_node(unsigned int node_ofs, struct inode *inode); int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background, unsigned int segno); -void build_gc_manager(struct f2fs_sb_info *sbi); +void f2fs_build_gc_manager(struct f2fs_sb_info *sbi); /* * recovery.c */ -int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only); -bool space_for_roll_forward(struct f2fs_sb_info *sbi); +int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only); +bool f2fs_space_for_roll_forward(struct f2fs_sb_info *sbi); /* * debug.c @@ -3267,29 +3274,31 @@ extern const struct inode_operations f2fs_dir_inode_operations; extern const struct inode_operations f2fs_symlink_inode_operations; extern const struct inode_operations f2fs_encrypted_symlink_inode_operations; extern const struct inode_operations f2fs_special_inode_operations; -extern struct kmem_cache *inode_entry_slab; +extern struct kmem_cache *f2fs_inode_entry_slab; /* * inline.c */ bool f2fs_may_inline_data(struct inode *inode); bool f2fs_may_inline_dentry(struct inode *inode); -void read_inline_data(struct page *page, struct page *ipage); -void truncate_inline_inode(struct inode *inode, struct page *ipage, u64 from); +void f2fs_do_read_inline_data(struct page *page, struct page *ipage); +void f2fs_truncate_inline_inode(struct inode *inode, + struct page *ipage, u64 from); int f2fs_read_inline_data(struct inode *inode, struct page *page); int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page); int f2fs_convert_inline_inode(struct inode *inode); int f2fs_write_inline_data(struct inode *inode, struct page *page); -bool recover_inline_data(struct inode *inode, struct page *npage); -struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir, +bool f2fs_recover_inline_data(struct inode *inode, struct page *npage); +struct f2fs_dir_entry *f2fs_find_in_inline_dir(struct inode *dir, struct fscrypt_name *fname, struct page **res_page); -int make_empty_inline_dir(struct inode *inode, struct inode *parent, +int f2fs_make_empty_inline_dir(struct inode *inode, struct inode *parent, struct page *ipage); int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, const struct qstr *orig_name, struct inode *inode, nid_t ino, umode_t mode); -void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page, - struct inode *dir, struct inode *inode); +void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, + struct page *page, struct inode *dir, + struct inode *inode); bool f2fs_empty_inline_dir(struct inode *dir); int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx, struct fscrypt_str *fstr); @@ -3310,17 +3319,17 @@ void f2fs_leave_shrinker(struct f2fs_sb_info *sbi); /* * extent_cache.c */ -struct rb_entry *__lookup_rb_tree(struct rb_root *root, +struct rb_entry *f2fs_lookup_rb_tree(struct rb_root *root, struct rb_entry *cached_re, unsigned int ofs); -struct rb_node **__lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi, +struct rb_node **f2fs_lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi, struct rb_root *root, struct rb_node **parent, unsigned int ofs); -struct rb_entry *__lookup_rb_tree_ret(struct rb_root *root, +struct rb_entry *f2fs_lookup_rb_tree_ret(struct rb_root *root, struct rb_entry *cached_re, unsigned int ofs, struct rb_entry **prev_entry, struct rb_entry **next_entry, struct rb_node ***insert_p, struct rb_node **insert_parent, bool force); -bool __check_rb_tree_consistence(struct f2fs_sb_info *sbi, +bool f2fs_check_rb_tree_consistence(struct f2fs_sb_info *sbi, struct rb_root *root); unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink); bool f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext); @@ -3332,9 +3341,9 @@ bool f2fs_lookup_extent_cache(struct inode *inode, pgoff_t pgofs, void f2fs_update_extent_cache(struct dnode_of_data *dn); void f2fs_update_extent_cache_range(struct dnode_of_data *dn, pgoff_t fofs, block_t blkaddr, unsigned int len); -void init_extent_cache_info(struct f2fs_sb_info *sbi); -int __init create_extent_cache(void); -void destroy_extent_cache(void); +void f2fs_init_extent_cache_info(struct f2fs_sb_info *sbi); +int __init f2fs_create_extent_cache(void); +void f2fs_destroy_extent_cache(void); /* * sysfs.c diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 0c2af49be162..f1476c93ded5 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -160,17 +160,18 @@ static inline enum cp_reason_type need_do_checkpoint(struct inode *inode) cp_reason = CP_SB_NEED_CP; else if (file_wrong_pino(inode)) cp_reason = CP_WRONG_PINO; - else if (!space_for_roll_forward(sbi)) + else if (!f2fs_space_for_roll_forward(sbi)) cp_reason = CP_NO_SPC_ROLL; - else if (!is_checkpointed_node(sbi, F2FS_I(inode)->i_pino)) + else if (!f2fs_is_checkpointed_node(sbi, F2FS_I(inode)->i_pino)) cp_reason = CP_NODE_NEED_CP; else if (test_opt(sbi, FASTBOOT)) cp_reason = CP_FASTBOOT_MODE; else if (F2FS_OPTION(sbi).active_logs == 2) cp_reason = CP_SPEC_LOG_NUM; else if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_STRICT && - need_dentry_mark(sbi, inode->i_ino) && - exist_written_data(sbi, F2FS_I(inode)->i_pino, TRANS_DIR_INO)) + f2fs_need_dentry_mark(sbi, inode->i_ino) && + f2fs_exist_written_data(sbi, F2FS_I(inode)->i_pino, + TRANS_DIR_INO)) cp_reason = CP_RECOVER_DIR; return cp_reason; @@ -181,7 +182,7 @@ static bool need_inode_page_update(struct f2fs_sb_info *sbi, nid_t ino) struct page *i = find_get_page(NODE_MAPPING(sbi), ino); bool ret = false; /* But we need to avoid that there are some inode updates */ - if ((i && PageDirty(i)) || need_inode_block_update(sbi, ino)) + if ((i && PageDirty(i)) || f2fs_need_inode_block_update(sbi, ino)) ret = true; f2fs_put_page(i, 0); return ret; @@ -241,14 +242,14 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end, * if there is no written data, don't waste time to write recovery info. */ if (!is_inode_flag_set(inode, FI_APPEND_WRITE) && - !exist_written_data(sbi, ino, APPEND_INO)) { + !f2fs_exist_written_data(sbi, ino, APPEND_INO)) { /* it may call write_inode just prior to fsync */ if (need_inode_page_update(sbi, ino)) goto go_write; if (is_inode_flag_set(inode, FI_UPDATE_WRITE) || - exist_written_data(sbi, ino, UPDATE_INO)) + f2fs_exist_written_data(sbi, ino, UPDATE_INO)) goto flush_out; goto out; } @@ -275,7 +276,7 @@ go_write: goto out; } sync_nodes: - ret = fsync_node_pages(sbi, inode, &wbc, atomic); + ret = f2fs_fsync_node_pages(sbi, inode, &wbc, atomic); if (ret) goto out; @@ -285,7 +286,7 @@ sync_nodes: goto out; } - if (need_inode_block_update(sbi, ino)) { + if (f2fs_need_inode_block_update(sbi, ino)) { f2fs_mark_inode_dirty_sync(inode, true); f2fs_write_inode(inode, NULL); goto sync_nodes; @@ -300,21 +301,21 @@ sync_nodes: * given fsync mark. */ if (!atomic) { - ret = wait_on_node_pages_writeback(sbi, ino); + ret = f2fs_wait_on_node_pages_writeback(sbi, ino); if (ret) goto out; } /* once recovery info is written, don't need to tack this */ - remove_ino_entry(sbi, ino, APPEND_INO); + f2fs_remove_ino_entry(sbi, ino, APPEND_INO); clear_inode_flag(inode, FI_APPEND_WRITE); flush_out: if (!atomic && F2FS_OPTION(sbi).fsync_mode != FSYNC_MODE_NOBARRIER) ret = f2fs_issue_flush(sbi, inode->i_ino); if (!ret) { - remove_ino_entry(sbi, ino, UPDATE_INO); + f2fs_remove_ino_entry(sbi, ino, UPDATE_INO); clear_inode_flag(inode, FI_UPDATE_WRITE); - remove_ino_entry(sbi, ino, FLUSH_INO); + f2fs_remove_ino_entry(sbi, ino, FLUSH_INO); } f2fs_update_time(sbi, REQ_TIME); out: @@ -395,13 +396,13 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) for (; data_ofs < isize; data_ofs = (loff_t)pgofs << PAGE_SHIFT) { set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, pgofs, LOOKUP_NODE); + err = f2fs_get_dnode_of_data(&dn, pgofs, LOOKUP_NODE); if (err && err != -ENOENT) { goto fail; } else if (err == -ENOENT) { /* direct node does not exists */ if (whence == SEEK_DATA) { - pgofs = get_next_page_offset(&dn, pgofs); + pgofs = f2fs_get_next_page_offset(&dn, pgofs); continue; } else { goto found; @@ -489,7 +490,7 @@ static int f2fs_file_open(struct inode *inode, struct file *filp) return dquot_file_open(inode, filp); } -void truncate_data_blocks_range(struct dnode_of_data *dn, int count) +void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count) { struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); struct f2fs_node *raw_node; @@ -509,8 +510,8 @@ void truncate_data_blocks_range(struct dnode_of_data *dn, int count) continue; dn->data_blkaddr = NULL_ADDR; - set_data_blkaddr(dn); - invalidate_blocks(sbi, blkaddr); + f2fs_set_data_blkaddr(dn); + f2fs_invalidate_blocks(sbi, blkaddr); if (dn->ofs_in_node == 0 && IS_INODE(dn->node_page)) clear_inode_flag(dn->inode, FI_FIRST_BLOCK_WRITTEN); nr_free++; @@ -522,7 +523,7 @@ void truncate_data_blocks_range(struct dnode_of_data *dn, int count) * once we invalidate valid blkaddr in range [ofs, ofs + count], * we will invalidate all blkaddr in the whole range. */ - fofs = start_bidx_of_node(ofs_of_node(dn->node_page), + fofs = f2fs_start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) + ofs; f2fs_update_extent_cache_range(dn, fofs, 0, len); dec_valid_block_count(sbi, dn->inode, nr_free); @@ -534,9 +535,9 @@ void truncate_data_blocks_range(struct dnode_of_data *dn, int count) dn->ofs_in_node, nr_free); } -void truncate_data_blocks(struct dnode_of_data *dn) +void f2fs_truncate_data_blocks(struct dnode_of_data *dn) { - truncate_data_blocks_range(dn, ADDRS_PER_BLOCK); + f2fs_truncate_data_blocks_range(dn, ADDRS_PER_BLOCK); } static int truncate_partial_data_page(struct inode *inode, u64 from, @@ -558,7 +559,7 @@ static int truncate_partial_data_page(struct inode *inode, u64 from, return 0; } - page = get_lock_data_page(inode, index, true); + page = f2fs_get_lock_data_page(inode, index, true); if (IS_ERR(page)) return PTR_ERR(page) == -ENOENT ? 0 : PTR_ERR(page); truncate_out: @@ -573,7 +574,7 @@ truncate_out: return 0; } -int truncate_blocks(struct inode *inode, u64 from, bool lock) +int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct dnode_of_data dn; @@ -592,21 +593,21 @@ int truncate_blocks(struct inode *inode, u64 from, bool lock) if (lock) f2fs_lock_op(sbi); - ipage = get_node_page(sbi, inode->i_ino); + ipage = f2fs_get_node_page(sbi, inode->i_ino); if (IS_ERR(ipage)) { err = PTR_ERR(ipage); goto out; } if (f2fs_has_inline_data(inode)) { - truncate_inline_inode(inode, ipage, from); + f2fs_truncate_inline_inode(inode, ipage, from); f2fs_put_page(ipage, 1); truncate_page = true; goto out; } set_new_dnode(&dn, inode, ipage, NULL, 0); - err = get_dnode_of_data(&dn, free_from, LOOKUP_NODE_RA); + err = f2fs_get_dnode_of_data(&dn, free_from, LOOKUP_NODE_RA); if (err) { if (err == -ENOENT) goto free_next; @@ -619,13 +620,13 @@ int truncate_blocks(struct inode *inode, u64 from, bool lock) f2fs_bug_on(sbi, count < 0); if (dn.ofs_in_node || IS_INODE(dn.node_page)) { - truncate_data_blocks_range(&dn, count); + f2fs_truncate_data_blocks_range(&dn, count); free_from += count; } f2fs_put_dnode(&dn); free_next: - err = truncate_inode_blocks(inode, free_from); + err = f2fs_truncate_inode_blocks(inode, free_from); out: if (lock) f2fs_unlock_op(sbi); @@ -664,7 +665,7 @@ int f2fs_truncate(struct inode *inode) return err; } - err = truncate_blocks(inode, i_size_read(inode), true); + err = f2fs_truncate_blocks(inode, i_size_read(inode), true); if (err) return err; @@ -815,7 +816,7 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) __setattr_copy(inode, attr); if (attr->ia_valid & ATTR_MODE) { - err = posix_acl_chmod(inode, get_inode_mode(inode)); + err = posix_acl_chmod(inode, f2fs_get_inode_mode(inode)); if (err || is_inode_flag_set(inode, FI_ACL_MODE)) { inode->i_mode = F2FS_I(inode)->i_acl_mode; clear_inode_flag(inode, FI_ACL_MODE); @@ -857,7 +858,7 @@ static int fill_zero(struct inode *inode, pgoff_t index, f2fs_balance_fs(sbi, true); f2fs_lock_op(sbi); - page = get_new_data_page(inode, NULL, index, false); + page = f2fs_get_new_data_page(inode, NULL, index, false); f2fs_unlock_op(sbi); if (IS_ERR(page)) @@ -870,7 +871,7 @@ static int fill_zero(struct inode *inode, pgoff_t index, return 0; } -int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end) +int f2fs_truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end) { int err; @@ -879,10 +880,11 @@ int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end) pgoff_t end_offset, count; set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, pg_start, LOOKUP_NODE); + err = f2fs_get_dnode_of_data(&dn, pg_start, LOOKUP_NODE); if (err) { if (err == -ENOENT) { - pg_start = get_next_page_offset(&dn, pg_start); + pg_start = f2fs_get_next_page_offset(&dn, + pg_start); continue; } return err; @@ -893,7 +895,7 @@ int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end) f2fs_bug_on(F2FS_I_SB(inode), count == 0 || count > end_offset); - truncate_data_blocks_range(&dn, count); + f2fs_truncate_data_blocks_range(&dn, count); f2fs_put_dnode(&dn); pg_start += count; @@ -949,7 +951,7 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) blk_end - 1); f2fs_lock_op(sbi); - ret = truncate_hole(inode, pg_start, pg_end); + ret = f2fs_truncate_hole(inode, pg_start, pg_end); f2fs_unlock_op(sbi); up_write(&F2FS_I(inode)->i_mmap_sem); } @@ -967,7 +969,7 @@ static int __read_out_blkaddrs(struct inode *inode, block_t *blkaddr, next_dnode: set_new_dnode(&dn, inode, NULL, NULL, 0); - ret = get_dnode_of_data(&dn, off, LOOKUP_NODE_RA); + ret = f2fs_get_dnode_of_data(&dn, off, LOOKUP_NODE_RA); if (ret && ret != -ENOENT) { return ret; } else if (ret == -ENOENT) { @@ -984,7 +986,7 @@ next_dnode: for (i = 0; i < done; i++, blkaddr++, do_replace++, dn.ofs_in_node++) { *blkaddr = datablock_addr(dn.inode, dn.node_page, dn.ofs_in_node); - if (!is_checkpointed_data(sbi, *blkaddr)) { + if (!f2fs_is_checkpointed_data(sbi, *blkaddr)) { if (test_opt(sbi, LFS)) { f2fs_put_dnode(&dn); @@ -1017,10 +1019,10 @@ static int __roll_back_blkaddrs(struct inode *inode, block_t *blkaddr, continue; set_new_dnode(&dn, inode, NULL, NULL, 0); - ret = get_dnode_of_data(&dn, off + i, LOOKUP_NODE_RA); + ret = f2fs_get_dnode_of_data(&dn, off + i, LOOKUP_NODE_RA); if (ret) { dec_valid_block_count(sbi, inode, 1); - invalidate_blocks(sbi, *blkaddr); + f2fs_invalidate_blocks(sbi, *blkaddr); } else { f2fs_update_data_blkaddr(&dn, *blkaddr); } @@ -1050,18 +1052,18 @@ static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode, pgoff_t ilen; set_new_dnode(&dn, dst_inode, NULL, NULL, 0); - ret = get_dnode_of_data(&dn, dst + i, ALLOC_NODE); + ret = f2fs_get_dnode_of_data(&dn, dst + i, ALLOC_NODE); if (ret) return ret; - get_node_info(sbi, dn.nid, &ni); + f2fs_get_node_info(sbi, dn.nid, &ni); ilen = min((pgoff_t) ADDRS_PER_PAGE(dn.node_page, dst_inode) - dn.ofs_in_node, len - i); do { dn.data_blkaddr = datablock_addr(dn.inode, dn.node_page, dn.ofs_in_node); - truncate_data_blocks_range(&dn, 1); + f2fs_truncate_data_blocks_range(&dn, 1); if (do_replace[i]) { f2fs_i_blocks_write(src_inode, @@ -1084,10 +1086,11 @@ static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode, } else { struct page *psrc, *pdst; - psrc = get_lock_data_page(src_inode, src + i, true); + psrc = f2fs_get_lock_data_page(src_inode, + src + i, true); if (IS_ERR(psrc)) return PTR_ERR(psrc); - pdst = get_new_data_page(dst_inode, NULL, dst + i, + pdst = f2fs_get_new_data_page(dst_inode, NULL, dst + i, true); if (IS_ERR(pdst)) { f2fs_put_page(psrc, 1); @@ -1098,7 +1101,8 @@ static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode, f2fs_put_page(pdst, 1); f2fs_put_page(psrc, 1); - ret = truncate_hole(src_inode, src + i, src + i + 1); + ret = f2fs_truncate_hole(src_inode, + src + i, src + i + 1); if (ret) return ret; i++; @@ -1215,7 +1219,7 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) new_size = i_size_read(inode) - len; truncate_pagecache(inode, new_size); - ret = truncate_blocks(inode, new_size, true); + ret = f2fs_truncate_blocks(inode, new_size, true); if (!ret) f2fs_i_size_write(inode, new_size); out_unlock: @@ -1240,7 +1244,7 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start, } dn->ofs_in_node = ofs_in_node; - ret = reserve_new_blocks(dn, count); + ret = f2fs_reserve_new_blocks(dn, count); if (ret) return ret; @@ -1249,7 +1253,7 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start, dn->data_blkaddr = datablock_addr(dn->inode, dn->node_page, dn->ofs_in_node); /* - * reserve_new_blocks will not guarantee entire block + * f2fs_reserve_new_blocks will not guarantee entire block * allocation. */ if (dn->data_blkaddr == NULL_ADDR) { @@ -1257,9 +1261,9 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start, break; } if (dn->data_blkaddr != NEW_ADDR) { - invalidate_blocks(sbi, dn->data_blkaddr); + f2fs_invalidate_blocks(sbi, dn->data_blkaddr); dn->data_blkaddr = NEW_ADDR; - set_data_blkaddr(dn); + f2fs_set_data_blkaddr(dn); } } @@ -1325,7 +1329,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, f2fs_lock_op(sbi); set_new_dnode(&dn, inode, NULL, NULL, 0); - ret = get_dnode_of_data(&dn, index, ALLOC_NODE); + ret = f2fs_get_dnode_of_data(&dn, index, ALLOC_NODE); if (ret) { f2fs_unlock_op(sbi); goto out; @@ -1399,7 +1403,7 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); down_write(&F2FS_I(inode)->i_mmap_sem); - ret = truncate_blocks(inode, i_size_read(inode), true); + ret = f2fs_truncate_blocks(inode, i_size_read(inode), true); if (ret) goto out; @@ -1560,7 +1564,7 @@ static int f2fs_release_file(struct inode *inode, struct file *filp) /* some remained atomic pages should discarded */ if (f2fs_is_atomic_file(inode)) - drop_inmem_pages(inode); + f2fs_drop_inmem_pages(inode); if (f2fs_is_volatile_file(inode)) { clear_inode_flag(inode, FI_VOLATILE_FILE); stat_dec_volatile_write(inode); @@ -1583,7 +1587,7 @@ static int f2fs_file_flush(struct file *file, fl_owner_t id) */ if (f2fs_is_atomic_file(inode) && F2FS_I(inode)->inmem_task == current) - drop_inmem_pages(inode); + f2fs_drop_inmem_pages(inode); return 0; } @@ -1732,7 +1736,7 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) } if (f2fs_is_atomic_file(inode)) { - ret = commit_inmem_pages(inode); + ret = f2fs_commit_inmem_pages(inode); if (ret) goto err_out; @@ -1835,7 +1839,7 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp) inode_lock(inode); if (f2fs_is_atomic_file(inode)) - drop_inmem_pages(inode); + f2fs_drop_inmem_pages(inode); if (f2fs_is_volatile_file(inode)) { clear_inode_flag(inode, FI_VOLATILE_FILE); stat_dec_volatile_write(inode); @@ -1892,7 +1896,7 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) f2fs_stop_checkpoint(sbi, false); break; case F2FS_GOING_DOWN_METAFLUSH: - sync_meta_pages(sbi, META, LONG_MAX, FS_META_IO); + f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_META_IO); f2fs_stop_checkpoint(sbi, false); break; default: @@ -1900,10 +1904,10 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) goto out; } - stop_gc_thread(sbi); - stop_discard_thread(sbi); + f2fs_stop_gc_thread(sbi); + f2fs_stop_discard_thread(sbi); - drop_discard_cmd(sbi); + f2fs_drop_discard_cmd(sbi); clear_opt(sbi, DISCARD); f2fs_update_time(sbi, REQ_TIME); @@ -2096,7 +2100,7 @@ out: return ret; } -static int f2fs_ioc_write_checkpoint(struct file *filp, unsigned long arg) +static int f2fs_ioc_f2fs_write_checkpoint(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -2134,7 +2138,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, int err; /* if in-place-update policy is enabled, don't waste time here */ - if (should_update_inplace(inode, NULL)) + if (f2fs_should_update_inplace(inode, NULL)) return -EINVAL; pg_start = range->start >> PAGE_SHIFT; @@ -2229,7 +2233,7 @@ do_map: while (idx < map.m_lblk + map.m_len && cnt < blk_per_seg) { struct page *page; - page = get_lock_data_page(inode, idx, true); + page = f2fs_get_lock_data_page(inode, idx, true); if (IS_ERR(page)) { err = PTR_ERR(page); goto clear_out; @@ -2576,7 +2580,7 @@ static int f2fs_ioc_set_pin_file(struct file *filp, unsigned long arg) inode_lock(inode); - if (should_update_outplace(inode, NULL)) { + if (f2fs_should_update_outplace(inode, NULL)) { ret = -EINVAL; goto out; } @@ -2689,7 +2693,7 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) case F2FS_IOC_GARBAGE_COLLECT_RANGE: return f2fs_ioc_gc_range(filp, arg); case F2FS_IOC_WRITE_CHECKPOINT: - return f2fs_ioc_write_checkpoint(filp, arg); + return f2fs_ioc_f2fs_write_checkpoint(filp, arg); case F2FS_IOC_DEFRAGMENT: return f2fs_ioc_defragment(filp, arg); case F2FS_IOC_MOVE_RANGE: diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 45713a64612d..dcadc0691a3e 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -114,7 +114,7 @@ next: return 0; } -int start_gc_thread(struct f2fs_sb_info *sbi) +int f2fs_start_gc_thread(struct f2fs_sb_info *sbi) { struct f2fs_gc_kthread *gc_th; dev_t dev = sbi->sb->s_bdev->bd_dev; @@ -146,7 +146,7 @@ out: return err; } -void stop_gc_thread(struct f2fs_sb_info *sbi) +void f2fs_stop_gc_thread(struct f2fs_sb_info *sbi) { struct f2fs_gc_kthread *gc_th = sbi->gc_thread; if (!gc_th) @@ -429,7 +429,7 @@ static void add_gc_inode(struct gc_inode_list *gc_list, struct inode *inode) iput(inode); return; } - new_ie = f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS); + new_ie = f2fs_kmem_cache_alloc(f2fs_inode_entry_slab, GFP_NOFS); new_ie->inode = inode; f2fs_radix_tree_insert(&gc_list->iroot, inode->i_ino, new_ie); @@ -443,7 +443,7 @@ static void put_gc_inode(struct gc_inode_list *gc_list) radix_tree_delete(&gc_list->iroot, ie->inode->i_ino); iput(ie->inode); list_del(&ie->list); - kmem_cache_free(inode_entry_slab, ie); + kmem_cache_free(f2fs_inode_entry_slab, ie); } } @@ -492,34 +492,34 @@ next_step: continue; if (phase == 0) { - ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), 1, + f2fs_ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), 1, META_NAT, true); continue; } if (phase == 1) { - ra_node_page(sbi, nid); + f2fs_ra_node_page(sbi, nid); continue; } /* phase == 2 */ - node_page = get_node_page(sbi, nid); + node_page = f2fs_get_node_page(sbi, nid); if (IS_ERR(node_page)) continue; - /* block may become invalid during get_node_page */ + /* block may become invalid during f2fs_get_node_page */ if (check_valid_map(sbi, segno, off) == 0) { f2fs_put_page(node_page, 1); continue; } - get_node_info(sbi, nid, &ni); + f2fs_get_node_info(sbi, nid, &ni); if (ni.blk_addr != start_addr + off) { f2fs_put_page(node_page, 1); continue; } - move_node_page(node_page, gc_type); + f2fs_move_node_page(node_page, gc_type); stat_inc_node_blk_count(sbi, 1, gc_type); } @@ -534,7 +534,7 @@ next_step: * as indirect or double indirect node blocks, are given, it must be a caller's * bug. */ -block_t start_bidx_of_node(unsigned int node_ofs, struct inode *inode) +block_t f2fs_start_bidx_of_node(unsigned int node_ofs, struct inode *inode) { unsigned int indirect_blks = 2 * NIDS_PER_BLOCK + 4; unsigned int bidx; @@ -565,11 +565,11 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, nid = le32_to_cpu(sum->nid); ofs_in_node = le16_to_cpu(sum->ofs_in_node); - node_page = get_node_page(sbi, nid); + node_page = f2fs_get_node_page(sbi, nid); if (IS_ERR(node_page)) return false; - get_node_info(sbi, nid, dni); + f2fs_get_node_info(sbi, nid, dni); if (sum->version != dni->version) { f2fs_msg(sbi->sb, KERN_WARNING, @@ -633,7 +633,7 @@ static void move_data_block(struct inode *inode, block_t bidx, } set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, bidx, LOOKUP_NODE); + err = f2fs_get_dnode_of_data(&dn, bidx, LOOKUP_NODE); if (err) goto out; @@ -648,7 +648,7 @@ static void move_data_block(struct inode *inode, block_t bidx, */ f2fs_wait_on_page_writeback(page, DATA, true); - get_node_info(fio.sbi, dn.nid, &ni); + f2fs_get_node_info(fio.sbi, dn.nid, &ni); set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version); /* read page */ @@ -658,7 +658,7 @@ static void move_data_block(struct inode *inode, block_t bidx, if (lfs_mode) down_write(&fio.sbi->io_order_lock); - allocate_data_block(fio.sbi, NULL, fio.old_blkaddr, &newaddr, + f2fs_allocate_data_block(fio.sbi, NULL, fio.old_blkaddr, &newaddr, &sum, CURSEG_COLD_DATA, NULL, false); fio.encrypted_page = f2fs_pagecache_get_page(META_MAPPING(fio.sbi), @@ -717,7 +717,7 @@ recover_block: if (lfs_mode) up_write(&fio.sbi->io_order_lock); if (err) - __f2fs_replace_block(fio.sbi, &sum, newaddr, fio.old_blkaddr, + f2fs_do_replace_block(fio.sbi, &sum, newaddr, fio.old_blkaddr, true, true); put_out: f2fs_put_dnode(&dn); @@ -730,7 +730,7 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type, { struct page *page; - page = get_lock_data_page(inode, bidx, true); + page = f2fs_get_lock_data_page(inode, bidx, true); if (IS_ERR(page)) return; @@ -775,12 +775,12 @@ retry: f2fs_wait_on_page_writeback(page, DATA, true); if (clear_page_dirty_for_io(page)) { inode_dec_dirty_pages(inode); - remove_dirty_inode(inode); + f2fs_remove_dirty_inode(inode); } set_cold_data(page); - err = do_write_data_page(&fio); + err = f2fs_do_write_data_page(&fio); if (err) { clear_cold_data(page); if (err == -ENOMEM) { @@ -832,13 +832,13 @@ next_step: continue; if (phase == 0) { - ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), 1, + f2fs_ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), 1, META_NAT, true); continue; } if (phase == 1) { - ra_node_page(sbi, nid); + f2fs_ra_node_page(sbi, nid); continue; } @@ -847,7 +847,7 @@ next_step: continue; if (phase == 2) { - ra_node_page(sbi, dni.ino); + f2fs_ra_node_page(sbi, dni.ino); continue; } @@ -870,8 +870,8 @@ next_step: continue; } - start_bidx = start_bidx_of_node(nofs, inode); - data_page = get_read_data_page(inode, + start_bidx = f2fs_start_bidx_of_node(nofs, inode); + data_page = f2fs_get_read_data_page(inode, start_bidx + ofs_in_node, REQ_RAHEAD, true); up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); @@ -905,7 +905,7 @@ next_step: inode_dio_wait(inode); } - start_bidx = start_bidx_of_node(nofs, inode) + start_bidx = f2fs_start_bidx_of_node(nofs, inode) + ofs_in_node; if (f2fs_post_read_required(inode)) move_data_block(inode, start_bidx, gc_type, @@ -955,12 +955,12 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, /* readahead multi ssa blocks those have contiguous address */ if (sbi->segs_per_sec > 1) - ra_meta_pages(sbi, GET_SUM_BLOCK(sbi, segno), + f2fs_ra_meta_pages(sbi, GET_SUM_BLOCK(sbi, segno), sbi->segs_per_sec, META_SSA, true); /* reference all summary page */ while (segno < end_segno) { - sum_page = get_sum_page(sbi, segno++); + sum_page = f2fs_get_sum_page(sbi, segno++); unlock_page(sum_page); } @@ -1056,7 +1056,7 @@ gc_more: * secure free segments which doesn't need fggc any more. */ if (prefree_segments(sbi)) { - ret = write_checkpoint(sbi, &cpc); + ret = f2fs_write_checkpoint(sbi, &cpc); if (ret) goto stop; } @@ -1093,13 +1093,13 @@ gc_more: if (has_not_enough_free_secs(sbi, sec_freed, 0)) { if (skipped_round > MAX_SKIP_ATOMIC_COUNT && skipped_round * 2 >= round) - drop_inmem_pages_all(sbi, true); + f2fs_drop_inmem_pages_all(sbi, true); segno = NULL_SEGNO; goto gc_more; } if (gc_type == FG_GC) - ret = write_checkpoint(sbi, &cpc); + ret = f2fs_write_checkpoint(sbi, &cpc); } stop: SIT_I(sbi)->last_victim[ALLOC_NEXT] = 0; @@ -1123,7 +1123,7 @@ stop: return ret; } -void build_gc_manager(struct f2fs_sb_info *sbi) +void f2fs_build_gc_manager(struct f2fs_sb_info *sbi) { DIRTY_I(sbi)->v_ops = &default_v_ops; diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index b51cc241f354..f3185ae98860 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -42,7 +42,7 @@ bool f2fs_may_inline_dentry(struct inode *inode) return true; } -void read_inline_data(struct page *page, struct page *ipage) +void f2fs_do_read_inline_data(struct page *page, struct page *ipage) { struct inode *inode = page->mapping->host; void *src_addr, *dst_addr; @@ -64,7 +64,8 @@ void read_inline_data(struct page *page, struct page *ipage) SetPageUptodate(page); } -void truncate_inline_inode(struct inode *inode, struct page *ipage, u64 from) +void f2fs_truncate_inline_inode(struct inode *inode, + struct page *ipage, u64 from) { void *addr; @@ -85,7 +86,7 @@ int f2fs_read_inline_data(struct inode *inode, struct page *page) { struct page *ipage; - ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino); + ipage = f2fs_get_node_page(F2FS_I_SB(inode), inode->i_ino); if (IS_ERR(ipage)) { unlock_page(page); return PTR_ERR(ipage); @@ -99,7 +100,7 @@ int f2fs_read_inline_data(struct inode *inode, struct page *page) if (page->index) zero_user_segment(page, 0, PAGE_SIZE); else - read_inline_data(page, ipage); + f2fs_do_read_inline_data(page, ipage); if (!PageUptodate(page)) SetPageUptodate(page); @@ -131,7 +132,7 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) f2fs_bug_on(F2FS_P_SB(page), PageWriteback(page)); - read_inline_data(page, dn->inode_page); + f2fs_do_read_inline_data(page, dn->inode_page); set_page_dirty(page); /* clear dirty state */ @@ -142,18 +143,18 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) ClearPageError(page); fio.old_blkaddr = dn->data_blkaddr; set_inode_flag(dn->inode, FI_HOT_DATA); - write_data_page(dn, &fio); + f2fs_outplace_write_data(dn, &fio); f2fs_wait_on_page_writeback(page, DATA, true); if (dirty) { inode_dec_dirty_pages(dn->inode); - remove_dirty_inode(dn->inode); + f2fs_remove_dirty_inode(dn->inode); } /* this converted inline_data should be recovered. */ set_inode_flag(dn->inode, FI_APPEND_WRITE); /* clear inline data and flag after data writeback */ - truncate_inline_inode(dn->inode, dn->inode_page, 0); + f2fs_truncate_inline_inode(dn->inode, dn->inode_page, 0); clear_inline_node(dn->inode_page); clear_out: stat_dec_inline_inode(dn->inode); @@ -178,7 +179,7 @@ int f2fs_convert_inline_inode(struct inode *inode) f2fs_lock_op(sbi); - ipage = get_node_page(sbi, inode->i_ino); + ipage = f2fs_get_node_page(sbi, inode->i_ino); if (IS_ERR(ipage)) { err = PTR_ERR(ipage); goto out; @@ -207,7 +208,7 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page) int err; set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, 0, LOOKUP_NODE); + err = f2fs_get_dnode_of_data(&dn, 0, LOOKUP_NODE); if (err) return err; @@ -225,7 +226,7 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page) kunmap_atomic(src_addr); set_page_dirty(dn.inode_page); - clear_radix_tree_dirty_tag(page); + f2fs_clear_radix_tree_dirty_tag(page); set_inode_flag(inode, FI_APPEND_WRITE); set_inode_flag(inode, FI_DATA_EXIST); @@ -235,7 +236,7 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page) return 0; } -bool recover_inline_data(struct inode *inode, struct page *npage) +bool f2fs_recover_inline_data(struct inode *inode, struct page *npage) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode *ri = NULL; @@ -256,7 +257,7 @@ bool recover_inline_data(struct inode *inode, struct page *npage) if (f2fs_has_inline_data(inode) && ri && (ri->i_inline & F2FS_INLINE_DATA)) { process_inline: - ipage = get_node_page(sbi, inode->i_ino); + ipage = f2fs_get_node_page(sbi, inode->i_ino); f2fs_bug_on(sbi, IS_ERR(ipage)); f2fs_wait_on_page_writeback(ipage, NODE, true); @@ -274,20 +275,20 @@ process_inline: } if (f2fs_has_inline_data(inode)) { - ipage = get_node_page(sbi, inode->i_ino); + ipage = f2fs_get_node_page(sbi, inode->i_ino); f2fs_bug_on(sbi, IS_ERR(ipage)); - truncate_inline_inode(inode, ipage, 0); + f2fs_truncate_inline_inode(inode, ipage, 0); clear_inode_flag(inode, FI_INLINE_DATA); f2fs_put_page(ipage, 1); } else if (ri && (ri->i_inline & F2FS_INLINE_DATA)) { - if (truncate_blocks(inode, 0, false)) + if (f2fs_truncate_blocks(inode, 0, false)) return false; goto process_inline; } return false; } -struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir, +struct f2fs_dir_entry *f2fs_find_in_inline_dir(struct inode *dir, struct fscrypt_name *fname, struct page **res_page) { struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); @@ -298,7 +299,7 @@ struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir, void *inline_dentry; f2fs_hash_t namehash; - ipage = get_node_page(sbi, dir->i_ino); + ipage = f2fs_get_node_page(sbi, dir->i_ino); if (IS_ERR(ipage)) { *res_page = ipage; return NULL; @@ -309,7 +310,7 @@ struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir, inline_dentry = inline_data_addr(dir, ipage); make_dentry_ptr_inline(dir, &d, inline_dentry); - de = find_target_dentry(fname, namehash, NULL, &d); + de = f2fs_find_target_dentry(fname, namehash, NULL, &d); unlock_page(ipage); if (de) *res_page = ipage; @@ -319,7 +320,7 @@ struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir, return de; } -int make_empty_inline_dir(struct inode *inode, struct inode *parent, +int f2fs_make_empty_inline_dir(struct inode *inode, struct inode *parent, struct page *ipage) { struct f2fs_dentry_ptr d; @@ -328,7 +329,7 @@ int make_empty_inline_dir(struct inode *inode, struct inode *parent, inline_dentry = inline_data_addr(inode, ipage); make_dentry_ptr_inline(inode, &d, inline_dentry); - do_make_empty_dir(inode, parent, &d); + f2fs_do_make_empty_dir(inode, parent, &d); set_page_dirty(ipage); @@ -386,7 +387,7 @@ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage, set_page_dirty(page); /* clear inline dir and flag after data writeback */ - truncate_inline_inode(dir, ipage, 0); + f2fs_truncate_inline_inode(dir, ipage, 0); stat_dec_inline_dir(dir); clear_inode_flag(dir, FI_INLINE_DENTRY); @@ -429,7 +430,7 @@ static int f2fs_add_inline_entries(struct inode *dir, void *inline_dentry) new_name.len = le16_to_cpu(de->name_len); ino = le32_to_cpu(de->ino); - fake_mode = get_de_type(de) << S_SHIFT; + fake_mode = f2fs_get_de_type(de) << S_SHIFT; err = f2fs_add_regular_entry(dir, &new_name, NULL, NULL, ino, fake_mode); @@ -441,8 +442,8 @@ static int f2fs_add_inline_entries(struct inode *dir, void *inline_dentry) return 0; punch_dentry_pages: truncate_inode_pages(&dir->i_data, 0); - truncate_blocks(dir, 0, false); - remove_dirty_inode(dir); + f2fs_truncate_blocks(dir, 0, false); + f2fs_remove_dirty_inode(dir); return err; } @@ -460,7 +461,7 @@ static int f2fs_move_rehashed_dirents(struct inode *dir, struct page *ipage, } memcpy(backup_dentry, inline_dentry, MAX_INLINE_DATA(dir)); - truncate_inline_inode(dir, ipage, 0); + f2fs_truncate_inline_inode(dir, ipage, 0); unlock_page(ipage); @@ -509,14 +510,14 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, struct page *page = NULL; int err = 0; - ipage = get_node_page(sbi, dir->i_ino); + ipage = f2fs_get_node_page(sbi, dir->i_ino); if (IS_ERR(ipage)) return PTR_ERR(ipage); inline_dentry = inline_data_addr(dir, ipage); make_dentry_ptr_inline(dir, &d, inline_dentry); - bit_pos = room_for_filename(d.bitmap, slots, d.max); + bit_pos = f2fs_room_for_filename(d.bitmap, slots, d.max); if (bit_pos >= d.max) { err = f2fs_convert_inline_dir(dir, ipage, inline_dentry); if (err) @@ -527,7 +528,7 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, if (inode) { down_write(&F2FS_I(inode)->i_sem); - page = init_inode_metadata(inode, dir, new_name, + page = f2fs_init_inode_metadata(inode, dir, new_name, orig_name, ipage); if (IS_ERR(page)) { err = PTR_ERR(page); @@ -548,7 +549,7 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, f2fs_put_page(page, 1); } - update_parent_metadata(dir, inode, 0); + f2fs_update_parent_metadata(dir, inode, 0); fail: if (inode) up_write(&F2FS_I(inode)->i_sem); @@ -594,7 +595,7 @@ bool f2fs_empty_inline_dir(struct inode *dir) void *inline_dentry; struct f2fs_dentry_ptr d; - ipage = get_node_page(sbi, dir->i_ino); + ipage = f2fs_get_node_page(sbi, dir->i_ino); if (IS_ERR(ipage)) return false; @@ -625,7 +626,7 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx, if (ctx->pos == d.max) return 0; - ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino); + ipage = f2fs_get_node_page(F2FS_I_SB(inode), inode->i_ino); if (IS_ERR(ipage)) return PTR_ERR(ipage); @@ -651,7 +652,7 @@ int f2fs_inline_data_fiemap(struct inode *inode, struct page *ipage; int err = 0; - ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino); + ipage = f2fs_get_node_page(F2FS_I_SB(inode), inode->i_ino); if (IS_ERR(ipage)) return PTR_ERR(ipage); @@ -667,7 +668,7 @@ int f2fs_inline_data_fiemap(struct inode *inode, ilen = start + len; ilen -= start; - get_node_info(F2FS_I_SB(inode), inode->i_ino, &ni); + f2fs_get_node_info(F2FS_I_SB(inode), inode->i_ino, &ni); byteaddr = (__u64)ni.blk_addr << inode->i_sb->s_blocksize_bits; byteaddr += (char *)inline_data_addr(inode, ipage) - (char *)F2FS_INODE(ipage); diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 2f8c99ab99f7..27e18b5cb459 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -209,10 +209,10 @@ static int do_read_inode(struct inode *inode) projid_t i_projid; /* Check if ino is within scope */ - if (check_nid_range(sbi, inode->i_ino)) + if (f2fs_check_nid_range(sbi, inode->i_ino)) return -EINVAL; - node_page = get_node_page(sbi, inode->i_ino); + node_page = f2fs_get_node_page(sbi, inode->i_ino); if (IS_ERR(node_page)) return PTR_ERR(node_page); @@ -278,7 +278,7 @@ static int do_read_inode(struct inode *inode) if (__written_first_block(ri)) set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN); - if (!need_inode_block_update(sbi, inode->i_ino)) + if (!f2fs_need_inode_block_update(sbi, inode->i_ino)) fi->last_disk_size = inode->i_size; if (fi->i_flags & F2FS_PROJINHERIT_FL) @@ -390,7 +390,7 @@ retry: return inode; } -void update_inode(struct inode *inode, struct page *node_page) +void f2fs_update_inode(struct inode *inode, struct page *node_page) { struct f2fs_inode *ri; struct extent_tree *et = F2FS_I(inode)->extent_tree; @@ -476,12 +476,12 @@ void update_inode(struct inode *inode, struct page *node_page) F2FS_I(inode)->i_disk_time[3] = F2FS_I(inode)->i_crtime; } -void update_inode_page(struct inode *inode) +void f2fs_update_inode_page(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct page *node_page; retry: - node_page = get_node_page(sbi, inode->i_ino); + node_page = f2fs_get_node_page(sbi, inode->i_ino); if (IS_ERR(node_page)) { int err = PTR_ERR(node_page); if (err == -ENOMEM) { @@ -492,7 +492,7 @@ retry: } return; } - update_inode(inode, node_page); + f2fs_update_inode(inode, node_page); f2fs_put_page(node_page, 1); } @@ -511,7 +511,7 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) * We need to balance fs here to prevent from producing dirty node pages * during the urgent cleaning time when runing out of free sections. */ - update_inode_page(inode); + f2fs_update_inode_page(inode); if (wbc && wbc->nr_to_write) f2fs_balance_fs(sbi, true); return 0; @@ -528,7 +528,7 @@ void f2fs_evict_inode(struct inode *inode) /* some remained atomic pages should discarded */ if (f2fs_is_atomic_file(inode)) - drop_inmem_pages(inode); + f2fs_drop_inmem_pages(inode); trace_f2fs_evict_inode(inode); truncate_inode_pages_final(&inode->i_data); @@ -538,7 +538,7 @@ void f2fs_evict_inode(struct inode *inode) goto out_clear; f2fs_bug_on(sbi, get_dirty_pages(inode)); - remove_dirty_inode(inode); + f2fs_remove_dirty_inode(inode); f2fs_destroy_extent_tree(inode); @@ -547,9 +547,9 @@ void f2fs_evict_inode(struct inode *inode) dquot_initialize(inode); - remove_ino_entry(sbi, inode->i_ino, APPEND_INO); - remove_ino_entry(sbi, inode->i_ino, UPDATE_INO); - remove_ino_entry(sbi, inode->i_ino, FLUSH_INO); + f2fs_remove_ino_entry(sbi, inode->i_ino, APPEND_INO); + f2fs_remove_ino_entry(sbi, inode->i_ino, UPDATE_INO); + f2fs_remove_ino_entry(sbi, inode->i_ino, FLUSH_INO); sb_start_intwrite(inode->i_sb); set_inode_flag(inode, FI_NO_ALLOC); @@ -566,7 +566,7 @@ retry: #endif if (!err) { f2fs_lock_op(sbi); - err = remove_inode_page(inode); + err = f2fs_remove_inode_page(inode); f2fs_unlock_op(sbi); if (err == -ENOENT) err = 0; @@ -579,7 +579,7 @@ retry: } if (err) - update_inode_page(inode); + f2fs_update_inode_page(inode); dquot_free_inode(inode); sb_end_intwrite(inode->i_sb); no_delete: @@ -602,18 +602,18 @@ no_delete: invalidate_mapping_pages(NODE_MAPPING(sbi), xnid, xnid); if (inode->i_nlink) { if (is_inode_flag_set(inode, FI_APPEND_WRITE)) - add_ino_entry(sbi, inode->i_ino, APPEND_INO); + f2fs_add_ino_entry(sbi, inode->i_ino, APPEND_INO); if (is_inode_flag_set(inode, FI_UPDATE_WRITE)) - add_ino_entry(sbi, inode->i_ino, UPDATE_INO); + f2fs_add_ino_entry(sbi, inode->i_ino, UPDATE_INO); } if (is_inode_flag_set(inode, FI_FREE_NID)) { - alloc_nid_failed(sbi, inode->i_ino); + f2fs_alloc_nid_failed(sbi, inode->i_ino); clear_inode_flag(inode, FI_FREE_NID); } else { /* * If xattr nid is corrupted, we can reach out error condition, - * err & !exist_written_data(sbi, inode->i_ino, ORPHAN_INO)). - * In that case, check_nid_range() is enough to give a clue. + * err & !f2fs_exist_written_data(sbi, inode->i_ino, ORPHAN_INO)). + * In that case, f2fs_check_nid_range() is enough to give a clue. */ } out_clear: @@ -622,7 +622,7 @@ out_clear: } /* caller should call f2fs_lock_op() */ -void handle_failed_inode(struct inode *inode) +void f2fs_handle_failed_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct node_info ni; @@ -637,7 +637,7 @@ void handle_failed_inode(struct inode *inode) * we must call this to avoid inode being remained as dirty, resulting * in a panic when flushing dirty inodes in gdirty_list. */ - update_inode_page(inode); + f2fs_update_inode_page(inode); f2fs_inode_synced(inode); /* don't make bad inode, since it becomes a regular file. */ @@ -648,18 +648,18 @@ void handle_failed_inode(struct inode *inode) * so we can prevent losing this orphan when encoutering checkpoint * and following suddenly power-off. */ - get_node_info(sbi, inode->i_ino, &ni); + f2fs_get_node_info(sbi, inode->i_ino, &ni); if (ni.blk_addr != NULL_ADDR) { - int err = acquire_orphan_inode(sbi); + int err = f2fs_acquire_orphan_inode(sbi); if (err) { set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_msg(sbi->sb, KERN_WARNING, "Too many orphan inodes, run fsck to fix."); } else { - add_orphan_inode(inode); + f2fs_add_orphan_inode(inode); } - alloc_nid_done(sbi, inode->i_ino); + f2fs_alloc_nid_done(sbi, inode->i_ino); } else { set_inode_flag(inode, FI_FREE_NID); } diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index b32433d8667b..7b025524ee16 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -37,7 +37,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) return ERR_PTR(-ENOMEM); f2fs_lock_op(sbi); - if (!alloc_nid(sbi, &ino)) { + if (!f2fs_alloc_nid(sbi, &ino)) { f2fs_unlock_op(sbi); err = -ENOSPC; goto fail; @@ -196,7 +196,7 @@ static inline void set_file_temperature(struct f2fs_sb_info *sbi, struct inode * up_read(&sbi->sb_lock); } -int update_extension_list(struct f2fs_sb_info *sbi, const char *name, +int f2fs_update_extension_list(struct f2fs_sb_info *sbi, const char *name, bool hot, bool set) { __u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list; @@ -295,7 +295,7 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, goto out; f2fs_unlock_op(sbi); - alloc_nid_done(sbi, ino); + f2fs_alloc_nid_done(sbi, ino); unlock_new_inode(inode); d_instantiate(dentry, inode); @@ -306,7 +306,7 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, f2fs_balance_fs(sbi, true); return 0; out: - handle_failed_inode(inode); + f2fs_handle_failed_inode(inode); return err; } @@ -401,7 +401,7 @@ static int __recover_dot_dentries(struct inode *dir, nid_t pino) err = PTR_ERR(page); goto out; } else { - err = __f2fs_add_link(dir, &dot, NULL, dir->i_ino, S_IFDIR); + err = f2fs_do_add_link(dir, &dot, NULL, dir->i_ino, S_IFDIR); if (err) goto out; } @@ -412,7 +412,7 @@ static int __recover_dot_dentries(struct inode *dir, nid_t pino) else if (IS_ERR(page)) err = PTR_ERR(page); else - err = __f2fs_add_link(dir, &dotdot, NULL, pino, S_IFDIR); + err = f2fs_do_add_link(dir, &dotdot, NULL, pino, S_IFDIR); out: if (!err) clear_inode_flag(dir, FI_INLINE_DOTS); @@ -524,7 +524,7 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) f2fs_balance_fs(sbi, true); f2fs_lock_op(sbi); - err = acquire_orphan_inode(sbi); + err = f2fs_acquire_orphan_inode(sbi); if (err) { f2fs_unlock_op(sbi); f2fs_put_page(page, 0); @@ -586,9 +586,9 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); if (err) - goto out_handle_failed_inode; + goto out_f2fs_handle_failed_inode; f2fs_unlock_op(sbi); - alloc_nid_done(sbi, inode->i_ino); + f2fs_alloc_nid_done(sbi, inode->i_ino); err = fscrypt_encrypt_symlink(inode, symname, len, &disk_link); if (err) @@ -622,8 +622,8 @@ err_out: f2fs_balance_fs(sbi, true); goto out_free_encrypted_link; -out_handle_failed_inode: - handle_failed_inode(inode); +out_f2fs_handle_failed_inode: + f2fs_handle_failed_inode(inode); out_free_encrypted_link: if (disk_link.name != (unsigned char *)symname) kfree(disk_link.name); @@ -659,7 +659,7 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) goto out_fail; f2fs_unlock_op(sbi); - alloc_nid_done(sbi, inode->i_ino); + f2fs_alloc_nid_done(sbi, inode->i_ino); unlock_new_inode(inode); d_instantiate(dentry, inode); @@ -672,7 +672,7 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) out_fail: clear_inode_flag(inode, FI_INC_LINK); - handle_failed_inode(inode); + f2fs_handle_failed_inode(inode); return err; } @@ -711,7 +711,7 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry, goto out; f2fs_unlock_op(sbi); - alloc_nid_done(sbi, inode->i_ino); + f2fs_alloc_nid_done(sbi, inode->i_ino); unlock_new_inode(inode); d_instantiate(dentry, inode); @@ -722,7 +722,7 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry, f2fs_balance_fs(sbi, true); return 0; out: - handle_failed_inode(inode); + f2fs_handle_failed_inode(inode); return err; } @@ -751,7 +751,7 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry, } f2fs_lock_op(sbi); - err = acquire_orphan_inode(sbi); + err = f2fs_acquire_orphan_inode(sbi); if (err) goto out; @@ -763,8 +763,8 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry, * add this non-linked tmpfile to orphan list, in this way we could * remove all unused data of tmpfile after abnormal power-off. */ - add_orphan_inode(inode); - alloc_nid_done(sbi, inode->i_ino); + f2fs_add_orphan_inode(inode); + f2fs_alloc_nid_done(sbi, inode->i_ino); if (whiteout) { f2fs_i_links_write(inode, false); @@ -780,9 +780,9 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry, return 0; release_out: - release_orphan_inode(sbi); + f2fs_release_orphan_inode(sbi); out: - handle_failed_inode(inode); + f2fs_handle_failed_inode(inode); return err; } @@ -889,7 +889,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, f2fs_lock_op(sbi); - err = acquire_orphan_inode(sbi); + err = f2fs_acquire_orphan_inode(sbi); if (err) goto put_out_dir; @@ -903,9 +903,9 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, up_write(&F2FS_I(new_inode)->i_sem); if (!new_inode->i_nlink) - add_orphan_inode(new_inode); + f2fs_add_orphan_inode(new_inode); else - release_orphan_inode(sbi); + f2fs_release_orphan_inode(sbi); } else { f2fs_balance_fs(sbi, true); @@ -974,9 +974,10 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, f2fs_i_links_write(old_dir, false); } if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_STRICT) { - add_ino_entry(sbi, new_dir->i_ino, TRANS_DIR_INO); + f2fs_add_ino_entry(sbi, new_dir->i_ino, TRANS_DIR_INO); if (S_ISDIR(old_inode->i_mode)) - add_ino_entry(sbi, old_inode->i_ino, TRANS_DIR_INO); + f2fs_add_ino_entry(sbi, old_inode->i_ino, + TRANS_DIR_INO); } f2fs_unlock_op(sbi); @@ -1128,8 +1129,8 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, f2fs_mark_inode_dirty_sync(new_dir, false); if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_STRICT) { - add_ino_entry(sbi, old_dir->i_ino, TRANS_DIR_INO); - add_ino_entry(sbi, new_dir->i_ino, TRANS_DIR_INO); + f2fs_add_ino_entry(sbi, old_dir->i_ino, TRANS_DIR_INO); + f2fs_add_ino_entry(sbi, new_dir->i_ino, TRANS_DIR_INO); } f2fs_unlock_op(sbi); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 2902e1fadebc..cd0f60b5be7a 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -23,7 +23,7 @@ #include "trace.h" #include -#define on_build_free_nids(nmi) mutex_is_locked(&(nm_i)->build_lock) +#define on_f2fs_build_free_nids(nmi) mutex_is_locked(&(nm_i)->build_lock) static struct kmem_cache *nat_entry_slab; static struct kmem_cache *free_nid_slab; @@ -32,7 +32,7 @@ static struct kmem_cache *nat_entry_set_slab; /* * Check whether the given nid is within node id range. */ -int check_nid_range(struct f2fs_sb_info *sbi, nid_t nid) +int f2fs_check_nid_range(struct f2fs_sb_info *sbi, nid_t nid) { if (unlikely(nid < F2FS_ROOT_INO(sbi) || nid >= NM_I(sbi)->max_nid)) { set_sbi_flag(sbi, SBI_NEED_FSCK); @@ -44,7 +44,7 @@ int check_nid_range(struct f2fs_sb_info *sbi, nid_t nid) return 0; } -bool available_free_memory(struct f2fs_sb_info *sbi, int type) +bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct sysinfo val; @@ -103,7 +103,7 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type) static void clear_node_page_dirty(struct page *page) { if (PageDirty(page)) { - clear_radix_tree_dirty_tag(page); + f2fs_clear_radix_tree_dirty_tag(page); clear_page_dirty_for_io(page); dec_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES); } @@ -113,7 +113,7 @@ static void clear_node_page_dirty(struct page *page) static struct page *get_current_nat_page(struct f2fs_sb_info *sbi, nid_t nid) { pgoff_t index = current_nat_addr(sbi, nid); - return get_meta_page(sbi, index); + return f2fs_get_meta_page(sbi, index); } static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid) @@ -130,8 +130,8 @@ static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid) dst_off = next_nat_addr(sbi, src_off); /* get current nat block page with lock */ - src_page = get_meta_page(sbi, src_off); - dst_page = grab_meta_page(sbi, dst_off); + src_page = f2fs_get_meta_page(sbi, src_off); + dst_page = f2fs_grab_meta_page(sbi, dst_off); f2fs_bug_on(sbi, PageDirty(src_page)); src_addr = page_address(src_page); @@ -267,7 +267,7 @@ static unsigned int __gang_lookup_nat_set(struct f2fs_nm_info *nm_i, start, nr); } -int need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid) +int f2fs_need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct nat_entry *e; @@ -284,7 +284,7 @@ int need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid) return need; } -bool is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid) +bool f2fs_is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct nat_entry *e; @@ -298,7 +298,7 @@ bool is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid) return is_cp; } -bool need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino) +bool f2fs_need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct nat_entry *e; @@ -397,7 +397,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, up_write(&nm_i->nat_tree_lock); } -int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink) +int f2fs_try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink) { struct f2fs_nm_info *nm_i = NM_I(sbi); int nr = nr_shrink; @@ -419,7 +419,8 @@ int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink) /* * This function always returns success */ -void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni) +void f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid, + struct node_info *ni) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); @@ -449,7 +450,7 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni) /* Check current segment summary */ down_read(&curseg->journal_rwsem); - i = lookup_journal_in_cursum(journal, NAT_JOURNAL, nid, 0); + i = f2fs_lookup_journal_in_cursum(journal, NAT_JOURNAL, nid, 0); if (i >= 0) { ne = nat_in_journal(journal, i); node_info_from_raw_nat(ni, &ne); @@ -464,7 +465,7 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni) index = current_nat_addr(sbi, nid); up_read(&nm_i->nat_tree_lock); - page = get_meta_page(sbi, index); + page = f2fs_get_meta_page(sbi, index); nat_blk = (struct f2fs_nat_block *)page_address(page); ne = nat_blk->entries[nid - start_nid]; node_info_from_raw_nat(ni, &ne); @@ -477,7 +478,7 @@ cache: /* * readahead MAX_RA_NODE number of node pages. */ -static void ra_node_pages(struct page *parent, int start, int n) +static void f2fs_ra_node_pages(struct page *parent, int start, int n) { struct f2fs_sb_info *sbi = F2FS_P_SB(parent); struct blk_plug plug; @@ -491,13 +492,13 @@ static void ra_node_pages(struct page *parent, int start, int n) end = min(end, NIDS_PER_BLOCK); for (i = start; i < end; i++) { nid = get_nid(parent, i, false); - ra_node_page(sbi, nid); + f2fs_ra_node_page(sbi, nid); } blk_finish_plug(&plug); } -pgoff_t get_next_page_offset(struct dnode_of_data *dn, pgoff_t pgofs) +pgoff_t f2fs_get_next_page_offset(struct dnode_of_data *dn, pgoff_t pgofs) { const long direct_index = ADDRS_PER_INODE(dn->inode); const long direct_blks = ADDRS_PER_BLOCK; @@ -612,7 +613,7 @@ got: * f2fs_unlock_op() only if ro is not set RDONLY_NODE. * In the case of RDONLY_NODE, we don't need to care about mutex. */ -int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) +int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) { struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); struct page *npage[4]; @@ -631,7 +632,7 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) npage[0] = dn->inode_page; if (!npage[0]) { - npage[0] = get_node_page(sbi, nids[0]); + npage[0] = f2fs_get_node_page(sbi, nids[0]); if (IS_ERR(npage[0])) return PTR_ERR(npage[0]); } @@ -655,24 +656,24 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) if (!nids[i] && mode == ALLOC_NODE) { /* alloc new node */ - if (!alloc_nid(sbi, &(nids[i]))) { + if (!f2fs_alloc_nid(sbi, &(nids[i]))) { err = -ENOSPC; goto release_pages; } dn->nid = nids[i]; - npage[i] = new_node_page(dn, noffset[i]); + npage[i] = f2fs_new_node_page(dn, noffset[i]); if (IS_ERR(npage[i])) { - alloc_nid_failed(sbi, nids[i]); + f2fs_alloc_nid_failed(sbi, nids[i]); err = PTR_ERR(npage[i]); goto release_pages; } set_nid(parent, offset[i - 1], nids[i], i == 1); - alloc_nid_done(sbi, nids[i]); + f2fs_alloc_nid_done(sbi, nids[i]); done = true; } else if (mode == LOOKUP_NODE_RA && i == level && level > 1) { - npage[i] = get_node_page_ra(parent, offset[i - 1]); + npage[i] = f2fs_get_node_page_ra(parent, offset[i - 1]); if (IS_ERR(npage[i])) { err = PTR_ERR(npage[i]); goto release_pages; @@ -687,7 +688,7 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) } if (!done) { - npage[i] = get_node_page(sbi, nids[i]); + npage[i] = f2fs_get_node_page(sbi, nids[i]); if (IS_ERR(npage[i])) { err = PTR_ERR(npage[i]); f2fs_put_page(npage[0], 0); @@ -726,15 +727,15 @@ static void truncate_node(struct dnode_of_data *dn) struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); struct node_info ni; - get_node_info(sbi, dn->nid, &ni); + f2fs_get_node_info(sbi, dn->nid, &ni); /* Deallocate node address */ - invalidate_blocks(sbi, ni.blk_addr); + f2fs_invalidate_blocks(sbi, ni.blk_addr); dec_valid_node_count(sbi, dn->inode, dn->nid == dn->inode->i_ino); set_node_addr(sbi, &ni, NULL_ADDR, false); if (dn->nid == dn->inode->i_ino) { - remove_orphan_inode(sbi, dn->nid); + f2fs_remove_orphan_inode(sbi, dn->nid); dec_valid_inode_count(sbi); f2fs_inode_synced(dn->inode); } @@ -759,7 +760,7 @@ static int truncate_dnode(struct dnode_of_data *dn) return 1; /* get direct node */ - page = get_node_page(F2FS_I_SB(dn->inode), dn->nid); + page = f2fs_get_node_page(F2FS_I_SB(dn->inode), dn->nid); if (IS_ERR(page) && PTR_ERR(page) == -ENOENT) return 1; else if (IS_ERR(page)) @@ -768,7 +769,7 @@ static int truncate_dnode(struct dnode_of_data *dn) /* Make dnode_of_data for parameter */ dn->node_page = page; dn->ofs_in_node = 0; - truncate_data_blocks(dn); + f2fs_truncate_data_blocks(dn); truncate_node(dn); return 1; } @@ -789,13 +790,13 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs, trace_f2fs_truncate_nodes_enter(dn->inode, dn->nid, dn->data_blkaddr); - page = get_node_page(F2FS_I_SB(dn->inode), dn->nid); + page = f2fs_get_node_page(F2FS_I_SB(dn->inode), dn->nid); if (IS_ERR(page)) { trace_f2fs_truncate_nodes_exit(dn->inode, PTR_ERR(page)); return PTR_ERR(page); } - ra_node_pages(page, ofs, NIDS_PER_BLOCK); + f2fs_ra_node_pages(page, ofs, NIDS_PER_BLOCK); rn = F2FS_NODE(page); if (depth < 3) { @@ -865,7 +866,7 @@ static int truncate_partial_nodes(struct dnode_of_data *dn, /* get indirect nodes in the path */ for (i = 0; i < idx + 1; i++) { /* reference count'll be increased */ - pages[i] = get_node_page(F2FS_I_SB(dn->inode), nid[i]); + pages[i] = f2fs_get_node_page(F2FS_I_SB(dn->inode), nid[i]); if (IS_ERR(pages[i])) { err = PTR_ERR(pages[i]); idx = i - 1; @@ -874,7 +875,7 @@ static int truncate_partial_nodes(struct dnode_of_data *dn, nid[i + 1] = get_nid(pages[i], offset[i + 1], false); } - ra_node_pages(pages[idx], offset[idx + 1], NIDS_PER_BLOCK); + f2fs_ra_node_pages(pages[idx], offset[idx + 1], NIDS_PER_BLOCK); /* free direct nodes linked to a partial indirect node */ for (i = offset[idx + 1]; i < NIDS_PER_BLOCK; i++) { @@ -911,7 +912,7 @@ fail: /* * All the block addresses of data and nodes should be nullified. */ -int truncate_inode_blocks(struct inode *inode, pgoff_t from) +int f2fs_truncate_inode_blocks(struct inode *inode, pgoff_t from) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); int err = 0, cont = 1; @@ -927,7 +928,7 @@ int truncate_inode_blocks(struct inode *inode, pgoff_t from) if (level < 0) return level; - page = get_node_page(sbi, inode->i_ino); + page = f2fs_get_node_page(sbi, inode->i_ino); if (IS_ERR(page)) { trace_f2fs_truncate_inode_blocks_exit(inode, PTR_ERR(page)); return PTR_ERR(page); @@ -1007,7 +1008,7 @@ fail: } /* caller must lock inode page */ -int truncate_xattr_node(struct inode *inode) +int f2fs_truncate_xattr_node(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); nid_t nid = F2FS_I(inode)->i_xattr_nid; @@ -1017,7 +1018,7 @@ int truncate_xattr_node(struct inode *inode) if (!nid) return 0; - npage = get_node_page(sbi, nid); + npage = f2fs_get_node_page(sbi, nid); if (IS_ERR(npage)) return PTR_ERR(npage); @@ -1032,17 +1033,17 @@ int truncate_xattr_node(struct inode *inode) * Caller should grab and release a rwsem by calling f2fs_lock_op() and * f2fs_unlock_op(). */ -int remove_inode_page(struct inode *inode) +int f2fs_remove_inode_page(struct inode *inode) { struct dnode_of_data dn; int err; set_new_dnode(&dn, inode, NULL, NULL, inode->i_ino); - err = get_dnode_of_data(&dn, 0, LOOKUP_NODE); + err = f2fs_get_dnode_of_data(&dn, 0, LOOKUP_NODE); if (err) return err; - err = truncate_xattr_node(inode); + err = f2fs_truncate_xattr_node(inode); if (err) { f2fs_put_dnode(&dn); return err; @@ -1051,7 +1052,7 @@ int remove_inode_page(struct inode *inode) /* remove potential inline_data blocks */ if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) - truncate_data_blocks_range(&dn, 1); + f2fs_truncate_data_blocks_range(&dn, 1); /* 0 is possible, after f2fs_new_inode() has failed */ f2fs_bug_on(F2FS_I_SB(inode), @@ -1062,7 +1063,7 @@ int remove_inode_page(struct inode *inode) return 0; } -struct page *new_inode_page(struct inode *inode) +struct page *f2fs_new_inode_page(struct inode *inode) { struct dnode_of_data dn; @@ -1070,10 +1071,10 @@ struct page *new_inode_page(struct inode *inode) set_new_dnode(&dn, inode, NULL, NULL, inode->i_ino); /* caller should f2fs_put_page(page, 1); */ - return new_node_page(&dn, 0); + return f2fs_new_node_page(&dn, 0); } -struct page *new_node_page(struct dnode_of_data *dn, unsigned int ofs) +struct page *f2fs_new_node_page(struct dnode_of_data *dn, unsigned int ofs) { struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); struct node_info new_ni; @@ -1091,7 +1092,7 @@ struct page *new_node_page(struct dnode_of_data *dn, unsigned int ofs) goto fail; #ifdef CONFIG_F2FS_CHECK_FS - get_node_info(sbi, dn->nid, &new_ni); + f2fs_get_node_info(sbi, dn->nid, &new_ni); f2fs_bug_on(sbi, new_ni.blk_addr != NULL_ADDR); #endif new_ni.nid = dn->nid; @@ -1143,7 +1144,7 @@ static int read_node_page(struct page *page, int op_flags) if (PageUptodate(page)) return LOCKED_PAGE; - get_node_info(sbi, page->index, &ni); + f2fs_get_node_info(sbi, page->index, &ni); if (unlikely(ni.blk_addr == NULL_ADDR)) { ClearPageUptodate(page); @@ -1157,14 +1158,14 @@ static int read_node_page(struct page *page, int op_flags) /* * Readahead a node page */ -void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid) +void f2fs_ra_node_page(struct f2fs_sb_info *sbi, nid_t nid) { struct page *apage; int err; if (!nid) return; - if (check_nid_range(sbi, nid)) + if (f2fs_check_nid_range(sbi, nid)) return; rcu_read_lock(); @@ -1189,7 +1190,7 @@ static struct page *__get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid, if (!nid) return ERR_PTR(-ENOENT); - if (check_nid_range(sbi, nid)) + if (f2fs_check_nid_range(sbi, nid)) return ERR_PTR(-EINVAL); repeat: page = f2fs_grab_cache_page(NODE_MAPPING(sbi), nid, false); @@ -1206,7 +1207,7 @@ repeat: } if (parent) - ra_node_pages(parent, start + 1, MAX_RA_NODE); + f2fs_ra_node_pages(parent, start + 1, MAX_RA_NODE); lock_page(page); @@ -1240,12 +1241,12 @@ out_err: return page; } -struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid) +struct page *f2fs_get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid) { return __get_node_page(sbi, nid, NULL, 0); } -struct page *get_node_page_ra(struct page *parent, int start) +struct page *f2fs_get_node_page_ra(struct page *parent, int start) { struct f2fs_sb_info *sbi = F2FS_P_SB(parent); nid_t nid = get_nid(parent, start, false); @@ -1280,7 +1281,7 @@ static void flush_inline_data(struct f2fs_sb_info *sbi, nid_t ino) ret = f2fs_write_inline_data(inode, page); inode_dec_dirty_pages(inode); - remove_dirty_inode(inode); + f2fs_remove_dirty_inode(inode); if (ret) set_page_dirty(page); page_out: @@ -1384,7 +1385,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, down_read(&sbi->node_write); } - get_node_info(sbi, nid, &ni); + f2fs_get_node_info(sbi, nid, &ni); /* This page is already truncated */ if (unlikely(ni.blk_addr == NULL_ADDR)) { @@ -1401,7 +1402,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, set_page_writeback(page); ClearPageError(page); fio.old_blkaddr = ni.blk_addr; - write_node_page(nid, &fio); + f2fs_do_write_node_page(nid, &fio); set_node_addr(sbi, &ni, fio.new_blkaddr, is_fsync_dnode(page)); dec_page_count(sbi, F2FS_DIRTY_NODES); up_read(&sbi->node_write); @@ -1430,7 +1431,7 @@ redirty_out: return AOP_WRITEPAGE_ACTIVATE; } -void move_node_page(struct page *node_page, int gc_type) +void f2fs_move_node_page(struct page *node_page, int gc_type) { if (gc_type == FG_GC) { struct writeback_control wbc = { @@ -1467,7 +1468,7 @@ static int f2fs_write_node_page(struct page *page, return __write_node_page(page, false, NULL, wbc, false, FS_NODE_IO); } -int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, +int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, struct writeback_control *wbc, bool atomic) { pgoff_t index; @@ -1534,9 +1535,9 @@ continue_unlock: if (IS_INODE(page)) { if (is_inode_flag_set(inode, FI_DIRTY_INODE)) - update_inode(inode, page); + f2fs_update_inode(inode, page); set_dentry_mark(page, - need_dentry_mark(sbi, ino)); + f2fs_need_dentry_mark(sbi, ino)); } /* may be written by other thread */ if (!PageDirty(page)) @@ -1586,7 +1587,8 @@ out: return ret ? -EIO: 0; } -int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc, +int f2fs_sync_node_pages(struct f2fs_sb_info *sbi, + struct writeback_control *wbc, bool do_balance, enum iostat_type io_type) { pgoff_t index; @@ -1687,7 +1689,7 @@ continue_unlock: return ret; } -int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino) +int f2fs_wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino) { pgoff_t index = 0; struct pagevec pvec; @@ -1744,7 +1746,7 @@ static int f2fs_write_node_pages(struct address_space *mapping, diff = nr_pages_to_write(sbi, NODE, wbc); wbc->sync_mode = WB_SYNC_NONE; blk_start_plug(&plug); - sync_node_pages(sbi, wbc, true, FS_NODE_IO); + f2fs_sync_node_pages(sbi, wbc, true, FS_NODE_IO); blk_finish_plug(&plug); wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff); return 0; @@ -1892,20 +1894,20 @@ static bool add_free_nid(struct f2fs_sb_info *sbi, * Thread A Thread B * - f2fs_create * - f2fs_new_inode - * - alloc_nid + * - f2fs_alloc_nid * - __insert_nid_to_list(PREALLOC_NID) * - f2fs_balance_fs_bg - * - build_free_nids - * - __build_free_nids + * - f2fs_build_free_nids + * - __f2fs_build_free_nids * - scan_nat_page * - add_free_nid * - __lookup_nat_cache * - f2fs_add_link - * - init_inode_metadata - * - new_inode_page - * - new_node_page + * - f2fs_init_inode_metadata + * - f2fs_new_inode_page + * - f2fs_new_node_page * - set_node_addr - * - alloc_nid_done + * - f2fs_alloc_nid_done * - __remove_nid_from_list(PREALLOC_NID) * - __insert_nid_to_list(FREE_NID) */ @@ -2037,7 +2039,8 @@ out: up_read(&nm_i->nat_tree_lock); } -static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount) +static void __f2fs_build_free_nids(struct f2fs_sb_info *sbi, + bool sync, bool mount) { struct f2fs_nm_info *nm_i = NM_I(sbi); int i = 0; @@ -2050,7 +2053,7 @@ static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount) if (nm_i->nid_cnt[FREE_NID] >= NAT_ENTRY_PER_BLOCK) return; - if (!sync && !available_free_memory(sbi, FREE_NIDS)) + if (!sync && !f2fs_available_free_memory(sbi, FREE_NIDS)) return; if (!mount) { @@ -2062,7 +2065,7 @@ static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount) } /* readahead nat pages to be scanned */ - ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES, + f2fs_ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES, META_NAT, true); down_read(&nm_i->nat_tree_lock); @@ -2092,14 +2095,14 @@ static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount) up_read(&nm_i->nat_tree_lock); - ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nm_i->next_scan_nid), + f2fs_ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nm_i->next_scan_nid), nm_i->ra_nid_pages, META_NAT, false); } -void build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount) +void f2fs_build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount) { mutex_lock(&NM_I(sbi)->build_lock); - __build_free_nids(sbi, sync, mount); + __f2fs_build_free_nids(sbi, sync, mount); mutex_unlock(&NM_I(sbi)->build_lock); } @@ -2108,7 +2111,7 @@ void build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount) * from second parameter of this function. * The returned nid could be used ino as well as nid when inode is created. */ -bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid) +bool f2fs_alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *i = NULL; @@ -2126,8 +2129,8 @@ retry: return false; } - /* We should not use stale free nids created by build_free_nids */ - if (nm_i->nid_cnt[FREE_NID] && !on_build_free_nids(nm_i)) { + /* We should not use stale free nids created by f2fs_build_free_nids */ + if (nm_i->nid_cnt[FREE_NID] && !on_f2fs_build_free_nids(nm_i)) { f2fs_bug_on(sbi, list_empty(&nm_i->free_nid_list)); i = list_first_entry(&nm_i->free_nid_list, struct free_nid, list); @@ -2144,14 +2147,14 @@ retry: spin_unlock(&nm_i->nid_list_lock); /* Let's scan nat pages and its caches to get free nids */ - build_free_nids(sbi, true, false); + f2fs_build_free_nids(sbi, true, false); goto retry; } /* - * alloc_nid() should be called prior to this function. + * f2fs_alloc_nid() should be called prior to this function. */ -void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid) +void f2fs_alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *i; @@ -2166,9 +2169,9 @@ void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid) } /* - * alloc_nid() should be called prior to this function. + * f2fs_alloc_nid() should be called prior to this function. */ -void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) +void f2fs_alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *i; @@ -2181,7 +2184,7 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) i = __lookup_free_nid_list(nm_i, nid); f2fs_bug_on(sbi, !i); - if (!available_free_memory(sbi, FREE_NIDS)) { + if (!f2fs_available_free_memory(sbi, FREE_NIDS)) { __remove_free_nid(sbi, i, PREALLOC_NID); need_free = true; } else { @@ -2198,7 +2201,7 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) kmem_cache_free(free_nid_slab, i); } -int try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink) +int f2fs_try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *i, *next; @@ -2226,14 +2229,14 @@ int try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink) return nr - nr_shrink; } -void recover_inline_xattr(struct inode *inode, struct page *page) +void f2fs_recover_inline_xattr(struct inode *inode, struct page *page) { void *src_addr, *dst_addr; size_t inline_size; struct page *ipage; struct f2fs_inode *ri; - ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino); + ipage = f2fs_get_node_page(F2FS_I_SB(inode), inode->i_ino); f2fs_bug_on(F2FS_I_SB(inode), IS_ERR(ipage)); ri = F2FS_INODE(page); @@ -2251,11 +2254,11 @@ void recover_inline_xattr(struct inode *inode, struct page *page) f2fs_wait_on_page_writeback(ipage, NODE, true); memcpy(dst_addr, src_addr, inline_size); update_inode: - update_inode(inode, ipage); + f2fs_update_inode(inode, ipage); f2fs_put_page(ipage, 1); } -int recover_xattr_data(struct inode *inode, struct page *page) +int f2fs_recover_xattr_data(struct inode *inode, struct page *page) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); nid_t prev_xnid = F2FS_I(inode)->i_xattr_nid; @@ -2268,25 +2271,25 @@ int recover_xattr_data(struct inode *inode, struct page *page) goto recover_xnid; /* 1: invalidate the previous xattr nid */ - get_node_info(sbi, prev_xnid, &ni); - invalidate_blocks(sbi, ni.blk_addr); + f2fs_get_node_info(sbi, prev_xnid, &ni); + f2fs_invalidate_blocks(sbi, ni.blk_addr); dec_valid_node_count(sbi, inode, false); set_node_addr(sbi, &ni, NULL_ADDR, false); recover_xnid: /* 2: update xattr nid in inode */ - if (!alloc_nid(sbi, &new_xnid)) + if (!f2fs_alloc_nid(sbi, &new_xnid)) return -ENOSPC; set_new_dnode(&dn, inode, NULL, NULL, new_xnid); - xpage = new_node_page(&dn, XATTR_NODE_OFFSET); + xpage = f2fs_new_node_page(&dn, XATTR_NODE_OFFSET); if (IS_ERR(xpage)) { - alloc_nid_failed(sbi, new_xnid); + f2fs_alloc_nid_failed(sbi, new_xnid); return PTR_ERR(xpage); } - alloc_nid_done(sbi, new_xnid); - update_inode_page(inode); + f2fs_alloc_nid_done(sbi, new_xnid); + f2fs_update_inode_page(inode); /* 3: update and set xattr node page dirty */ memcpy(F2FS_NODE(xpage), F2FS_NODE(page), VALID_XATTR_BLOCK_SIZE); @@ -2297,14 +2300,14 @@ recover_xnid: return 0; } -int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) +int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) { struct f2fs_inode *src, *dst; nid_t ino = ino_of_node(page); struct node_info old_ni, new_ni; struct page *ipage; - get_node_info(sbi, ino, &old_ni); + f2fs_get_node_info(sbi, ino, &old_ni); if (unlikely(old_ni.blk_addr != NULL_ADDR)) return -EINVAL; @@ -2358,7 +2361,7 @@ retry: return 0; } -void restore_node_summary(struct f2fs_sb_info *sbi, +void f2fs_restore_node_summary(struct f2fs_sb_info *sbi, unsigned int segno, struct f2fs_summary_block *sum) { struct f2fs_node *rn; @@ -2375,10 +2378,10 @@ void restore_node_summary(struct f2fs_sb_info *sbi, nrpages = min(last_offset - i, BIO_MAX_PAGES); /* readahead node pages */ - ra_meta_pages(sbi, addr, nrpages, META_POR, true); + f2fs_ra_meta_pages(sbi, addr, nrpages, META_POR, true); for (idx = addr; idx < addr + nrpages; idx++) { - struct page *page = get_tmp_page(sbi, idx); + struct page *page = f2fs_get_tmp_page(sbi, idx); rn = F2FS_NODE(page); sum_entry->nid = rn->footer.nid; @@ -2520,7 +2523,7 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, f2fs_bug_on(sbi, nat_get_blkaddr(ne) == NEW_ADDR); if (to_journal) { - offset = lookup_journal_in_cursum(journal, + offset = f2fs_lookup_journal_in_cursum(journal, NAT_JOURNAL, nid, 1); f2fs_bug_on(sbi, offset < 0); raw_ne = &nat_in_journal(journal, offset); @@ -2557,7 +2560,7 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, /* * This function is called during the checkpointing process. */ -void flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) +void f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); @@ -2620,7 +2623,7 @@ static int __get_nat_bitmaps(struct f2fs_sb_info *sbi) nat_bits_addr = __start_cp_addr(sbi) + sbi->blocks_per_seg - nm_i->nat_bits_blocks; for (i = 0; i < nm_i->nat_bits_blocks; i++) { - struct page *page = get_meta_page(sbi, nat_bits_addr++); + struct page *page = f2fs_get_meta_page(sbi, nat_bits_addr++); memcpy(nm_i->nat_bits + (i << F2FS_BLKSIZE_BITS), page_address(page), F2FS_BLKSIZE); @@ -2763,7 +2766,7 @@ static int init_free_nid_cache(struct f2fs_sb_info *sbi) return 0; } -int build_node_manager(struct f2fs_sb_info *sbi) +int f2fs_build_node_manager(struct f2fs_sb_info *sbi) { int err; @@ -2783,11 +2786,11 @@ int build_node_manager(struct f2fs_sb_info *sbi) /* load free nid status from nat_bits table */ load_free_nid_bitmap(sbi); - build_free_nids(sbi, true, true); + f2fs_build_free_nids(sbi, true, true); return 0; } -void destroy_node_manager(struct f2fs_sb_info *sbi) +void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *i, *next_i; @@ -2859,7 +2862,7 @@ void destroy_node_manager(struct f2fs_sb_info *sbi) kfree(nm_i); } -int __init create_node_manager_caches(void) +int __init f2fs_create_node_manager_caches(void) { nat_entry_slab = f2fs_kmem_cache_create("nat_entry", sizeof(struct nat_entry)); @@ -2885,7 +2888,7 @@ fail: return -ENOMEM; } -void destroy_node_manager_caches(void) +void f2fs_destroy_node_manager_caches(void) { kmem_cache_destroy(nat_entry_set_slab); kmem_cache_destroy(free_nid_slab); diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 3c3551811134..daf81d416b89 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -47,7 +47,7 @@ static struct kmem_cache *fsync_entry_slab; -bool space_for_roll_forward(struct f2fs_sb_info *sbi) +bool f2fs_space_for_roll_forward(struct f2fs_sb_info *sbi) { s64 nalloc = percpu_counter_sum_positive(&sbi->alloc_valid_block_count); @@ -162,7 +162,7 @@ retry: goto out_put; } - err = acquire_orphan_inode(F2FS_I_SB(inode)); + err = f2fs_acquire_orphan_inode(F2FS_I_SB(inode)); if (err) { iput(einode); goto out_put; @@ -173,7 +173,7 @@ retry: } else if (IS_ERR(page)) { err = PTR_ERR(page); } else { - err = __f2fs_do_add_link(dir, &fname, inode, + err = f2fs_add_dentry(dir, &fname, inode, inode->i_ino, inode->i_mode); } if (err == -ENOMEM) @@ -252,10 +252,10 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head, while (1) { struct fsync_inode_entry *entry; - if (!is_valid_meta_blkaddr(sbi, blkaddr, META_POR)) + if (!f2fs_is_valid_meta_blkaddr(sbi, blkaddr, META_POR)) return 0; - page = get_tmp_page(sbi, blkaddr); + page = f2fs_get_tmp_page(sbi, blkaddr); if (!is_recoverable_dnode(page)) break; @@ -269,7 +269,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head, if (!check_only && IS_INODE(page) && is_dent_dnode(page)) { - err = recover_inode_page(sbi, page); + err = f2fs_recover_inode_page(sbi, page); if (err) break; quota_inode = true; @@ -310,7 +310,7 @@ next: blkaddr = next_blkaddr_of_node(page); f2fs_put_page(page, 1); - ra_meta_pages_cond(sbi, blkaddr); + f2fs_ra_meta_pages_cond(sbi, blkaddr); } f2fs_put_page(page, 1); return err; @@ -353,7 +353,7 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, } } - sum_page = get_sum_page(sbi, segno); + sum_page = f2fs_get_sum_page(sbi, segno); sum_node = (struct f2fs_summary_block *)page_address(sum_page); sum = sum_node->entries[blkoff]; f2fs_put_page(sum_page, 1); @@ -373,7 +373,7 @@ got_it: } /* Get the node page */ - node_page = get_node_page(sbi, nid); + node_page = f2fs_get_node_page(sbi, nid); if (IS_ERR(node_page)) return PTR_ERR(node_page); @@ -398,7 +398,8 @@ got_it: inode = dn->inode; } - bidx = start_bidx_of_node(offset, inode) + le16_to_cpu(sum.ofs_in_node); + bidx = f2fs_start_bidx_of_node(offset, inode) + + le16_to_cpu(sum.ofs_in_node); /* * if inode page is locked, unlock temporarily, but its reference @@ -408,11 +409,11 @@ got_it: unlock_page(dn->inode_page); set_new_dnode(&tdn, inode, NULL, NULL, 0); - if (get_dnode_of_data(&tdn, bidx, LOOKUP_NODE)) + if (f2fs_get_dnode_of_data(&tdn, bidx, LOOKUP_NODE)) goto out; if (tdn.data_blkaddr == blkaddr) - truncate_data_blocks_range(&tdn, 1); + f2fs_truncate_data_blocks_range(&tdn, 1); f2fs_put_dnode(&tdn); out: @@ -425,7 +426,7 @@ out: truncate_out: if (datablock_addr(tdn.inode, tdn.node_page, tdn.ofs_in_node) == blkaddr) - truncate_data_blocks_range(&tdn, 1); + f2fs_truncate_data_blocks_range(&tdn, 1); if (dn->inode->i_ino == nid && !dn->inode_page_locked) unlock_page(dn->inode_page); return 0; @@ -441,25 +442,25 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, /* step 1: recover xattr */ if (IS_INODE(page)) { - recover_inline_xattr(inode, page); + f2fs_recover_inline_xattr(inode, page); } else if (f2fs_has_xattr_block(ofs_of_node(page))) { - err = recover_xattr_data(inode, page); + err = f2fs_recover_xattr_data(inode, page); if (!err) recovered++; goto out; } /* step 2: recover inline data */ - if (recover_inline_data(inode, page)) + if (f2fs_recover_inline_data(inode, page)) goto out; /* step 3: recover data indices */ - start = start_bidx_of_node(ofs_of_node(page), inode); + start = f2fs_start_bidx_of_node(ofs_of_node(page), inode); end = start + ADDRS_PER_PAGE(page, inode); set_new_dnode(&dn, inode, NULL, NULL, 0); retry_dn: - err = get_dnode_of_data(&dn, start, ALLOC_NODE); + err = f2fs_get_dnode_of_data(&dn, start, ALLOC_NODE); if (err) { if (err == -ENOMEM) { congestion_wait(BLK_RW_ASYNC, HZ/50); @@ -470,7 +471,7 @@ retry_dn: f2fs_wait_on_page_writeback(dn.node_page, NODE, true); - get_node_info(sbi, dn.nid, &ni); + f2fs_get_node_info(sbi, dn.nid, &ni); f2fs_bug_on(sbi, ni.ino != ino_of_node(page)); f2fs_bug_on(sbi, ofs_of_node(dn.node_page) != ofs_of_node(page)); @@ -486,7 +487,7 @@ retry_dn: /* dest is invalid, just invalidate src block */ if (dest == NULL_ADDR) { - truncate_data_blocks_range(&dn, 1); + f2fs_truncate_data_blocks_range(&dn, 1); continue; } @@ -500,19 +501,19 @@ retry_dn: * and then reserve one new block in dnode page. */ if (dest == NEW_ADDR) { - truncate_data_blocks_range(&dn, 1); - reserve_new_block(&dn); + f2fs_truncate_data_blocks_range(&dn, 1); + f2fs_reserve_new_block(&dn); continue; } /* dest is valid block, try to recover from src to dest */ - if (is_valid_meta_blkaddr(sbi, dest, META_POR)) { + if (f2fs_is_valid_meta_blkaddr(sbi, dest, META_POR)) { if (src == NULL_ADDR) { - err = reserve_new_block(&dn); + err = f2fs_reserve_new_block(&dn); #ifdef CONFIG_F2FS_FAULT_INJECTION while (err) - err = reserve_new_block(&dn); + err = f2fs_reserve_new_block(&dn); #endif /* We should not get -ENOSPC */ f2fs_bug_on(sbi, err); @@ -567,12 +568,12 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list, while (1) { struct fsync_inode_entry *entry; - if (!is_valid_meta_blkaddr(sbi, blkaddr, META_POR)) + if (!f2fs_is_valid_meta_blkaddr(sbi, blkaddr, META_POR)) break; - ra_meta_pages_cond(sbi, blkaddr); + f2fs_ra_meta_pages_cond(sbi, blkaddr); - page = get_tmp_page(sbi, blkaddr); + page = f2fs_get_tmp_page(sbi, blkaddr); if (!is_recoverable_dnode(page)) { f2fs_put_page(page, 1); @@ -610,11 +611,11 @@ next: f2fs_put_page(page, 1); } if (!err) - allocate_new_segments(sbi); + f2fs_allocate_new_segments(sbi); return err; } -int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) +int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) { struct list_head inode_list; struct list_head dir_list; @@ -689,7 +690,7 @@ skip: struct cp_control cpc = { .reason = CP_RECOVERY, }; - err = write_checkpoint(sbi, &cpc); + err = f2fs_write_checkpoint(sbi, &cpc); } kmem_cache_destroy(fsync_entry_slab); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 507f697178b6..8672bf574426 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -169,7 +169,7 @@ found: return result - size + __reverse_ffz(tmp); } -bool need_SSR(struct f2fs_sb_info *sbi) +bool f2fs_need_SSR(struct f2fs_sb_info *sbi) { int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES); int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS); @@ -184,7 +184,7 @@ bool need_SSR(struct f2fs_sb_info *sbi) SM_I(sbi)->min_ssr_sections + reserved_sections(sbi)); } -void register_inmem_page(struct inode *inode, struct page *page) +void f2fs_register_inmem_page(struct inode *inode, struct page *page) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); @@ -239,7 +239,8 @@ static int __revoke_inmem_pages(struct inode *inode, trace_f2fs_commit_inmem_page(page, INMEM_REVOKE); retry: set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, page->index, LOOKUP_NODE); + err = f2fs_get_dnode_of_data(&dn, page->index, + LOOKUP_NODE); if (err) { if (err == -ENOMEM) { congestion_wait(BLK_RW_ASYNC, HZ/50); @@ -249,9 +250,9 @@ retry: err = -EAGAIN; goto next; } - get_node_info(sbi, dn.nid, &ni); + f2fs_get_node_info(sbi, dn.nid, &ni); if (cur->old_addr == NEW_ADDR) { - invalidate_blocks(sbi, dn.data_blkaddr); + f2fs_invalidate_blocks(sbi, dn.data_blkaddr); f2fs_update_data_blkaddr(&dn, NEW_ADDR); } else f2fs_replace_block(sbi, &dn, dn.data_blkaddr, @@ -273,7 +274,7 @@ next: return err; } -void drop_inmem_pages_all(struct f2fs_sb_info *sbi, bool gc_failure) +void f2fs_drop_inmem_pages_all(struct f2fs_sb_info *sbi, bool gc_failure) { struct list_head *head = &sbi->inode_list[ATOMIC_FILE]; struct inode *inode; @@ -296,7 +297,7 @@ next: } drop: set_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); - drop_inmem_pages(inode); + f2fs_drop_inmem_pages(inode); iput(inode); } skip: @@ -305,7 +306,7 @@ skip: goto next; } -void drop_inmem_pages(struct inode *inode) +void f2fs_drop_inmem_pages(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); @@ -323,7 +324,7 @@ void drop_inmem_pages(struct inode *inode) stat_dec_atomic_write(inode); } -void drop_inmem_page(struct inode *inode, struct page *page) +void f2fs_drop_inmem_page(struct inode *inode, struct page *page) { struct f2fs_inode_info *fi = F2FS_I(inode); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -353,7 +354,7 @@ void drop_inmem_page(struct inode *inode, struct page *page) trace_f2fs_commit_inmem_page(page, INMEM_INVALIDATE); } -static int __commit_inmem_pages(struct inode *inode) +static int __f2fs_commit_inmem_pages(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); @@ -383,14 +384,14 @@ static int __commit_inmem_pages(struct inode *inode) f2fs_wait_on_page_writeback(page, DATA, true); if (clear_page_dirty_for_io(page)) { inode_dec_dirty_pages(inode); - remove_dirty_inode(inode); + f2fs_remove_dirty_inode(inode); } retry: fio.page = page; fio.old_blkaddr = NULL_ADDR; fio.encrypted_page = NULL; fio.need_lock = LOCK_DONE; - err = do_write_data_page(&fio); + err = f2fs_do_write_data_page(&fio); if (err) { if (err == -ENOMEM) { congestion_wait(BLK_RW_ASYNC, HZ/50); @@ -431,7 +432,7 @@ retry: return err; } -int commit_inmem_pages(struct inode *inode) +int f2fs_commit_inmem_pages(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); @@ -443,7 +444,7 @@ int commit_inmem_pages(struct inode *inode) set_inode_flag(inode, FI_ATOMIC_COMMIT); mutex_lock(&fi->inmem_lock); - err = __commit_inmem_pages(inode); + err = __f2fs_commit_inmem_pages(inode); spin_lock(&sbi->inode_lock[ATOMIC_FILE]); if (!list_empty(&fi->inmem_ilist)) @@ -490,24 +491,24 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) return; /* try to shrink extent cache when there is no enough memory */ - if (!available_free_memory(sbi, EXTENT_CACHE)) + if (!f2fs_available_free_memory(sbi, EXTENT_CACHE)) f2fs_shrink_extent_tree(sbi, EXTENT_CACHE_SHRINK_NUMBER); /* check the # of cached NAT entries */ - if (!available_free_memory(sbi, NAT_ENTRIES)) - try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK); + if (!f2fs_available_free_memory(sbi, NAT_ENTRIES)) + f2fs_try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK); - if (!available_free_memory(sbi, FREE_NIDS)) - try_to_free_nids(sbi, MAX_FREE_NIDS); + if (!f2fs_available_free_memory(sbi, FREE_NIDS)) + f2fs_try_to_free_nids(sbi, MAX_FREE_NIDS); else - build_free_nids(sbi, false, false); + f2fs_build_free_nids(sbi, false, false); if (!is_idle(sbi) && !excess_dirty_nats(sbi)) return; /* checkpoint is the only way to shrink partial cached entries */ - if (!available_free_memory(sbi, NAT_ENTRIES) || - !available_free_memory(sbi, INO_ENTRIES) || + if (!f2fs_available_free_memory(sbi, NAT_ENTRIES) || + !f2fs_available_free_memory(sbi, INO_ENTRIES) || excess_prefree_segs(sbi) || excess_dirty_nats(sbi) || f2fs_time_over(sbi, CP_TIME)) { @@ -515,7 +516,7 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) struct blk_plug plug; blk_start_plug(&plug); - sync_dirty_inodes(sbi, FILE_INODE); + f2fs_sync_dirty_inodes(sbi, FILE_INODE); blk_finish_plug(&plug); } f2fs_sync_fs(sbi->sb, true); @@ -548,7 +549,7 @@ static int submit_flush_wait(struct f2fs_sb_info *sbi, nid_t ino) return __submit_flush_wait(sbi, sbi->sb->s_bdev); for (i = 0; i < sbi->s_ndevs; i++) { - if (!is_dirty_device(sbi, ino, i, FLUSH_INO)) + if (!f2fs_is_dirty_device(sbi, ino, i, FLUSH_INO)) continue; ret = __submit_flush_wait(sbi, FDEV(i).bdev); if (ret) @@ -659,7 +660,7 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi, nid_t ino) return cmd.ret; } -int create_flush_cmd_control(struct f2fs_sb_info *sbi) +int f2fs_create_flush_cmd_control(struct f2fs_sb_info *sbi) { dev_t dev = sbi->sb->s_bdev->bd_dev; struct flush_cmd_control *fcc; @@ -696,7 +697,7 @@ init_thread: return err; } -void destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free) +void f2fs_destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free) { struct flush_cmd_control *fcc = SM_I(sbi)->fcc_info; @@ -1102,7 +1103,7 @@ static struct discard_cmd *__insert_discard_tree(struct f2fs_sb_info *sbi, goto do_insert; } - p = __lookup_rb_tree_for_insert(sbi, &dcc->root, &parent, lstart); + p = f2fs_lookup_rb_tree_for_insert(sbi, &dcc->root, &parent, lstart); do_insert: dc = __attach_discard_cmd(sbi, bdev, lstart, start, len, parent, p); if (!dc) @@ -1167,7 +1168,7 @@ static void __update_discard_tree_range(struct f2fs_sb_info *sbi, mutex_lock(&dcc->cmd_lock); - dc = (struct discard_cmd *)__lookup_rb_tree_ret(&dcc->root, + dc = (struct discard_cmd *)f2fs_lookup_rb_tree_ret(&dcc->root, NULL, lstart, (struct rb_entry **)&prev_dc, (struct rb_entry **)&next_dc, @@ -1278,7 +1279,8 @@ static int __issue_discard_cmd(struct f2fs_sb_info *sbi, mutex_lock(&dcc->cmd_lock); if (list_empty(pend_list)) goto next; - f2fs_bug_on(sbi, !__check_rb_tree_consistence(sbi, &dcc->root)); + f2fs_bug_on(sbi, + !f2fs_check_rb_tree_consistence(sbi, &dcc->root)); blk_start_plug(&plug); list_for_each_entry_safe(dc, tmp, pend_list, list) { f2fs_bug_on(sbi, dc->state != D_PREP); @@ -1331,7 +1333,7 @@ static bool __drop_discard_cmd(struct f2fs_sb_info *sbi) return dropped; } -void drop_discard_cmd(struct f2fs_sb_info *sbi) +void f2fs_drop_discard_cmd(struct f2fs_sb_info *sbi) { __drop_discard_cmd(sbi); } @@ -1422,7 +1424,8 @@ static void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) bool need_wait = false; mutex_lock(&dcc->cmd_lock); - dc = (struct discard_cmd *)__lookup_rb_tree(&dcc->root, NULL, blkaddr); + dc = (struct discard_cmd *)f2fs_lookup_rb_tree(&dcc->root, + NULL, blkaddr); if (dc) { if (dc->state == D_PREP) { __punch_discard_cmd(sbi, dc, blkaddr); @@ -1437,7 +1440,7 @@ static void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) __wait_one_discard_bio(sbi, dc); } -void stop_discard_thread(struct f2fs_sb_info *sbi) +void f2fs_stop_discard_thread(struct f2fs_sb_info *sbi) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; @@ -1685,7 +1688,7 @@ static void release_discard_addr(struct discard_entry *entry) kmem_cache_free(discard_entry_slab, entry); } -void release_discard_addrs(struct f2fs_sb_info *sbi) +void f2fs_release_discard_addrs(struct f2fs_sb_info *sbi) { struct list_head *head = &(SM_I(sbi)->dcc_info->entry_list); struct discard_entry *entry, *this; @@ -1696,7 +1699,7 @@ void release_discard_addrs(struct f2fs_sb_info *sbi) } /* - * Should call clear_prefree_segments after checkpoint is done. + * Should call f2fs_clear_prefree_segments after checkpoint is done. */ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi) { @@ -1709,7 +1712,8 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi) mutex_unlock(&dirty_i->seglist_lock); } -void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc) +void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi, + struct cp_control *cpc) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct list_head *head = &dcc->entry_list; @@ -1851,7 +1855,7 @@ static void destroy_discard_cmd_control(struct f2fs_sb_info *sbi) if (!dcc) return; - stop_discard_thread(sbi); + f2fs_stop_discard_thread(sbi); kfree(dcc); SM_I(sbi)->dcc_info = NULL; @@ -1967,7 +1971,7 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) get_sec_entry(sbi, segno)->valid_blocks += del; } -void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr) +void f2fs_invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr) { unsigned int segno = GET_SEGNO(sbi, addr); struct sit_info *sit_i = SIT_I(sbi); @@ -1987,7 +1991,7 @@ void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr) up_write(&sit_i->sentry_lock); } -bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr) +bool f2fs_is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr) { struct sit_info *sit_i = SIT_I(sbi); unsigned int segno, offset; @@ -2026,7 +2030,7 @@ static void __add_sum_entry(struct f2fs_sb_info *sbi, int type, /* * Calculate the number of current summary pages for writing */ -int npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra) +int f2fs_npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra) { int valid_sum_count = 0; int i, sum_in_page; @@ -2056,14 +2060,15 @@ int npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra) /* * Caller should put this summary page */ -struct page *get_sum_page(struct f2fs_sb_info *sbi, unsigned int segno) +struct page *f2fs_get_sum_page(struct f2fs_sb_info *sbi, unsigned int segno) { - return get_meta_page(sbi, GET_SUM_BLOCK(sbi, segno)); + return f2fs_get_meta_page(sbi, GET_SUM_BLOCK(sbi, segno)); } -void update_meta_page(struct f2fs_sb_info *sbi, void *src, block_t blk_addr) +void f2fs_update_meta_page(struct f2fs_sb_info *sbi, + void *src, block_t blk_addr) { - struct page *page = grab_meta_page(sbi, blk_addr); + struct page *page = f2fs_grab_meta_page(sbi, blk_addr); memcpy(page_address(page), src, PAGE_SIZE); set_page_dirty(page); @@ -2073,14 +2078,14 @@ void update_meta_page(struct f2fs_sb_info *sbi, void *src, block_t blk_addr) static void write_sum_page(struct f2fs_sb_info *sbi, struct f2fs_summary_block *sum_blk, block_t blk_addr) { - update_meta_page(sbi, (void *)sum_blk, blk_addr); + f2fs_update_meta_page(sbi, (void *)sum_blk, blk_addr); } static void write_current_sum_page(struct f2fs_sb_info *sbi, int type, block_t blk_addr) { struct curseg_info *curseg = CURSEG_I(sbi, type); - struct page *page = grab_meta_page(sbi, blk_addr); + struct page *page = f2fs_grab_meta_page(sbi, blk_addr); struct f2fs_summary_block *src = curseg->sum_blk; struct f2fs_summary_block *dst; @@ -2325,7 +2330,7 @@ static void change_curseg(struct f2fs_sb_info *sbi, int type) curseg->alloc_type = SSR; __next_free_blkoff(sbi, curseg, 0); - sum_page = get_sum_page(sbi, new_segno); + sum_page = f2fs_get_sum_page(sbi, new_segno); sum_node = (struct f2fs_summary_block *)page_address(sum_page); memcpy(curseg->sum_blk, sum_node, SUM_ENTRY_SIZE); f2fs_put_page(sum_page, 1); @@ -2339,7 +2344,7 @@ static int get_ssr_segment(struct f2fs_sb_info *sbi, int type) int i, cnt; bool reversed = false; - /* need_SSR() already forces to do this */ + /* f2fs_need_SSR() already forces to do this */ if (v_ops->get_victim(sbi, &segno, BG_GC, type, SSR)) { curseg->next_segno = segno; return 1; @@ -2391,7 +2396,7 @@ static void allocate_segment_by_default(struct f2fs_sb_info *sbi, new_curseg(sbi, type, false); else if (curseg->alloc_type == LFS && is_next_segment_free(sbi, type)) new_curseg(sbi, type, false); - else if (need_SSR(sbi) && get_ssr_segment(sbi, type)) + else if (f2fs_need_SSR(sbi) && get_ssr_segment(sbi, type)) change_curseg(sbi, type); else new_curseg(sbi, type, false); @@ -2399,7 +2404,7 @@ static void allocate_segment_by_default(struct f2fs_sb_info *sbi, stat_inc_seg_type(sbi, curseg); } -void allocate_new_segments(struct f2fs_sb_info *sbi) +void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi) { struct curseg_info *curseg; unsigned int old_segno; @@ -2421,7 +2426,8 @@ static const struct segment_allocation default_salloc_ops = { .allocate_segment = allocate_segment_by_default, }; -bool exist_trim_candidates(struct f2fs_sb_info *sbi, struct cp_control *cpc) +bool f2fs_exist_trim_candidates(struct f2fs_sb_info *sbi, + struct cp_control *cpc) { __u64 trim_start = cpc->trim_start; bool has_candidate = false; @@ -2454,9 +2460,9 @@ next: issued = 0; mutex_lock(&dcc->cmd_lock); - f2fs_bug_on(sbi, !__check_rb_tree_consistence(sbi, &dcc->root)); + f2fs_bug_on(sbi, !f2fs_check_rb_tree_consistence(sbi, &dcc->root)); - dc = (struct discard_cmd *)__lookup_rb_tree_ret(&dcc->root, + dc = (struct discard_cmd *)f2fs_lookup_rb_tree_ret(&dcc->root, NULL, start, (struct rb_entry **)&prev_dc, (struct rb_entry **)&next_dc, @@ -2537,7 +2543,7 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) goto out; mutex_lock(&sbi->gc_mutex); - err = write_checkpoint(sbi, &cpc); + err = f2fs_write_checkpoint(sbi, &cpc); mutex_unlock(&sbi->gc_mutex); if (err) goto out; @@ -2571,7 +2577,7 @@ static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type) return false; } -int rw_hint_to_seg_type(enum rw_hint hint) +int f2fs_rw_hint_to_seg_type(enum rw_hint hint) { switch (hint) { case WRITE_LIFE_SHORT: @@ -2644,7 +2650,7 @@ int rw_hint_to_seg_type(enum rw_hint hint) * WRITE_LIFE_LONG " WRITE_LIFE_LONG */ -enum rw_hint io_type_to_rw_hint(struct f2fs_sb_info *sbi, +enum rw_hint f2fs_io_type_to_rw_hint(struct f2fs_sb_info *sbi, enum page_type type, enum temp_type temp) { if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_USER) { @@ -2715,7 +2721,7 @@ static int __get_segment_type_6(struct f2fs_io_info *fio) is_inode_flag_set(inode, FI_ATOMIC_FILE) || is_inode_flag_set(inode, FI_VOLATILE_FILE)) return CURSEG_HOT_DATA; - /* rw_hint_to_seg_type(inode->i_write_hint); */ + /* f2fs_rw_hint_to_seg_type(inode->i_write_hint); */ return CURSEG_WARM_DATA; } else { if (IS_DNODE(fio->page)) @@ -2752,7 +2758,7 @@ static int __get_segment_type(struct f2fs_io_info *fio) return type; } -void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, +void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, block_t old_blkaddr, block_t *new_blkaddr, struct f2fs_summary *sum, int type, struct f2fs_io_info *fio, bool add_list) @@ -2835,7 +2841,7 @@ static void update_device_state(struct f2fs_io_info *fio) devidx = f2fs_target_device_index(sbi, fio->new_blkaddr); /* update device state for fsync */ - set_dirty_device(sbi, fio->ino, devidx, FLUSH_INO); + f2fs_set_dirty_device(sbi, fio->ino, devidx, FLUSH_INO); /* update device state for checkpoint */ if (!f2fs_test_bit(devidx, (char *)&sbi->dirty_device)) { @@ -2853,7 +2859,7 @@ static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) if (keep_order) down_read(&fio->sbi->io_order_lock); reallocate: - allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr, + f2fs_allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr, &fio->new_blkaddr, sum, type, fio, true); /* writeout dirty page into bdev */ @@ -2869,7 +2875,7 @@ reallocate: up_read(&fio->sbi->io_order_lock); } -void write_meta_page(struct f2fs_sb_info *sbi, struct page *page, +void f2fs_do_write_meta_page(struct f2fs_sb_info *sbi, struct page *page, enum iostat_type io_type) { struct f2fs_io_info fio = { @@ -2895,7 +2901,7 @@ void write_meta_page(struct f2fs_sb_info *sbi, struct page *page, f2fs_update_iostat(sbi, io_type, F2FS_BLKSIZE); } -void write_node_page(unsigned int nid, struct f2fs_io_info *fio) +void f2fs_do_write_node_page(unsigned int nid, struct f2fs_io_info *fio) { struct f2fs_summary sum; @@ -2905,14 +2911,15 @@ void write_node_page(unsigned int nid, struct f2fs_io_info *fio) f2fs_update_iostat(fio->sbi, fio->io_type, F2FS_BLKSIZE); } -void write_data_page(struct dnode_of_data *dn, struct f2fs_io_info *fio) +void f2fs_outplace_write_data(struct dnode_of_data *dn, + struct f2fs_io_info *fio) { struct f2fs_sb_info *sbi = fio->sbi; struct f2fs_summary sum; struct node_info ni; f2fs_bug_on(sbi, dn->data_blkaddr == NULL_ADDR); - get_node_info(sbi, dn->nid, &ni); + f2fs_get_node_info(sbi, dn->nid, &ni); set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); do_write_page(&sum, fio); f2fs_update_data_blkaddr(dn, fio->new_blkaddr); @@ -2920,7 +2927,7 @@ void write_data_page(struct dnode_of_data *dn, struct f2fs_io_info *fio) f2fs_update_iostat(sbi, fio->io_type, F2FS_BLKSIZE); } -int rewrite_data_page(struct f2fs_io_info *fio) +int f2fs_inplace_write_data(struct f2fs_io_info *fio) { int err; struct f2fs_sb_info *sbi = fio->sbi; @@ -2955,7 +2962,7 @@ static inline int __f2fs_get_curseg(struct f2fs_sb_info *sbi, return i; } -void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, +void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, block_t old_blkaddr, block_t new_blkaddr, bool recover_curseg, bool recover_newaddr) { @@ -3040,7 +3047,7 @@ void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn, set_summary(&sum, dn->nid, dn->ofs_in_node, version); - __f2fs_replace_block(sbi, &sum, old_addr, new_addr, + f2fs_do_replace_block(sbi, &sum, old_addr, new_addr, recover_curseg, recover_newaddr); f2fs_update_data_blkaddr(dn, new_addr); @@ -3086,7 +3093,7 @@ static void read_compacted_summaries(struct f2fs_sb_info *sbi) start = start_sum_block(sbi); - page = get_meta_page(sbi, start++); + page = f2fs_get_meta_page(sbi, start++); kaddr = (unsigned char *)page_address(page); /* Step 1: restore nat cache */ @@ -3126,7 +3133,7 @@ static void read_compacted_summaries(struct f2fs_sb_info *sbi) f2fs_put_page(page, 1); page = NULL; - page = get_meta_page(sbi, start++); + page = f2fs_get_meta_page(sbi, start++); kaddr = (unsigned char *)page_address(page); offset = 0; } @@ -3165,7 +3172,7 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type) blk_addr = GET_SUM_BLOCK(sbi, segno); } - new = get_meta_page(sbi, blk_addr); + new = f2fs_get_meta_page(sbi, blk_addr); sum = (struct f2fs_summary_block *)page_address(new); if (IS_NODESEG(type)) { @@ -3177,7 +3184,7 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type) ns->ofs_in_node = 0; } } else { - restore_node_summary(sbi, segno, sum); + f2fs_restore_node_summary(sbi, segno, sum); } } @@ -3209,10 +3216,10 @@ static int restore_curseg_summaries(struct f2fs_sb_info *sbi) int err; if (is_set_ckpt_flags(sbi, CP_COMPACT_SUM_FLAG)) { - int npages = npages_for_summary_flush(sbi, true); + int npages = f2fs_npages_for_summary_flush(sbi, true); if (npages >= 2) - ra_meta_pages(sbi, start_sum_block(sbi), npages, + f2fs_ra_meta_pages(sbi, start_sum_block(sbi), npages, META_CP, true); /* restore for compacted data summary */ @@ -3221,7 +3228,7 @@ static int restore_curseg_summaries(struct f2fs_sb_info *sbi) } if (__exist_node_summaries(sbi)) - ra_meta_pages(sbi, sum_blk_addr(sbi, NR_CURSEG_TYPE, type), + f2fs_ra_meta_pages(sbi, sum_blk_addr(sbi, NR_CURSEG_TYPE, type), NR_CURSEG_TYPE - type, META_CP, true); for (; type <= CURSEG_COLD_NODE; type++) { @@ -3247,7 +3254,7 @@ static void write_compacted_summaries(struct f2fs_sb_info *sbi, block_t blkaddr) int written_size = 0; int i, j; - page = grab_meta_page(sbi, blkaddr++); + page = f2fs_grab_meta_page(sbi, blkaddr++); kaddr = (unsigned char *)page_address(page); memset(kaddr, 0, PAGE_SIZE); @@ -3272,7 +3279,7 @@ static void write_compacted_summaries(struct f2fs_sb_info *sbi, block_t blkaddr) for (j = 0; j < blkoff; j++) { if (!page) { - page = grab_meta_page(sbi, blkaddr++); + page = f2fs_grab_meta_page(sbi, blkaddr++); kaddr = (unsigned char *)page_address(page); memset(kaddr, 0, PAGE_SIZE); written_size = 0; @@ -3309,7 +3316,7 @@ static void write_normal_summaries(struct f2fs_sb_info *sbi, write_current_sum_page(sbi, i, blkaddr + (i - type)); } -void write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk) +void f2fs_write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk) { if (is_set_ckpt_flags(sbi, CP_COMPACT_SUM_FLAG)) write_compacted_summaries(sbi, start_blk); @@ -3317,12 +3324,12 @@ void write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk) write_normal_summaries(sbi, start_blk, CURSEG_HOT_DATA); } -void write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk) +void f2fs_write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk) { write_normal_summaries(sbi, start_blk, CURSEG_HOT_NODE); } -int lookup_journal_in_cursum(struct f2fs_journal *journal, int type, +int f2fs_lookup_journal_in_cursum(struct f2fs_journal *journal, int type, unsigned int val, int alloc) { int i; @@ -3347,7 +3354,7 @@ int lookup_journal_in_cursum(struct f2fs_journal *journal, int type, static struct page *get_current_sit_page(struct f2fs_sb_info *sbi, unsigned int segno) { - return get_meta_page(sbi, current_sit_addr(sbi, segno)); + return f2fs_get_meta_page(sbi, current_sit_addr(sbi, segno)); } static struct page *get_next_sit_page(struct f2fs_sb_info *sbi, @@ -3360,7 +3367,7 @@ static struct page *get_next_sit_page(struct f2fs_sb_info *sbi, src_off = current_sit_addr(sbi, start); dst_off = next_sit_addr(sbi, src_off); - page = grab_meta_page(sbi, dst_off); + page = f2fs_grab_meta_page(sbi, dst_off); seg_info_to_sit_page(sbi, page, start); set_page_dirty(page); @@ -3456,7 +3463,7 @@ static void remove_sits_in_journal(struct f2fs_sb_info *sbi) * CP calls this function, which flushes SIT entries including sit_journal, * and moves prefree segs to free segs. */ -void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) +void f2fs_flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) { struct sit_info *sit_i = SIT_I(sbi); unsigned long *bitmap = sit_i->dirty_sentries_bitmap; @@ -3528,7 +3535,7 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) } if (to_journal) { - offset = lookup_journal_in_cursum(journal, + offset = f2fs_lookup_journal_in_cursum(journal, SIT_JOURNAL, segno, 1); f2fs_bug_on(sbi, offset < 0); segno_in_journal(journal, offset) = @@ -3744,7 +3751,7 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) block_t total_node_blocks = 0; do { - readed = ra_meta_pages(sbi, start_blk, BIO_MAX_PAGES, + readed = f2fs_ra_meta_pages(sbi, start_blk, BIO_MAX_PAGES, META_SIT, true); start = start_blk * sit_i->sents_per_block; @@ -3962,7 +3969,7 @@ static void init_min_max_mtime(struct f2fs_sb_info *sbi) up_write(&sit_i->sentry_lock); } -int build_segment_manager(struct f2fs_sb_info *sbi) +int f2fs_build_segment_manager(struct f2fs_sb_info *sbi) { struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); @@ -3999,7 +4006,7 @@ int build_segment_manager(struct f2fs_sb_info *sbi) init_rwsem(&sm_info->curseg_lock); if (!f2fs_readonly(sbi->sb)) { - err = create_flush_cmd_control(sbi); + err = f2fs_create_flush_cmd_control(sbi); if (err) return err; } @@ -4124,13 +4131,13 @@ static void destroy_sit_info(struct f2fs_sb_info *sbi) kfree(sit_i); } -void destroy_segment_manager(struct f2fs_sb_info *sbi) +void f2fs_destroy_segment_manager(struct f2fs_sb_info *sbi) { struct f2fs_sm_info *sm_info = SM_I(sbi); if (!sm_info) return; - destroy_flush_cmd_control(sbi, true); + f2fs_destroy_flush_cmd_control(sbi, true); destroy_discard_cmd_control(sbi); destroy_dirty_segmap(sbi); destroy_curseg(sbi); @@ -4140,7 +4147,7 @@ void destroy_segment_manager(struct f2fs_sb_info *sbi) kfree(sm_info); } -int __init create_segment_manager_caches(void) +int __init f2fs_create_segment_manager_caches(void) { discard_entry_slab = f2fs_kmem_cache_create("discard_entry", sizeof(struct discard_entry)); @@ -4173,7 +4180,7 @@ fail: return -ENOMEM; } -void destroy_segment_manager_caches(void) +void f2fs_destroy_segment_manager_caches(void) { kmem_cache_destroy(sit_entry_set_slab); kmem_cache_destroy(discard_cmd_slab); diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c index 0b5664a1a6cc..36cfd816c160 100644 --- a/fs/f2fs/shrinker.c +++ b/fs/f2fs/shrinker.c @@ -109,11 +109,11 @@ unsigned long f2fs_shrink_scan(struct shrinker *shrink, /* shrink clean nat cache entries */ if (freed < nr) - freed += try_to_free_nats(sbi, nr - freed); + freed += f2fs_try_to_free_nats(sbi, nr - freed); /* shrink free nids cache entries */ if (freed < nr) - freed += try_to_free_nids(sbi, nr - freed); + freed += f2fs_try_to_free_nids(sbi, nr - freed); spin_lock(&f2fs_list_lock); p = p->next; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 9819c04e6848..d306725d7399 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -865,7 +865,7 @@ static int f2fs_drop_inode(struct inode *inode) /* some remained atomic pages should discarded */ if (f2fs_is_atomic_file(inode)) - drop_inmem_pages(inode); + f2fs_drop_inmem_pages(inode); /* should remain fi->extent_tree for writepage */ f2fs_destroy_extent_node(inode); @@ -1002,7 +1002,7 @@ static void f2fs_put_super(struct super_block *sb) struct cp_control cpc = { .reason = CP_UMOUNT, }; - write_checkpoint(sbi, &cpc); + f2fs_write_checkpoint(sbi, &cpc); } /* be sure to wait for any on-going discard commands */ @@ -1012,17 +1012,17 @@ static void f2fs_put_super(struct super_block *sb) struct cp_control cpc = { .reason = CP_UMOUNT | CP_TRIMMED, }; - write_checkpoint(sbi, &cpc); + f2fs_write_checkpoint(sbi, &cpc); } - /* write_checkpoint can update stat informaion */ + /* f2fs_write_checkpoint can update stat informaion */ f2fs_destroy_stats(sbi); /* * normally superblock is clean, so we need to release this. * In addition, EIO will skip do checkpoint, we need this as well. */ - release_ino_entry(sbi, true); + f2fs_release_ino_entry(sbi, true); f2fs_leave_shrinker(sbi); mutex_unlock(&sbi->umount_mutex); @@ -1034,8 +1034,8 @@ static void f2fs_put_super(struct super_block *sb) iput(sbi->meta_inode); /* destroy f2fs internal modules */ - destroy_node_manager(sbi); - destroy_segment_manager(sbi); + f2fs_destroy_node_manager(sbi); + f2fs_destroy_segment_manager(sbi); kfree(sbi->ckpt); @@ -1078,7 +1078,7 @@ int f2fs_sync_fs(struct super_block *sb, int sync) cpc.reason = __get_cp_reason(sbi); mutex_lock(&sbi->gc_mutex); - err = write_checkpoint(sbi, &cpc); + err = f2fs_write_checkpoint(sbi, &cpc); mutex_unlock(&sbi->gc_mutex); } f2fs_trace_ios(NULL, 1); @@ -1481,11 +1481,11 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) */ if ((*flags & MS_RDONLY) || !test_opt(sbi, BG_GC)) { if (sbi->gc_thread) { - stop_gc_thread(sbi); + f2fs_stop_gc_thread(sbi); need_restart_gc = true; } } else if (!sbi->gc_thread) { - err = start_gc_thread(sbi); + err = f2fs_start_gc_thread(sbi); if (err) goto restore_opts; need_stop_gc = true; @@ -1508,9 +1508,9 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) */ if ((*flags & MS_RDONLY) || !test_opt(sbi, FLUSH_MERGE)) { clear_opt(sbi, FLUSH_MERGE); - destroy_flush_cmd_control(sbi, false); + f2fs_destroy_flush_cmd_control(sbi, false); } else { - err = create_flush_cmd_control(sbi); + err = f2fs_create_flush_cmd_control(sbi); if (err) goto restore_gc; } @@ -1528,11 +1528,11 @@ skip: return 0; restore_gc: if (need_restart_gc) { - if (start_gc_thread(sbi)) + if (f2fs_start_gc_thread(sbi)) f2fs_msg(sbi->sb, KERN_WARNING, "background gc thread has stopped"); } else if (need_stop_gc) { - stop_gc_thread(sbi); + f2fs_stop_gc_thread(sbi); } restore_opts: #ifdef CONFIG_QUOTA @@ -1953,7 +1953,7 @@ static struct inode *f2fs_nfs_get_inode(struct super_block *sb, struct f2fs_sb_info *sbi = F2FS_SB(sb); struct inode *inode; - if (check_nid_range(sbi, ino)) + if (f2fs_check_nid_range(sbi, ino)) return ERR_PTR(-ESTALE); /* @@ -2279,7 +2279,7 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi, return 0; } -int sanity_check_ckpt(struct f2fs_sb_info *sbi) +int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi) { unsigned int total, fsmeta; struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); @@ -2832,7 +2832,7 @@ try_onemore: goto free_io_dummy; } - err = get_valid_checkpoint(sbi); + err = f2fs_get_valid_checkpoint(sbi); if (err) { f2fs_msg(sb, KERN_ERR, "Failed to get valid F2FS checkpoint"); goto free_meta_inode; @@ -2862,18 +2862,18 @@ try_onemore: spin_lock_init(&sbi->inode_lock[i]); } - init_extent_cache_info(sbi); + f2fs_init_extent_cache_info(sbi); - init_ino_entry_info(sbi); + f2fs_init_ino_entry_info(sbi); /* setup f2fs internal modules */ - err = build_segment_manager(sbi); + err = f2fs_build_segment_manager(sbi); if (err) { f2fs_msg(sb, KERN_ERR, "Failed to initialize F2FS segment manager"); goto free_sm; } - err = build_node_manager(sbi); + err = f2fs_build_node_manager(sbi); if (err) { f2fs_msg(sb, KERN_ERR, "Failed to initialize F2FS node manager"); @@ -2891,7 +2891,7 @@ try_onemore: sbi->kbytes_written = le64_to_cpu(seg_i->journal->info.kbytes_written); - build_gc_manager(sbi); + f2fs_build_gc_manager(sbi); /* get an inode for node space */ sbi->node_inode = f2fs_iget(sb, F2FS_NODE_INO(sbi)); @@ -2943,7 +2943,7 @@ try_onemore: } #endif /* if there are nt orphan nodes free them */ - err = recover_orphan_inodes(sbi); + err = f2fs_recover_orphan_inodes(sbi); if (err) goto free_meta; @@ -2965,7 +2965,7 @@ try_onemore: if (!retry) goto skip_recovery; - err = recover_fsync_data(sbi, false); + err = f2fs_recover_fsync_data(sbi, false); if (err < 0) { need_fsck = true; f2fs_msg(sb, KERN_ERR, @@ -2973,7 +2973,7 @@ try_onemore: goto free_meta; } } else { - err = recover_fsync_data(sbi, true); + err = f2fs_recover_fsync_data(sbi, true); if (!f2fs_readonly(sb) && err > 0) { err = -EINVAL; @@ -2983,7 +2983,7 @@ try_onemore: } } skip_recovery: - /* recover_fsync_data() cleared this already */ + /* f2fs_recover_fsync_data() cleared this already */ clear_sbi_flag(sbi, SBI_POR_DOING); /* @@ -2992,7 +2992,7 @@ skip_recovery: */ if (test_opt(sbi, BG_GC) && !f2fs_readonly(sb)) { /* After POR, we can run background GC thread.*/ - err = start_gc_thread(sbi); + err = f2fs_start_gc_thread(sbi); if (err) goto free_meta; } @@ -3023,10 +3023,10 @@ free_meta: #endif f2fs_sync_inode_meta(sbi); /* - * Some dirty meta pages can be produced by recover_orphan_inodes() + * Some dirty meta pages can be produced by f2fs_recover_orphan_inodes() * failed by EIO. Then, iput(node_inode) can trigger balance_fs_bg() - * followed by write_checkpoint() through f2fs_write_node_pages(), which - * falls into an infinite loop in sync_meta_pages(). + * followed by f2fs_write_checkpoint() through f2fs_write_node_pages(), which + * falls into an infinite loop in f2fs_sync_meta_pages(). */ truncate_inode_pages_final(META_MAPPING(sbi)); #ifdef CONFIG_QUOTA @@ -3039,13 +3039,13 @@ free_root_inode: free_stats: f2fs_destroy_stats(sbi); free_node_inode: - release_ino_entry(sbi, true); + f2fs_release_ino_entry(sbi, true); truncate_inode_pages_final(NODE_MAPPING(sbi)); iput(sbi->node_inode); free_nm: - destroy_node_manager(sbi); + f2fs_destroy_node_manager(sbi); free_sm: - destroy_segment_manager(sbi); + f2fs_destroy_segment_manager(sbi); free_devices: destroy_device_list(sbi); kfree(sbi->ckpt); @@ -3091,8 +3091,8 @@ static void kill_f2fs_super(struct super_block *sb) { if (sb->s_root) { set_sbi_flag(F2FS_SB(sb), SBI_IS_CLOSE); - stop_gc_thread(F2FS_SB(sb)); - stop_discard_thread(F2FS_SB(sb)); + f2fs_stop_gc_thread(F2FS_SB(sb)); + f2fs_stop_discard_thread(F2FS_SB(sb)); } kill_block_super(sb); } @@ -3141,16 +3141,16 @@ static int __init init_f2fs_fs(void) err = init_inodecache(); if (err) goto fail; - err = create_node_manager_caches(); + err = f2fs_create_node_manager_caches(); if (err) goto free_inodecache; - err = create_segment_manager_caches(); + err = f2fs_create_segment_manager_caches(); if (err) goto free_node_manager_caches; - err = create_checkpoint_caches(); + err = f2fs_create_checkpoint_caches(); if (err) goto free_segment_manager_caches; - err = create_extent_cache(); + err = f2fs_create_extent_cache(); if (err) goto free_checkpoint_caches; err = f2fs_init_sysfs(); @@ -3179,13 +3179,13 @@ free_shrinker: free_sysfs: f2fs_exit_sysfs(); free_extent_cache: - destroy_extent_cache(); + f2fs_destroy_extent_cache(); free_checkpoint_caches: - destroy_checkpoint_caches(); + f2fs_destroy_checkpoint_caches(); free_segment_manager_caches: - destroy_segment_manager_caches(); + f2fs_destroy_segment_manager_caches(); free_node_manager_caches: - destroy_node_manager_caches(); + f2fs_destroy_node_manager_caches(); free_inodecache: destroy_inodecache(); fail: @@ -3199,10 +3199,10 @@ static void __exit exit_f2fs_fs(void) unregister_filesystem(&f2fs_fs_type); unregister_shrinker(&f2fs_shrinker_info); f2fs_exit_sysfs(); - destroy_extent_cache(); - destroy_checkpoint_caches(); - destroy_segment_manager_caches(); - destroy_node_manager_caches(); + f2fs_destroy_extent_cache(); + f2fs_destroy_checkpoint_caches(); + f2fs_destroy_segment_manager_caches(); + f2fs_destroy_node_manager_caches(); destroy_inodecache(); f2fs_destroy_trace_ios(); } diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index ac3ea6044936..60c827eadd82 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -165,7 +165,7 @@ static ssize_t f2fs_sbi_show(struct f2fs_attr *a, return snprintf(buf, PAGE_SIZE, "%u\n", *ui); } -static ssize_t __f2fs_sbi_store(struct f2fs_attr *a, +static ssize_t __sbi_store(struct f2fs_attr *a, struct f2fs_sb_info *sbi, const char *buf, size_t count) { @@ -201,13 +201,13 @@ static ssize_t __f2fs_sbi_store(struct f2fs_attr *a, down_write(&sbi->sb_lock); - ret = update_extension_list(sbi, name, hot, set); + ret = f2fs_update_extension_list(sbi, name, hot, set); if (ret) goto out; ret = f2fs_commit_super(sbi, false); if (ret) - update_extension_list(sbi, name, hot, !set); + f2fs_update_extension_list(sbi, name, hot, !set); out: up_write(&sbi->sb_lock); return ret ? ret : count; @@ -288,7 +288,7 @@ static ssize_t f2fs_sbi_store(struct f2fs_attr *a, if (gc_entry) down_read(&sbi->sb->s_umount); - ret = __f2fs_sbi_store(a, sbi, buf, count); + ret = __sbi_store(a, sbi, buf, count); if (gc_entry) up_read(&sbi->sb->s_umount); diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 116be979b897..61a5d9284bc0 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -299,7 +299,7 @@ static int read_inline_xattr(struct inode *inode, struct page *ipage, if (ipage) { inline_addr = inline_xattr_addr(inode, ipage); } else { - page = get_node_page(sbi, inode->i_ino); + page = f2fs_get_node_page(sbi, inode->i_ino); if (IS_ERR(page)) return PTR_ERR(page); @@ -320,7 +320,7 @@ static int read_xattr_block(struct inode *inode, void *txattr_addr) void *xattr_addr; /* The inode already has an extended attribute block. */ - xpage = get_node_page(sbi, xnid); + xpage = f2fs_get_node_page(sbi, xnid); if (IS_ERR(xpage)) return PTR_ERR(xpage); @@ -444,7 +444,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, int err = 0; if (hsize > inline_size && !F2FS_I(inode)->i_xattr_nid) - if (!alloc_nid(sbi, &new_nid)) + if (!f2fs_alloc_nid(sbi, &new_nid)) return -ENOSPC; /* write to inline xattr */ @@ -452,9 +452,9 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, if (ipage) { inline_addr = inline_xattr_addr(inode, ipage); } else { - in_page = get_node_page(sbi, inode->i_ino); + in_page = f2fs_get_node_page(sbi, inode->i_ino); if (IS_ERR(in_page)) { - alloc_nid_failed(sbi, new_nid); + f2fs_alloc_nid_failed(sbi, new_nid); return PTR_ERR(in_page); } inline_addr = inline_xattr_addr(inode, in_page); @@ -464,8 +464,8 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, NODE, true); /* no need to use xattr node block */ if (hsize <= inline_size) { - err = truncate_xattr_node(inode); - alloc_nid_failed(sbi, new_nid); + err = f2fs_truncate_xattr_node(inode); + f2fs_alloc_nid_failed(sbi, new_nid); if (err) { f2fs_put_page(in_page, 1); return err; @@ -478,10 +478,10 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, /* write to xattr node block */ if (F2FS_I(inode)->i_xattr_nid) { - xpage = get_node_page(sbi, F2FS_I(inode)->i_xattr_nid); + xpage = f2fs_get_node_page(sbi, F2FS_I(inode)->i_xattr_nid); if (IS_ERR(xpage)) { err = PTR_ERR(xpage); - alloc_nid_failed(sbi, new_nid); + f2fs_alloc_nid_failed(sbi, new_nid); goto in_page_out; } f2fs_bug_on(sbi, new_nid); @@ -489,13 +489,13 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, } else { struct dnode_of_data dn; set_new_dnode(&dn, inode, NULL, NULL, new_nid); - xpage = new_node_page(&dn, XATTR_NODE_OFFSET); + xpage = f2fs_new_node_page(&dn, XATTR_NODE_OFFSET); if (IS_ERR(xpage)) { err = PTR_ERR(xpage); - alloc_nid_failed(sbi, new_nid); + f2fs_alloc_nid_failed(sbi, new_nid); goto in_page_out; } - alloc_nid_done(sbi, new_nid); + f2fs_alloc_nid_done(sbi, new_nid); } xattr_addr = page_address(xpage); @@ -733,7 +733,7 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name, if (err) return err; - /* this case is only from init_inode_metadata */ + /* this case is only from f2fs_init_inode_metadata */ if (ipage) return __f2fs_setxattr(inode, index, name, value, size, ipage, flags); From d5b4710fcf381855b348216179e925f78815ef2c Mon Sep 17 00:00:00 2001 From: youngjun yoo Date: Wed, 30 May 2018 04:21:14 +0900 Subject: [PATCH 768/804] fs: f2fs: changed variable type of offset "unsigned" to "loff_t" clean up checkpatch warning: WARNING: Prefer 'unsigned int' to bare use of 'unsigned' Signed-off-by: youngjun yoo Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index f1476c93ded5..516fe3cc85ff 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -98,7 +98,7 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, /* page is wholly or partially inside EOF */ if (((loff_t)(page->index + 1) << PAGE_SHIFT) > i_size_read(inode)) { - unsigned offset; + loff_t offset; offset = i_size_read(inode) & ~PAGE_MASK; zero_user_segment(page, offset, PAGE_SIZE); } @@ -543,7 +543,7 @@ void f2fs_truncate_data_blocks(struct dnode_of_data *dn) static int truncate_partial_data_page(struct inode *inode, u64 from, bool cache_only) { - unsigned offset = from & (PAGE_SIZE - 1); + loff_t offset = from & (PAGE_SIZE - 1); pgoff_t index = from >> PAGE_SHIFT; struct address_space *mapping = inode->i_mapping; struct page *page; From 39ee53e22320abc578d94dded9244d64d450135a Mon Sep 17 00:00:00 2001 From: youngjun yoo Date: Wed, 30 May 2018 04:33:07 +0900 Subject: [PATCH 769/804] fs: f2fs: add missing blank lines after declarations clean up checkpatch warning: WARNING: Missing a blank line after declarations Signed-off-by: youngjun yoo Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 516fe3cc85ff..c01e97426b2f 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -99,6 +99,7 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, if (((loff_t)(page->index + 1) << PAGE_SHIFT) > i_size_read(inode)) { loff_t offset; + offset = i_size_read(inode) & ~PAGE_MASK; zero_user_segment(page, offset, PAGE_SIZE); } @@ -416,6 +417,7 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) dn.ofs_in_node++, pgofs++, data_ofs = (loff_t)pgofs << PAGE_SHIFT) { block_t blkaddr; + blkaddr = datablock_addr(dn.inode, dn.node_page, dn.ofs_in_node); @@ -506,6 +508,7 @@ void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count) for (; count > 0; count--, addr++, dn->ofs_in_node++) { block_t blkaddr = le32_to_cpu(*addr); + if (blkaddr == NULL_ADDR) continue; From 1ae5aadab1914fbdfcc24761005203e46fa0b343 Mon Sep 17 00:00:00 2001 From: youngjun yoo Date: Wed, 30 May 2018 04:34:58 +0900 Subject: [PATCH 770/804] fs: f2fs: insert space around that ':' and ', ' clean up checkpatch error: ERROR: space required after that ':' ERROR: space required after that ',' Signed-off-by: youngjun yoo Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index c01e97426b2f..de1c712777c9 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1487,7 +1487,7 @@ static int expand_inode_data(struct inode *inode, loff_t offset, last_off = map.m_lblk + map.m_len - 1; /* update new size to the failed position */ - new_size = (last_off == pg_end) ? offset + len: + new_size = (last_off == pg_end) ? offset + len : (loff_t)(last_off + 1) << PAGE_SHIFT; } else { new_size = ((loff_t)pg_end << PAGE_SHIFT) + off_end; @@ -2132,7 +2132,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, struct inode *inode = file_inode(filp); struct f2fs_map_blocks map = { .m_next_extent = NULL, .m_seg_type = NO_CHECK_TYPE }; - struct extent_info ei = {0,0,0}; + struct extent_info ei = {0, 0, 0}; pgoff_t pg_start, pg_end, next_pgofs; unsigned int blk_per_seg = sbi->blocks_per_seg; unsigned int total = 0, sec_num; From 588ecdfd7d023e7ed43fc516823d7df3c9d14fc3 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 4 Jun 2018 23:20:17 +0800 Subject: [PATCH 771/804] f2fs: fix to update mtime correctly If we change system time to the past, get_mtime() will return a overflowed time, and SIT_I(sbi)->max_mtime will be udpated incorrectly, this patch fixes the two issues. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 2 +- fs/f2fs/segment.c | 7 ++++--- fs/f2fs/segment.h | 18 +++++++++++++++--- 3 files changed, 20 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index b00c807c8c8b..60b4886f5bb6 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1234,7 +1234,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) * modify checkpoint * version number is already updated */ - ckpt->elapsed_time = cpu_to_le64(get_mtime(sbi)); + ckpt->elapsed_time = cpu_to_le64(get_mtime(sbi, true)); ckpt->free_segment_count = cpu_to_le32(free_segments(sbi)); for (i = 0; i < NR_CURSEG_NODE_TYPE; i++) { ckpt->cur_node_segno[i] = diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 8672bf574426..9a3dc92ecf23 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1902,8 +1902,9 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) (new_vblocks > sbi->blocks_per_seg))); se->valid_blocks = new_vblocks; - se->mtime = get_mtime(sbi); - SIT_I(sbi)->max_mtime = se->mtime; + se->mtime = get_mtime(sbi, false); + if (se->mtime > SIT_I(sbi)->max_mtime) + SIT_I(sbi)->max_mtime = se->mtime; /* Update valid block bitmap */ if (del > 0) { @@ -3965,7 +3966,7 @@ static void init_min_max_mtime(struct f2fs_sb_info *sbi) if (sit_i->min_mtime > mtime) sit_i->min_mtime = mtime; } - sit_i->max_mtime = get_mtime(sbi); + sit_i->max_mtime = get_mtime(sbi, false); up_write(&sit_i->sentry_lock); } diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 3e7ef7c6771f..f18fc82fbe99 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -745,11 +745,23 @@ static inline void set_to_next_sit(struct sit_info *sit_i, unsigned int start) #endif } -static inline unsigned long long get_mtime(struct f2fs_sb_info *sbi) +static inline unsigned long long get_mtime(struct f2fs_sb_info *sbi, + bool base_time) { struct sit_info *sit_i = SIT_I(sbi); - return sit_i->elapsed_time + CURRENT_TIME_SEC.tv_sec - - sit_i->mounted_time; + time64_t diff, now = ktime_get_real_seconds(); + + if (now >= sit_i->mounted_time) + return sit_i->elapsed_time + now - sit_i->mounted_time; + + /* system time is set to the past */ + if (!base_time) { + diff = sit_i->mounted_time - now; + if (sit_i->elapsed_time >= diff) + return sit_i->elapsed_time - diff; + return 0; + } + return sit_i->elapsed_time; } static inline void set_summary(struct f2fs_summary *sum, nid_t nid, From 6a4540cf1984dafe622622d647f22089ef404839 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 4 Jun 2018 23:20:35 +0800 Subject: [PATCH 772/804] f2fs: don't change wbc->sync_mode We should never falsify wbc->sync_mode passed from mm, otherwise mm can trigger writeback with wrong IO priority. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index cd0f60b5be7a..5264b079b93e 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1744,7 +1744,6 @@ static int f2fs_write_node_pages(struct address_space *mapping, trace_f2fs_writepages(mapping->host, wbc, NODE); diff = nr_pages_to_write(sbi, NODE, wbc); - wbc->sync_mode = WB_SYNC_NONE; blk_start_plug(&plug); f2fs_sync_node_pages(sbi, wbc, true, FS_NODE_IO); blk_finish_plug(&plug); From 853e7339b634660b951d9892e036faf225cf1187 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 4 Jun 2018 23:20:36 +0800 Subject: [PATCH 773/804] f2fs: let sync node IO interrupt async one Although mixed sync/async IOs can have continuous LBA, as they have different IO priority, block IO scheduler will add them into different queues and commit them separately, result in splited IOs which causes wrose performance. This patch gives high priority to synchronous IO of nodes, means that once synchronous flow starts, it can interrupt asynchronous writeback flow of system flusher, so more big IOs can be expected. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 2 ++ fs/f2fs/data.c | 9 +++++---- fs/f2fs/f2fs.h | 2 +- fs/f2fs/file.c | 2 ++ fs/f2fs/gc.c | 7 +++++++ fs/f2fs/node.c | 21 ++++++++++++++++++--- fs/f2fs/super.c | 3 ++- 7 files changed, 37 insertions(+), 9 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 60b4886f5bb6..e255e9b5538f 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1090,7 +1090,9 @@ retry_flush_nodes: if (get_pages(sbi, F2FS_DIRTY_NODES)) { up_write(&sbi->node_write); + atomic_inc(&sbi->wb_sync_req[NODE]); err = f2fs_sync_node_pages(sbi, &wbc, false, FS_CP_NODE_IO); + atomic_dec(&sbi->wb_sync_req[NODE]); if (err) { up_write(&sbi->node_change); f2fs_unlock_all(sbi); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 4b0db685e5d5..a166927355c8 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1930,6 +1930,7 @@ static int f2fs_write_cache_pages(struct address_space *mapping, int ret = 0; int done = 0; struct pagevec pvec; + struct f2fs_sb_info *sbi = F2FS_M_SB(mapping); int nr_pages; pgoff_t uninitialized_var(writeback_index); pgoff_t index; @@ -1984,7 +1985,7 @@ retry: bool submitted = false; /* give a priority to WB_SYNC threads */ - if (atomic_read(&F2FS_M_SB(mapping)->wb_sync_req) && + if (atomic_read(&sbi->wb_sync_req[DATA]) && wbc->sync_mode == WB_SYNC_NONE) { done = 1; break; @@ -2104,8 +2105,8 @@ static int __f2fs_write_data_pages(struct address_space *mapping, /* to avoid spliting IOs due to mixed WB_SYNC_ALL and WB_SYNC_NONE */ if (wbc->sync_mode == WB_SYNC_ALL) - atomic_inc(&sbi->wb_sync_req); - else if (atomic_read(&sbi->wb_sync_req)) + atomic_inc(&sbi->wb_sync_req[DATA]); + else if (atomic_read(&sbi->wb_sync_req[DATA])) goto skip_write; blk_start_plug(&plug); @@ -2113,7 +2114,7 @@ static int __f2fs_write_data_pages(struct address_space *mapping, blk_finish_plug(&plug); if (wbc->sync_mode == WB_SYNC_ALL) - atomic_dec(&sbi->wb_sync_req); + atomic_dec(&sbi->wb_sync_req[DATA]); /* * if some pages were truncated, we cannot guarantee its mapping->host * to detect pending bios. diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index e91f7ff71dc6..6873b321c2c1 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1265,7 +1265,7 @@ struct f2fs_sb_info { struct percpu_counter alloc_valid_block_count; /* writeback control */ - atomic_t wb_sync_req; /* count # of WB_SYNC threads */ + atomic_t wb_sync_req[META]; /* count # of WB_SYNC threads */ /* valid inode count */ struct percpu_counter total_valid_inode_count; diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index de1c712777c9..8b0002f05451 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -277,7 +277,9 @@ go_write: goto out; } sync_nodes: + atomic_inc(&sbi->wb_sync_req[NODE]); ret = f2fs_fsync_node_pages(sbi, inode, &wbc, atomic); + atomic_dec(&sbi->wb_sync_req[NODE]); if (ret) goto out; diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index dcadc0691a3e..772ef64d2035 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -473,12 +473,16 @@ static void gc_node_segment(struct f2fs_sb_info *sbi, block_t start_addr; int off; int phase = 0; + bool fggc = (gc_type == FG_GC); start_addr = START_BLOCK(sbi, segno); next_step: entry = sum; + if (fggc && phase == 2) + atomic_inc(&sbi->wb_sync_req[NODE]); + for (off = 0; off < sbi->blocks_per_seg; off++, entry++) { nid_t nid = le32_to_cpu(entry->nid); struct page *node_page; @@ -525,6 +529,9 @@ next_step: if (++phase < 3) goto next_step; + + if (fggc) + atomic_dec(&sbi->wb_sync_req[NODE]); } /* diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 5264b079b93e..baa8ee1aca38 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1596,21 +1596,28 @@ int f2fs_sync_node_pages(struct f2fs_sb_info *sbi, int step = 0; int nwritten = 0; int ret = 0; - int nr_pages; + int nr_pages, done = 0; pagevec_init(&pvec, 0); next_step: index = 0; - while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, - PAGECACHE_TAG_DIRTY))) { + while (!done && (nr_pages = pagevec_lookup_tag(&pvec, + NODE_MAPPING(sbi), &index, PAGECACHE_TAG_DIRTY))) { int i; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; bool submitted = false; + /* give a priority to WB_SYNC threads */ + if (atomic_read(&sbi->wb_sync_req[NODE]) && + wbc->sync_mode == WB_SYNC_NONE) { + done = 1; + break; + } + /* * flushing sequence with step: * 0. indirect nodes @@ -1741,6 +1748,11 @@ static int f2fs_write_node_pages(struct address_space *mapping, if (get_pages(sbi, F2FS_DIRTY_NODES) < nr_pages_to_skip(sbi, NODE)) goto skip_write; + if (wbc->sync_mode == WB_SYNC_ALL) + atomic_inc(&sbi->wb_sync_req[NODE]); + else if (atomic_read(&sbi->wb_sync_req[NODE])) + goto skip_write; + trace_f2fs_writepages(mapping->host, wbc, NODE); diff = nr_pages_to_write(sbi, NODE, wbc); @@ -1748,6 +1760,9 @@ static int f2fs_write_node_pages(struct address_space *mapping, f2fs_sync_node_pages(sbi, wbc, true, FS_NODE_IO); blk_finish_plug(&plug); wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff); + + if (wbc->sync_mode == WB_SYNC_ALL) + atomic_dec(&sbi->wb_sync_req[NODE]); return 0; skip_write: diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index d306725d7399..0b803213ed64 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2360,7 +2360,8 @@ static void init_sb_info(struct f2fs_sb_info *sbi) for (i = 0; i < NR_COUNT_TYPE; i++) atomic_set(&sbi->nr_pages[i], 0); - atomic_set(&sbi->wb_sync_req, 0); + for (i = 0; i < META; i++) + atomic_set(&sbi->wb_sync_req[i], 0); INIT_LIST_HEAD(&sbi->s_list); mutex_init(&sbi->umount_mutex); From d400752f547f8aea87260885fcdceed3a58e9072 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 4 Jun 2018 23:20:51 +0800 Subject: [PATCH 774/804] f2fs: fix to clear FI_VOLATILE_FILE correctly Thread A Thread B - f2fs_release_file - clear_inode_flag(FI_VOLATILE_FILE) - wb_writeback - writeback_sb_inodes - __writeback_single_inode - do_writepages - f2fs_write_data_pages - __write_data_page all volatile file's pages are writebacked to storage - set_inode_flag(FI_DROP_CACHE) - filemap_fdatawrite There is a hole that mm can flush all dirty pages of volatile file as inode is not tagged with both FI_VOLATILE_FILE and FI_DROP_CACHE flags, we should never writeback the page #0 and also it's unneeded to writeback other pages. This patch adjusts to relocate clear_inode_flag(FI_VOLATILE_FILE), so that FI_VOLATILE_FILE flag can be remained before all dirty pages were dropped to avoid issue. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 8b0002f05451..2ca53f7b94e9 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1571,11 +1571,11 @@ static int f2fs_release_file(struct inode *inode, struct file *filp) if (f2fs_is_atomic_file(inode)) f2fs_drop_inmem_pages(inode); if (f2fs_is_volatile_file(inode)) { - clear_inode_flag(inode, FI_VOLATILE_FILE); - stat_dec_volatile_write(inode); set_inode_flag(inode, FI_DROP_CACHE); filemap_fdatawrite(inode->i_mapping); clear_inode_flag(inode, FI_DROP_CACHE); + clear_inode_flag(inode, FI_VOLATILE_FILE); + stat_dec_volatile_write(inode); } return 0; } From c41203299a521a7ba9bb41afbc14c534ee1e3554 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 7 May 2018 16:47:02 -0700 Subject: [PATCH 775/804] overflow.h: Add allocation size calculation helpers In preparation for replacing unchecked overflows for memory allocations, this creates helpers for the 3 most common calculations: array_size(a, b): 2-dimensional array array3_size(a, b, c): 3-dimensional array struct_size(ptr, member, n): struct followed by n-many trailing members Each of these return SIZE_MAX on overflow instead of wrapping around. (Additionally renames a variable named "array_size" to avoid future collision.) Co-developed-by: Matthew Wilcox Signed-off-by: Kees Cook --- drivers/md/dm-table.c | 10 +- include/linux/overflow.h | 278 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 283 insertions(+), 5 deletions(-) create mode 100644 include/linux/overflow.h diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index cb5d0daf53bb..8e9646a2550d 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -507,14 +507,14 @@ static int adjoin(struct dm_table *table, struct dm_target *ti) * On the other hand, dm-switch needs to process bulk data using messages and * excessive use of GFP_NOIO could cause trouble. */ -static char **realloc_argv(unsigned *array_size, char **old_argv) +static char **realloc_argv(unsigned *size, char **old_argv) { char **argv; unsigned new_size; gfp_t gfp; - if (*array_size) { - new_size = *array_size * 2; + if (*size) { + new_size = *size * 2; gfp = GFP_KERNEL; } else { new_size = 8; @@ -522,8 +522,8 @@ static char **realloc_argv(unsigned *array_size, char **old_argv) } argv = kmalloc(new_size * sizeof(*argv), gfp); if (argv) { - memcpy(argv, old_argv, *array_size * sizeof(*argv)); - *array_size = new_size; + memcpy(argv, old_argv, *size * sizeof(*argv)); + *size = new_size; } kfree(old_argv); diff --git a/include/linux/overflow.h b/include/linux/overflow.h new file mode 100644 index 000000000000..8712ff70995f --- /dev/null +++ b/include/linux/overflow.h @@ -0,0 +1,278 @@ +/* SPDX-License-Identifier: GPL-2.0 OR MIT */ +#ifndef __LINUX_OVERFLOW_H +#define __LINUX_OVERFLOW_H + +#include + +/* + * In the fallback code below, we need to compute the minimum and + * maximum values representable in a given type. These macros may also + * be useful elsewhere, so we provide them outside the + * COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW block. + * + * It would seem more obvious to do something like + * + * #define type_min(T) (T)(is_signed_type(T) ? (T)1 << (8*sizeof(T)-1) : 0) + * #define type_max(T) (T)(is_signed_type(T) ? ((T)1 << (8*sizeof(T)-1)) - 1 : ~(T)0) + * + * Unfortunately, the middle expressions, strictly speaking, have + * undefined behaviour, and at least some versions of gcc warn about + * the type_max expression (but not if -fsanitize=undefined is in + * effect; in that case, the warning is deferred to runtime...). + * + * The slightly excessive casting in type_min is to make sure the + * macros also produce sensible values for the exotic type _Bool. [The + * overflow checkers only almost work for _Bool, but that's + * a-feature-not-a-bug, since people shouldn't be doing arithmetic on + * _Bools. Besides, the gcc builtins don't allow _Bool* as third + * argument.] + * + * Idea stolen from + * https://mail-index.netbsd.org/tech-misc/2007/02/05/0000.html - + * credit to Christian Biere. + */ +#define is_signed_type(type) (((type)(-1)) < (type)1) +#define __type_half_max(type) ((type)1 << (8*sizeof(type) - 1 - is_signed_type(type))) +#define type_max(T) ((T)((__type_half_max(T) - 1) + __type_half_max(T))) +#define type_min(T) ((T)((T)-type_max(T)-(T)1)) + + +#ifdef COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW +/* + * For simplicity and code hygiene, the fallback code below insists on + * a, b and *d having the same type (similar to the min() and max() + * macros), whereas gcc's type-generic overflow checkers accept + * different types. Hence we don't just make check_add_overflow an + * alias for __builtin_add_overflow, but add type checks similar to + * below. + */ +#define check_add_overflow(a, b, d) ({ \ + typeof(a) __a = (a); \ + typeof(b) __b = (b); \ + typeof(d) __d = (d); \ + (void) (&__a == &__b); \ + (void) (&__a == __d); \ + __builtin_add_overflow(__a, __b, __d); \ +}) + +#define check_sub_overflow(a, b, d) ({ \ + typeof(a) __a = (a); \ + typeof(b) __b = (b); \ + typeof(d) __d = (d); \ + (void) (&__a == &__b); \ + (void) (&__a == __d); \ + __builtin_sub_overflow(__a, __b, __d); \ +}) + +#define check_mul_overflow(a, b, d) ({ \ + typeof(a) __a = (a); \ + typeof(b) __b = (b); \ + typeof(d) __d = (d); \ + (void) (&__a == &__b); \ + (void) (&__a == __d); \ + __builtin_mul_overflow(__a, __b, __d); \ +}) + +#else + + +/* Checking for unsigned overflow is relatively easy without causing UB. */ +#define __unsigned_add_overflow(a, b, d) ({ \ + typeof(a) __a = (a); \ + typeof(b) __b = (b); \ + typeof(d) __d = (d); \ + (void) (&__a == &__b); \ + (void) (&__a == __d); \ + *__d = __a + __b; \ + *__d < __a; \ +}) +#define __unsigned_sub_overflow(a, b, d) ({ \ + typeof(a) __a = (a); \ + typeof(b) __b = (b); \ + typeof(d) __d = (d); \ + (void) (&__a == &__b); \ + (void) (&__a == __d); \ + *__d = __a - __b; \ + __a < __b; \ +}) +/* + * If one of a or b is a compile-time constant, this avoids a division. + */ +#define __unsigned_mul_overflow(a, b, d) ({ \ + typeof(a) __a = (a); \ + typeof(b) __b = (b); \ + typeof(d) __d = (d); \ + (void) (&__a == &__b); \ + (void) (&__a == __d); \ + *__d = __a * __b; \ + __builtin_constant_p(__b) ? \ + __b > 0 && __a > type_max(typeof(__a)) / __b : \ + __a > 0 && __b > type_max(typeof(__b)) / __a; \ +}) + +/* + * For signed types, detecting overflow is much harder, especially if + * we want to avoid UB. But the interface of these macros is such that + * we must provide a result in *d, and in fact we must produce the + * result promised by gcc's builtins, which is simply the possibly + * wrapped-around value. Fortunately, we can just formally do the + * operations in the widest relevant unsigned type (u64) and then + * truncate the result - gcc is smart enough to generate the same code + * with and without the (u64) casts. + */ + +/* + * Adding two signed integers can overflow only if they have the same + * sign, and overflow has happened iff the result has the opposite + * sign. + */ +#define __signed_add_overflow(a, b, d) ({ \ + typeof(a) __a = (a); \ + typeof(b) __b = (b); \ + typeof(d) __d = (d); \ + (void) (&__a == &__b); \ + (void) (&__a == __d); \ + *__d = (u64)__a + (u64)__b; \ + (((~(__a ^ __b)) & (*__d ^ __a)) \ + & type_min(typeof(__a))) != 0; \ +}) + +/* + * Subtraction is similar, except that overflow can now happen only + * when the signs are opposite. In this case, overflow has happened if + * the result has the opposite sign of a. + */ +#define __signed_sub_overflow(a, b, d) ({ \ + typeof(a) __a = (a); \ + typeof(b) __b = (b); \ + typeof(d) __d = (d); \ + (void) (&__a == &__b); \ + (void) (&__a == __d); \ + *__d = (u64)__a - (u64)__b; \ + ((((__a ^ __b)) & (*__d ^ __a)) \ + & type_min(typeof(__a))) != 0; \ +}) + +/* + * Signed multiplication is rather hard. gcc always follows C99, so + * division is truncated towards 0. This means that we can write the + * overflow check like this: + * + * (a > 0 && (b > MAX/a || b < MIN/a)) || + * (a < -1 && (b > MIN/a || b < MAX/a) || + * (a == -1 && b == MIN) + * + * The redundant casts of -1 are to silence an annoying -Wtype-limits + * (included in -Wextra) warning: When the type is u8 or u16, the + * __b_c_e in check_mul_overflow obviously selects + * __unsigned_mul_overflow, but unfortunately gcc still parses this + * code and warns about the limited range of __b. + */ + +#define __signed_mul_overflow(a, b, d) ({ \ + typeof(a) __a = (a); \ + typeof(b) __b = (b); \ + typeof(d) __d = (d); \ + typeof(a) __tmax = type_max(typeof(a)); \ + typeof(a) __tmin = type_min(typeof(a)); \ + (void) (&__a == &__b); \ + (void) (&__a == __d); \ + *__d = (u64)__a * (u64)__b; \ + (__b > 0 && (__a > __tmax/__b || __a < __tmin/__b)) || \ + (__b < (typeof(__b))-1 && (__a > __tmin/__b || __a < __tmax/__b)) || \ + (__b == (typeof(__b))-1 && __a == __tmin); \ +}) + + +#define check_add_overflow(a, b, d) \ + __builtin_choose_expr(is_signed_type(typeof(a)), \ + __signed_add_overflow(a, b, d), \ + __unsigned_add_overflow(a, b, d)) + +#define check_sub_overflow(a, b, d) \ + __builtin_choose_expr(is_signed_type(typeof(a)), \ + __signed_sub_overflow(a, b, d), \ + __unsigned_sub_overflow(a, b, d)) + +#define check_mul_overflow(a, b, d) \ + __builtin_choose_expr(is_signed_type(typeof(a)), \ + __signed_mul_overflow(a, b, d), \ + __unsigned_mul_overflow(a, b, d)) + + +#endif /* COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW */ + +/** + * array_size() - Calculate size of 2-dimensional array. + * + * @a: dimension one + * @b: dimension two + * + * Calculates size of 2-dimensional array: @a * @b. + * + * Returns: number of bytes needed to represent the array or SIZE_MAX on + * overflow. + */ +static inline __must_check size_t array_size(size_t a, size_t b) +{ + size_t bytes; + + if (check_mul_overflow(a, b, &bytes)) + return SIZE_MAX; + + return bytes; +} + +/** + * array3_size() - Calculate size of 3-dimensional array. + * + * @a: dimension one + * @b: dimension two + * @c: dimension three + * + * Calculates size of 3-dimensional array: @a * @b * @c. + * + * Returns: number of bytes needed to represent the array or SIZE_MAX on + * overflow. + */ +static inline __must_check size_t array3_size(size_t a, size_t b, size_t c) +{ + size_t bytes; + + if (check_mul_overflow(a, b, &bytes)) + return SIZE_MAX; + if (check_mul_overflow(bytes, c, &bytes)) + return SIZE_MAX; + + return bytes; +} + +static inline __must_check size_t __ab_c_size(size_t n, size_t size, size_t c) +{ + size_t bytes; + + if (check_mul_overflow(n, size, &bytes)) + return SIZE_MAX; + if (check_add_overflow(bytes, c, &bytes)) + return SIZE_MAX; + + return bytes; +} + +/** + * struct_size() - Calculate size of structure with trailing array. + * @p: Pointer to the structure. + * @member: Name of the array member. + * @n: Number of elements in the array. + * + * Calculates size of memory needed for structure @p followed by an + * array of @n @member elements. + * + * Return: number of bytes needed or SIZE_MAX on overflow. + */ +#define struct_size(p, member, n) \ + __ab_c_size(n, \ + sizeof(*(p)->member) + __must_be_array((p)->member),\ + sizeof(*(p))) + +#endif /* __LINUX_OVERFLOW_H */ From 3ea03ea4bd0940bb8f9bc18f957918d1fd7e90db Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 12 Jun 2018 14:28:16 -0700 Subject: [PATCH 776/804] treewide: Use array_size() in f2fs_kmalloc() The f2fs_kmalloc() function has no 2-factor argument form, so multiplication factors need to be wrapped in array_size(). This patch replaces cases of: f2fs_kmalloc(handle, a * b, gfp) with: f2fs_kmalloc(handle, array_size(a, b), gfp) as well as handling cases of: f2fs_kmalloc(handle, a * b * c, gfp) with: f2fs_kmalloc(handle, array3_size(a, b, c), gfp) This does, however, attempt to ignore constant size factors like: f2fs_kmalloc(handle, 4 * 1024, gfp) though any constants defined via macros get caught up in the conversion. Any factors with a sizeof() of "unsigned char", "char", and "u8" were dropped, since they're redundant. The Coccinelle script used for this was: // Fix redundant parens around sizeof(). @@ expression HANDLE; type TYPE; expression THING, E; @@ ( f2fs_kmalloc(HANDLE, - (sizeof(TYPE)) * E + sizeof(TYPE) * E , ...) | f2fs_kmalloc(HANDLE, - (sizeof(THING)) * E + sizeof(THING) * E , ...) ) // Drop single-byte sizes and redundant parens. @@ expression HANDLE; expression COUNT; typedef u8; typedef __u8; @@ ( f2fs_kmalloc(HANDLE, - sizeof(u8) * (COUNT) + COUNT , ...) | f2fs_kmalloc(HANDLE, - sizeof(__u8) * (COUNT) + COUNT , ...) | f2fs_kmalloc(HANDLE, - sizeof(char) * (COUNT) + COUNT , ...) | f2fs_kmalloc(HANDLE, - sizeof(unsigned char) * (COUNT) + COUNT , ...) | f2fs_kmalloc(HANDLE, - sizeof(u8) * COUNT + COUNT , ...) | f2fs_kmalloc(HANDLE, - sizeof(__u8) * COUNT + COUNT , ...) | f2fs_kmalloc(HANDLE, - sizeof(char) * COUNT + COUNT , ...) | f2fs_kmalloc(HANDLE, - sizeof(unsigned char) * COUNT + COUNT , ...) ) // 2-factor product with sizeof(type/expression) and identifier or constant. @@ expression HANDLE; type TYPE; expression THING; identifier COUNT_ID; constant COUNT_CONST; @@ ( f2fs_kmalloc(HANDLE, - sizeof(TYPE) * (COUNT_ID) + array_size(COUNT_ID, sizeof(TYPE)) , ...) | f2fs_kmalloc(HANDLE, - sizeof(TYPE) * COUNT_ID + array_size(COUNT_ID, sizeof(TYPE)) , ...) | f2fs_kmalloc(HANDLE, - sizeof(TYPE) * (COUNT_CONST) + array_size(COUNT_CONST, sizeof(TYPE)) , ...) | f2fs_kmalloc(HANDLE, - sizeof(TYPE) * COUNT_CONST + array_size(COUNT_CONST, sizeof(TYPE)) , ...) | f2fs_kmalloc(HANDLE, - sizeof(THING) * (COUNT_ID) + array_size(COUNT_ID, sizeof(THING)) , ...) | f2fs_kmalloc(HANDLE, - sizeof(THING) * COUNT_ID + array_size(COUNT_ID, sizeof(THING)) , ...) | f2fs_kmalloc(HANDLE, - sizeof(THING) * (COUNT_CONST) + array_size(COUNT_CONST, sizeof(THING)) , ...) | f2fs_kmalloc(HANDLE, - sizeof(THING) * COUNT_CONST + array_size(COUNT_CONST, sizeof(THING)) , ...) ) // 2-factor product, only identifiers. @@ expression HANDLE; identifier SIZE, COUNT; @@ f2fs_kmalloc(HANDLE, - SIZE * COUNT + array_size(COUNT, SIZE) , ...) // 3-factor product with 1 sizeof(type) or sizeof(expression), with // redundant parens removed. @@ expression HANDLE; expression THING; identifier STRIDE, COUNT; type TYPE; @@ ( f2fs_kmalloc(HANDLE, - sizeof(TYPE) * (COUNT) * (STRIDE) + array3_size(COUNT, STRIDE, sizeof(TYPE)) , ...) | f2fs_kmalloc(HANDLE, - sizeof(TYPE) * (COUNT) * STRIDE + array3_size(COUNT, STRIDE, sizeof(TYPE)) , ...) | f2fs_kmalloc(HANDLE, - sizeof(TYPE) * COUNT * (STRIDE) + array3_size(COUNT, STRIDE, sizeof(TYPE)) , ...) | f2fs_kmalloc(HANDLE, - sizeof(TYPE) * COUNT * STRIDE + array3_size(COUNT, STRIDE, sizeof(TYPE)) , ...) | f2fs_kmalloc(HANDLE, - sizeof(THING) * (COUNT) * (STRIDE) + array3_size(COUNT, STRIDE, sizeof(THING)) , ...) | f2fs_kmalloc(HANDLE, - sizeof(THING) * (COUNT) * STRIDE + array3_size(COUNT, STRIDE, sizeof(THING)) , ...) | f2fs_kmalloc(HANDLE, - sizeof(THING) * COUNT * (STRIDE) + array3_size(COUNT, STRIDE, sizeof(THING)) , ...) | f2fs_kmalloc(HANDLE, - sizeof(THING) * COUNT * STRIDE + array3_size(COUNT, STRIDE, sizeof(THING)) , ...) ) // 3-factor product with 2 sizeof(variable), with redundant parens removed. @@ expression HANDLE; expression THING1, THING2; identifier COUNT; type TYPE1, TYPE2; @@ ( f2fs_kmalloc(HANDLE, - sizeof(TYPE1) * sizeof(TYPE2) * COUNT + array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2)) , ...) | f2fs_kmalloc(HANDLE, - sizeof(TYPE1) * sizeof(THING2) * (COUNT) + array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2)) , ...) | f2fs_kmalloc(HANDLE, - sizeof(THING1) * sizeof(THING2) * COUNT + array3_size(COUNT, sizeof(THING1), sizeof(THING2)) , ...) | f2fs_kmalloc(HANDLE, - sizeof(THING1) * sizeof(THING2) * (COUNT) + array3_size(COUNT, sizeof(THING1), sizeof(THING2)) , ...) | f2fs_kmalloc(HANDLE, - sizeof(TYPE1) * sizeof(THING2) * COUNT + array3_size(COUNT, sizeof(TYPE1), sizeof(THING2)) , ...) | f2fs_kmalloc(HANDLE, - sizeof(TYPE1) * sizeof(THING2) * (COUNT) + array3_size(COUNT, sizeof(TYPE1), sizeof(THING2)) , ...) ) // 3-factor product, only identifiers, with redundant parens removed. @@ expression HANDLE; identifier STRIDE, SIZE, COUNT; @@ ( f2fs_kmalloc(HANDLE, - (COUNT) * STRIDE * SIZE + array3_size(COUNT, STRIDE, SIZE) , ...) | f2fs_kmalloc(HANDLE, - COUNT * (STRIDE) * SIZE + array3_size(COUNT, STRIDE, SIZE) , ...) | f2fs_kmalloc(HANDLE, - COUNT * STRIDE * (SIZE) + array3_size(COUNT, STRIDE, SIZE) , ...) | f2fs_kmalloc(HANDLE, - (COUNT) * (STRIDE) * SIZE + array3_size(COUNT, STRIDE, SIZE) , ...) | f2fs_kmalloc(HANDLE, - COUNT * (STRIDE) * (SIZE) + array3_size(COUNT, STRIDE, SIZE) , ...) | f2fs_kmalloc(HANDLE, - (COUNT) * STRIDE * (SIZE) + array3_size(COUNT, STRIDE, SIZE) , ...) | f2fs_kmalloc(HANDLE, - (COUNT) * (STRIDE) * (SIZE) + array3_size(COUNT, STRIDE, SIZE) , ...) | f2fs_kmalloc(HANDLE, - COUNT * STRIDE * SIZE + array3_size(COUNT, STRIDE, SIZE) , ...) ) // Any remaining multi-factor products, first at least 3-factor products // when they're not all constants... @@ expression HANDLE; expression E1, E2, E3; constant C1, C2, C3; @@ ( f2fs_kmalloc(HANDLE, C1 * C2 * C3, ...) | f2fs_kmalloc(HANDLE, - E1 * E2 * E3 + array3_size(E1, E2, E3) , ...) ) // And then all remaining 2 factors products when they're not all constants. @@ expression HANDLE; expression E1, E2; constant C1, C2; @@ ( f2fs_kmalloc(HANDLE, C1 * C2, ...) | f2fs_kmalloc(HANDLE, - E1 * E2 + array_size(E1, E2) , ...) ) Signed-off-by: Kees Cook --- fs/f2fs/f2fs.h | 1 + fs/f2fs/super.c | 8 +++++--- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 6873b321c2c1..0b514cf1ac6f 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -26,6 +26,7 @@ #include #include #include +#include #define __FS_HAS_ENCRYPTION IS_ENABLED(CONFIG_F2FS_FS_ENCRYPTION) #include diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 0b803213ed64..bfa56b037ed8 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2791,9 +2791,11 @@ try_onemore: int n = (i == META) ? 1: NR_TEMP_TYPE; int j; - sbi->write_io[i] = f2fs_kmalloc(sbi, - n * sizeof(struct f2fs_bio_info), - GFP_KERNEL); + sbi->write_io[i] = + f2fs_kmalloc(sbi, + array_size(n, + sizeof(struct f2fs_bio_info)), + GFP_KERNEL); if (!sbi->write_io[i]) { err = -ENOMEM; goto free_options; From f15443db99c35cd3bf44d76bc4f6d181f89e4acd Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 12 Jun 2018 14:28:23 -0700 Subject: [PATCH 777/804] treewide: Use array_size() in f2fs_kzalloc() The f2fs_kzalloc() function has no 2-factor argument form, so multiplication factors need to be wrapped in array_size(). This patch replaces cases of: f2fs_kzalloc(handle, a * b, gfp) with: f2fs_kzalloc(handle, array_size(a, b), gfp) as well as handling cases of: f2fs_kzalloc(handle, a * b * c, gfp) with: f2fs_kzalloc(handle, array3_size(a, b, c), gfp) This does, however, attempt to ignore constant size factors like: f2fs_kzalloc(handle, 4 * 1024, gfp) though any constants defined via macros get caught up in the conversion. Any factors with a sizeof() of "unsigned char", "char", and "u8" were dropped, since they're redundant. The Coccinelle script used for this was: // Fix redundant parens around sizeof(). @@ expression HANDLE; type TYPE; expression THING, E; @@ ( f2fs_kzalloc(HANDLE, - (sizeof(TYPE)) * E + sizeof(TYPE) * E , ...) | f2fs_kzalloc(HANDLE, - (sizeof(THING)) * E + sizeof(THING) * E , ...) ) // Drop single-byte sizes and redundant parens. @@ expression HANDLE; expression COUNT; typedef u8; typedef __u8; @@ ( f2fs_kzalloc(HANDLE, - sizeof(u8) * (COUNT) + COUNT , ...) | f2fs_kzalloc(HANDLE, - sizeof(__u8) * (COUNT) + COUNT , ...) | f2fs_kzalloc(HANDLE, - sizeof(char) * (COUNT) + COUNT , ...) | f2fs_kzalloc(HANDLE, - sizeof(unsigned char) * (COUNT) + COUNT , ...) | f2fs_kzalloc(HANDLE, - sizeof(u8) * COUNT + COUNT , ...) | f2fs_kzalloc(HANDLE, - sizeof(__u8) * COUNT + COUNT , ...) | f2fs_kzalloc(HANDLE, - sizeof(char) * COUNT + COUNT , ...) | f2fs_kzalloc(HANDLE, - sizeof(unsigned char) * COUNT + COUNT , ...) ) // 2-factor product with sizeof(type/expression) and identifier or constant. @@ expression HANDLE; type TYPE; expression THING; identifier COUNT_ID; constant COUNT_CONST; @@ ( f2fs_kzalloc(HANDLE, - sizeof(TYPE) * (COUNT_ID) + array_size(COUNT_ID, sizeof(TYPE)) , ...) | f2fs_kzalloc(HANDLE, - sizeof(TYPE) * COUNT_ID + array_size(COUNT_ID, sizeof(TYPE)) , ...) | f2fs_kzalloc(HANDLE, - sizeof(TYPE) * (COUNT_CONST) + array_size(COUNT_CONST, sizeof(TYPE)) , ...) | f2fs_kzalloc(HANDLE, - sizeof(TYPE) * COUNT_CONST + array_size(COUNT_CONST, sizeof(TYPE)) , ...) | f2fs_kzalloc(HANDLE, - sizeof(THING) * (COUNT_ID) + array_size(COUNT_ID, sizeof(THING)) , ...) | f2fs_kzalloc(HANDLE, - sizeof(THING) * COUNT_ID + array_size(COUNT_ID, sizeof(THING)) , ...) | f2fs_kzalloc(HANDLE, - sizeof(THING) * (COUNT_CONST) + array_size(COUNT_CONST, sizeof(THING)) , ...) | f2fs_kzalloc(HANDLE, - sizeof(THING) * COUNT_CONST + array_size(COUNT_CONST, sizeof(THING)) , ...) ) // 2-factor product, only identifiers. @@ expression HANDLE; identifier SIZE, COUNT; @@ f2fs_kzalloc(HANDLE, - SIZE * COUNT + array_size(COUNT, SIZE) , ...) // 3-factor product with 1 sizeof(type) or sizeof(expression), with // redundant parens removed. @@ expression HANDLE; expression THING; identifier STRIDE, COUNT; type TYPE; @@ ( f2fs_kzalloc(HANDLE, - sizeof(TYPE) * (COUNT) * (STRIDE) + array3_size(COUNT, STRIDE, sizeof(TYPE)) , ...) | f2fs_kzalloc(HANDLE, - sizeof(TYPE) * (COUNT) * STRIDE + array3_size(COUNT, STRIDE, sizeof(TYPE)) , ...) | f2fs_kzalloc(HANDLE, - sizeof(TYPE) * COUNT * (STRIDE) + array3_size(COUNT, STRIDE, sizeof(TYPE)) , ...) | f2fs_kzalloc(HANDLE, - sizeof(TYPE) * COUNT * STRIDE + array3_size(COUNT, STRIDE, sizeof(TYPE)) , ...) | f2fs_kzalloc(HANDLE, - sizeof(THING) * (COUNT) * (STRIDE) + array3_size(COUNT, STRIDE, sizeof(THING)) , ...) | f2fs_kzalloc(HANDLE, - sizeof(THING) * (COUNT) * STRIDE + array3_size(COUNT, STRIDE, sizeof(THING)) , ...) | f2fs_kzalloc(HANDLE, - sizeof(THING) * COUNT * (STRIDE) + array3_size(COUNT, STRIDE, sizeof(THING)) , ...) | f2fs_kzalloc(HANDLE, - sizeof(THING) * COUNT * STRIDE + array3_size(COUNT, STRIDE, sizeof(THING)) , ...) ) // 3-factor product with 2 sizeof(variable), with redundant parens removed. @@ expression HANDLE; expression THING1, THING2; identifier COUNT; type TYPE1, TYPE2; @@ ( f2fs_kzalloc(HANDLE, - sizeof(TYPE1) * sizeof(TYPE2) * COUNT + array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2)) , ...) | f2fs_kzalloc(HANDLE, - sizeof(TYPE1) * sizeof(THING2) * (COUNT) + array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2)) , ...) | f2fs_kzalloc(HANDLE, - sizeof(THING1) * sizeof(THING2) * COUNT + array3_size(COUNT, sizeof(THING1), sizeof(THING2)) , ...) | f2fs_kzalloc(HANDLE, - sizeof(THING1) * sizeof(THING2) * (COUNT) + array3_size(COUNT, sizeof(THING1), sizeof(THING2)) , ...) | f2fs_kzalloc(HANDLE, - sizeof(TYPE1) * sizeof(THING2) * COUNT + array3_size(COUNT, sizeof(TYPE1), sizeof(THING2)) , ...) | f2fs_kzalloc(HANDLE, - sizeof(TYPE1) * sizeof(THING2) * (COUNT) + array3_size(COUNT, sizeof(TYPE1), sizeof(THING2)) , ...) ) // 3-factor product, only identifiers, with redundant parens removed. @@ expression HANDLE; identifier STRIDE, SIZE, COUNT; @@ ( f2fs_kzalloc(HANDLE, - (COUNT) * STRIDE * SIZE + array3_size(COUNT, STRIDE, SIZE) , ...) | f2fs_kzalloc(HANDLE, - COUNT * (STRIDE) * SIZE + array3_size(COUNT, STRIDE, SIZE) , ...) | f2fs_kzalloc(HANDLE, - COUNT * STRIDE * (SIZE) + array3_size(COUNT, STRIDE, SIZE) , ...) | f2fs_kzalloc(HANDLE, - (COUNT) * (STRIDE) * SIZE + array3_size(COUNT, STRIDE, SIZE) , ...) | f2fs_kzalloc(HANDLE, - COUNT * (STRIDE) * (SIZE) + array3_size(COUNT, STRIDE, SIZE) , ...) | f2fs_kzalloc(HANDLE, - (COUNT) * STRIDE * (SIZE) + array3_size(COUNT, STRIDE, SIZE) , ...) | f2fs_kzalloc(HANDLE, - (COUNT) * (STRIDE) * (SIZE) + array3_size(COUNT, STRIDE, SIZE) , ...) | f2fs_kzalloc(HANDLE, - COUNT * STRIDE * SIZE + array3_size(COUNT, STRIDE, SIZE) , ...) ) // Any remaining multi-factor products, first at least 3-factor products // when they're not all constants... @@ expression HANDLE; expression E1, E2, E3; constant C1, C2, C3; @@ ( f2fs_kzalloc(HANDLE, C1 * C2 * C3, ...) | f2fs_kzalloc(HANDLE, - E1 * E2 * E3 + array3_size(E1, E2, E3) , ...) ) // And then all remaining 2 factors products when they're not all constants. @@ expression HANDLE; expression E1, E2; constant C1, C2; @@ ( f2fs_kzalloc(HANDLE, C1 * C2, ...) | f2fs_kzalloc(HANDLE, - E1 * E2 + array_size(E1, E2) , ...) ) Signed-off-by: Kees Cook --- fs/f2fs/checkpoint.c | 3 ++- fs/f2fs/node.c | 6 ++++-- fs/f2fs/segment.c | 3 ++- fs/f2fs/super.c | 12 ++++++++---- 4 files changed, 16 insertions(+), 8 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index e255e9b5538f..178623c15765 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -807,7 +807,8 @@ int f2fs_get_valid_checkpoint(struct f2fs_sb_info *sbi) block_t cp_blk_no; int i; - sbi->ckpt = f2fs_kzalloc(sbi, cp_blks * blk_size, GFP_KERNEL); + sbi->ckpt = f2fs_kzalloc(sbi, array_size(blk_size, cp_blks), + GFP_KERNEL); if (!sbi->ckpt) return -ENOMEM; /* diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index baa8ee1aca38..1ad24998e29c 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2756,8 +2756,10 @@ static int init_free_nid_cache(struct f2fs_sb_info *sbi) struct f2fs_nm_info *nm_i = NM_I(sbi); int i; - nm_i->free_nid_bitmap = f2fs_kzalloc(sbi, nm_i->nat_blocks * - sizeof(unsigned char *), GFP_KERNEL); + nm_i->free_nid_bitmap = + f2fs_kzalloc(sbi, array_size(sizeof(unsigned char *), + nm_i->nat_blocks), + GFP_KERNEL); if (!nm_i->free_nid_bitmap) return -ENOMEM; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 9a3dc92ecf23..97ec716ac0c1 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -3716,7 +3716,8 @@ static int build_curseg(struct f2fs_sb_info *sbi) struct curseg_info *array; int i; - array = f2fs_kzalloc(sbi, sizeof(*array) * NR_CURSEG_TYPE, GFP_KERNEL); + array = f2fs_kzalloc(sbi, array_size(NR_CURSEG_TYPE, sizeof(*array)), + GFP_KERNEL); if (!array) return -ENOMEM; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index bfa56b037ed8..08635dc2594f 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2423,8 +2423,10 @@ static int init_blkz_info(struct f2fs_sb_info *sbi, int devi) #define F2FS_REPORT_NR_ZONES 4096 - zones = f2fs_kzalloc(sbi, sizeof(struct blk_zone) * - F2FS_REPORT_NR_ZONES, GFP_KERNEL); + zones = f2fs_kzalloc(sbi, + array_size(F2FS_REPORT_NR_ZONES, + sizeof(struct blk_zone)), + GFP_KERNEL); if (!zones) return -ENOMEM; @@ -2568,8 +2570,10 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) * Initialize multiple devices information, or single * zoned block device information. */ - sbi->devs = f2fs_kzalloc(sbi, sizeof(struct f2fs_dev_info) * - max_devices, GFP_KERNEL); + sbi->devs = f2fs_kzalloc(sbi, + array_size(max_devices, + sizeof(struct f2fs_dev_info)), + GFP_KERNEL); if (!sbi->devs) return -ENOMEM; From 6944da0a68ca00f8f27bd71e0e0e292ea14b5ca5 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 12 Jun 2018 14:28:35 -0700 Subject: [PATCH 778/804] treewide: Use array_size in f2fs_kvzalloc() The f2fs_kvzalloc() function has no 2-factor argument form, so multiplication factors need to be wrapped in array_size(). This patch replaces cases of: f2fs_kvzalloc(handle, a * b, gfp) with: f2fs_kvzalloc(handle, array_size(a, b), gfp) as well as handling cases of: f2fs_kvzalloc(handle, a * b * c, gfp) with: f2fs_kvzalloc(handle, array3_size(a, b, c), gfp) This does, however, attempt to ignore constant size factors like: f2fs_kvzalloc(handle, 4 * 1024, gfp) though any constants defined via macros get caught up in the conversion. Any factors with a sizeof() of "unsigned char", "char", and "u8" were dropped, since they're redundant. The Coccinelle script used for this was: // Fix redundant parens around sizeof(). @@ expression HANDLE; type TYPE; expression THING, E; @@ ( f2fs_kvzalloc(HANDLE, - (sizeof(TYPE)) * E + sizeof(TYPE) * E , ...) | f2fs_kvzalloc(HANDLE, - (sizeof(THING)) * E + sizeof(THING) * E , ...) ) // Drop single-byte sizes and redundant parens. @@ expression HANDLE; expression COUNT; typedef u8; typedef __u8; @@ ( f2fs_kvzalloc(HANDLE, - sizeof(u8) * (COUNT) + COUNT , ...) | f2fs_kvzalloc(HANDLE, - sizeof(__u8) * (COUNT) + COUNT , ...) | f2fs_kvzalloc(HANDLE, - sizeof(char) * (COUNT) + COUNT , ...) | f2fs_kvzalloc(HANDLE, - sizeof(unsigned char) * (COUNT) + COUNT , ...) | f2fs_kvzalloc(HANDLE, - sizeof(u8) * COUNT + COUNT , ...) | f2fs_kvzalloc(HANDLE, - sizeof(__u8) * COUNT + COUNT , ...) | f2fs_kvzalloc(HANDLE, - sizeof(char) * COUNT + COUNT , ...) | f2fs_kvzalloc(HANDLE, - sizeof(unsigned char) * COUNT + COUNT , ...) ) // 2-factor product with sizeof(type/expression) and identifier or constant. @@ expression HANDLE; type TYPE; expression THING; identifier COUNT_ID; constant COUNT_CONST; @@ ( f2fs_kvzalloc(HANDLE, - sizeof(TYPE) * (COUNT_ID) + array_size(COUNT_ID, sizeof(TYPE)) , ...) | f2fs_kvzalloc(HANDLE, - sizeof(TYPE) * COUNT_ID + array_size(COUNT_ID, sizeof(TYPE)) , ...) | f2fs_kvzalloc(HANDLE, - sizeof(TYPE) * (COUNT_CONST) + array_size(COUNT_CONST, sizeof(TYPE)) , ...) | f2fs_kvzalloc(HANDLE, - sizeof(TYPE) * COUNT_CONST + array_size(COUNT_CONST, sizeof(TYPE)) , ...) | f2fs_kvzalloc(HANDLE, - sizeof(THING) * (COUNT_ID) + array_size(COUNT_ID, sizeof(THING)) , ...) | f2fs_kvzalloc(HANDLE, - sizeof(THING) * COUNT_ID + array_size(COUNT_ID, sizeof(THING)) , ...) | f2fs_kvzalloc(HANDLE, - sizeof(THING) * (COUNT_CONST) + array_size(COUNT_CONST, sizeof(THING)) , ...) | f2fs_kvzalloc(HANDLE, - sizeof(THING) * COUNT_CONST + array_size(COUNT_CONST, sizeof(THING)) , ...) ) // 2-factor product, only identifiers. @@ expression HANDLE; identifier SIZE, COUNT; @@ f2fs_kvzalloc(HANDLE, - SIZE * COUNT + array_size(COUNT, SIZE) , ...) // 3-factor product with 1 sizeof(type) or sizeof(expression), with // redundant parens removed. @@ expression HANDLE; expression THING; identifier STRIDE, COUNT; type TYPE; @@ ( f2fs_kvzalloc(HANDLE, - sizeof(TYPE) * (COUNT) * (STRIDE) + array3_size(COUNT, STRIDE, sizeof(TYPE)) , ...) | f2fs_kvzalloc(HANDLE, - sizeof(TYPE) * (COUNT) * STRIDE + array3_size(COUNT, STRIDE, sizeof(TYPE)) , ...) | f2fs_kvzalloc(HANDLE, - sizeof(TYPE) * COUNT * (STRIDE) + array3_size(COUNT, STRIDE, sizeof(TYPE)) , ...) | f2fs_kvzalloc(HANDLE, - sizeof(TYPE) * COUNT * STRIDE + array3_size(COUNT, STRIDE, sizeof(TYPE)) , ...) | f2fs_kvzalloc(HANDLE, - sizeof(THING) * (COUNT) * (STRIDE) + array3_size(COUNT, STRIDE, sizeof(THING)) , ...) | f2fs_kvzalloc(HANDLE, - sizeof(THING) * (COUNT) * STRIDE + array3_size(COUNT, STRIDE, sizeof(THING)) , ...) | f2fs_kvzalloc(HANDLE, - sizeof(THING) * COUNT * (STRIDE) + array3_size(COUNT, STRIDE, sizeof(THING)) , ...) | f2fs_kvzalloc(HANDLE, - sizeof(THING) * COUNT * STRIDE + array3_size(COUNT, STRIDE, sizeof(THING)) , ...) ) // 3-factor product with 2 sizeof(variable), with redundant parens removed. @@ expression HANDLE; expression THING1, THING2; identifier COUNT; type TYPE1, TYPE2; @@ ( f2fs_kvzalloc(HANDLE, - sizeof(TYPE1) * sizeof(TYPE2) * COUNT + array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2)) , ...) | f2fs_kvzalloc(HANDLE, - sizeof(TYPE1) * sizeof(THING2) * (COUNT) + array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2)) , ...) | f2fs_kvzalloc(HANDLE, - sizeof(THING1) * sizeof(THING2) * COUNT + array3_size(COUNT, sizeof(THING1), sizeof(THING2)) , ...) | f2fs_kvzalloc(HANDLE, - sizeof(THING1) * sizeof(THING2) * (COUNT) + array3_size(COUNT, sizeof(THING1), sizeof(THING2)) , ...) | f2fs_kvzalloc(HANDLE, - sizeof(TYPE1) * sizeof(THING2) * COUNT + array3_size(COUNT, sizeof(TYPE1), sizeof(THING2)) , ...) | f2fs_kvzalloc(HANDLE, - sizeof(TYPE1) * sizeof(THING2) * (COUNT) + array3_size(COUNT, sizeof(TYPE1), sizeof(THING2)) , ...) ) // 3-factor product, only identifiers, with redundant parens removed. @@ expression HANDLE; identifier STRIDE, SIZE, COUNT; @@ ( f2fs_kvzalloc(HANDLE, - (COUNT) * STRIDE * SIZE + array3_size(COUNT, STRIDE, SIZE) , ...) | f2fs_kvzalloc(HANDLE, - COUNT * (STRIDE) * SIZE + array3_size(COUNT, STRIDE, SIZE) , ...) | f2fs_kvzalloc(HANDLE, - COUNT * STRIDE * (SIZE) + array3_size(COUNT, STRIDE, SIZE) , ...) | f2fs_kvzalloc(HANDLE, - (COUNT) * (STRIDE) * SIZE + array3_size(COUNT, STRIDE, SIZE) , ...) | f2fs_kvzalloc(HANDLE, - COUNT * (STRIDE) * (SIZE) + array3_size(COUNT, STRIDE, SIZE) , ...) | f2fs_kvzalloc(HANDLE, - (COUNT) * STRIDE * (SIZE) + array3_size(COUNT, STRIDE, SIZE) , ...) | f2fs_kvzalloc(HANDLE, - (COUNT) * (STRIDE) * (SIZE) + array3_size(COUNT, STRIDE, SIZE) , ...) | f2fs_kvzalloc(HANDLE, - COUNT * STRIDE * SIZE + array3_size(COUNT, STRIDE, SIZE) , ...) ) // Any remaining multi-factor products, first at least 3-factor products // when they're not all constants... @@ expression HANDLE; expression E1, E2, E3; constant C1, C2, C3; @@ ( f2fs_kvzalloc(HANDLE, C1 * C2 * C3, ...) | f2fs_kvzalloc(HANDLE, - E1 * E2 * E3 + array3_size(E1, E2, E3) , ...) ) // And then all remaining 2 factors products when they're not all constants. @@ expression HANDLE; expression E1, E2; constant C1, C2; @@ ( f2fs_kvzalloc(HANDLE, C1 * C2, ...) | f2fs_kvzalloc(HANDLE, - E1 * E2 + array_size(E1, E2) , ...) ) Signed-off-by: Kees Cook --- fs/f2fs/file.c | 6 ++++-- fs/f2fs/node.c | 6 ++++-- fs/f2fs/segment.c | 12 ++++++++---- 3 files changed, 16 insertions(+), 8 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 2ca53f7b94e9..1ada29893092 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1129,12 +1129,14 @@ static int __exchange_data_block(struct inode *src_inode, olen = min((pgoff_t)4 * ADDRS_PER_BLOCK, len); src_blkaddr = f2fs_kvzalloc(F2FS_I_SB(src_inode), - sizeof(block_t) * olen, GFP_KERNEL); + array_size(olen, sizeof(block_t)), + GFP_KERNEL); if (!src_blkaddr) return -ENOMEM; do_replace = f2fs_kvzalloc(F2FS_I_SB(src_inode), - sizeof(int) * olen, GFP_KERNEL); + array_size(olen, sizeof(int)), + GFP_KERNEL); if (!do_replace) { kvfree(src_blkaddr); return -ENOMEM; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 1ad24998e29c..b72fac4766a9 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2775,8 +2775,10 @@ static int init_free_nid_cache(struct f2fs_sb_info *sbi) if (!nm_i->nat_block_bitmap) return -ENOMEM; - nm_i->free_nid_count = f2fs_kvzalloc(sbi, nm_i->nat_blocks * - sizeof(unsigned short), GFP_KERNEL); + nm_i->free_nid_count = + f2fs_kvzalloc(sbi, array_size(sizeof(unsigned short), + nm_i->nat_blocks), + GFP_KERNEL); if (!nm_i->free_nid_count) return -ENOMEM; return 0; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 97ec716ac0c1..3d0c42ef0474 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -3598,8 +3598,10 @@ static int build_sit_info(struct f2fs_sb_info *sbi) SM_I(sbi)->sit_info = sit_i; - sit_i->sentries = f2fs_kvzalloc(sbi, MAIN_SEGS(sbi) * - sizeof(struct seg_entry), GFP_KERNEL); + sit_i->sentries = + f2fs_kvzalloc(sbi, array_size(sizeof(struct seg_entry), + MAIN_SEGS(sbi)), + GFP_KERNEL); if (!sit_i->sentries) return -ENOMEM; @@ -3639,8 +3641,10 @@ static int build_sit_info(struct f2fs_sb_info *sbi) return -ENOMEM; if (sbi->segs_per_sec > 1) { - sit_i->sec_entries = f2fs_kvzalloc(sbi, MAIN_SECS(sbi) * - sizeof(struct sec_entry), GFP_KERNEL); + sit_i->sec_entries = + f2fs_kvzalloc(sbi, array_size(sizeof(struct sec_entry), + MAIN_SECS(sbi)), + GFP_KERNEL); if (!sit_i->sec_entries) return -ENOMEM; } From 8bdaa17ffa3f406efb03ba72dbad339531b178ad Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Wed, 29 Nov 2017 22:29:47 +0900 Subject: [PATCH 779/804] UPSTREAM: android: binder: Check for errors in binder_alloc_shrinker_init(). Both list_lru_init() and register_shrinker() might return an error. Signed-off-by: Tetsuo Handa Cc: Sherry Yang Cc: Michal Hocko Signed-off-by: Greg Kroah-Hartman (cherry picked from commit 533dfb250d1c8d2bb8c9b65252f7b296b29913d4) Change-Id: I5325ccaf34a04179ef3dae73dd8f3abfd6e21565 --- drivers/android/binder.c | 4 +++- drivers/android/binder_alloc.c | 12 +++++++++--- drivers/android/binder_alloc.h | 2 +- 3 files changed, 13 insertions(+), 5 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 7f3e8dcd6006..9197d4e70238 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -5781,7 +5781,9 @@ static int __init binder_init(void) struct binder_device *device; struct hlist_node *tmp; - binder_alloc_shrinker_init(); + ret = binder_alloc_shrinker_init(); + if (ret) + return ret; atomic_set(&binder_transaction_log.cur, ~0U); atomic_set(&binder_transaction_log_failed.cur, ~0U); diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c index 3a4279d219f7..a1e123b5a2b6 100644 --- a/drivers/android/binder_alloc.c +++ b/drivers/android/binder_alloc.c @@ -1010,8 +1010,14 @@ void binder_alloc_init(struct binder_alloc *alloc) INIT_LIST_HEAD(&alloc->buffers); } -void binder_alloc_shrinker_init(void) +int binder_alloc_shrinker_init(void) { - list_lru_init(&binder_alloc_lru); - register_shrinker(&binder_shrinker); + int ret = list_lru_init(&binder_alloc_lru); + + if (ret == 0) { + ret = register_shrinker(&binder_shrinker); + if (ret) + list_lru_destroy(&binder_alloc_lru); + } + return ret; } diff --git a/drivers/android/binder_alloc.h b/drivers/android/binder_alloc.h index 0b145307f1fd..9ef64e563856 100644 --- a/drivers/android/binder_alloc.h +++ b/drivers/android/binder_alloc.h @@ -130,7 +130,7 @@ extern struct binder_buffer *binder_alloc_new_buf(struct binder_alloc *alloc, size_t extra_buffers_size, int is_async); extern void binder_alloc_init(struct binder_alloc *alloc); -void binder_alloc_shrinker_init(void); +extern int binder_alloc_shrinker_init(void); extern void binder_alloc_vma_close(struct binder_alloc *alloc); extern struct binder_buffer * binder_alloc_prepare_to_free(struct binder_alloc *alloc, From 8429d9832a2cb3d301eeb5d6606b6cd088b9e52b Mon Sep 17 00:00:00 2001 From: Xiongwei Song Date: Thu, 14 Dec 2017 12:15:42 +0800 Subject: [PATCH 780/804] UPSTREAM: ANDROID: binder: make binder_alloc_new_buf_locked static and indent its arguments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The function binder_alloc_new_buf_locked() is only used in this file, so make it static. Also clean up sparse warning: drivers/android/binder_alloc.c:330:23: warning: no previous prototype for ‘binder_alloc_new_buf_locked’ [-Wmissing-prototypes] In addition, the line of the function name exceeds 80 characters when add static for this function, hence indent its arguments anew. Signed-off-by: Xiongwei Song Signed-off-by: Greg Kroah-Hartman (cherry picked from commit 3f827245463a57f5ef64a665e1ca64eed0da00a5) Change-Id: I6b379df815d30f9b3e9f1dd50334375123b25bbc --- drivers/android/binder_alloc.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c index a1e123b5a2b6..9f0204a44ba0 100644 --- a/drivers/android/binder_alloc.c +++ b/drivers/android/binder_alloc.c @@ -328,11 +328,12 @@ err_no_vma: return vma ? -ENOMEM : -ESRCH; } -struct binder_buffer *binder_alloc_new_buf_locked(struct binder_alloc *alloc, - size_t data_size, - size_t offsets_size, - size_t extra_buffers_size, - int is_async) +static struct binder_buffer *binder_alloc_new_buf_locked( + struct binder_alloc *alloc, + size_t data_size, + size_t offsets_size, + size_t extra_buffers_size, + int is_async) { struct rb_node *n = alloc->free_buffers.rb_node; struct binder_buffer *buffer; From dab911911501027bca4afdf8425263e7cb9341bb Mon Sep 17 00:00:00 2001 From: Elad Wexler Date: Fri, 29 Dec 2017 11:03:37 +0200 Subject: [PATCH 781/804] UPSTREAM: android: binder: Prefer __func__ to using hardcoded function name Coding style fixup Signed-off-by: Elad Wexler Signed-off-by: Greg Kroah-Hartman (cherry picked from commit 00c41cddebde8d1a635bf81a7b255b7e56fd0d15) Change-Id: I795e2a9f525c4a8df5cd0a81842a88529ba54f21 --- drivers/android/binder.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 9197d4e70238..805df83963d4 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -4911,7 +4911,7 @@ static int binder_mmap(struct file *filp, struct vm_area_struct *vma) return ret; err_bad_arg: - pr_err("binder_mmap: %d %lx-%lx %s failed %d\n", + pr_err("%s: %d %lx-%lx %s failed %d\n", __func__, proc->pid, vma->vm_start, vma->vm_end, failure_string, ret); return ret; } @@ -4921,7 +4921,7 @@ static int binder_open(struct inode *nodp, struct file *filp) struct binder_proc *proc; struct binder_device *binder_dev; - binder_debug(BINDER_DEBUG_OPEN_CLOSE, "binder_open: %d:%d\n", + binder_debug(BINDER_DEBUG_OPEN_CLOSE, "%s: %d:%d\n", __func__, current->group_leader->pid, current->pid); proc = kzalloc(sizeof(*proc), GFP_KERNEL); From 086c9e40bfac145b292d79e73ee4501506073e15 Mon Sep 17 00:00:00 2001 From: Harsh Shandilya Date: Fri, 22 Dec 2017 19:37:02 +0530 Subject: [PATCH 782/804] UPSTREAM: android: binder: Use octal permissions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit checkpatch warns against the use of symbolic permissions, this patch migrates all symbolic permissions in the binder driver to octal permissions. Test: debugfs nodes created by binder have the same unix permissions prior to and after this patch was applied. Signed-off-by: Harsh Shandilya Cc: "Arve Hjønnevåg" Cc: Todd Kjos Cc: Martijn Coenen Signed-off-by: Greg Kroah-Hartman (cherry picked from commit 21d02ddf716669e182a13b69b4dd928cf8ef5e0f) Change-Id: I8152fe280ead1d04d89593e813a722f9eb5def27 --- drivers/android/binder.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 805df83963d4..98c00b57ad80 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -143,7 +143,7 @@ enum { }; static uint32_t binder_debug_mask = BINDER_DEBUG_USER_ERROR | BINDER_DEBUG_FAILED_TRANSACTION | BINDER_DEBUG_DEAD_TRANSACTION; -module_param_named(debug_mask, binder_debug_mask, uint, S_IWUSR | S_IRUGO); +module_param_named(debug_mask, binder_debug_mask, uint, 0644); static char *binder_devices_param = CONFIG_ANDROID_BINDER_DEVICES; module_param_named(devices, binder_devices_param, charp, S_IRUGO); @@ -162,7 +162,7 @@ static int binder_set_stop_on_user_error(const char *val, return ret; } module_param_call(stop_on_user_error, binder_set_stop_on_user_error, - param_get_int, &binder_stop_on_user_error, S_IWUSR | S_IRUGO); + param_get_int, &binder_stop_on_user_error, 0644); #define binder_debug(mask, x...) \ do { \ @@ -4966,7 +4966,7 @@ static int binder_open(struct inode *nodp, struct file *filp) * anyway print all contexts that a given PID has, so this * is not a problem. */ - proc->debugfs_entry = debugfs_create_file(strbuf, S_IRUGO, + proc->debugfs_entry = debugfs_create_file(strbuf, 0444, binder_debugfs_dir_entry_proc, (void *)(unsigned long)proc->pid, &binder_proc_fops); @@ -5798,27 +5798,27 @@ static int __init binder_init(void) if (binder_debugfs_dir_entry_root) { debugfs_create_file("state", - S_IRUGO, + 0444, binder_debugfs_dir_entry_root, NULL, &binder_state_fops); debugfs_create_file("stats", - S_IRUGO, + 0444, binder_debugfs_dir_entry_root, NULL, &binder_stats_fops); debugfs_create_file("transactions", - S_IRUGO, + 0444, binder_debugfs_dir_entry_root, NULL, &binder_transactions_fops); debugfs_create_file("transaction_log", - S_IRUGO, + 0444, binder_debugfs_dir_entry_root, &binder_transaction_log, &binder_transaction_log_fops); debugfs_create_file("failed_transaction_log", - S_IRUGO, + 0444, binder_debugfs_dir_entry_root, &binder_transaction_log_failed, &binder_transaction_log_fops); From 8cfa4a392450f64d57af5c3b99dae20bd7143f13 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Tue, 23 Jan 2018 12:04:27 -0600 Subject: [PATCH 783/804] UPSTREAM: android: binder: Use true and false for boolean values Assign true or false to boolean variables instead of an integer value. This issue was detected with the help of Coccinelle. Signed-off-by: Gustavo A. R. Silva Cc: Todd Kjos Cc: Martijn Coenen Signed-off-by: Greg Kroah-Hartman (cherry picked from commit 197410ad884eb18b31d48e9d8e64cb5a9e326f2f) Change-Id: I30bed831d6b6ff2e9e3e521ccc5d6836f0b30944 --- drivers/android/binder.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 98c00b57ad80..3ec97d41cf30 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -251,7 +251,7 @@ static struct binder_transaction_log_entry *binder_transaction_log_add( unsigned int cur = atomic_inc_return(&log->cur); if (cur >= ARRAY_SIZE(log->entry)) - log->full = 1; + log->full = true; e = &log->entry[cur % ARRAY_SIZE(log->entry)]; WRITE_ONCE(e->debug_id_done, 0); /* @@ -2805,7 +2805,7 @@ static bool binder_proc_transaction(struct binder_transaction *t, if (node->has_async_transaction) { pending_async = true; } else { - node->has_async_transaction = 1; + node->has_async_transaction = true; } } @@ -3670,7 +3670,7 @@ static int binder_thread_write(struct binder_proc *proc, w = binder_dequeue_work_head_ilocked( &buf_node->async_todo); if (!w) { - buf_node->has_async_transaction = 0; + buf_node->has_async_transaction = false; } else { binder_enqueue_work_ilocked( w, &proc->todo); From 44343a7b30db28f6d140a23b46653d9fdd0a17c2 Mon Sep 17 00:00:00 2001 From: Ganesh Mahendran Date: Wed, 10 Jan 2018 10:49:05 +0800 Subject: [PATCH 784/804] UPSTREAM: android: binder: use VM_ALLOC to get vm area VM_IOREMAP is used to access hardware through a mechanism called I/O mapped memory. Android binder is a IPC machanism which will not access I/O memory. And VM_IOREMAP has alignment requiement which may not needed in binder. __get_vm_area_node() { ... if (flags & VM_IOREMAP) align = 1ul << clamp_t(int, fls_long(size), PAGE_SHIFT, IOREMAP_MAX_ORDER); ... } This patch will save some kernel vm area, especially for 32bit os. In 32bit OS, kernel vm area is only 240MB. We may got below error when launching a app: <3>[ 4482.440053] binder_alloc: binder_alloc_mmap_handler: 15728 8ce67000-8cf65000 get_vm_area failed -12 <3>[ 4483.218817] binder_alloc: binder_alloc_mmap_handler: 15745 8ce67000-8cf65000 get_vm_area failed -12 Signed-off-by: Ganesh Mahendran Acked-by: Martijn Coenen Acked-by: Todd Kjos Cc: stable ---- V3: update comments V2: update comments Signed-off-by: Greg Kroah-Hartman (cherry picked from commit aac6830ec1cb681544212838911cdc57f2638216) Change-Id: Ide458abc6a4d3ec07973733aa223c4247eef20e6 --- drivers/android/binder_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c index 9f0204a44ba0..16d02de1f700 100644 --- a/drivers/android/binder_alloc.c +++ b/drivers/android/binder_alloc.c @@ -671,7 +671,7 @@ int binder_alloc_mmap_handler(struct binder_alloc *alloc, goto err_already_mapped; } - area = get_vm_area(vma->vm_end - vma->vm_start, VM_IOREMAP); + area = get_vm_area(vma->vm_end - vma->vm_start, VM_ALLOC); if (area == NULL) { ret = -ENOMEM; failure_string = "get_vm_area"; From 02d82286734d9195913adf42300523c41f1008ab Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 29 Mar 2018 12:14:40 +0300 Subject: [PATCH 785/804] UPSTREAM: ANDROID: binder: re-order some conditions It doesn't make any difference to runtime but I've switched these two checks to make my static checker happy. The problem is that "buffer->data_size" is user controlled and if it's less than "sizeo(*hdr)" then that means "offset" can be more than "buffer->data_size". It's just cleaner to check it in the other order. Signed-off-by: Dan Carpenter Acked-by: Martijn Coenen Signed-off-by: Greg Kroah-Hartman (cherry picked from commit 361f2ddbb0c9f9b4f336025a7bd0212cea4a34f0) Change-Id: I098d525ba63d125caa9840e6e1d5004bf70edc3c --- drivers/android/binder.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 3ec97d41cf30..d7085625186a 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -2215,8 +2215,8 @@ static size_t binder_validate_object(struct binder_buffer *buffer, u64 offset) struct binder_object_header *hdr; size_t object_size = 0; - if (offset > buffer->data_size - sizeof(*hdr) || - buffer->data_size < sizeof(*hdr) || + if (buffer->data_size < sizeof(*hdr) || + offset > buffer->data_size - sizeof(*hdr) || !IS_ALIGNED(offset, sizeof(u32))) return 0; From 613e7993d78f5e06f2e56224f85d1068b98d7235 Mon Sep 17 00:00:00 2001 From: Martijn Coenen Date: Fri, 11 May 2018 01:45:24 -0700 Subject: [PATCH 786/804] UPSTREAM: ANDROID: binder: remove 32-bit binder interface. New devices launching with Android P need to use the 64-bit binder interface, even on 32-bit SoCs [0]. This change removes the Kconfig option to select the 32-bit binder interface. We don't think this will affect existing userspace for the following reasons: 1) The latest Android common tree is 4.14, so we don't believe any Android devices are on kernels >4.14. 2) Android devices launch on an LTS release and stick with it, so we wouldn't expect devices running on <= 4.14 now to upgrade to 4.17 or later. But even if they did, they'd rebuild the world (kernel + userspace) anyway. 3) Other userspaces like 'anbox' are already using the 64-bit interface. Note that this change doesn't remove the 32-bit UAPI itself; the reason for that is that Android userspace always uses the latest UAPI headers from upstream, and userspace retains 32-bit support for devices that are upgrading. This will be removed as well in 2-3 years, at which point we can remove the code from the UAPI as well. Finally, this change introduces build errors on archs where 64-bit get_user/put_user is not supported, so make binder unavailable on m68k (which wouldn't want it anyway). [0]: https://android-review.googlesource.com/c/platform/build/+/595193 Signed-off-by: Martijn Coenen Signed-off-by: Greg Kroah-Hartman (cherry picked from commit 1190b4e38f97023154e6b3bef61b251aa5f970d0) Change-Id: I73dadf1d7b45a42bb18be5d5d3f5c090e61866de --- drivers/android/Kconfig | 15 +-------------- drivers/android/binder.c | 4 ---- 2 files changed, 1 insertion(+), 18 deletions(-) diff --git a/drivers/android/Kconfig b/drivers/android/Kconfig index 01de42c8b74b..63ed9ceebf7b 100644 --- a/drivers/android/Kconfig +++ b/drivers/android/Kconfig @@ -9,7 +9,7 @@ if ANDROID config ANDROID_BINDER_IPC bool "Android Binder IPC Driver" - depends on MMU + depends on MMU && !M68K default n ---help--- Binder is used in Android for both communication between processes, @@ -31,19 +31,6 @@ config ANDROID_BINDER_DEVICES created. Each binder device has its own context manager, and is therefore logically separated from the other devices. -config ANDROID_BINDER_IPC_32BIT - bool - depends on !64BIT && ANDROID_BINDER_IPC - default y - ---help--- - The Binder API has been changed to support both 32 and 64bit - applications in a mixed environment. - - Enable this to support an old 32-bit Android user-space (v4.4 and - earlier). - - Note that enabling this will break newer Android user-space. - config ANDROID_BINDER_IPC_SELFTEST bool "Android Binder IPC Driver Selftest" depends on ANDROID_BINDER_IPC diff --git a/drivers/android/binder.c b/drivers/android/binder.c index d7085625186a..d4fb60022d60 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -71,10 +71,6 @@ #include #include -#ifdef CONFIG_ANDROID_BINDER_IPC_32BIT -#define BINDER_IPC_32BIT 1 -#endif - #include #include "binder_alloc.h" #include "binder_trace.h" From 157ecdc803759e9455b7ef27f3fad97361478837 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=AE=8B=E9=87=91=E6=97=B6?= Date: Thu, 10 May 2018 02:05:03 +0000 Subject: [PATCH 787/804] UPSTREAM: ANDROID: binder: correct the cmd print for BINDER_WORK_RETURN_ERROR When to execute binder_stat_br the e->cmd has been modifying as BR_OK instead of the original return error cmd, in fact we want to know the original return error, such as BR_DEAD_REPLY or BR_FAILED_REPLY, etc. instead of always BR_OK, in order to avoid the value of the e->cmd is always BR_OK, so we need assign the value of the e->cmd to cmd before e->cmd = BR_OK. Signed-off-by: songjinshi Signed-off-by: Greg Kroah-Hartman (cherry picked from commit 838d5565669aa5bb7deb605684a5970d51d5eaf6) Change-Id: I425b32c5419a491c6b9ceee7c00dde6513e0421d --- drivers/android/binder.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index d4fb60022d60..8d303b672e0e 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -4094,6 +4094,7 @@ retry: binder_inner_proc_unlock(proc); if (put_user(e->cmd, (uint32_t __user *)ptr)) return -EFAULT; + cmd = e->cmd; e->cmd = BR_OK; ptr += sizeof(uint32_t); From e734b26701c1bab9b8a082a53f9b2cabf10051ee Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Mon, 7 May 2018 23:15:37 +0900 Subject: [PATCH 788/804] UPSTREAM: ANDROID: binder: change down_write to down_read MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit binder_update_page_range needs down_write of mmap_sem because vm_insert_page need to change vma->vm_flags to VM_MIXEDMAP unless it is set. However, when I profile binder working, it seems every binder buffers should be mapped in advance by binder_mmap. It means we could set VM_MIXEDMAP in binder_mmap time which is already hold a mmap_sem as down_write so binder_update_page_range doesn't need to hold a mmap_sem as down_write. Please use proper API down_read. It would help mmap_sem contention problem as well as fixing down_write abuse. Ganesh Mahendran tested app launching and binder throughput test and he said he couldn't find any problem and I did binder latency test per Greg KH request(Thanks Martijn to teach me how I can do) I cannot find any problem, too. Cc: Ganesh Mahendran Cc: Joe Perches Cc: Arve Hjønnevåg Cc: Todd Kjos Reviewed-by: Martijn Coenen Signed-off-by: Minchan Kim Reviewed-by: Joel Fernandes (Google) Signed-off-by: Greg Kroah-Hartman (cherry picked from commit 720c241924046aff83f5f2323232f34a30a4c281) Change-Id: I8358ceaaab4030f7122c95308dcad59557cad411 --- drivers/android/binder.c | 4 +++- drivers/android/binder_alloc.c | 6 +++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 8d303b672e0e..eb61b0c2fddc 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -4899,7 +4899,9 @@ static int binder_mmap(struct file *filp, struct vm_area_struct *vma) failure_string = "bad vm_flags"; goto err_bad_arg; } - vma->vm_flags = (vma->vm_flags | VM_DONTCOPY) & ~VM_MAYWRITE; + vma->vm_flags |= VM_DONTCOPY | VM_MIXEDMAP; + vma->vm_flags &= ~VM_MAYWRITE; + vma->vm_ops = &binder_vm_ops; vma->vm_private_data = proc; diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c index 16d02de1f700..1d9db2ef26bd 100644 --- a/drivers/android/binder_alloc.c +++ b/drivers/android/binder_alloc.c @@ -220,7 +220,7 @@ static int binder_update_page_range(struct binder_alloc *alloc, int allocate, mm = alloc->vma_vm_mm; if (mm) { - down_write(&mm->mmap_sem); + down_read(&mm->mmap_sem); vma = alloc->vma; } @@ -289,7 +289,7 @@ static int binder_update_page_range(struct binder_alloc *alloc, int allocate, /* vm_insert_page does not seem to increment the refcount */ } if (mm) { - up_write(&mm->mmap_sem); + up_read(&mm->mmap_sem); mmput(mm); } return 0; @@ -322,7 +322,7 @@ err_page_ptr_cleared: } err_no_vma: if (mm) { - up_write(&mm->mmap_sem); + up_read(&mm->mmap_sem); mmput(mm); } return vma ? -ENOMEM : -ESRCH; From 96523f2450dc194ee8b914f9315fa6b609d172f3 Mon Sep 17 00:00:00 2001 From: Martijn Coenen Date: Fri, 15 Jun 2018 11:53:36 +0200 Subject: [PATCH 789/804] UPSTREAM: Revert "FROMLIST: binder: fix proc->files use-after-free" This reverts commit f09daf140e6e6d3b34e34382bc47a06b854b774e. Change-Id: I6d340f75e57e1badc5fe3f41e0aa8f148047c7bd --- drivers/android/binder.c | 63 +++++++++++++++++++++------------------- 1 file changed, 33 insertions(+), 30 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index eb61b0c2fddc..38e22aca89f6 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -462,8 +462,9 @@ struct binder_ref { }; enum binder_deferred_state { - BINDER_DEFERRED_FLUSH = 0x01, - BINDER_DEFERRED_RELEASE = 0x02, + BINDER_DEFERRED_PUT_FILES = 0x01, + BINDER_DEFERRED_FLUSH = 0x02, + BINDER_DEFERRED_RELEASE = 0x04, }; /** @@ -500,6 +501,8 @@ struct binder_priority { * (invariant after initialized) * @tsk task_struct for group_leader of process * (invariant after initialized) + * @files files_struct for process + * (invariant after initialized) * @deferred_work_node: element for binder_deferred_list * (protected by binder_deferred_lock) * @deferred_work: bitmap of deferred work to perform @@ -544,6 +547,7 @@ struct binder_proc { struct list_head waiting_threads; int pid; struct task_struct *tsk; + struct files_struct *files; struct hlist_node deferred_work_node; int deferred_work; bool is_dead; @@ -938,34 +942,22 @@ static void binder_free_thread(struct binder_thread *thread); static void binder_free_proc(struct binder_proc *proc); static void binder_inc_node_tmpref_ilocked(struct binder_node *node); -struct files_struct *binder_get_files_struct(struct binder_proc *proc) -{ - return get_files_struct(proc->tsk); -} - static int task_get_unused_fd_flags(struct binder_proc *proc, int flags) { - struct files_struct *files; + struct files_struct *files = proc->files; unsigned long rlim_cur; unsigned long irqs; - int ret; - files = binder_get_files_struct(proc); if (files == NULL) return -ESRCH; - if (!lock_task_sighand(proc->tsk, &irqs)) { - ret = -EMFILE; - goto err; - } + if (!lock_task_sighand(proc->tsk, &irqs)) + return -EMFILE; rlim_cur = task_rlimit(proc->tsk, RLIMIT_NOFILE); unlock_task_sighand(proc->tsk, &irqs); - ret = __alloc_fd(files, 0, rlim_cur, flags); -err: - put_files_struct(files); - return ret; + return __alloc_fd(files, 0, rlim_cur, flags); } /* @@ -974,12 +966,8 @@ err: static void task_fd_install( struct binder_proc *proc, unsigned int fd, struct file *file) { - struct files_struct *files = binder_get_files_struct(proc); - - if (files) { - __fd_install(files, fd, file); - put_files_struct(files); - } + if (proc->files) + __fd_install(proc->files, fd, file); } /* @@ -987,20 +975,18 @@ static void task_fd_install( */ static long task_close_fd(struct binder_proc *proc, unsigned int fd) { - struct files_struct *files = binder_get_files_struct(proc); int retval; - if (files == NULL) + if (proc->files == NULL) return -ESRCH; - retval = __close_fd(files, fd); + retval = __close_fd(proc->files, fd); /* can't restart close syscall because file table entry was cleared */ if (unlikely(retval == -ERESTARTSYS || retval == -ERESTARTNOINTR || retval == -ERESTARTNOHAND || retval == -ERESTART_RESTARTBLOCK)) retval = -EINTR; - put_files_struct(files); return retval; } @@ -4863,6 +4849,7 @@ static void binder_vma_close(struct vm_area_struct *vma) (vma->vm_end - vma->vm_start) / SZ_1K, vma->vm_flags, (unsigned long)pgprot_val(vma->vm_page_prot)); binder_alloc_vma_close(&proc->alloc); + binder_defer_work(proc, BINDER_DEFERRED_PUT_FILES); } static int binder_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) @@ -4906,8 +4893,10 @@ static int binder_mmap(struct file *filp, struct vm_area_struct *vma) vma->vm_private_data = proc; ret = binder_alloc_mmap_handler(&proc->alloc, vma); - - return ret; + if (ret) + return ret; + proc->files = get_files_struct(current); + return 0; err_bad_arg: pr_err("%s: %d %lx-%lx %s failed %d\n", __func__, @@ -5086,6 +5075,8 @@ static void binder_deferred_release(struct binder_proc *proc) struct rb_node *n; int threads, nodes, incoming_refs, outgoing_refs, active_transactions; + BUG_ON(proc->files); + mutex_lock(&binder_procs_lock); hlist_del(&proc->proc_node); mutex_unlock(&binder_procs_lock); @@ -5167,6 +5158,8 @@ static void binder_deferred_release(struct binder_proc *proc) static void binder_deferred_func(struct work_struct *work) { struct binder_proc *proc; + struct files_struct *files; + int defer; do { @@ -5183,11 +5176,21 @@ static void binder_deferred_func(struct work_struct *work) } mutex_unlock(&binder_deferred_lock); + files = NULL; + if (defer & BINDER_DEFERRED_PUT_FILES) { + files = proc->files; + if (files) + proc->files = NULL; + } + if (defer & BINDER_DEFERRED_FLUSH) binder_deferred_flush(proc); if (defer & BINDER_DEFERRED_RELEASE) binder_deferred_release(proc); /* frees proc */ + + if (files) + put_files_struct(files); } while (proc); } static DECLARE_WORK(binder_deferred_work, binder_deferred_func); From c88a3ec1ee64576a706432af3e8f1aee8e6e4d2e Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Mon, 27 Nov 2017 09:32:33 -0800 Subject: [PATCH 790/804] UPSTREAM: binder: fix proc->files use-after-free proc->files cleanup is initiated by binder_vma_close. Therefore a reference on the binder_proc is not enough to prevent the files_struct from being released while the binder_proc still has a reference. This can lead to an attempt to dereference the stale pointer obtained from proc->files prior to proc->files cleanup. This has been seen once in task_get_unused_fd_flags() when __alloc_fd() is called with a stale "files". The fix is to protect proc->files with a mutex to prevent cleanup while in use. Signed-off-by: Todd Kjos Cc: stable # 4.14 Signed-off-by: Greg Kroah-Hartman (cherry picked from commit 7f3dc0088b98533f17128058fac73cd8b2752ef1) Change-Id: I40982bb0b4615bda5459538c20eb2a913964042c --- drivers/android/binder.c | 44 ++++++++++++++++++++++++++++------------ 1 file changed, 31 insertions(+), 13 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 38e22aca89f6..ef7c58541c5c 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -502,7 +502,8 @@ struct binder_priority { * @tsk task_struct for group_leader of process * (invariant after initialized) * @files files_struct for process - * (invariant after initialized) + * (protected by @files_lock) + * @files_lock mutex to protect @files * @deferred_work_node: element for binder_deferred_list * (protected by binder_deferred_lock) * @deferred_work: bitmap of deferred work to perform @@ -548,6 +549,7 @@ struct binder_proc { int pid; struct task_struct *tsk; struct files_struct *files; + struct mutex files_lock; struct hlist_node deferred_work_node; int deferred_work; bool is_dead; @@ -944,20 +946,26 @@ static void binder_inc_node_tmpref_ilocked(struct binder_node *node); static int task_get_unused_fd_flags(struct binder_proc *proc, int flags) { - struct files_struct *files = proc->files; unsigned long rlim_cur; unsigned long irqs; + int ret; - if (files == NULL) - return -ESRCH; - - if (!lock_task_sighand(proc->tsk, &irqs)) - return -EMFILE; - + mutex_lock(&proc->files_lock); + if (proc->files == NULL) { + ret = -ESRCH; + goto err; + } + if (!lock_task_sighand(proc->tsk, &irqs)) { + ret = -EMFILE; + goto err; + } rlim_cur = task_rlimit(proc->tsk, RLIMIT_NOFILE); unlock_task_sighand(proc->tsk, &irqs); - return __alloc_fd(files, 0, rlim_cur, flags); + ret = __alloc_fd(proc->files, 0, rlim_cur, flags); +err: + mutex_unlock(&proc->files_lock); + return ret; } /* @@ -966,8 +974,10 @@ static int task_get_unused_fd_flags(struct binder_proc *proc, int flags) static void task_fd_install( struct binder_proc *proc, unsigned int fd, struct file *file) { + mutex_lock(&proc->files_lock); if (proc->files) __fd_install(proc->files, fd, file); + mutex_unlock(&proc->files_lock); } /* @@ -977,9 +987,11 @@ static long task_close_fd(struct binder_proc *proc, unsigned int fd) { int retval; - if (proc->files == NULL) - return -ESRCH; - + mutex_lock(&proc->files_lock); + if (proc->files == NULL) { + retval = -ESRCH; + goto err; + } retval = __close_fd(proc->files, fd); /* can't restart close syscall because file table entry was cleared */ if (unlikely(retval == -ERESTARTSYS || @@ -987,7 +999,8 @@ static long task_close_fd(struct binder_proc *proc, unsigned int fd) retval == -ERESTARTNOHAND || retval == -ERESTART_RESTARTBLOCK)) retval = -EINTR; - +err: + mutex_unlock(&proc->files_lock); return retval; } @@ -4895,7 +4908,9 @@ static int binder_mmap(struct file *filp, struct vm_area_struct *vma) ret = binder_alloc_mmap_handler(&proc->alloc, vma); if (ret) return ret; + mutex_lock(&proc->files_lock); proc->files = get_files_struct(current); + mutex_unlock(&proc->files_lock); return 0; err_bad_arg: @@ -4919,6 +4934,7 @@ static int binder_open(struct inode *nodp, struct file *filp) spin_lock_init(&proc->outer_lock); get_task_struct(current->group_leader); proc->tsk = current->group_leader; + mutex_init(&proc->files_lock); INIT_LIST_HEAD(&proc->todo); if (binder_supported_policy(current->policy)) { proc->default_priority.sched_policy = current->policy; @@ -5178,9 +5194,11 @@ static void binder_deferred_func(struct work_struct *work) files = NULL; if (defer & BINDER_DEFERRED_PUT_FILES) { + mutex_lock(&proc->files_lock); files = proc->files; if (files) proc->files = NULL; + mutex_unlock(&proc->files_lock); } if (defer & BINDER_DEFERRED_FLUSH) From 8dd84f190eec55766eb3e6215af185b1a3cf59f4 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 21 Aug 2017 16:13:28 +0200 Subject: [PATCH 791/804] UPSTREAM: binder: free memory on error On binder_init() the devices string is duplicated and smashed into individual device names which are passed along. However, the original duplicated string wasn't freed in case binder_init() failed. Let's free it on error. Signed-off-by: Christian Brauner Cc: stable Signed-off-by: Greg Kroah-Hartman (cherry picked from commit 22eb9476b5d80a393ac0ba235c42bccc90b82c76) Change-Id: I78fdeecf70c31ba4248b3de17130f97546288f84 --- drivers/android/binder.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index ef7c58541c5c..e159ab84547d 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -5797,7 +5797,7 @@ static int __init init_binder_device(const char *name) static int __init binder_init(void) { int ret; - char *device_name, *device_names; + char *device_name, *device_names, *device_tmp; struct binder_device *device; struct hlist_node *tmp; @@ -5855,7 +5855,8 @@ static int __init binder_init(void) } strcpy(device_names, binder_devices_param); - while ((device_name = strsep(&device_names, ","))) { + device_tmp = device_names; + while ((device_name = strsep(&device_tmp, ","))) { ret = init_binder_device(device_name); if (ret) goto err_init_binder_device_failed; @@ -5869,6 +5870,9 @@ err_init_binder_device_failed: hlist_del(&device->hlist); kfree(device); } + + kfree(device_names); + err_alloc_device_names_failed: debugfs_remove_recursive(binder_debugfs_dir_entry_root); From bda6b6e49b198870ddfd6cbbc88dda8271ddbbc5 Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Wed, 7 Feb 2018 13:57:37 -0800 Subject: [PATCH 792/804] UPSTREAM: binder: replace "%p" with "%pK" The format specifier "%p" can leak kernel addresses. Use "%pK" instead. There were 4 remaining cases in binder.c. Signed-off-by: Todd Kjos Cc: stable Signed-off-by: Greg Kroah-Hartman (cherry picked from commit 8ca86f1639ec5890d400fff9211aca22d0a392eb) Change-Id: I309241853c53bcdfa65c17cb05876e786597afdd --- drivers/android/binder.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index e159ab84547d..e0c46ce312d7 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -2351,7 +2351,7 @@ static void binder_transaction_buffer_release(struct binder_proc *proc, int debug_id = buffer->debug_id; binder_debug(BINDER_DEBUG_TRANSACTION, - "%d buffer release %d, size %zd-%zd, failed at %p\n", + "%d buffer release %d, size %zd-%zd, failed at %pK\n", proc->pid, buffer->debug_id, buffer->data_size, buffer->offsets_size, failed_at); @@ -3887,7 +3887,7 @@ static int binder_thread_write(struct binder_proc *proc, } } binder_debug(BINDER_DEBUG_DEAD_BINDER, - "%d:%d BC_DEAD_BINDER_DONE %016llx found %p\n", + "%d:%d BC_DEAD_BINDER_DONE %016llx found %pK\n", proc->pid, thread->pid, (u64)cookie, death); if (death == NULL) { @@ -5237,7 +5237,7 @@ static void print_binder_transaction_ilocked(struct seq_file *m, spin_lock(&t->lock); to_proc = t->to_proc; seq_printf(m, - "%s %d: %p from %d:%d to %d:%d code %x flags %x pri %d:%d r%d", + "%s %d: %pK from %d:%d to %d:%d code %x flags %x pri %d:%d r%d", prefix, t->debug_id, t, t->from ? t->from->proc->pid : 0, t->from ? t->from->pid : 0, @@ -5262,7 +5262,7 @@ static void print_binder_transaction_ilocked(struct seq_file *m, } if (buffer->target_node) seq_printf(m, " node %d", buffer->target_node->debug_id); - seq_printf(m, " size %zd:%zd data %p\n", + seq_printf(m, " size %zd:%zd data %pK\n", buffer->data_size, buffer->offsets_size, buffer->data); } From e95033171949d70bba6f3c582b063e3bb8620f86 Mon Sep 17 00:00:00 2001 From: Connor O'Brien Date: Fri, 13 Jul 2018 14:31:40 -0700 Subject: [PATCH 793/804] ANDROID: Reduce use of #ifdef CONFIG_CPU_FREQ_TIMES Add empty versions of functions to cpufreq_times.h to cut down on use of #ifdef in .c files. Test: kernel builds with and without CONFIG_CPU_FREQ_TIMES=y Change-Id: I49ac364fac3d42bba0ca1801e23b15081094fb12 Signed-off-by: Connor O'Brien --- include/linux/cpufreq_times.h | 4 ++++ kernel/exit.c | 3 +-- kernel/sched/core.c | 2 -- kernel/sched/cputime.c | 5 +---- 4 files changed, 6 insertions(+), 8 deletions(-) diff --git a/include/linux/cpufreq_times.h b/include/linux/cpufreq_times.h index 3fb38750c853..6374d4205a5f 100644 --- a/include/linux/cpufreq_times.h +++ b/include/linux/cpufreq_times.h @@ -31,6 +31,10 @@ void cpufreq_times_record_transition(struct cpufreq_freqs *freq); void cpufreq_task_times_remove_uids(uid_t uid_start, uid_t uid_end); int single_uid_time_in_state_open(struct inode *inode, struct file *file); #else +static inline void cpufreq_task_times_init(struct task_struct *p) {} +static inline void cpufreq_task_times_exit(struct task_struct *p) {} +static inline void cpufreq_acct_update_power(struct task_struct *p, + u64 cputime) {} static inline void cpufreq_times_create_policy(struct cpufreq_policy *policy) {} static inline void cpufreq_times_record_transition( struct cpufreq_freqs *freq) {} diff --git a/kernel/exit.c b/kernel/exit.c index e9bfee5fcce5..4479af833505 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -174,9 +174,8 @@ void release_task(struct task_struct *p) { struct task_struct *leader; int zap_leader; -#ifdef CONFIG_CPU_FREQ_TIMES + cpufreq_task_times_exit(p); -#endif repeat: /* don't need to get the RCU readlock here - the process is dead and * can't be modifying its own credentials. But shut RCU-lockdep up */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9a0c1841993f..8f389b86bf34 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2198,9 +2198,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) memset(&p->se.statistics, 0, sizeof(p->se.statistics)); #endif -#ifdef CONFIG_CPU_FREQ_TIMES cpufreq_task_times_init(p); -#endif RB_CLEAR_NODE(&p->dl.rb_node); init_dl_task_timer(&p->dl); diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index c0763cba909d..fc2cfd6b2941 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -167,10 +167,8 @@ void account_user_time(struct task_struct *p, cputime_t cputime, /* Account for user time used */ acct_account_cputime(p); -#ifdef CONFIG_CPU_FREQ_TIMES /* Account power usage for user time */ cpufreq_acct_update_power(p, cputime); -#endif } /* @@ -221,10 +219,9 @@ void __account_system_time(struct task_struct *p, cputime_t cputime, /* Account for system time used */ acct_account_cputime(p); -#ifdef CONFIG_CPU_FREQ_TIMES + /* Account power usage for system time */ cpufreq_acct_update_power(p, cputime); -#endif } /* From 1b37d68f4c82b1c40cc16478de1f8f6a1f584a82 Mon Sep 17 00:00:00 2001 From: Sultan Alsawaf Date: Sun, 3 Jun 2018 10:47:51 -0700 Subject: [PATCH 794/804] ANDROID: Fix massive cpufreq_times memory leaks Every time _cpu_up() is called for a CPU, idle_thread_get() is called which then re-initializes a CPU's idle thread that was already previously created and cached in a global variable in smpboot.c. idle_thread_get() calls init_idle() which then calls __sched_fork(). __sched_fork() is where cpufreq_task_times_init() is, and cpufreq_task_times_init() allocates memory for the task struct's time_in_state array. Since idle_thread_get() reuses a task struct instance that was already previously created, this means that every time it calls init_idle(), cpufreq_task_times_init() allocates this array again and overwrites the existing allocation that the idle thread already had. This causes memory to be leaked every time a CPU is onlined. In order to fix this, move allocation of time_in_state into _do_fork to avoid allocating it at all for idle threads. The cpufreq times interface is intended to be used for tracking userspace tasks, so we can safely remove it from the kernel's idle threads without killing any functionality. But that's not all! Task structs can be freed outside of release_task(), which creates another memory leak because a task struct can be freed without having its cpufreq times allocation freed. To fix this, free the cpufreq times allocation at the same time that task struct allocations are freed, in free_task(). Since free_task() can also be called in error paths of copy_process() after dup_task_struct(), set time_in_state to NULL immediately after calling dup_task_struct() to avoid possible double free. Bug description and fix adapted from patch submitted by Sultan Alsawaf at https://android-review.googlesource.com/c/kernel/msm/+/700134 Bug: 110044919 Test: Hikey960 builds, boots & reports /proc//time_in_state correctly Change-Id: I12fe7611fc88eb7f6c39f8f7629ad27b6ec4722c Signed-off-by: Connor O'Brien --- drivers/cpufreq/cpufreq_times.c | 9 ++++++--- include/linux/cpufreq_times.h | 2 ++ kernel/exit.c | 3 --- kernel/fork.c | 6 ++++++ kernel/sched/core.c | 2 -- 5 files changed, 14 insertions(+), 8 deletions(-) diff --git a/drivers/cpufreq/cpufreq_times.c b/drivers/cpufreq/cpufreq_times.c index e5df7a47cc16..e7a8b636a5f4 100644 --- a/drivers/cpufreq/cpufreq_times.c +++ b/drivers/cpufreq/cpufreq_times.c @@ -234,16 +234,19 @@ static int uid_time_in_state_seq_show(struct seq_file *m, void *v) void cpufreq_task_times_init(struct task_struct *p) { - void *temp; unsigned long flags; - unsigned int max_state; spin_lock_irqsave(&task_time_in_state_lock, flags); p->time_in_state = NULL; spin_unlock_irqrestore(&task_time_in_state_lock, flags); p->max_state = 0; +} - max_state = READ_ONCE(next_offset); +void cpufreq_task_times_alloc(struct task_struct *p) +{ + void *temp; + unsigned long flags; + unsigned int max_state = READ_ONCE(next_offset); /* We use one array to avoid multiple allocs per task */ temp = kcalloc(max_state, sizeof(p->time_in_state[0]), GFP_ATOMIC); diff --git a/include/linux/cpufreq_times.h b/include/linux/cpufreq_times.h index 6374d4205a5f..356a3fad03c9 100644 --- a/include/linux/cpufreq_times.h +++ b/include/linux/cpufreq_times.h @@ -22,6 +22,7 @@ #ifdef CONFIG_CPU_FREQ_TIMES void cpufreq_task_times_init(struct task_struct *p); +void cpufreq_task_times_alloc(struct task_struct *p); void cpufreq_task_times_exit(struct task_struct *p); int proc_time_in_state_show(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *p); @@ -32,6 +33,7 @@ void cpufreq_task_times_remove_uids(uid_t uid_start, uid_t uid_end); int single_uid_time_in_state_open(struct inode *inode, struct file *file); #else static inline void cpufreq_task_times_init(struct task_struct *p) {} +static inline void cpufreq_task_times_alloc(struct task_struct *p) {} static inline void cpufreq_task_times_exit(struct task_struct *p) {} static inline void cpufreq_acct_update_power(struct task_struct *p, u64 cputime) {} diff --git a/kernel/exit.c b/kernel/exit.c index 4479af833505..e57bff761b88 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -54,7 +54,6 @@ #include #include #include -#include #include "sched/tune.h" @@ -174,8 +173,6 @@ void release_task(struct task_struct *p) { struct task_struct *leader; int zap_leader; - - cpufreq_task_times_exit(p); repeat: /* don't need to get the RCU readlock here - the process is dead and * can't be modifying its own credentials. But shut RCU-lockdep up */ diff --git a/kernel/fork.c b/kernel/fork.c index a24b96015538..80445ca0420b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -77,6 +77,7 @@ #include #include #include +#include #include #include @@ -226,6 +227,7 @@ static void account_kernel_stack(unsigned long *stack, int account) void free_task(struct task_struct *tsk) { + cpufreq_task_times_exit(tsk); account_kernel_stack(tsk->stack, -1); arch_release_thread_stack(tsk->stack); free_thread_stack(tsk->stack); @@ -1360,6 +1362,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (!p) goto fork_out; + cpufreq_task_times_init(p); + ftrace_graph_init_task(p); rt_mutex_init_task(p); @@ -1791,6 +1795,8 @@ long _do_fork(unsigned long clone_flags, struct completion vfork; struct pid *pid; + cpufreq_task_times_alloc(p); + trace_sched_process_fork(current, p); pid = get_task_pid(p, PIDTYPE_PID); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 8f389b86bf34..0c1b195a2aaf 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2198,8 +2198,6 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) memset(&p->se.statistics, 0, sizeof(p->se.statistics)); #endif - cpufreq_task_times_init(p); - RB_CLEAR_NODE(&p->dl.rb_node); init_dl_task_timer(&p->dl); __dl_clear_params(p); From 40ecc0aff22302ee76a724e7d3223ad8c75febe2 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Sun, 8 Apr 2018 23:35:28 +0200 Subject: [PATCH 795/804] Kbuild: fix # escaping in .cmd files for future Make MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 9564a8cf422d7b58f6e857e3546d346fa970191e upstream. I tried building using a freshly built Make (4.2.1-69-g8a731d1), but already the objtool build broke with orc_dump.c: In function ‘orc_dump’: orc_dump.c:106:2: error: ‘elf_getshnum’ is deprecated [-Werror=deprecated-declarations] if (elf_getshdrnum(elf, &nr_sections)) { Turns out that with that new Make, the backslash was not removed, so cpp didn't see a #include directive, grep found nothing, and -DLIBELF_USE_DEPRECATED was wrongly put in CFLAGS. Now, that new Make behaviour is documented in their NEWS file: * WARNING: Backward-incompatibility! Number signs (#) appearing inside a macro reference or function invocation no longer introduce comments and should not be escaped with backslashes: thus a call such as: foo := $(shell echo '#') is legal. Previously the number sign needed to be escaped, for example: foo := $(shell echo '\#') Now this latter will resolve to "\#". If you want to write makefiles portable to both versions, assign the number sign to a variable: C := \# foo := $(shell echo '$C') This was claimed to be fixed in 3.81, but wasn't, for some reason. To detect this change search for 'nocomment' in the .FEATURES variable. This also fixes up the two make-cmd instances to replace # with $(pound) rather than with \#. There might very well be other places that need similar fixup in preparation for whatever future Make release contains the above change, but at least this builds an x86_64 defconfig with the new make. Link: https://bugzilla.kernel.org/show_bug.cgi?id=197847 Cc: Randy Dunlap Signed-off-by: Rasmus Villemoes Signed-off-by: Masahiro Yamada Cc: Konstantin Khlebnikov Signed-off-by: Greg Kroah-Hartman --- scripts/Kbuild.include | 5 +++-- tools/build/Build.include | 5 +++-- tools/scripts/Makefile.include | 2 ++ 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/scripts/Kbuild.include b/scripts/Kbuild.include index 1db6d73c8dd2..31a981d6229d 100644 --- a/scripts/Kbuild.include +++ b/scripts/Kbuild.include @@ -7,6 +7,7 @@ quote := " squote := ' empty := space := $(empty) $(empty) +pound := \# ### # Name of target with a '.' as filename prefix. foo/bar.o => foo/.bar.o @@ -236,11 +237,11 @@ endif # Replace >$< with >$$< to preserve $ when reloading the .cmd file # (needed for make) -# Replace >#< with >\#< to avoid starting a comment in the .cmd file +# Replace >#< with >$(pound)< to avoid starting a comment in the .cmd file # (needed for make) # Replace >'< with >'\''< to be able to enclose the whole string in '...' # (needed for the shell) -make-cmd = $(call escsq,$(subst \#,\\\#,$(subst $$,$$$$,$(cmd_$(1))))) +make-cmd = $(call escsq,$(subst $(pound),$$(pound),$(subst $$,$$$$,$(cmd_$(1))))) # Find any prerequisites that is newer than target or that does not exist. # PHONY targets skipped in both cases. diff --git a/tools/build/Build.include b/tools/build/Build.include index 1c570528baf7..0340d8a51dab 100644 --- a/tools/build/Build.include +++ b/tools/build/Build.include @@ -12,6 +12,7 @@ # Convenient variables comma := , squote := ' +pound := \# ### # Name of target with a '.' as filename prefix. foo/bar.o => foo/.bar.o @@ -43,11 +44,11 @@ echo-cmd = $(if $($(quiet)cmd_$(1)),\ ### # Replace >$< with >$$< to preserve $ when reloading the .cmd file # (needed for make) -# Replace >#< with >\#< to avoid starting a comment in the .cmd file +# Replace >#< with >$(pound)< to avoid starting a comment in the .cmd file # (needed for make) # Replace >'< with >'\''< to be able to enclose the whole string in '...' # (needed for the shell) -make-cmd = $(call escsq,$(subst \#,\\\#,$(subst $$,$$$$,$(cmd_$(1))))) +make-cmd = $(call escsq,$(subst $(pound),$$(pound),$(subst $$,$$$$,$(cmd_$(1))))) ### # Find any prerequisites that is newer than target or that does not exist. diff --git a/tools/scripts/Makefile.include b/tools/scripts/Makefile.include index 19edc1a7a232..7ea4438b801d 100644 --- a/tools/scripts/Makefile.include +++ b/tools/scripts/Makefile.include @@ -92,3 +92,5 @@ ifneq ($(silent),1) QUIET_INSTALL = @printf ' INSTALL %s\n' $1; endif endif + +pound := \# From 654ee679e1a16eafc65c541f7dc71471b9608e04 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 15 Dec 2016 10:14:42 -0800 Subject: [PATCH 796/804] x86/cpu: Probe CPUID leaf 6 even when cpuid_level == 6 commit 3df8d9208569ef0b2313e516566222d745f3b94b upstream. A typo (or mis-merge?) resulted in leaf 6 only being probed if cpuid_level >= 7. Fixes: 2ccd71f1b278 ("x86/cpufeature: Move some of the scattered feature bits to x86_capability") Signed-off-by: Andy Lutomirski Acked-by: Borislav Petkov Cc: Brian Gerst Link: http://lkml.kernel.org/r/6ea30c0e9daec21e488b54761881a6dfcf3e04d0.1481825597.git.luto@kernel.org Signed-off-by: Thomas Gleixner Cc: Brad Spengler Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/cpu/common.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 814276d0eed1..736e2843139b 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -686,13 +686,14 @@ void get_cpu_cap(struct cpuinfo_x86 *c) c->x86_capability[CPUID_1_EDX] = edx; } + /* Thermal and Power Management Leaf: level 0x00000006 (eax) */ + if (c->cpuid_level >= 0x00000006) + c->x86_capability[CPUID_6_EAX] = cpuid_eax(0x00000006); + /* Additional Intel-defined flags: level 0x00000007 */ if (c->cpuid_level >= 0x00000007) { cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx); - c->x86_capability[CPUID_7_0_EBX] = ebx; - - c->x86_capability[CPUID_6_EAX] = cpuid_eax(0x00000006); c->x86_capability[CPUID_7_ECX] = ecx; } From 60904a578815a6f4abcc0eafadacc152589359fd Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 7 Jul 2016 18:28:43 -0300 Subject: [PATCH 797/804] perf tools: Move syscall number fallbacks from perf-sys.h to tools/arch/x86/include/asm/ commit cec07f53c398f22576df77052c4777dc13f14962 upstream. And remove the empty tools/arch/x86/include/asm/unistd_{32,64}.h files introduced by eae7a755ee81 ("perf tools, x86: Build perf on older user-space as well"). This way we get closer to mirroring the kernel for cases where __NR_ can't be found for some include path/_GNU_SOURCE/whatever scenario. Cc: Adrian Hunter Cc: David Ahern Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan Link: http://lkml.kernel.org/n/tip-kpj6m3mbjw82kg6krk2z529e@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo Cc: Konstantin Khlebnikov Signed-off-by: Greg Kroah-Hartman --- tools/arch/x86/include/asm/unistd_32.h | 9 +++++++++ tools/arch/x86/include/asm/unistd_64.h | 9 +++++++++ tools/perf/config/Makefile | 1 + tools/perf/perf-sys.h | 18 ------------------ tools/perf/util/include/asm/unistd_32.h | 1 - tools/perf/util/include/asm/unistd_64.h | 1 - 6 files changed, 19 insertions(+), 20 deletions(-) create mode 100644 tools/arch/x86/include/asm/unistd_32.h create mode 100644 tools/arch/x86/include/asm/unistd_64.h delete mode 100644 tools/perf/util/include/asm/unistd_32.h delete mode 100644 tools/perf/util/include/asm/unistd_64.h diff --git a/tools/arch/x86/include/asm/unistd_32.h b/tools/arch/x86/include/asm/unistd_32.h new file mode 100644 index 000000000000..cf33ab09273d --- /dev/null +++ b/tools/arch/x86/include/asm/unistd_32.h @@ -0,0 +1,9 @@ +#ifndef __NR_perf_event_open +# define __NR_perf_event_open 336 +#endif +#ifndef __NR_futex +# define __NR_futex 240 +#endif +#ifndef __NR_gettid +# define __NR_gettid 224 +#endif diff --git a/tools/arch/x86/include/asm/unistd_64.h b/tools/arch/x86/include/asm/unistd_64.h new file mode 100644 index 000000000000..2c9835695b56 --- /dev/null +++ b/tools/arch/x86/include/asm/unistd_64.h @@ -0,0 +1,9 @@ +#ifndef __NR_perf_event_open +# define __NR_perf_event_open 298 +#endif +#ifndef __NR_futex +# define __NR_futex 202 +#endif +#ifndef __NR_gettid +# define __NR_gettid 186 +#endif diff --git a/tools/perf/config/Makefile b/tools/perf/config/Makefile index de89ec574361..b92c952b01ef 100644 --- a/tools/perf/config/Makefile +++ b/tools/perf/config/Makefile @@ -200,6 +200,7 @@ CFLAGS += -I$(src-perf)/arch/$(ARCH)/include CFLAGS += -I$(srctree)/tools/include/ CFLAGS += -I$(srctree)/arch/$(ARCH)/include/uapi CFLAGS += -I$(srctree)/arch/$(ARCH)/include +CFLAGS += -I$(srctree)/tools/arch/$(ARCH)/include CFLAGS += -I$(srctree)/include/uapi CFLAGS += -I$(srctree)/include diff --git a/tools/perf/perf-sys.h b/tools/perf/perf-sys.h index 83a25cef82fd..5cee8a3d0455 100644 --- a/tools/perf/perf-sys.h +++ b/tools/perf/perf-sys.h @@ -11,29 +11,11 @@ #if defined(__i386__) #define cpu_relax() asm volatile("rep; nop" ::: "memory"); #define CPUINFO_PROC {"model name"} -#ifndef __NR_perf_event_open -# define __NR_perf_event_open 336 -#endif -#ifndef __NR_futex -# define __NR_futex 240 -#endif -#ifndef __NR_gettid -# define __NR_gettid 224 -#endif #endif #if defined(__x86_64__) #define cpu_relax() asm volatile("rep; nop" ::: "memory"); #define CPUINFO_PROC {"model name"} -#ifndef __NR_perf_event_open -# define __NR_perf_event_open 298 -#endif -#ifndef __NR_futex -# define __NR_futex 202 -#endif -#ifndef __NR_gettid -# define __NR_gettid 186 -#endif #endif #ifdef __powerpc__ diff --git a/tools/perf/util/include/asm/unistd_32.h b/tools/perf/util/include/asm/unistd_32.h deleted file mode 100644 index 8b137891791f..000000000000 --- a/tools/perf/util/include/asm/unistd_32.h +++ /dev/null @@ -1 +0,0 @@ - diff --git a/tools/perf/util/include/asm/unistd_64.h b/tools/perf/util/include/asm/unistd_64.h deleted file mode 100644 index 8b137891791f..000000000000 --- a/tools/perf/util/include/asm/unistd_64.h +++ /dev/null @@ -1 +0,0 @@ - From ecb99897516f0bb433f3e79df3b1958c80ac4810 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 19 Jul 2018 15:35:58 +0200 Subject: [PATCH 798/804] Linux 4.4.142 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 3fc39e41dbde..75d6176c8786 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ VERSION = 4 PATCHLEVEL = 4 -SUBLEVEL = 141 +SUBLEVEL = 142 EXTRAVERSION = NAME = Blurry Fish Butt From f402eb9ad587a3952a2180193fa62eeac4289bdc Mon Sep 17 00:00:00 2001 From: Alistair Strachan Date: Wed, 25 Jul 2018 16:11:38 -0700 Subject: [PATCH 799/804] x86_64_cuttlefish_defconfig: enable verity cert Bug: 72722987 Test: Build, boot and verify in /proc/keys Change-Id: Ia55b94d56827003a88cb6083a75340ee31347470 Signed-off-by: Alistair Strachan --- arch/x86/configs/x86_64_cuttlefish_defconfig | 5 ++++ verity_dev_keys.x509 | 24 ++++++++++++++++++++ 2 files changed, 29 insertions(+) create mode 100644 verity_dev_keys.x509 diff --git a/arch/x86/configs/x86_64_cuttlefish_defconfig b/arch/x86/configs/x86_64_cuttlefish_defconfig index 71026930c04c..7b63741c622d 100644 --- a/arch/x86/configs/x86_64_cuttlefish_defconfig +++ b/arch/x86/configs/x86_64_cuttlefish_defconfig @@ -447,3 +447,8 @@ CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=1 # CONFIG_CRYPTO_MANAGER_DISABLE_TESTS is not set CONFIG_CRYPTO_ECHAINIV=y CONFIG_CRYPTO_SHA512=y +CONFIG_ASYMMETRIC_KEY_TYPE=y +CONFIG_ASYMMETRIC_PUBLIC_KEY_SUBTYPE=y +CONFIG_X509_CERTIFICATE_PARSER=y +CONFIG_SYSTEM_TRUSTED_KEYRING=y +CONFIG_SYSTEM_TRUSTED_KEYS="verity_dev_keys.x509" diff --git a/verity_dev_keys.x509 b/verity_dev_keys.x509 new file mode 100644 index 000000000000..86399c3c1dd7 --- /dev/null +++ b/verity_dev_keys.x509 @@ -0,0 +1,24 @@ +-----BEGIN CERTIFICATE----- +MIID/TCCAuWgAwIBAgIJAJcPmDkJqolJMA0GCSqGSIb3DQEBBQUAMIGUMQswCQYD +VQQGEwJVUzETMBEGA1UECAwKQ2FsaWZvcm5pYTEWMBQGA1UEBwwNTW91bnRhaW4g +VmlldzEQMA4GA1UECgwHQW5kcm9pZDEQMA4GA1UECwwHQW5kcm9pZDEQMA4GA1UE +AwwHQW5kcm9pZDEiMCAGCSqGSIb3DQEJARYTYW5kcm9pZEBhbmRyb2lkLmNvbTAe +Fw0xNDExMDYxOTA3NDBaFw00MjAzMjQxOTA3NDBaMIGUMQswCQYDVQQGEwJVUzET +MBEGA1UECAwKQ2FsaWZvcm5pYTEWMBQGA1UEBwwNTW91bnRhaW4gVmlldzEQMA4G +A1UECgwHQW5kcm9pZDEQMA4GA1UECwwHQW5kcm9pZDEQMA4GA1UEAwwHQW5kcm9p +ZDEiMCAGCSqGSIb3DQEJARYTYW5kcm9pZEBhbmRyb2lkLmNvbTCCASIwDQYJKoZI +hvcNAQEBBQADggEPADCCAQoCggEBAOjreE0vTVSRenuzO9vnaWfk0eQzYab0gqpi +6xAzi6dmD+ugoEKJmbPiuE5Dwf21isZ9uhUUu0dQM46dK4ocKxMRrcnmGxydFn6o +fs3ODJMXOkv2gKXL/FdbEPdDbxzdu8z3yk+W67udM/fW7WbaQ3DO0knu+izKak/3 +T41c5uoXmQ81UNtAzRGzGchNVXMmWuTGOkg6U+0I2Td7K8yvUMWhAWPPpKLtVH9r +AL5TzjYNR92izdKcz3AjRsI3CTjtpiVABGeX0TcjRSuZB7K9EK56HV+OFNS6I1NP +jdD7FIShyGlqqZdUOkAUZYanbpgeT5N7QL6uuqcGpoTOkalu6kkCAwEAAaNQME4w +HQYDVR0OBBYEFH5DM/m7oArf4O3peeKO0ZIEkrQPMB8GA1UdIwQYMBaAFH5DM/m7 +oArf4O3peeKO0ZIEkrQPMAwGA1UdEwQFMAMBAf8wDQYJKoZIhvcNAQEFBQADggEB +AHO3NSvDE5jFvMehGGtS8BnFYdFKRIglDMc4niWSzhzOVYRH4WajxdtBWc5fx0ix +NF/+hVKVhP6AIOQa+++sk+HIi7RvioPPbhjcsVlZe7cUEGrLSSveGouQyc+j0+m6 +JF84kszIl5GGNMTnx0XRPO+g8t6h5LWfnVydgZfpGRRg+WHewk1U2HlvTjIceb0N +dcoJ8WKJAFWdcuE7VIm4w+vF/DYX/A2Oyzr2+QRhmYSv1cusgAeC1tvH4ap+J1Lg +UnOu5Kh/FqPLLSwNVQp4Bu7b9QFfqK8Moj84bj88NqRGZgDyqzuTrFxn6FW7dmyA +yttuAJAEAymk1mipd9+zp38= +-----END CERTIFICATE----- From 56b516c5e343439b0a33c797de1bed012d679fe0 Mon Sep 17 00:00:00 2001 From: Alistair Strachan Date: Wed, 25 Jul 2018 16:11:09 -0700 Subject: [PATCH 800/804] x86_64_cuttlefish_defconfig: Enable android-verity Bug: 72722987 Test: Build & boot with x86_64_cuttlefish_defconfig Change-Id: I961e6aaa944b5ab0c005cb39604a52f8dc98fb06 Signed-off-by: Alistair Strachan --- arch/x86/configs/x86_64_cuttlefish_defconfig | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/x86/configs/x86_64_cuttlefish_defconfig b/arch/x86/configs/x86_64_cuttlefish_defconfig index 7b63741c622d..df9b6bd228f7 100644 --- a/arch/x86/configs/x86_64_cuttlefish_defconfig +++ b/arch/x86/configs/x86_64_cuttlefish_defconfig @@ -214,13 +214,17 @@ CONFIG_SCSI_CONSTANTS=y CONFIG_SCSI_SPI_ATTRS=y CONFIG_SCSI_VIRTIO=y CONFIG_MD=y +CONFIG_BLK_DEV_MD=y +CONFIG_MD_LINEAR=y CONFIG_BLK_DEV_DM=y CONFIG_DM_CRYPT=y CONFIG_DM_MIRROR=y CONFIG_DM_ZERO=y CONFIG_DM_UEVENT=y CONFIG_DM_VERITY=y +CONFIG_DM_VERITY_HASH_PREFETCH_MIN_SIZE=1 CONFIG_DM_VERITY_FEC=y +CONFIG_DM_ANDROID_VERITY=y CONFIG_NETDEVICES=y CONFIG_NETCONSOLE=y CONFIG_NETCONSOLE_DYNAMIC=y From 9fa2a49a4ac4f105943fe244b0fa825beb1d1ba5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 27 Jul 2018 12:08:54 +0200 Subject: [PATCH 801/804] tcp: avoid collapses in tcp_prune_queue() if possible [ Upstream commit f4a3313d8e2ca9fd8d8f45e40a2903ba782607e7 ] Right after a TCP flow is created, receiving tiny out of order packets allways hit the condition : if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) tcp_clamp_window(sk); tcp_clamp_window() increases sk_rcvbuf to match sk_rmem_alloc (guarded by tcp_rmem[2]) Calling tcp_collapse_ofo_queue() in this case is not useful, and offers a O(N^2) surface attack to malicious peers. Better not attempt anything before full queue capacity is reached, forcing attacker to spend lots of resource and allow us to more easily detect the abuse. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Acked-by: Yuchung Cheng Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman Signed-off-by: Greg Kroah-Hartman --- net/ipv4/tcp_input.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 667a2573317f..c757a74b2916 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4866,6 +4866,9 @@ static int tcp_prune_queue(struct sock *sk) else if (tcp_under_memory_pressure(sk)) tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss); + if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) + return 0; + tcp_collapse_ofo_queue(sk); if (!skb_queue_empty(&sk->sk_receive_queue)) tcp_collapse(sk, &sk->sk_receive_queue, From 792e682a471db356576473d90e9700071171d7e1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 27 Jul 2018 12:08:55 +0200 Subject: [PATCH 802/804] tcp: detect malicious patterns in tcp_collapse_ofo_queue() [ Upstream commit 3d4bf93ac12003f9b8e1e2de37fe27983deebdcf ] In case an attacker feeds tiny packets completely out of order, tcp_collapse_ofo_queue() might scan the whole rb-tree, performing expensive copies, but not changing socket memory usage at all. 1) Do not attempt to collapse tiny skbs. 2) Add logic to exit early when too many tiny skbs are detected. We prefer not doing aggressive collapsing (which copies packets) for pathological flows, and revert to tcp_prune_ofo_queue() which will be less expensive. In the future, we might add the possibility of terminating flows that are proven to be malicious. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman Signed-off-by: Greg Kroah-Hartman --- net/ipv4/tcp_input.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index c757a74b2916..cc9874d7f223 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4780,6 +4780,7 @@ restart: static void tcp_collapse_ofo_queue(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); + u32 range_truesize, sum_tiny = 0; struct sk_buff *skb = skb_peek(&tp->out_of_order_queue); struct sk_buff *head; u32 start, end; @@ -4789,6 +4790,7 @@ static void tcp_collapse_ofo_queue(struct sock *sk) start = TCP_SKB_CB(skb)->seq; end = TCP_SKB_CB(skb)->end_seq; + range_truesize = skb->truesize; head = skb; for (;;) { @@ -4803,8 +4805,17 @@ static void tcp_collapse_ofo_queue(struct sock *sk) if (!skb || after(TCP_SKB_CB(skb)->seq, end) || before(TCP_SKB_CB(skb)->end_seq, start)) { - tcp_collapse(sk, &tp->out_of_order_queue, - head, skb, start, end); + /* Do not attempt collapsing tiny skbs */ + if (range_truesize != head->truesize || + end - start >= SKB_WITH_OVERHEAD(SK_MEM_QUANTUM)) { + tcp_collapse(sk, &tp->out_of_order_queue, + head, skb, start, end); + } else { + sum_tiny += range_truesize; + if (sum_tiny > sk->sk_rcvbuf >> 3) + return; + } + head = skb; if (!skb) break; From 7a77ef209cfb4e760c70e56bc5fdeaed8f34ed45 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 27 Jul 2018 13:58:53 +0200 Subject: [PATCH 803/804] Fix backport of "tcp: detect malicious patterns in tcp_collapse_ofo_queue()" Based on review from Eric Dumazet, my backport of commit 3d4bf93ac12003f9b8e1e2de37fe27983deebdcf to older kernels was a bit incorrect. This patch fixes this. Reported-by: Eric Dumazet Signed-off-by: Greg Kroah-Hartman Signed-off-by: Greg Kroah-Hartman --- net/ipv4/tcp_input.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index cc9874d7f223..d3b7172b9ee0 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4822,6 +4822,7 @@ static void tcp_collapse_ofo_queue(struct sock *sk) /* Start new segment */ start = TCP_SKB_CB(skb)->seq; end = TCP_SKB_CB(skb)->end_seq; + range_truesize = skb->truesize; } else { if (before(TCP_SKB_CB(skb)->seq, start)) start = TCP_SKB_CB(skb)->seq; From 9664bdeff388afe4749eee2332bc0cc220b6cee2 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Thu, 26 Jul 2018 16:32:09 -0700 Subject: [PATCH 804/804] ANDROID: sdcardfs: Check stacked filesystem depth bug: 111860541 Change-Id: Ia0a30b2b8956c4ada28981584cd8647713a1e993 Signed-off-by: Daniel Rosenberg --- fs/sdcardfs/main.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/sdcardfs/main.c b/fs/sdcardfs/main.c index 30e0c431a1ea..27ec726e7a46 100644 --- a/fs/sdcardfs/main.c +++ b/fs/sdcardfs/main.c @@ -295,6 +295,13 @@ static int sdcardfs_read_super(struct vfsmount *mnt, struct super_block *sb, atomic_inc(&lower_sb->s_active); sdcardfs_set_lower_super(sb, lower_sb); + sb->s_stack_depth = lower_sb->s_stack_depth + 1; + if (sb->s_stack_depth > FILESYSTEM_MAX_STACK_DEPTH) { + pr_err("sdcardfs: maximum fs stacking depth exceeded\n"); + err = -EINVAL; + goto out_sput; + } + /* inherit maxbytes from lower file system */ sb->s_maxbytes = lower_sb->s_maxbytes;