From e87f7329bbd6760c2acc4f1eb423362b08851a71 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 23 Nov 2016 10:51:17 -0800 Subject: [PATCH 01/66] f2fs: fix overflow due to condition check order In the last ilen case, i was already increased, resulting in accessing out- of-boundary entry of do_replace and blkaddr. Fix to check ilen first to exit the loop. Fixes: 2aa8fbb9693020 ("f2fs: refactor __exchange_data_block for speed up") Cc: stable@vger.kernel.org # 4.8+ Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index c7865073cd26..02372175725d 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -967,7 +967,7 @@ static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode, new_size = (dst + i) << PAGE_SHIFT; if (dst_inode->i_size < new_size) f2fs_i_size_write(dst_inode, new_size); - } while ((do_replace[i] || blkaddr[i] == NULL_ADDR) && --ilen); + } while (--ilen && (do_replace[i] || blkaddr[i] == NULL_ADDR)); f2fs_put_dnode(&dn); } else { From 2411cf5befa5804e4ced4c45a3212d7653869286 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 11 Oct 2016 22:31:34 +0800 Subject: [PATCH 02/66] f2fs: exclude free nids building and allocation During nid allocation, it needs to exclude building and allocating flow of free nids, this is because while building free nid cache, there are two steps: a) load free nids from unused nat entries in NAT pages, b) update free nid cache by checking nat journal. The two steps should be atomical, otherwise an used nid can be allocated as free one after a) and before b). This patch adds missing lock which covers build_free_nids in unlock_operation and f2fs_balance_fs_bg to avoid that. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 01177ecdeab8..f12de4589856 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1785,7 +1785,7 @@ static void scan_nat_page(struct f2fs_sb_info *sbi, } } -void build_free_nids(struct f2fs_sb_info *sbi) +void __build_free_nids(struct f2fs_sb_info *sbi) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); @@ -1839,6 +1839,13 @@ void build_free_nids(struct f2fs_sb_info *sbi) nm_i->ra_nid_pages, META_NAT, false); } +void build_free_nids(struct f2fs_sb_info *sbi) +{ + mutex_lock(&NM_I(sbi)->build_lock); + __build_free_nids(sbi); + mutex_unlock(&NM_I(sbi)->build_lock); +} + /* * If this function returns success, caller can obtain a new nid * from second parameter of this function. @@ -1875,9 +1882,7 @@ retry: spin_unlock(&nm_i->free_nid_list_lock); /* Let's scan nat pages and its caches to get free nids */ - mutex_lock(&nm_i->build_lock); build_free_nids(sbi); - mutex_unlock(&nm_i->build_lock); goto retry; } From 2dd15654ac0abe587a245a09a7823bbbd588bfb7 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 11 Oct 2016 22:57:00 +0800 Subject: [PATCH 03/66] f2fs: fix to release discard entries during checkpoint In f2fs_fill_super, if there is any IO error occurs during recovery, cached discard entries will be leaked, in order to avoid this, make write_checkpoint() handle memory release by itself, besides, move clear_prefree_segments to write_checkpoint for readability. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 9 ++++++--- fs/f2fs/super.c | 1 - 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 7e9b504bd8b2..7db2aa2280ad 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1184,7 +1184,6 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) if (unlikely(f2fs_cp_error(sbi))) return -EIO; - clear_prefree_segments(sbi, cpc); clear_sbi_flag(sbi, SBI_IS_DIRTY); clear_sbi_flag(sbi, SBI_NEED_CP); @@ -1261,8 +1260,12 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* unlock all the fs_lock[] in do_checkpoint() */ err = do_checkpoint(sbi, cpc); - - f2fs_wait_all_discard_bio(sbi); + if (err) { + release_discard_addrs(sbi); + } else { + clear_prefree_segments(sbi, cpc); + f2fs_wait_all_discard_bio(sbi); + } unblock_operations(sbi); stat_inc_cp_count(sbi->stat_info); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 6132b4ce4e4c..bfa4414d707e 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -738,7 +738,6 @@ static void f2fs_put_super(struct super_block *sb) * In addition, EIO will skip do checkpoint, we need this as well. */ release_ino_entry(sbi, true); - release_discard_addrs(sbi); f2fs_leave_shrinker(sbi); mutex_unlock(&sbi->umount_mutex); From 933439c8f3474e329709b715b43b0b8168bbecf8 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 11 Oct 2016 22:57:01 +0800 Subject: [PATCH 04/66] f2fs: give a chance to detach from dirty list If there is no dirty pages in inode, we should give a chance to detach the inode from global dirty list, otherwise it needs to call another unnecessary .writepages for detaching. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 8 +++++--- fs/f2fs/dir.c | 1 + fs/f2fs/gc.c | 4 +++- fs/f2fs/inline.c | 4 +++- fs/f2fs/node.c | 1 + fs/f2fs/segment.c | 4 +++- 6 files changed, 16 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 9ae194fd2fdb..b2289dfab9a2 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1785,12 +1785,14 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset, return; if (PageDirty(page)) { - if (inode->i_ino == F2FS_META_INO(sbi)) + if (inode->i_ino == F2FS_META_INO(sbi)) { dec_page_count(sbi, F2FS_DIRTY_META); - else if (inode->i_ino == F2FS_NODE_INO(sbi)) + } else if (inode->i_ino == F2FS_NODE_INO(sbi)) { dec_page_count(sbi, F2FS_DIRTY_NODES); - else + } else { inode_dec_dirty_pages(inode); + remove_dirty_inode(inode); + } } /* This is atomic written page, keep Private */ diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 369f4513be37..4df6168f43de 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -742,6 +742,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, ClearPagePrivate(page); ClearPageUptodate(page); inode_dec_dirty_pages(dir); + remove_dirty_inode(dir); } f2fs_put_page(page, 1); } diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 6f14ee923acd..9c189171b89c 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -673,8 +673,10 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type) retry: set_page_dirty(page); f2fs_wait_on_page_writeback(page, DATA, true); - if (clear_page_dirty_for_io(page)) + if (clear_page_dirty_for_io(page)) { inode_dec_dirty_pages(inode); + remove_dirty_inode(inode); + } set_cold_data(page); diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 5f1a67f756af..210b2dc79535 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -137,8 +137,10 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) fio.old_blkaddr = dn->data_blkaddr; write_data_page(dn, &fio); f2fs_wait_on_page_writeback(page, DATA, true); - if (dirty) + if (dirty) { inode_dec_dirty_pages(dn->inode); + remove_dirty_inode(dn->inode); + } /* this converted inline_data should be recovered. */ set_inode_flag(dn->inode, FI_APPEND_WRITE); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index f12de4589856..e5236463781a 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1204,6 +1204,7 @@ static void flush_inline_data(struct f2fs_sb_info *sbi, nid_t ino) ret = f2fs_write_inline_data(inode, page); inode_dec_dirty_pages(inode); + remove_dirty_inode(inode); if (ret) set_page_dirty(page); page_out: diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index fc886f008449..f7e73b2afe68 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -274,8 +274,10 @@ static int __commit_inmem_pages(struct inode *inode, set_page_dirty(page); f2fs_wait_on_page_writeback(page, DATA, true); - if (clear_page_dirty_for_io(page)) + if (clear_page_dirty_for_io(page)) { inode_dec_dirty_pages(inode); + remove_dirty_inode(inode); + } fio.page = page; err = do_write_data_page(&fio); From 9434fcde1fa0f48e1a29fbdd9d436fa279aeb909 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 11 Oct 2016 22:57:02 +0800 Subject: [PATCH 05/66] f2fs: add missing f2fs_balance_fs in f2fs_zero_range f2fs_balance_fs should be called in between node page updating, otherwise node page count will exceeded far beyond watermark of triggering foreground garbage collection, result in facing high risk of hitting LFS allocation failure. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 02372175725d..578bc7bcaa13 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1218,6 +1218,9 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, ret = f2fs_do_zero_range(&dn, index, end); f2fs_put_dnode(&dn); f2fs_unlock_op(sbi); + + f2fs_balance_fs(sbi, dn.node_changed); + if (ret) goto out; From 6f2d8ed654bfa391854df4de854953f772a16a9d Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 11 Oct 2016 22:57:03 +0800 Subject: [PATCH 06/66] f2fs: don't miss any f2fs_balance_fs cases In f2fs_map_blocks, let f2fs_balance_fs detects node page modification with dn.node_changed to avoid miss some corner cases. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index b2289dfab9a2..7fb081516f5f 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -676,7 +676,6 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, unsigned int ofs_in_node, last_ofs_in_node; blkcnt_t prealloc; struct extent_info ei; - bool allocated = false; block_t blkaddr; if (!maxblocks) @@ -735,10 +734,8 @@ next_block: } } else { err = __allocate_data_block(&dn); - if (!err) { + if (!err) set_inode_flag(inode, FI_APPEND_WRITE); - allocated = true; - } } if (err) goto sync_out; @@ -793,7 +790,6 @@ skip: err = reserve_new_blocks(&dn, prealloc); if (err) goto sync_out; - allocated = dn.node_changed; map->m_len += dn.ofs_in_node - ofs_in_node; if (prealloc && dn.ofs_in_node != last_ofs_in_node + 1) { @@ -812,9 +808,8 @@ skip: if (create) { f2fs_unlock_op(sbi); - f2fs_balance_fs(sbi, allocated); + f2fs_balance_fs(sbi, dn.node_changed); } - allocated = false; goto next_dnode; sync_out: @@ -822,7 +817,7 @@ sync_out: unlock_out: if (create) { f2fs_unlock_op(sbi); - f2fs_balance_fs(sbi, allocated); + f2fs_balance_fs(sbi, dn.node_changed); } out: trace_f2fs_map_blocks(inode, map, err); From 58736fa60f6ae659ac72da8b1580c308b47e8edd Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 11 Oct 2016 22:57:04 +0800 Subject: [PATCH 07/66] f2fs: be aware of extent beyond EOF in fiemap f2fs can support fallocating blocks beyond file size without changing the size, but ->fiemap of f2fs was restricted and can't detect these extents fallocated past EOF, now relieve the restriction. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 7fb081516f5f..b544e0b2e7e5 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -886,7 +886,6 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, struct buffer_head map_bh; sector_t start_blk, last_blk; pgoff_t next_pgofs; - loff_t isize; u64 logical = 0, phys = 0, size = 0; u32 flags = 0; int ret = 0; @@ -903,13 +902,6 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, inode_lock(inode); - isize = i_size_read(inode); - if (start >= isize) - goto out; - - if (start + len > isize) - len = isize - start; - if (logical_to_blk(inode, len) == 0) len = blk_to_logical(inode, 1); @@ -928,13 +920,11 @@ next: /* HOLE */ if (!buffer_mapped(&map_bh)) { start_blk = next_pgofs; - /* Go through holes util pass the EOF */ - if (blk_to_logical(inode, start_blk) < isize) + + if (blk_to_logical(inode, start_blk) < blk_to_logical(inode, + F2FS_I_SB(inode)->max_file_blocks)) goto prep_next; - /* Found a hole beyond isize means no more extents. - * Note that the premise is that filesystems don't - * punch holes beyond isize and keep size unchanged. - */ + flags |= FIEMAP_EXTENT_LAST; } From b691d98fdd4cc2514c60fd6975e6016da203e64f Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 11 Oct 2016 22:57:05 +0800 Subject: [PATCH 08/66] f2fs: fix to update largest extent under lock In order to avoid racing problem, make largest extent cache being updated under lock. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/inode.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index d7369895a78a..1fbebcb33a9d 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -252,6 +252,7 @@ retry: int update_inode(struct inode *inode, struct page *node_page) { struct f2fs_inode *ri; + struct extent_tree *et = F2FS_I(inode)->extent_tree; f2fs_inode_synced(inode); @@ -267,11 +268,13 @@ int update_inode(struct inode *inode, struct page *node_page) ri->i_size = cpu_to_le64(i_size_read(inode)); ri->i_blocks = cpu_to_le64(inode->i_blocks); - if (F2FS_I(inode)->extent_tree) - set_raw_extent(&F2FS_I(inode)->extent_tree->largest, - &ri->i_ext); - else + if (et) { + read_lock(&et->lock); + set_raw_extent(&et->largest, &ri->i_ext); + read_unlock(&et->lock); + } else { memset(&ri->i_ext, 0, sizeof(ri->i_ext)); + } set_raw_inline(inode, ri); ri->i_atime = cpu_to_le64(inode->i_atime.tv_sec); From 9de69279750e9740bc7221c7051a40c0516a58fb Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 11 Oct 2016 22:57:06 +0800 Subject: [PATCH 09/66] f2fs: fix error handling in fsync_node_pages In fsync_node_pages, if f2fs was taged with CP_ERROR_FLAG, make sure bio cache was flushed before return. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index e5236463781a..975909c146a1 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1339,7 +1339,8 @@ retry: if (unlikely(f2fs_cp_error(sbi))) { f2fs_put_page(last_page, 0); pagevec_release(&pvec); - return -EIO; + ret = -EIO; + goto out; } if (!IS_DNODE(page) || !is_cold_node(page)) @@ -1412,7 +1413,7 @@ continue_unlock: unlock_page(last_page); goto retry; } - +out: if (nwritten) f2fs_submit_merged_bio_cond(sbi, NULL, NULL, ino, NODE, WRITE); return ret ? -EIO: 0; From 0c0b471e43e7acf0747c6eb410863bf78c14750d Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 11 Oct 2016 10:36:12 -0700 Subject: [PATCH 10/66] f2fs: fix sparse warnings f2fs contained a number of endianness conversion bugs. Also, one function should have been 'static'. Found with sparse by running 'make C=2 CF=-D__CHECK_ENDIAN__ fs/f2fs/' Signed-off-by: Eric Biggers Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 2 +- fs/f2fs/inline.c | 2 +- fs/f2fs/node.c | 5 +++-- fs/f2fs/node.h | 2 +- fs/f2fs/segment.c | 2 +- 5 files changed, 7 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 4df6168f43de..9fd1b0e6d572 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -136,7 +136,7 @@ struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *fname, /* show encrypted name */ if (fname->hash) { - if (de->hash_code == fname->hash) + if (de->hash_code == cpu_to_le32(fname->hash)) goto found; } else if (de_name.len == name->len && de->hash_code == namehash && diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 210b2dc79535..9cd3379581be 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -421,7 +421,7 @@ static int f2fs_add_inline_entries(struct inode *dir, } new_name.name = d.filename[bit_pos]; - new_name.len = de->name_len; + new_name.len = le16_to_cpu(de->name_len); ino = le32_to_cpu(de->ino); fake_mode = get_de_type(de) << S_SHIFT; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 975909c146a1..6f99907bc8d7 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -270,8 +270,9 @@ static void cache_nat_entry(struct f2fs_sb_info *sbi, nid_t nid, e = grab_nat_entry(nm_i, nid); node_info_from_raw_nat(&e->ni, ne); } else { - f2fs_bug_on(sbi, nat_get_ino(e) != ne->ino || - nat_get_blkaddr(e) != ne->block_addr || + f2fs_bug_on(sbi, nat_get_ino(e) != le32_to_cpu(ne->ino) || + nat_get_blkaddr(e) != + le32_to_cpu(ne->block_addr) || nat_get_version(e) != ne->version); } } diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 868bec65e51c..cfdcf98516a1 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -313,7 +313,7 @@ static inline bool is_recoverable_dnode(struct page *page) ((unsigned char *)ckpt + crc_offset))); cp_ver |= (crc << 32); } - return cpu_to_le64(cp_ver) == cpver_of_node(page); + return cp_ver == cpver_of_node(page); } /* diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index f7e73b2afe68..5e1b4a9e2cd2 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -635,7 +635,7 @@ static void f2fs_submit_bio_wait_endio(struct bio *bio) } /* this function is copied from blkdev_issue_discard from block/blk-lib.c */ -int __f2fs_issue_discard_async(struct f2fs_sb_info *sbi, sector_t sector, +static int __f2fs_issue_discard_async(struct f2fs_sb_info *sbi, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, unsigned long flags) { struct block_device *bdev = sbi->sb->s_bdev; From a11b9f65eae766b17ec3451a6a1766f0a9d1dbff Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 11 Oct 2016 22:56:59 +0800 Subject: [PATCH 11/66] f2fs: clear nlink if fail to add_link We don't need to keep incomplete created inode in cache, so if we fail to add link into directory during new inode creation, it's better to set nlink of inode to zero, then we can evict inode immediately. Otherwise release of nid belong to inode will be delayed until inode cache is being shrunk, it may cause a seemingly endless loop while allocating free nids in time of testing generic/269 case of fstest suit. Signed-off-by: Chao Yu [Jaegeuk Kim: add update_inode_page to fix kernel panic] Signed-off-by: Jaegeuk Kim --- fs/f2fs/inode.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 1fbebcb33a9d..d32fd0343eae 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -387,6 +387,8 @@ retry: f2fs_lock_op(sbi); err = remove_inode_page(inode); f2fs_unlock_op(sbi); + if (err == -ENOENT) + err = 0; } /* give more chances, if ENOMEM case */ @@ -427,6 +429,18 @@ void handle_failed_inode(struct inode *inode) struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct node_info ni; + /* + * clear nlink of inode in order to release resource of inode + * immediately. + */ + clear_nlink(inode); + + /* + * we must call this to avoid inode being remained as dirty, resulting + * in a panic when flushing dirty inodes in gdirty_list. + */ + update_inode_page(inode); + /* don't make bad inode, since it becomes a regular file. */ unlock_new_inode(inode); From b8559dc242d1d47dcf99660a4d6afded727e0cc0 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 12 Oct 2016 19:28:29 +0800 Subject: [PATCH 12/66] f2fs: split free nid list During free nid allocation, in order to do preallocation, we will tag free nid entry as allocated one and still leave it in free nid list, for other allocators who want to grab free nids, it needs to traverse the free nid list for lookup. It becomes overhead in scenario of allocating free nid intensively by multithreads. This patch splits free nid list to two list: {free,alloc}_nid_list, to keep free nids and preallocated free nids separately, after that, traverse latency will be gone, besides split nid_cnt for separate statistic. Additionally, introduce __insert_nid_to_list and __remove_nid_from_list for cleanup. Signed-off-by: Chao Yu [Jaegeuk Kim: modify f2fs_bug_on to avoid needless branches] Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 11 ++-- fs/f2fs/f2fs.h | 14 +++-- fs/f2fs/node.c | 136 +++++++++++++++++++++++++++------------------ fs/f2fs/node.h | 11 ++-- fs/f2fs/shrinker.c | 4 +- 5 files changed, 108 insertions(+), 68 deletions(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index fb245bd302e4..6af146c48644 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -74,7 +74,8 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->dirty_nats = NM_I(sbi)->dirty_nat_cnt; si->sits = MAIN_SEGS(sbi); si->dirty_sits = SIT_I(sbi)->dirty_sentries; - si->fnids = NM_I(sbi)->fcnt; + si->free_nids = NM_I(sbi)->nid_cnt[FREE_NID_LIST]; + si->alloc_nids = NM_I(sbi)->nid_cnt[ALLOC_NID_LIST]; si->bg_gc = sbi->bg_gc; si->util_free = (int)(free_user_blocks(sbi) >> sbi->log_blocks_per_seg) * 100 / (int)(sbi->user_block_count >> sbi->log_blocks_per_seg) @@ -194,7 +195,9 @@ get_cache: si->cache_mem += sizeof(struct flush_cmd_control); /* free nids */ - si->cache_mem += NM_I(sbi)->fcnt * sizeof(struct free_nid); + si->cache_mem += (NM_I(sbi)->nid_cnt[FREE_NID_LIST] + + NM_I(sbi)->nid_cnt[ALLOC_NID_LIST]) * + sizeof(struct free_nid); si->cache_mem += NM_I(sbi)->nat_cnt * sizeof(struct nat_entry); si->cache_mem += NM_I(sbi)->dirty_nat_cnt * sizeof(struct nat_entry_set); @@ -324,8 +327,8 @@ static int stat_show(struct seq_file *s, void *v) si->ndirty_imeta); seq_printf(s, " - NATs: %9d/%9d\n - SITs: %9d/%9d\n", si->dirty_nats, si->nats, si->dirty_sits, si->sits); - seq_printf(s, " - free_nids: %9d\n", - si->fnids); + seq_printf(s, " - free_nids: %9d, alloc_nids: %9d\n", + si->free_nids, si->alloc_nids); seq_puts(s, "\nDistribution of User Blocks:"); seq_puts(s, " [ valid | invalid | free ]\n"); seq_puts(s, " ["); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 9e8de18a168a..86b543c3defb 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -503,6 +503,12 @@ static inline void __try_update_largest_extent(struct inode *inode, } } +enum nid_list { + FREE_NID_LIST, + ALLOC_NID_LIST, + MAX_NID_LIST, +}; + struct f2fs_nm_info { block_t nat_blkaddr; /* base disk address of NAT */ nid_t max_nid; /* maximum possible node ids */ @@ -522,9 +528,9 @@ struct f2fs_nm_info { /* free node ids management */ struct radix_tree_root free_nid_root;/* root of the free_nid cache */ - struct list_head free_nid_list; /* a list for free nids */ - spinlock_t free_nid_list_lock; /* protect free nid list */ - unsigned int fcnt; /* the number of free node id */ + struct list_head nid_list[MAX_NID_LIST];/* lists for free nids */ + unsigned int nid_cnt[MAX_NID_LIST]; /* the number of free node id */ + spinlock_t nid_list_lock; /* protect nid lists ops */ struct mutex build_lock; /* lock for build free nids */ /* for checkpoint */ @@ -2184,7 +2190,7 @@ struct f2fs_stat_info { s64 ndirty_node, ndirty_dent, ndirty_meta, ndirty_data, ndirty_imeta; s64 inmem_pages; unsigned int ndirty_dirs, ndirty_files, ndirty_all; - int nats, dirty_nats, sits, dirty_sits, fnids; + int nats, dirty_nats, sits, dirty_sits, free_nids, alloc_nids; int total_count, utilization; int bg_gc, wb_bios; int inline_xattr, inline_inode, inline_dir, orphans; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 6f99907bc8d7..3019fe2e9df7 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -45,8 +45,8 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type) * give 25%, 25%, 50%, 50%, 50% memory for each components respectively */ if (type == FREE_NIDS) { - mem_size = (nm_i->fcnt * sizeof(struct free_nid)) >> - PAGE_SHIFT; + mem_size = (nm_i->nid_cnt[FREE_NID_LIST] * + sizeof(struct free_nid)) >> PAGE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2); } else if (type == NAT_ENTRIES) { mem_size = (nm_i->nat_cnt * sizeof(struct nat_entry)) >> @@ -1698,10 +1698,31 @@ static struct free_nid *__lookup_free_nid_list(struct f2fs_nm_info *nm_i, static void __del_from_free_nid_list(struct f2fs_nm_info *nm_i, struct free_nid *i) { - list_del(&i->list); radix_tree_delete(&nm_i->free_nid_root, i->nid); } +static void __insert_nid_to_list(struct f2fs_sb_info *sbi, + struct free_nid *i, enum nid_list list) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + + f2fs_bug_on(sbi, list == FREE_NID_LIST ? i->state != NID_NEW : + i->state != NID_ALLOC); + nm_i->nid_cnt[list]++; + list_add_tail(&i->list, &nm_i->nid_list[list]); +} + +static void __remove_nid_from_list(struct f2fs_sb_info *sbi, + struct free_nid *i, enum nid_list list) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + + f2fs_bug_on(sbi, list == FREE_NID_LIST ? i->state != NID_NEW : + i->state != NID_ALLOC); + nm_i->nid_cnt[list]--; + list_del(&i->list); +} + static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) { struct f2fs_nm_info *nm_i = NM_I(sbi); @@ -1732,33 +1753,33 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) return 0; } - spin_lock(&nm_i->free_nid_list_lock); + spin_lock(&nm_i->nid_list_lock); if (radix_tree_insert(&nm_i->free_nid_root, i->nid, i)) { - spin_unlock(&nm_i->free_nid_list_lock); + spin_unlock(&nm_i->nid_list_lock); radix_tree_preload_end(); kmem_cache_free(free_nid_slab, i); return 0; } - list_add_tail(&i->list, &nm_i->free_nid_list); - nm_i->fcnt++; - spin_unlock(&nm_i->free_nid_list_lock); + __insert_nid_to_list(sbi, i, FREE_NID_LIST); + spin_unlock(&nm_i->nid_list_lock); radix_tree_preload_end(); return 1; } -static void remove_free_nid(struct f2fs_nm_info *nm_i, nid_t nid) +static void remove_free_nid(struct f2fs_sb_info *sbi, nid_t nid) { + struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *i; bool need_free = false; - spin_lock(&nm_i->free_nid_list_lock); + spin_lock(&nm_i->nid_list_lock); i = __lookup_free_nid_list(nm_i, nid); if (i && i->state == NID_NEW) { + __remove_nid_from_list(sbi, i, FREE_NID_LIST); __del_from_free_nid_list(nm_i, i); - nm_i->fcnt--; need_free = true; } - spin_unlock(&nm_i->free_nid_list_lock); + spin_unlock(&nm_i->nid_list_lock); if (need_free) kmem_cache_free(free_nid_slab, i); @@ -1797,7 +1818,7 @@ void __build_free_nids(struct f2fs_sb_info *sbi) nid_t nid = nm_i->next_scan_nid; /* Enough entries */ - if (nm_i->fcnt >= NAT_ENTRY_PER_BLOCK) + if (nm_i->nid_cnt[FREE_NID_LIST] >= NAT_ENTRY_PER_BLOCK) return; /* readahead nat pages to be scanned */ @@ -1833,7 +1854,7 @@ void __build_free_nids(struct f2fs_sb_info *sbi) if (addr == NULL_ADDR) add_free_nid(sbi, nid, true); else - remove_free_nid(nm_i, nid); + remove_free_nid(sbi, nid); } up_read(&curseg->journal_rwsem); up_read(&nm_i->nat_tree_lock); @@ -1866,23 +1887,22 @@ retry: if (unlikely(sbi->total_valid_node_count + 1 > nm_i->available_nids)) return false; - spin_lock(&nm_i->free_nid_list_lock); + spin_lock(&nm_i->nid_list_lock); /* We should not use stale free nids created by build_free_nids */ - if (nm_i->fcnt && !on_build_free_nids(nm_i)) { - f2fs_bug_on(sbi, list_empty(&nm_i->free_nid_list)); - list_for_each_entry(i, &nm_i->free_nid_list, list) - if (i->state == NID_NEW) - break; - - f2fs_bug_on(sbi, i->state != NID_NEW); + if (nm_i->nid_cnt[FREE_NID_LIST] && !on_build_free_nids(nm_i)) { + f2fs_bug_on(sbi, list_empty(&nm_i->nid_list[FREE_NID_LIST])); + i = list_first_entry(&nm_i->nid_list[FREE_NID_LIST], + struct free_nid, list); *nid = i->nid; + + __remove_nid_from_list(sbi, i, FREE_NID_LIST); i->state = NID_ALLOC; - nm_i->fcnt--; - spin_unlock(&nm_i->free_nid_list_lock); + __insert_nid_to_list(sbi, i, ALLOC_NID_LIST); + spin_unlock(&nm_i->nid_list_lock); return true; } - spin_unlock(&nm_i->free_nid_list_lock); + spin_unlock(&nm_i->nid_list_lock); /* Let's scan nat pages and its caches to get free nids */ build_free_nids(sbi); @@ -1897,11 +1917,12 @@ void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid) struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *i; - spin_lock(&nm_i->free_nid_list_lock); + spin_lock(&nm_i->nid_list_lock); i = __lookup_free_nid_list(nm_i, nid); - f2fs_bug_on(sbi, !i || i->state != NID_ALLOC); + f2fs_bug_on(sbi, !i); + __remove_nid_from_list(sbi, i, ALLOC_NID_LIST); __del_from_free_nid_list(nm_i, i); - spin_unlock(&nm_i->free_nid_list_lock); + spin_unlock(&nm_i->nid_list_lock); kmem_cache_free(free_nid_slab, i); } @@ -1918,17 +1939,20 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) if (!nid) return; - spin_lock(&nm_i->free_nid_list_lock); + spin_lock(&nm_i->nid_list_lock); i = __lookup_free_nid_list(nm_i, nid); - f2fs_bug_on(sbi, !i || i->state != NID_ALLOC); + f2fs_bug_on(sbi, !i); + + __remove_nid_from_list(sbi, i, ALLOC_NID_LIST); + if (!available_free_memory(sbi, FREE_NIDS)) { __del_from_free_nid_list(nm_i, i); need_free = true; } else { i->state = NID_NEW; - nm_i->fcnt++; + __insert_nid_to_list(sbi, i, FREE_NID_LIST); } - spin_unlock(&nm_i->free_nid_list_lock); + spin_unlock(&nm_i->nid_list_lock); if (need_free) kmem_cache_free(free_nid_slab, i); @@ -1940,24 +1964,26 @@ int try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink) struct free_nid *i, *next; int nr = nr_shrink; - if (nm_i->fcnt <= MAX_FREE_NIDS) + if (nm_i->nid_cnt[FREE_NID_LIST] <= MAX_FREE_NIDS) return 0; if (!mutex_trylock(&nm_i->build_lock)) return 0; - spin_lock(&nm_i->free_nid_list_lock); - list_for_each_entry_safe(i, next, &nm_i->free_nid_list, list) { - if (nr_shrink <= 0 || nm_i->fcnt <= MAX_FREE_NIDS) + spin_lock(&nm_i->nid_list_lock); + list_for_each_entry_safe(i, next, &nm_i->nid_list[FREE_NID_LIST], + list) { + if (nr_shrink <= 0 || + nm_i->nid_cnt[FREE_NID_LIST] <= MAX_FREE_NIDS) break; - if (i->state == NID_ALLOC) - continue; + + __remove_nid_from_list(sbi, i, FREE_NID_LIST); __del_from_free_nid_list(nm_i, i); + kmem_cache_free(free_nid_slab, i); - nm_i->fcnt--; nr_shrink--; } - spin_unlock(&nm_i->free_nid_list_lock); + spin_unlock(&nm_i->nid_list_lock); mutex_unlock(&nm_i->build_lock); return nr - nr_shrink; @@ -2013,7 +2039,7 @@ recover_xnid: if (unlikely(!inc_valid_node_count(sbi, inode))) f2fs_bug_on(sbi, 1); - remove_free_nid(NM_I(sbi), new_xnid); + remove_free_nid(sbi, new_xnid); get_node_info(sbi, new_xnid, &ni); ni.ino = inode->i_ino; set_node_addr(sbi, &ni, NEW_ADDR, false); @@ -2043,7 +2069,7 @@ retry: } /* Should not use this inode from free nid list */ - remove_free_nid(NM_I(sbi), ino); + remove_free_nid(sbi, ino); if (!PageUptodate(ipage)) SetPageUptodate(ipage); @@ -2277,20 +2303,22 @@ static int init_node_manager(struct f2fs_sb_info *sbi) /* not used nids: 0, node, meta, (and root counted as valid node) */ nm_i->available_nids = nm_i->max_nid - F2FS_RESERVED_NODE_NUM; - nm_i->fcnt = 0; + nm_i->nid_cnt[FREE_NID_LIST] = 0; + nm_i->nid_cnt[ALLOC_NID_LIST] = 0; nm_i->nat_cnt = 0; nm_i->ram_thresh = DEF_RAM_THRESHOLD; nm_i->ra_nid_pages = DEF_RA_NID_PAGES; nm_i->dirty_nats_ratio = DEF_DIRTY_NAT_RATIO_THRESHOLD; INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC); - INIT_LIST_HEAD(&nm_i->free_nid_list); + INIT_LIST_HEAD(&nm_i->nid_list[FREE_NID_LIST]); + INIT_LIST_HEAD(&nm_i->nid_list[ALLOC_NID_LIST]); INIT_RADIX_TREE(&nm_i->nat_root, GFP_NOIO); INIT_RADIX_TREE(&nm_i->nat_set_root, GFP_NOIO); INIT_LIST_HEAD(&nm_i->nat_entries); mutex_init(&nm_i->build_lock); - spin_lock_init(&nm_i->free_nid_list_lock); + spin_lock_init(&nm_i->nid_list_lock); init_rwsem(&nm_i->nat_tree_lock); nm_i->next_scan_nid = le32_to_cpu(sbi->ckpt->next_free_nid); @@ -2335,17 +2363,19 @@ void destroy_node_manager(struct f2fs_sb_info *sbi) return; /* destroy free nid list */ - spin_lock(&nm_i->free_nid_list_lock); - list_for_each_entry_safe(i, next_i, &nm_i->free_nid_list, list) { - f2fs_bug_on(sbi, i->state == NID_ALLOC); + spin_lock(&nm_i->nid_list_lock); + list_for_each_entry_safe(i, next_i, &nm_i->nid_list[FREE_NID_LIST], + list) { + __remove_nid_from_list(sbi, i, FREE_NID_LIST); __del_from_free_nid_list(nm_i, i); - nm_i->fcnt--; - spin_unlock(&nm_i->free_nid_list_lock); + spin_unlock(&nm_i->nid_list_lock); kmem_cache_free(free_nid_slab, i); - spin_lock(&nm_i->free_nid_list_lock); + spin_lock(&nm_i->nid_list_lock); } - f2fs_bug_on(sbi, nm_i->fcnt); - spin_unlock(&nm_i->free_nid_list_lock); + f2fs_bug_on(sbi, nm_i->nid_cnt[FREE_NID_LIST]); + f2fs_bug_on(sbi, nm_i->nid_cnt[ALLOC_NID_LIST]); + f2fs_bug_on(sbi, !list_empty(&nm_i->nid_list[ALLOC_NID_LIST])); + spin_unlock(&nm_i->nid_list_lock); /* destroy nat cache */ down_write(&nm_i->nat_tree_lock); diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index cfdcf98516a1..e7997e240366 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -169,14 +169,15 @@ static inline void next_free_nid(struct f2fs_sb_info *sbi, nid_t *nid) struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *fnid; - spin_lock(&nm_i->free_nid_list_lock); - if (nm_i->fcnt <= 0) { - spin_unlock(&nm_i->free_nid_list_lock); + spin_lock(&nm_i->nid_list_lock); + if (nm_i->nid_cnt[FREE_NID_LIST] <= 0) { + spin_unlock(&nm_i->nid_list_lock); return; } - fnid = list_entry(nm_i->free_nid_list.next, struct free_nid, list); + fnid = list_entry(nm_i->nid_list[FREE_NID_LIST].next, + struct free_nid, list); *nid = fnid->nid; - spin_unlock(&nm_i->free_nid_list_lock); + spin_unlock(&nm_i->nid_list_lock); } /* diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c index 46c915425923..ec539f407cc4 100644 --- a/fs/f2fs/shrinker.c +++ b/fs/f2fs/shrinker.c @@ -26,8 +26,8 @@ static unsigned long __count_nat_entries(struct f2fs_sb_info *sbi) static unsigned long __count_free_nids(struct f2fs_sb_info *sbi) { - if (NM_I(sbi)->fcnt > MAX_FREE_NIDS) - return NM_I(sbi)->fcnt - MAX_FREE_NIDS; + if (NM_I(sbi)->nid_cnt[FREE_NID_LIST] > MAX_FREE_NIDS) + return NM_I(sbi)->nid_cnt[FREE_NID_LIST] - MAX_FREE_NIDS; return 0; } From eb0aa4b80784b8551bd5be577024e067bc83ef94 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 12 Oct 2016 10:09:59 -0700 Subject: [PATCH 13/66] f2fs: clean up free nid list operations This patch cleans up to use consistent free nid list ops. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 56 +++++++++++++++++++++++--------------------------- 1 file changed, 26 insertions(+), 30 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 3019fe2e9df7..f21e5bba9483 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1695,25 +1695,26 @@ static struct free_nid *__lookup_free_nid_list(struct f2fs_nm_info *nm_i, return radix_tree_lookup(&nm_i->free_nid_root, n); } -static void __del_from_free_nid_list(struct f2fs_nm_info *nm_i, - struct free_nid *i) -{ - radix_tree_delete(&nm_i->free_nid_root, i->nid); -} - -static void __insert_nid_to_list(struct f2fs_sb_info *sbi, - struct free_nid *i, enum nid_list list) +static int __insert_nid_to_list(struct f2fs_sb_info *sbi, + struct free_nid *i, enum nid_list list, bool new) { struct f2fs_nm_info *nm_i = NM_I(sbi); + if (new) { + int err = radix_tree_insert(&nm_i->free_nid_root, i->nid, i); + if (err) + return err; + } + f2fs_bug_on(sbi, list == FREE_NID_LIST ? i->state != NID_NEW : i->state != NID_ALLOC); nm_i->nid_cnt[list]++; list_add_tail(&i->list, &nm_i->nid_list[list]); + return 0; } static void __remove_nid_from_list(struct f2fs_sb_info *sbi, - struct free_nid *i, enum nid_list list) + struct free_nid *i, enum nid_list list, bool reuse) { struct f2fs_nm_info *nm_i = NM_I(sbi); @@ -1721,6 +1722,8 @@ static void __remove_nid_from_list(struct f2fs_sb_info *sbi, i->state != NID_ALLOC); nm_i->nid_cnt[list]--; list_del(&i->list); + if (!reuse) + radix_tree_delete(&nm_i->free_nid_root, i->nid); } static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) @@ -1728,6 +1731,7 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *i; struct nat_entry *ne; + int err; if (!available_free_memory(sbi, FREE_NIDS)) return -1; @@ -1754,15 +1758,13 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) } spin_lock(&nm_i->nid_list_lock); - if (radix_tree_insert(&nm_i->free_nid_root, i->nid, i)) { - spin_unlock(&nm_i->nid_list_lock); - radix_tree_preload_end(); + err = __insert_nid_to_list(sbi, i, FREE_NID_LIST, true); + spin_unlock(&nm_i->nid_list_lock); + radix_tree_preload_end(); + if (err) { kmem_cache_free(free_nid_slab, i); return 0; } - __insert_nid_to_list(sbi, i, FREE_NID_LIST); - spin_unlock(&nm_i->nid_list_lock); - radix_tree_preload_end(); return 1; } @@ -1775,8 +1777,7 @@ static void remove_free_nid(struct f2fs_sb_info *sbi, nid_t nid) spin_lock(&nm_i->nid_list_lock); i = __lookup_free_nid_list(nm_i, nid); if (i && i->state == NID_NEW) { - __remove_nid_from_list(sbi, i, FREE_NID_LIST); - __del_from_free_nid_list(nm_i, i); + __remove_nid_from_list(sbi, i, FREE_NID_LIST, false); need_free = true; } spin_unlock(&nm_i->nid_list_lock); @@ -1896,9 +1897,9 @@ retry: struct free_nid, list); *nid = i->nid; - __remove_nid_from_list(sbi, i, FREE_NID_LIST); + __remove_nid_from_list(sbi, i, FREE_NID_LIST, true); i->state = NID_ALLOC; - __insert_nid_to_list(sbi, i, ALLOC_NID_LIST); + __insert_nid_to_list(sbi, i, ALLOC_NID_LIST, false); spin_unlock(&nm_i->nid_list_lock); return true; } @@ -1920,8 +1921,7 @@ void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid) spin_lock(&nm_i->nid_list_lock); i = __lookup_free_nid_list(nm_i, nid); f2fs_bug_on(sbi, !i); - __remove_nid_from_list(sbi, i, ALLOC_NID_LIST); - __del_from_free_nid_list(nm_i, i); + __remove_nid_from_list(sbi, i, ALLOC_NID_LIST, false); spin_unlock(&nm_i->nid_list_lock); kmem_cache_free(free_nid_slab, i); @@ -1943,14 +1943,13 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) i = __lookup_free_nid_list(nm_i, nid); f2fs_bug_on(sbi, !i); - __remove_nid_from_list(sbi, i, ALLOC_NID_LIST); - if (!available_free_memory(sbi, FREE_NIDS)) { - __del_from_free_nid_list(nm_i, i); + __remove_nid_from_list(sbi, i, ALLOC_NID_LIST, false); need_free = true; } else { + __remove_nid_from_list(sbi, i, ALLOC_NID_LIST, true); i->state = NID_NEW; - __insert_nid_to_list(sbi, i, FREE_NID_LIST); + __insert_nid_to_list(sbi, i, FREE_NID_LIST, false); } spin_unlock(&nm_i->nid_list_lock); @@ -1977,9 +1976,7 @@ int try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink) nm_i->nid_cnt[FREE_NID_LIST] <= MAX_FREE_NIDS) break; - __remove_nid_from_list(sbi, i, FREE_NID_LIST); - __del_from_free_nid_list(nm_i, i); - + __remove_nid_from_list(sbi, i, FREE_NID_LIST, false); kmem_cache_free(free_nid_slab, i); nr_shrink--; } @@ -2366,8 +2363,7 @@ void destroy_node_manager(struct f2fs_sb_info *sbi) spin_lock(&nm_i->nid_list_lock); list_for_each_entry_safe(i, next_i, &nm_i->nid_list[FREE_NID_LIST], list) { - __remove_nid_from_list(sbi, i, FREE_NID_LIST); - __del_from_free_nid_list(nm_i, i); + __remove_nid_from_list(sbi, i, FREE_NID_LIST, false); spin_unlock(&nm_i->nid_list_lock); kmem_cache_free(free_nid_slab, i); spin_lock(&nm_i->nid_list_lock); From 3a2ad5672bb36ee9c07bab97dadc8b0f70d391f4 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 11 Oct 2016 22:31:35 +0800 Subject: [PATCH 14/66] f2fs: don't interrupt free nids building during nid allocation Let build_free_nids support sync/async methods, in allocation flow of nids, we use synchronuous method, so that we can avoid looping in alloc_nid when free memory is low; in unblock_operations and f2fs_balance_fs_bg we use asynchronuous method in where low memory condition can interrupt us. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 2 +- fs/f2fs/f2fs.h | 2 +- fs/f2fs/node.c | 22 ++++++++++------------ fs/f2fs/segment.c | 2 +- 4 files changed, 13 insertions(+), 15 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 7db2aa2280ad..654f5d7f9288 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -987,7 +987,7 @@ static void unblock_operations(struct f2fs_sb_info *sbi) { up_write(&sbi->node_write); - build_free_nids(sbi); + build_free_nids(sbi, false); f2fs_unlock_all(sbi); } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 86b543c3defb..692f08a53a3b 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2040,7 +2040,7 @@ void move_node_page(struct page *, int); int fsync_node_pages(struct f2fs_sb_info *, struct inode *, struct writeback_control *, bool); int sync_node_pages(struct f2fs_sb_info *, struct writeback_control *); -void build_free_nids(struct f2fs_sb_info *); +void build_free_nids(struct f2fs_sb_info *, bool); bool alloc_nid(struct f2fs_sb_info *, nid_t *); void alloc_nid_done(struct f2fs_sb_info *, nid_t); void alloc_nid_failed(struct f2fs_sb_info *, nid_t); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index f21e5bba9483..b6894ef3d907 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1733,9 +1733,6 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) struct nat_entry *ne; int err; - if (!available_free_memory(sbi, FREE_NIDS)) - return -1; - /* 0 nid should not be used */ if (unlikely(nid == 0)) return 0; @@ -1803,14 +1800,12 @@ static void scan_nat_page(struct f2fs_sb_info *sbi, blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr); f2fs_bug_on(sbi, blk_addr == NEW_ADDR); - if (blk_addr == NULL_ADDR) { - if (add_free_nid(sbi, start_nid, true) < 0) - break; - } + if (blk_addr == NULL_ADDR) + add_free_nid(sbi, start_nid, true); } } -void __build_free_nids(struct f2fs_sb_info *sbi) +void __build_free_nids(struct f2fs_sb_info *sbi, bool sync) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); @@ -1822,6 +1817,9 @@ void __build_free_nids(struct f2fs_sb_info *sbi) if (nm_i->nid_cnt[FREE_NID_LIST] >= NAT_ENTRY_PER_BLOCK) return; + if (!sync && !available_free_memory(sbi, FREE_NIDS)) + return; + /* readahead nat pages to be scanned */ ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES, META_NAT, true); @@ -1864,10 +1862,10 @@ void __build_free_nids(struct f2fs_sb_info *sbi) nm_i->ra_nid_pages, META_NAT, false); } -void build_free_nids(struct f2fs_sb_info *sbi) +void build_free_nids(struct f2fs_sb_info *sbi, bool sync) { mutex_lock(&NM_I(sbi)->build_lock); - __build_free_nids(sbi); + __build_free_nids(sbi, sync); mutex_unlock(&NM_I(sbi)->build_lock); } @@ -1906,7 +1904,7 @@ retry: spin_unlock(&nm_i->nid_list_lock); /* Let's scan nat pages and its caches to get free nids */ - build_free_nids(sbi); + build_free_nids(sbi, true); goto retry; } @@ -2343,7 +2341,7 @@ int build_node_manager(struct f2fs_sb_info *sbi) if (err) return err; - build_free_nids(sbi); + build_free_nids(sbi, true); return 0; } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 5e1b4a9e2cd2..de597ccdd107 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -382,7 +382,7 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) if (!available_free_memory(sbi, FREE_NIDS)) try_to_free_nids(sbi, MAX_FREE_NIDS); else - build_free_nids(sbi); + build_free_nids(sbi, false); /* checkpoint is the only way to shrink partial cached entries */ if (!available_free_memory(sbi, NAT_ENTRIES) || From 02110a4fd53164db7cce3bb2780dce4d6c4e058f Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 11 Oct 2016 22:31:36 +0800 Subject: [PATCH 15/66] f2fs: avoid casted negative value as shrink count This patch makes sure it returns a positive value instead of a probable casted negative value as shrink count. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/shrinker.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c index ec539f407cc4..5c60fc28ec75 100644 --- a/fs/f2fs/shrinker.c +++ b/fs/f2fs/shrinker.c @@ -21,14 +21,16 @@ static unsigned int shrinker_run_no; static unsigned long __count_nat_entries(struct f2fs_sb_info *sbi) { - return NM_I(sbi)->nat_cnt - NM_I(sbi)->dirty_nat_cnt; + long count = NM_I(sbi)->nat_cnt - NM_I(sbi)->dirty_nat_cnt; + + return count > 0 ? count : 0; } static unsigned long __count_free_nids(struct f2fs_sb_info *sbi) { - if (NM_I(sbi)->nid_cnt[FREE_NID_LIST] > MAX_FREE_NIDS) - return NM_I(sbi)->nid_cnt[FREE_NID_LIST] - MAX_FREE_NIDS; - return 0; + long count = NM_I(sbi)->nid_cnt[FREE_NID_LIST] - MAX_FREE_NIDS; + + return count > 0 ? count : 0; } static unsigned long __count_extent_cache(struct f2fs_sb_info *sbi) From b9610bdfcbdbb6017802ec6d1e073f445c98157d Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 14 Oct 2016 13:28:05 -0700 Subject: [PATCH 16/66] f2fs: count dirty inodes to flush node pages during checkpoint If there are a lot of dirty inodes, we need to flush all of them when doing checkpoint. So, we need to count this for enough free space. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index fecb856ad874..762743988426 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -471,11 +471,12 @@ static inline bool need_SSR(struct f2fs_sb_info *sbi) { int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES); int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS); + int imeta_secs = get_blocktype_secs(sbi, F2FS_DIRTY_IMETA); if (test_opt(sbi, LFS)) return false; - return free_sections(sbi) <= (node_secs + 2 * dent_secs + + return free_sections(sbi) <= (node_secs + 2 * dent_secs + imeta_secs + reserved_sections(sbi) + 1); } @@ -484,14 +485,14 @@ static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, { int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES); int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS); - - node_secs += get_blocktype_secs(sbi, F2FS_DIRTY_IMETA); + int imeta_secs = get_blocktype_secs(sbi, F2FS_DIRTY_IMETA); if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) return false; return (free_sections(sbi) + freed) <= - (node_secs + 2 * dent_secs + reserved_sections(sbi) + needed); + (node_secs + 2 * dent_secs + imeta_secs + + reserved_sections(sbi) + needed); } static inline bool excess_prefree_segs(struct f2fs_sb_info *sbi) From 15d04354555fdfe8005e1365009e349148fb5f90 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 14 Oct 2016 13:30:31 -0700 Subject: [PATCH 17/66] f2fs: call f2fs_balance_fs for setattr If inode becomes dirty, we need to check the # of dirty inodes whether or not further checkpoint would be required. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 578bc7bcaa13..5e3e1674d4e8 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -694,7 +694,6 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) err = f2fs_truncate(inode); if (err) return err; - f2fs_balance_fs(F2FS_I_SB(inode), true); } else { /* * do not trim all blocks after i_size if target size is @@ -723,6 +722,10 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) } f2fs_mark_inode_dirty_sync(inode); + + /* inode change will produce dirty node pages flushed by checkpoint */ + f2fs_balance_fs(F2FS_I_SB(inode), true); + return err; } From 3e7b5bbbef7f5eb8a19aa61b611c704bf8230937 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 17 Oct 2016 15:36:31 -0700 Subject: [PATCH 18/66] f2fs: declare static function for __build_free_nids This patch avoids build warning. Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index b6894ef3d907..74125b13d1af 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1805,7 +1805,7 @@ static void scan_nat_page(struct f2fs_sb_info *sbi, } } -void __build_free_nids(struct f2fs_sb_info *sbi, bool sync) +static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); From 664ba972df9b96942191db3068274cc1db899774 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 18 Oct 2016 11:07:45 -0700 Subject: [PATCH 19/66] f2fs: use BIO_MAX_PAGES for bio allocation We don't need to allocate bio partially in order to maximize sequential writes. Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 2 +- fs/f2fs/data.c | 4 +--- fs/f2fs/node.c | 3 +-- fs/f2fs/segment.c | 4 ++-- fs/f2fs/segment.h | 17 +++-------------- 5 files changed, 8 insertions(+), 22 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 654f5d7f9288..157b7fdc2c67 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -228,7 +228,7 @@ void ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index) f2fs_put_page(page, 0); if (readahead) - ra_meta_pages(sbi, index, MAX_BIO_BLOCKS(sbi), META_POR, true); + ra_meta_pages(sbi, index, BIO_MAX_PAGES, META_POR, true); } static int f2fs_write_meta_page(struct page *page, diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index b544e0b2e7e5..68edb47f5f71 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -277,10 +277,8 @@ void f2fs_submit_page_mbio(struct f2fs_io_info *fio) __submit_merged_bio(io); alloc_new: if (io->bio == NULL) { - int bio_blocks = MAX_BIO_BLOCKS(sbi); - io->bio = __bio_alloc(sbi, fio->new_blkaddr, - bio_blocks, is_read); + BIO_MAX_PAGES, is_read); io->fio = *fio; } diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 74125b13d1af..38cc07d49ff3 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2098,7 +2098,6 @@ int restore_node_summary(struct f2fs_sb_info *sbi, struct f2fs_node *rn; struct f2fs_summary *sum_entry; block_t addr; - int bio_blocks = MAX_BIO_BLOCKS(sbi); int i, idx, last_offset, nrpages; /* scan the node segment */ @@ -2107,7 +2106,7 @@ int restore_node_summary(struct f2fs_sb_info *sbi, sum_entry = &sum->entries[0]; for (i = 0; i < last_offset; i += nrpages, addr += nrpages) { - nrpages = min(last_offset - i, bio_blocks); + nrpages = min(last_offset - i, BIO_MAX_PAGES); /* readahead node pages */ ra_meta_pages(sbi, addr, nrpages, META_POR, true); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index de597ccdd107..dcd85d33b91d 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2317,10 +2317,10 @@ static void build_sit_entries(struct f2fs_sb_info *sbi) int sit_blk_cnt = SIT_BLK_CNT(sbi); unsigned int i, start, end; unsigned int readed, start_blk = 0; - int nrpages = MAX_BIO_BLOCKS(sbi) * 8; do { - readed = ra_meta_pages(sbi, start_blk, nrpages, META_SIT, true); + readed = ra_meta_pages(sbi, start_blk, BIO_MAX_PAGES, + META_SIT, true); start = start_blk * sit_i->sents_per_block; end = (start_blk + readed) * sit_i->sents_per_block; diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 762743988426..89ab4301ef02 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -102,8 +102,6 @@ (((sector_t)blk_addr) << F2FS_LOG_SECTORS_PER_BLOCK) #define SECTOR_TO_BLOCK(sectors) \ (sectors >> F2FS_LOG_SECTORS_PER_BLOCK) -#define MAX_BIO_BLOCKS(sbi) \ - ((int)min((int)max_hw_blocks(sbi), BIO_MAX_PAGES)) /* * indicate a block allocation direction: RIGHT and LEFT. @@ -696,13 +694,6 @@ static inline bool sec_usage_check(struct f2fs_sb_info *sbi, unsigned int secno) return false; } -static inline unsigned int max_hw_blocks(struct f2fs_sb_info *sbi) -{ - struct block_device *bdev = sbi->sb->s_bdev; - struct request_queue *q = bdev_get_queue(bdev); - return SECTOR_TO_BLOCK(queue_max_sectors(q)); -} - /* * It is very important to gather dirty pages and write at once, so that we can * submit a big bio without interfering other data writes. @@ -720,7 +711,7 @@ static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type) else if (type == NODE) return 8 * sbi->blocks_per_seg; else if (type == META) - return 8 * MAX_BIO_BLOCKS(sbi); + return 8 * BIO_MAX_PAGES; else return 0; } @@ -737,11 +728,9 @@ static inline long nr_pages_to_write(struct f2fs_sb_info *sbi, int type, return 0; nr_to_write = wbc->nr_to_write; - + desired = BIO_MAX_PAGES; if (type == NODE) - desired = 2 * max_hw_blocks(sbi); - else - desired = MAX_BIO_BLOCKS(sbi); + desired <<= 1; wbc->nr_to_write = desired; return desired - nr_to_write; From 7c45729a4d6d1c90879e6c5c2df325c2f6db7191 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 14 Oct 2016 11:51:23 -0700 Subject: [PATCH 20/66] f2fs: keep dirty inodes selectively for checkpoint This is to avoid no free segment bug during checkpoint caused by a number of dirty inodes. The case was reported by Chao like this. 1. mount with lazytime option 2. fill 4k file until disk is full 3. sync filesystem 4. read all files in the image 5. umount In this case, we actually don't need to flush dirty inode to inode page during checkpoint. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/acl.c | 2 +- fs/f2fs/dir.c | 6 +++--- fs/f2fs/extent_cache.c | 2 +- fs/f2fs/f2fs.h | 26 +++++++++++++------------- fs/f2fs/file.c | 9 +++++---- fs/f2fs/inline.c | 2 +- fs/f2fs/inode.c | 7 ++++--- fs/f2fs/namei.c | 6 +++--- fs/f2fs/super.c | 29 ++++++++++++++++------------- fs/f2fs/xattr.c | 4 ++-- 10 files changed, 49 insertions(+), 44 deletions(-) diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index 6fe23af509e1..8f487692c21f 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -384,7 +384,7 @@ int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage, if (error) return error; - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); if (default_acl) { error = __f2fs_set_acl(inode, ACL_TYPE_DEFAULT, default_acl, diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 9fd1b0e6d572..5f5678ecb445 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -313,7 +313,7 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, set_page_dirty(page); dir->i_mtime = dir->i_ctime = current_time(dir); - f2fs_mark_inode_dirty_sync(dir); + f2fs_mark_inode_dirty_sync(dir, false); f2fs_put_page(page, 1); } @@ -466,7 +466,7 @@ void update_parent_metadata(struct inode *dir, struct inode *inode, clear_inode_flag(inode, FI_NEW_INODE); } dir->i_mtime = dir->i_ctime = current_time(dir); - f2fs_mark_inode_dirty_sync(dir); + f2fs_mark_inode_dirty_sync(dir, false); if (F2FS_I(dir)->i_current_depth != current_depth) f2fs_i_depth_write(dir, current_depth); @@ -731,7 +731,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, set_page_dirty(page); dir->i_ctime = dir->i_mtime = current_time(dir); - f2fs_mark_inode_dirty_sync(dir); + f2fs_mark_inode_dirty_sync(dir, false); if (inode) f2fs_drop_nlink(dir, inode); diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 2b06d4fcd954..4db44da7ef69 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -172,7 +172,7 @@ static void __drop_largest_extent(struct inode *inode, if (fofs < largest->fofs + largest->len && fofs + len > largest->fofs) { largest->len = 0; - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); } } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 692f08a53a3b..e6d057ceaf76 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -493,13 +493,13 @@ static inline bool __is_front_mergeable(struct extent_info *cur, return __is_extent_mergeable(cur, front); } -extern void f2fs_mark_inode_dirty_sync(struct inode *); +extern void f2fs_mark_inode_dirty_sync(struct inode *, bool); static inline void __try_update_largest_extent(struct inode *inode, struct extent_tree *et, struct extent_node *en) { if (en->ei.len > et->largest.len) { et->largest = en->ei; - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); } } @@ -1627,7 +1627,7 @@ static inline void __mark_inode_dirty_flag(struct inode *inode, return; case FI_DATA_EXIST: case FI_INLINE_DOTS: - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); } } @@ -1654,7 +1654,7 @@ static inline void set_acl_inode(struct inode *inode, umode_t mode) { F2FS_I(inode)->i_acl_mode = mode; set_inode_flag(inode, FI_ACL_MODE); - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, false); } static inline void f2fs_i_links_write(struct inode *inode, bool inc) @@ -1663,7 +1663,7 @@ static inline void f2fs_i_links_write(struct inode *inode, bool inc) inc_nlink(inode); else drop_nlink(inode); - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); } static inline void f2fs_i_blocks_write(struct inode *inode, @@ -1674,7 +1674,7 @@ static inline void f2fs_i_blocks_write(struct inode *inode, inode->i_blocks = add ? inode->i_blocks + diff : inode->i_blocks - diff; - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); if (clean || recover) set_inode_flag(inode, FI_AUTO_RECOVER); } @@ -1688,7 +1688,7 @@ static inline void f2fs_i_size_write(struct inode *inode, loff_t i_size) return; i_size_write(inode, i_size); - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); if (clean || recover) set_inode_flag(inode, FI_AUTO_RECOVER); } @@ -1703,19 +1703,19 @@ static inline bool f2fs_skip_inode_update(struct inode *inode) static inline void f2fs_i_depth_write(struct inode *inode, unsigned int depth) { F2FS_I(inode)->i_current_depth = depth; - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); } static inline void f2fs_i_xnid_write(struct inode *inode, nid_t xnid) { F2FS_I(inode)->i_xattr_nid = xnid; - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); } static inline void f2fs_i_pino_write(struct inode *inode, nid_t pino) { F2FS_I(inode)->i_pino = pino; - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); } static inline void get_inline_info(struct inode *inode, struct f2fs_inode *ri) @@ -1843,13 +1843,13 @@ static inline int is_file(struct inode *inode, int type) static inline void set_file(struct inode *inode, int type) { F2FS_I(inode)->i_advise |= type; - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); } static inline void clear_file(struct inode *inode, int type) { F2FS_I(inode)->i_advise &= ~type; - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); } static inline int f2fs_readonly(struct super_block *sb) @@ -2001,7 +2001,7 @@ static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode) /* * super.c */ -int f2fs_inode_dirtied(struct inode *); +int f2fs_inode_dirtied(struct inode *, bool); void f2fs_inode_synced(struct inode *); int f2fs_commit_super(struct f2fs_sb_info *, bool); int f2fs_sync_fs(struct super_block *, int); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 5e3e1674d4e8..67a6b5ddc251 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -264,7 +264,7 @@ sync_nodes: } if (need_inode_block_update(sbi, ino)) { - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); f2fs_write_inode(inode, NULL); goto sync_nodes; } @@ -632,7 +632,7 @@ int f2fs_truncate(struct inode *inode) return err; inode->i_mtime = inode->i_ctime = current_time(inode); - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, false); return 0; } @@ -721,7 +721,8 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) } } - f2fs_mark_inode_dirty_sync(inode); + /* update attributes only */ + f2fs_mark_inode_dirty_sync(inode, false); /* inode change will produce dirty node pages flushed by checkpoint */ f2fs_balance_fs(F2FS_I_SB(inode), true); @@ -1399,7 +1400,7 @@ static long f2fs_fallocate(struct file *file, int mode, if (!ret) { inode->i_mtime = inode->i_ctime = current_time(inode); - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, false); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); } diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 9cd3379581be..8cab5df539c6 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -575,7 +575,7 @@ void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page, f2fs_put_page(page, 1); dir->i_ctime = dir->i_mtime = current_time(dir); - f2fs_mark_inode_dirty_sync(dir); + f2fs_mark_inode_dirty_sync(dir, false); if (inode) f2fs_drop_nlink(dir, inode); diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index d32fd0343eae..bfa512dde4ab 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -19,10 +19,11 @@ #include -void f2fs_mark_inode_dirty_sync(struct inode *inode) +void f2fs_mark_inode_dirty_sync(struct inode *inode, bool sync) { - if (f2fs_inode_dirtied(inode)) + if (f2fs_inode_dirtied(inode, sync)) return; + mark_inode_dirty_sync(inode); } @@ -43,7 +44,7 @@ void f2fs_set_inode_flags(struct inode *inode) new_fl |= S_DIRSYNC; inode_set_flags(inode, new_fl, S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, false); } static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 489fa0d5f914..db33b5631dc8 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -778,7 +778,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, up_write(&F2FS_I(old_inode)->i_sem); old_inode->i_ctime = current_time(old_inode); - f2fs_mark_inode_dirty_sync(old_inode); + f2fs_mark_inode_dirty_sync(old_inode, false); f2fs_delete_entry(old_entry, old_page, old_dir, NULL); @@ -938,7 +938,7 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, f2fs_i_links_write(old_dir, old_nlink > 0); up_write(&F2FS_I(old_dir)->i_sem); } - f2fs_mark_inode_dirty_sync(old_dir); + f2fs_mark_inode_dirty_sync(old_dir, false); /* update directory entry info of new dir inode */ f2fs_set_link(new_dir, new_entry, new_page, old_inode); @@ -953,7 +953,7 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, f2fs_i_links_write(new_dir, new_nlink > 0); up_write(&F2FS_I(new_dir)->i_sem); } - f2fs_mark_inode_dirty_sync(new_dir); + f2fs_mark_inode_dirty_sync(new_dir, false); f2fs_unlock_op(sbi); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index bfa4414d707e..f8e2f319655d 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -620,24 +620,25 @@ static int f2fs_drop_inode(struct inode *inode) return generic_drop_inode(inode); } -int f2fs_inode_dirtied(struct inode *inode) +int f2fs_inode_dirtied(struct inode *inode, bool sync) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + int ret = 0; spin_lock(&sbi->inode_lock[DIRTY_META]); if (is_inode_flag_set(inode, FI_DIRTY_INODE)) { - spin_unlock(&sbi->inode_lock[DIRTY_META]); - return 1; + ret = 1; + } else { + set_inode_flag(inode, FI_DIRTY_INODE); + stat_inc_dirty_inode(sbi, DIRTY_META); } - - set_inode_flag(inode, FI_DIRTY_INODE); - list_add_tail(&F2FS_I(inode)->gdirty_list, + if (sync && list_empty(&F2FS_I(inode)->gdirty_list)) { + list_add_tail(&F2FS_I(inode)->gdirty_list, &sbi->inode_list[DIRTY_META]); - inc_page_count(sbi, F2FS_DIRTY_IMETA); - stat_inc_dirty_inode(sbi, DIRTY_META); + inc_page_count(sbi, F2FS_DIRTY_IMETA); + } spin_unlock(&sbi->inode_lock[DIRTY_META]); - - return 0; + return ret; } void f2fs_inode_synced(struct inode *inode) @@ -649,10 +650,12 @@ void f2fs_inode_synced(struct inode *inode) spin_unlock(&sbi->inode_lock[DIRTY_META]); return; } - list_del_init(&F2FS_I(inode)->gdirty_list); + if (!list_empty(&F2FS_I(inode)->gdirty_list)) { + list_del_init(&F2FS_I(inode)->gdirty_list); + dec_page_count(sbi, F2FS_DIRTY_IMETA); + } clear_inode_flag(inode, FI_DIRTY_INODE); clear_inode_flag(inode, FI_AUTO_RECOVER); - dec_page_count(sbi, F2FS_DIRTY_IMETA); stat_dec_dirty_inode(F2FS_I_SB(inode), DIRTY_META); spin_unlock(&sbi->inode_lock[DIRTY_META]); } @@ -676,7 +679,7 @@ static void f2fs_dirty_inode(struct inode *inode, int flags) if (is_inode_flag_set(inode, FI_AUTO_RECOVER)) clear_inode_flag(inode, FI_AUTO_RECOVER); - f2fs_inode_dirtied(inode); + f2fs_inode_dirtied(inode, false); } static void f2fs_i_callback(struct rcu_head *head) diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 3e1c0280f866..c47ce2f330a1 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -106,7 +106,7 @@ static int f2fs_xattr_advise_set(const struct xattr_handler *handler, return -EINVAL; F2FS_I(inode)->i_advise |= *(char *)value; - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); return 0; } @@ -554,7 +554,7 @@ static int __f2fs_setxattr(struct inode *inode, int index, if (index == F2FS_XATTR_INDEX_ENCRYPTION && !strcmp(name, F2FS_XATTR_NAME_ENCRYPTION_CONTEXT)) f2fs_set_encrypted_inode(inode); - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); if (!error && S_ISDIR(inode->i_mode)) set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_CP); exit: From 18340edc8da20b0d399eb25ba4bb631b27652f46 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 19 Oct 2016 18:27:56 -0700 Subject: [PATCH 21/66] f2fs: make clean inodes when flushing inode page This patch tries to make more clean inodes when flushing dirty inodes in checkpoint. Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 6 +++++- fs/f2fs/inode.c | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 157b7fdc2c67..e5677e4a69d6 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -921,7 +921,11 @@ int f2fs_sync_inode_meta(struct f2fs_sb_info *sbi) inode = igrab(&fi->vfs_inode); spin_unlock(&sbi->inode_lock[DIRTY_META]); if (inode) { - update_inode_page(inode); + sync_inode_metadata(inode, 0); + + /* it's on eviction */ + if (is_inode_flag_set(inode, FI_DIRTY_INODE)) + update_inode_page(inode); iput(inode); } }; diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index bfa512dde4ab..7b5e402f0a72 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -339,7 +339,7 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) * We need to balance fs here to prevent from producing dirty node pages * during the urgent cleaning time when runing out of free sections. */ - if (update_inode_page(inode)) + if (update_inode_page(inode) && wbc && wbc->nr_to_write) f2fs_balance_fs(sbi, true); return 0; } From 35782b233f37e48ecc469d9c7232f3f6a7fad41a Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 20 Oct 2016 19:09:57 -0700 Subject: [PATCH 22/66] f2fs: remove percpu_count due to performance regression This patch removes percpu_count usage due to performance regression in iozone. Fixes: 523be8a6b3 ("f2fs: use percpu_counter for page counters") Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 12 ++++++------ fs/f2fs/f2fs.h | 12 ++++++------ fs/f2fs/super.c | 16 +++++----------- 3 files changed, 17 insertions(+), 23 deletions(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 6af146c48644..2fdf23398fa1 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -313,17 +313,17 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, " - Inner Struct Count: tree: %d(%d), node: %d\n", si->ext_tree, si->zombie_tree, si->ext_node); seq_puts(s, "\nBalancing F2FS Async:\n"); - seq_printf(s, " - inmem: %4lld, wb_bios: %4d\n", + seq_printf(s, " - inmem: %4d, wb_bios: %4d\n", si->inmem_pages, si->wb_bios); - seq_printf(s, " - nodes: %4lld in %4d\n", + seq_printf(s, " - nodes: %4d in %4d\n", si->ndirty_node, si->node_pages); - seq_printf(s, " - dents: %4lld in dirs:%4d (%4d)\n", + seq_printf(s, " - dents: %4d in dirs:%4d (%4d)\n", si->ndirty_dent, si->ndirty_dirs, si->ndirty_all); - seq_printf(s, " - datas: %4lld in files:%4d\n", + seq_printf(s, " - datas: %4d in files:%4d\n", si->ndirty_data, si->ndirty_files); - seq_printf(s, " - meta: %4lld in %4d\n", + seq_printf(s, " - meta: %4d in %4d\n", si->ndirty_meta, si->meta_pages); - seq_printf(s, " - imeta: %4lld\n", + seq_printf(s, " - imeta: %4d\n", si->ndirty_imeta); seq_printf(s, " - NATs: %9d/%9d\n - SITs: %9d/%9d\n", si->dirty_nats, si->nats, si->dirty_sits, si->sits); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index e6d057ceaf76..8bfbb546ebb2 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -824,7 +824,7 @@ struct f2fs_sb_info { atomic_t nr_wb_bios; /* # of writeback bios */ /* # of pages, see count_type */ - struct percpu_counter nr_pages[NR_COUNT_TYPE]; + atomic_t nr_pages[NR_COUNT_TYPE]; /* # of allocated blocks */ struct percpu_counter alloc_valid_block_count; @@ -1238,7 +1238,7 @@ static inline void dec_valid_block_count(struct f2fs_sb_info *sbi, static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type) { - percpu_counter_inc(&sbi->nr_pages[count_type]); + atomic_inc(&sbi->nr_pages[count_type]); if (count_type == F2FS_DIRTY_DATA || count_type == F2FS_INMEM_PAGES) return; @@ -1255,7 +1255,7 @@ static inline void inode_inc_dirty_pages(struct inode *inode) static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type) { - percpu_counter_dec(&sbi->nr_pages[count_type]); + atomic_dec(&sbi->nr_pages[count_type]); } static inline void inode_dec_dirty_pages(struct inode *inode) @@ -1271,7 +1271,7 @@ static inline void inode_dec_dirty_pages(struct inode *inode) static inline s64 get_pages(struct f2fs_sb_info *sbi, int count_type) { - return percpu_counter_sum_positive(&sbi->nr_pages[count_type]); + return atomic_read(&sbi->nr_pages[count_type]); } static inline s64 get_dirty_pages(struct inode *inode) @@ -2187,8 +2187,8 @@ struct f2fs_stat_info { unsigned long long hit_largest, hit_cached, hit_rbtree; unsigned long long hit_total, total_ext; int ext_tree, zombie_tree, ext_node; - s64 ndirty_node, ndirty_dent, ndirty_meta, ndirty_data, ndirty_imeta; - s64 inmem_pages; + int ndirty_node, ndirty_dent, ndirty_meta, ndirty_data, ndirty_imeta; + int inmem_pages; unsigned int ndirty_dirs, ndirty_files, ndirty_all; int nats, dirty_nats, sits, dirty_sits, free_nids, alloc_nids; int total_count, utilization; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index f8e2f319655d..9749758f9977 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -696,10 +696,6 @@ static void f2fs_destroy_inode(struct inode *inode) static void destroy_percpu_info(struct f2fs_sb_info *sbi) { - int i; - - for (i = 0; i < NR_COUNT_TYPE; i++) - percpu_counter_destroy(&sbi->nr_pages[i]); percpu_counter_destroy(&sbi->alloc_valid_block_count); percpu_counter_destroy(&sbi->total_valid_inode_count); } @@ -1449,6 +1445,7 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi) static void init_sb_info(struct f2fs_sb_info *sbi) { struct f2fs_super_block *raw_super = sbi->raw_super; + int i; sbi->log_sectors_per_block = le32_to_cpu(raw_super->log_sectors_per_block); @@ -1473,6 +1470,9 @@ static void init_sb_info(struct f2fs_sb_info *sbi) sbi->interval_time[REQ_TIME] = DEF_IDLE_INTERVAL; clear_sbi_flag(sbi, SBI_NEED_FSCK); + for (i = 0; i < NR_COUNT_TYPE; i++) + atomic_set(&sbi->nr_pages[i], 0); + INIT_LIST_HEAD(&sbi->s_list); mutex_init(&sbi->umount_mutex); mutex_init(&sbi->wio_mutex[NODE]); @@ -1488,13 +1488,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi) static int init_percpu_info(struct f2fs_sb_info *sbi) { - int i, err; - - for (i = 0; i < NR_COUNT_TYPE; i++) { - err = percpu_counter_init(&sbi->nr_pages[i], 0, GFP_KERNEL); - if (err) - return err; - } + int err; err = percpu_counter_init(&sbi->alloc_valid_block_count, 0, GFP_KERNEL); if (err) From 230436b3ef3fd7d4a1da19edf5e87bb2d74e0fc2 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 2 Nov 2016 14:52:15 +0100 Subject: [PATCH 23/66] f2fs: hide a maybe-uninitialized warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit gcc is unsure about the use of last_ofs_in_node, which might happen without a prior initialization: fs/f2fs//git/arm-soc/fs/f2fs/data.c: In function ‘f2fs_map_blocks’: fs/f2fs/data.c:799:54: warning: ‘last_ofs_in_node’ may be used uninitialized in this function [-Wmaybe-uninitialized] if (prealloc && dn.ofs_in_node != last_ofs_in_node + 1) { As pointed out by Chao Yu, the code is actually correct as 'prealloc' is only set if the last_ofs_in_node has been set, the two always get updated together. This initializes last_ofs_in_node to dn.ofs_in_node for each new dnode at the start of the 'next_block' loop, which at that point is a correct initialization as well. I assume that compilers that correctly track the contents of the variables and do not warn about the condition also figure out that they can eliminate the extra assignment here. Fixes: 46008c6d4232 ("f2fs: support in batch multi blocks preallocation") Signed-off-by: Arnd Bergmann Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 68edb47f5f71..cf5ec39f907d 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -713,7 +713,7 @@ next_dnode: } prealloc = 0; - ofs_in_node = dn.ofs_in_node; + last_ofs_in_node = ofs_in_node = dn.ofs_in_node; end_offset = ADDRS_PER_PAGE(dn.node_page, inode); next_block: From ed6bd4b146527e7c6934e3582c47d7b857802676 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 29 Oct 2016 18:46:34 +0800 Subject: [PATCH 24/66] f2fs: report error of f2fs_fill_dentries Report error of f2fs_fill_dentries to ->iterate_shared, otherwise when error ocurrs, user may just list part of dirents in target directory without any hints. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 21 ++++++++++++--------- fs/f2fs/f2fs.h | 2 +- fs/f2fs/inline.c | 6 ++++-- 3 files changed, 17 insertions(+), 12 deletions(-) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 5f5678ecb445..827c5daef4fc 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -785,7 +785,7 @@ bool f2fs_empty_dir(struct inode *dir) return true; } -bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, +int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, unsigned int start_pos, struct fscrypt_str *fstr) { unsigned char d_type = DT_UNKNOWN; @@ -820,7 +820,7 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, (u32)de->hash_code, 0, &de_name, fstr); if (err) - return true; + return err; de_name = *fstr; fstr->len = save_len; @@ -828,12 +828,12 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, if (!dir_emit(ctx, de_name.name, de_name.len, le32_to_cpu(de->ino), d_type)) - return true; + return 1; bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); ctx->pos = start_pos + bit_pos; } - return false; + return 0; } static int f2fs_readdir(struct file *file, struct dir_context *ctx) @@ -872,17 +872,21 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) dentry_page = get_lock_data_page(inode, n, false); if (IS_ERR(dentry_page)) { err = PTR_ERR(dentry_page); - if (err == -ENOENT) + if (err == -ENOENT) { + err = 0; continue; - else + } else { goto out; + } } dentry_blk = kmap(dentry_page); make_dentry_ptr(inode, &d, (void *)dentry_blk, 1); - if (f2fs_fill_dentries(ctx, &d, n * NR_DENTRY_IN_BLOCK, &fstr)) { + err = f2fs_fill_dentries(ctx, &d, + n * NR_DENTRY_IN_BLOCK, &fstr); + if (err) { kunmap(dentry_page); f2fs_put_page(dentry_page, 1); break; @@ -892,10 +896,9 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) kunmap(dentry_page); f2fs_put_page(dentry_page, 1); } - err = 0; out: fscrypt_fname_free_buffer(&fstr); - return err; + return err < 0 ? err : 0; } static int f2fs_dir_open(struct inode *inode, struct file *filp) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 8bfbb546ebb2..b569d14052ce 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1961,7 +1961,7 @@ void set_de_type(struct f2fs_dir_entry *, umode_t); unsigned char get_de_type(struct f2fs_dir_entry *); struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *, f2fs_hash_t, int *, struct f2fs_dentry_ptr *); -bool f2fs_fill_dentries(struct dir_context *, struct f2fs_dentry_ptr *, +int f2fs_fill_dentries(struct dir_context *, struct f2fs_dentry_ptr *, unsigned int, struct fscrypt_str *); void do_make_empty_dir(struct inode *, struct inode *, struct f2fs_dentry_ptr *); diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 8cab5df539c6..c486ec935a30 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -612,6 +612,7 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx, struct f2fs_inline_dentry *inline_dentry = NULL; struct page *ipage = NULL; struct f2fs_dentry_ptr d; + int err; if (ctx->pos == NR_INLINE_DENTRY) return 0; @@ -624,11 +625,12 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx, make_dentry_ptr(inode, &d, (void *)inline_dentry, 2); - if (!f2fs_fill_dentries(ctx, &d, 0, fstr)) + err = f2fs_fill_dentries(ctx, &d, 0, fstr); + if (!err) ctx->pos = NR_INLINE_DENTRY; f2fs_put_page(ipage, 1); - return 0; + return err < 0 ? err : 0; } int f2fs_inline_data_fiemap(struct inode *inode, From 099228000eff6b25e0f76b276043cd65cd4eba5a Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 31 Oct 2016 14:01:41 -0700 Subject: [PATCH 25/66] f2fs: avoid infinite loop in the EIO case on recover_orphan_inodes This patch should fix an infinite loop case below. F2FS-fs : inject IO error in f2fs_read_end_io+0xf3/0x120 [f2fs] F2FS-fs (nvme0n1p1): recover_orphan_inode: orphan failed (ino=39ac1a), run fsck to fix. ... [] sync_meta_pages+0xae/0x270 [f2fs] [] ? flush_sit_entries+0x8d/0x960 [f2fs] [] write_checkpoint+0x361/0xf20 [f2fs] [] ? trace_hardirqs_on+0xd/0x10 [] ? f2fs_sync_fs+0x79/0x190 [f2fs] [] f2fs_sync_fs+0x85/0x190 [f2fs] [] f2fs_balance_fs_bg+0x7e/0x1c0 [f2fs] [] f2fs_write_node_pages+0x34/0x320 [f2fs] [] do_writepages+0x21/0x30 [] __writeback_single_inode+0x61/0x760 [] ? _raw_spin_unlock+0x27/0x40 [] writeback_single_inode+0xd5/0x190 [] write_inode_now+0x99/0xc0 [] iput+0x1f6/0x2c0 [] f2fs_fill_super+0xe0e/0x1300 [f2fs] [] ? sget_userns+0x4f4/0x530 [] mount_bdev+0x182/0x1b0 [] ? f2fs_commit_super+0x100/0x100 [f2fs] [] f2fs_mount+0x15/0x20 [f2fs] [] mount_fs+0x38/0x170 [] vfs_kern_mount+0x6b/0x160 [] do_mount+0x1be/0xd60 [] ? copy_mount_options+0xb7/0x220 [] SyS_mount+0x94/0xd0 [] entry_SYSCALL_64_fastpath+0x23/0xc6 Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 9749758f9977..72603f1d8c2e 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1889,6 +1889,13 @@ free_node_inode: mutex_lock(&sbi->umount_mutex); release_ino_entry(sbi, true); f2fs_leave_shrinker(sbi); + /* + * Some dirty meta pages can be produced by recover_orphan_inodes() + * failed by EIO. Then, iput(node_inode) can trigger balance_fs_bg() + * followed by write_checkpoint() through f2fs_write_node_pages(), which + * falls into an infinite loop in sync_meta_pages(). + */ + truncate_inode_pages_final(META_MAPPING(sbi)); iput(sbi->node_inode); mutex_unlock(&sbi->umount_mutex); free_nm: From 487df616dec33231c99294b906d720d256a2de16 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 28 Oct 2016 17:44:59 +0900 Subject: [PATCH 26/66] f2fs: Add missing break in switch-case Signed-off-by: Damien Le Moal Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 72603f1d8c2e..82af510b3bcc 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -420,6 +420,7 @@ static int parse_options(struct super_block *sb, char *options) break; case Opt_nodiscard: clear_opt(sbi, DISCARD); + break; case Opt_noheap: set_opt(sbi, NOHEAP); break; From 0bfd7a091c19132489a0f977b8dbf9f6b5ae0a1c Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 28 Oct 2016 17:45:00 +0900 Subject: [PATCH 27/66] f2fs: Use generic zoned block device terminology SMR stands for "Shingled Magnetic Recording" which makes sense only for hard disk drives (spinning rust). The ZBC/ZAC standards enable management of SMR disks, but solid state drives may also support those standards. So rename the HMSMR feature to BLKZONED to avoid a HDD centric terminology. For the same reason, rename f2fs_sb_mounted_hmsmr to f2fs_sb_mounted_blkzoned. Signed-off-by: Damien Le Moal Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 +- fs/f2fs/f2fs.h | 6 +++--- fs/f2fs/super.c | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index cf5ec39f907d..47ded0c34df1 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -110,7 +110,7 @@ static inline void __submit_bio(struct f2fs_sb_info *sbi, { if (!is_read_io(bio_op(bio))) { atomic_inc(&sbi->nr_wb_bios); - if (f2fs_sb_mounted_hmsmr(sbi->sb) && + if (f2fs_sb_mounted_blkzoned(sbi->sb) && current->plug && (type == DATA || type == NODE)) blk_finish_plug(current->plug); } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index b569d14052ce..47e7a97bdc6c 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -103,7 +103,7 @@ struct f2fs_mount_info { }; #define F2FS_FEATURE_ENCRYPT 0x0001 -#define F2FS_FEATURE_HMSMR 0x0002 +#define F2FS_FEATURE_BLKZONED 0x0002 #define F2FS_HAS_FEATURE(sb, mask) \ ((F2FS_SB(sb)->raw_super->feature & cpu_to_le32(mask)) != 0) @@ -2418,9 +2418,9 @@ static inline int f2fs_sb_has_crypto(struct super_block *sb) return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_ENCRYPT); } -static inline int f2fs_sb_mounted_hmsmr(struct super_block *sb) +static inline int f2fs_sb_mounted_blkzoned(struct super_block *sb) { - return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_HMSMR); + return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_BLKZONED); } static inline void set_opt_mode(struct f2fs_sb_info *sbi, unsigned int mt) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 82af510b3bcc..cf54fd89badd 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -973,7 +973,7 @@ static void default_options(struct f2fs_sb_info *sbi) set_opt(sbi, EXTENT_CACHE); sbi->sb->s_flags |= MS_LAZYTIME; set_opt(sbi, FLUSH_MERGE); - if (f2fs_sb_mounted_hmsmr(sbi->sb)) { + if (f2fs_sb_mounted_blkzoned(sbi->sb)) { set_opt_mode(sbi, F2FS_MOUNT_LFS); set_opt(sbi, DISCARD); } else { From d1b959c8770260b611b9a1f0c5e8b12b7cb5b9d2 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 28 Oct 2016 17:45:01 +0900 Subject: [PATCH 28/66] f2fs: Check zoned block feature for host-managed zoned block devices The F2FS_FEATURE_BLKZONED feature indicates that the drive was formatted with zone alignment optimization. This is optional for host-aware devices, but mandatory for host-managed zoned block devices. So check that the feature is set in this latter case. Signed-off-by: Damien Le Moal Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index cf54fd89badd..ea3752f1668d 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1638,6 +1638,26 @@ try_onemore: sb->s_fs_info = sbi; sbi->raw_super = raw_super; + /* + * The BLKZONED feature indicates that the drive was formatted with + * zone alignment optimization. This is optional for host-aware + * devices, but mandatory for host-managed zoned block devices. + */ +#ifndef CONFIG_BLK_DEV_ZONED + if (f2fs_sb_mounted_blkzoned(sb)) { + f2fs_msg(sb, KERN_ERR, + "Zoned block device support is not enabled\n"); + goto free_sb_buf; + } +#else + if (bdev_zoned_model(sb->s_bdev) == BLK_ZONED_HM && + !f2fs_sb_mounted_blkzoned(sb)) { + f2fs_msg(sb, KERN_ERR, + "Zoned block device feature not enabled\n"); + goto free_sb_buf; + } +#endif + default_options(sbi); /* parse mount options */ options = kstrdup((const char *)data, GFP_KERNEL); From 0ab0299835738cd407569401da1fef4c97b4419c Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 28 Oct 2016 17:45:02 +0900 Subject: [PATCH 29/66] f2fs: Suppress discard warning message for zoned block devices For zoned block devices, discard is replaced by zone reset. So do not warn if the device does not supports discard. Signed-off-by: Damien Le Moal Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index ea3752f1668d..56b0d2a5a590 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -412,7 +412,7 @@ static int parse_options(struct super_block *sb, char *options) q = bdev_get_queue(sb->s_bdev); if (blk_queue_discard(q)) { set_opt(sbi, DISCARD); - } else { + } else if (!f2fs_sb_mounted_blkzoned(sb)) { f2fs_msg(sb, KERN_WARNING, "mounting with \"discard\" option, but " "the device does not support discard"); From 96ba2decb4241aa2c6b61cfc8489d648769eff99 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 28 Oct 2016 17:45:03 +0900 Subject: [PATCH 30/66] f2fs: Always enable discard for zoned blocks devices Zone write pointer reset acts as discard for zoned block devices. So if the zoned block device feature is enabled, always declare that discard is enabled, even if the device does not actually support the command. For the same reason, prevent the use the "nodicard" mount option. Signed-off-by: Damien Le Moal Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 14 +++++++------- fs/f2fs/super.c | 5 +++++ 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 47e7a97bdc6c..707beab1afec 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1111,13 +1111,6 @@ static inline void clear_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f) spin_unlock(&sbi->cp_lock); } -static inline bool f2fs_discard_en(struct f2fs_sb_info *sbi) -{ - struct request_queue *q = bdev_get_queue(sbi->sb->s_bdev); - - return blk_queue_discard(q); -} - static inline void f2fs_lock_op(struct f2fs_sb_info *sbi) { down_read(&sbi->cp_rwsem); @@ -2423,6 +2416,13 @@ static inline int f2fs_sb_mounted_blkzoned(struct super_block *sb) return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_BLKZONED); } +static inline bool f2fs_discard_en(struct f2fs_sb_info *sbi) +{ + struct request_queue *q = bdev_get_queue(sbi->sb->s_bdev); + + return blk_queue_discard(q) || f2fs_sb_mounted_blkzoned(sbi->sb); +} + static inline void set_opt_mode(struct f2fs_sb_info *sbi, unsigned int mt) { clear_opt(sbi, ADAPTIVE); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 56b0d2a5a590..7bb0b36ae5e6 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -419,6 +419,11 @@ static int parse_options(struct super_block *sb, char *options) } break; case Opt_nodiscard: + if (f2fs_sb_mounted_blkzoned(sb)) { + f2fs_msg(sb, KERN_WARNING, + "discard is required for zoned block devices"); + return -EINVAL; + } clear_opt(sbi, DISCARD); break; case Opt_noheap: From 3adc57e97792e4ac9f228bde802829e2e9840afe Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 28 Oct 2016 17:45:04 +0900 Subject: [PATCH 31/66] f2fs: Do not allow adaptive mode for host-managed zoned block devices The LFS mode is mandatory for host-managed zoned block devices as update in place optimizations are not possible for segments in sequential zones. Signed-off-by: Damien Le Moal Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 7bb0b36ae5e6..3312284aa245 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -518,6 +518,13 @@ static int parse_options(struct super_block *sb, char *options) return -ENOMEM; if (strlen(name) == 8 && !strncmp(name, "adaptive", 8)) { + if (f2fs_sb_mounted_blkzoned(sb)) { + f2fs_msg(sb, KERN_WARNING, + "adaptive mode is not allowed with " + "zoned block device feature"); + kfree(name); + return -EINVAL; + } set_opt_mode(sbi, F2FS_MOUNT_ADAPTIVE); } else if (strlen(name) == 3 && !strncmp(name, "lfs", 3)) { From 178053e2f1f9ccdb61ff6c2bd8644b53fc98e72e Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 28 Oct 2016 17:45:05 +0900 Subject: [PATCH 32/66] f2fs: Cache zoned block devices zone type With the zoned block device feature enabled, section discard need to do a zone reset for sections contained in sequential zones, and a regular discard (if supported) for sections stored in conventional zones. Avoid the need for a costly report zones to obtain a section zone type when discarding it by caching the types of the device zones in the super block information. This cache is initialized at mount time for mounts with the zoned block device feature enabled. Signed-off-by: Damien Le Moal Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 18 +++++++++++++ fs/f2fs/super.c | 72 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 90 insertions(+) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 707beab1afec..a6e2ede94b08 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -756,6 +756,14 @@ struct f2fs_sb_info { u8 key_prefix[F2FS_KEY_DESC_PREFIX_SIZE]; u8 key_prefix_size; #endif + +#ifdef CONFIG_BLK_DEV_ZONED + unsigned int nr_blkz; /* Total number of zones */ + unsigned int blocks_per_blkz; /* F2FS blocks per zone */ + unsigned int log_blocks_per_blkz; /* log2 F2FS blocks per zone */ + u8 *blkz_type; /* Array of zones type */ +#endif + /* for node-related operations */ struct f2fs_nm_info *nm_info; /* node manager */ struct inode *node_inode; /* cache node blocks */ @@ -2416,6 +2424,16 @@ static inline int f2fs_sb_mounted_blkzoned(struct super_block *sb) return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_BLKZONED); } +#ifdef CONFIG_BLK_DEV_ZONED +static inline int get_blkz_type(struct f2fs_sb_info *sbi, + block_t blkaddr) +{ + unsigned int zno = blkaddr >> sbi->log_blocks_per_blkz; + + return sbi->blkz_type[zno]; +} +#endif + static inline bool f2fs_discard_en(struct f2fs_sb_info *sbi) { struct request_queue *q = bdev_get_queue(sbi->sb->s_bdev); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 3312284aa245..12a4f3f44a2f 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1511,6 +1511,65 @@ static int init_percpu_info(struct f2fs_sb_info *sbi) GFP_KERNEL); } +#ifdef CONFIG_BLK_DEV_ZONED +static int init_blkz_info(struct f2fs_sb_info *sbi) +{ + struct block_device *bdev = sbi->sb->s_bdev; + sector_t nr_sectors = bdev->bd_part->nr_sects; + sector_t sector = 0; + struct blk_zone *zones; + unsigned int i, nr_zones; + unsigned int n = 0; + int err = -EIO; + + if (!f2fs_sb_mounted_blkzoned(sbi->sb)) + return 0; + + sbi->blocks_per_blkz = SECTOR_TO_BLOCK(bdev_zone_size(bdev)); + sbi->log_blocks_per_blkz = __ilog2_u32(sbi->blocks_per_blkz); + sbi->nr_blkz = SECTOR_TO_BLOCK(nr_sectors) >> + sbi->log_blocks_per_blkz; + if (nr_sectors & (bdev_zone_size(bdev) - 1)) + sbi->nr_blkz++; + + sbi->blkz_type = kmalloc(sbi->nr_blkz, GFP_KERNEL); + if (!sbi->blkz_type) + return -ENOMEM; + +#define F2FS_REPORT_NR_ZONES 4096 + + zones = kcalloc(F2FS_REPORT_NR_ZONES, sizeof(struct blk_zone), + GFP_KERNEL); + if (!zones) + return -ENOMEM; + + /* Get block zones type */ + while (zones && sector < nr_sectors) { + + nr_zones = F2FS_REPORT_NR_ZONES; + err = blkdev_report_zones(bdev, sector, + zones, &nr_zones, + GFP_KERNEL); + if (err) + break; + if (!nr_zones) { + err = -EIO; + break; + } + + for (i = 0; i < nr_zones; i++) { + sbi->blkz_type[n] = zones[i].type; + sector += zones[i].len; + n++; + } + } + + kfree(zones); + + return err; +} +#endif + /* * Read f2fs raw super block. * Because we have two copies of super block, so read both of them @@ -1757,6 +1816,15 @@ try_onemore: init_ino_entry_info(sbi); +#ifdef CONFIG_BLK_DEV_ZONED + err = init_blkz_info(sbi); + if (err) { + f2fs_msg(sb, KERN_ERR, + "Failed to initialize F2FS blkzone information"); + goto free_blkz; + } +#endif + /* setup f2fs internal modules */ err = build_segment_manager(sbi); if (err) { @@ -1935,6 +2003,10 @@ free_nm: destroy_node_manager(sbi); free_sm: destroy_segment_manager(sbi); +#ifdef CONFIG_BLK_DEV_ZONED +free_blkz: + kfree(sbi->blkz_type); +#endif kfree(sbi->ckpt); free_meta_inode: make_bad_inode(sbi->meta_inode); From f46e8809e88d113ba096bcfc772b5182ce00b941 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 28 Oct 2016 17:45:06 +0900 Subject: [PATCH 33/66] f2fs: Reset sequential zones on zoned block devices When a zoned block device is mounted, discarding sections contained in sequential zones must reset the zone write pointer. For sections contained in conventional zones, the regular discard is used if the drive supports it. Signed-off-by: Damien Le Moal Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 63 +++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 55 insertions(+), 8 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index dcd85d33b91d..b7b40bad080a 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -16,6 +16,7 @@ #include #include #include +#include #include "f2fs.h" #include "segment.h" @@ -635,15 +636,19 @@ static void f2fs_submit_bio_wait_endio(struct bio *bio) } /* this function is copied from blkdev_issue_discard from block/blk-lib.c */ -static int __f2fs_issue_discard_async(struct f2fs_sb_info *sbi, sector_t sector, - sector_t nr_sects, gfp_t gfp_mask, unsigned long flags) +static int __f2fs_issue_discard_async(struct f2fs_sb_info *sbi, + block_t blkstart, block_t blklen) { struct block_device *bdev = sbi->sb->s_bdev; struct bio *bio = NULL; int err; - err = __blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask, flags, - &bio); + trace_f2fs_issue_discard(sbi->sb, blkstart, blklen); + + err = __blkdev_issue_discard(bdev, + SECTOR_FROM_BLOCK(blkstart), + SECTOR_FROM_BLOCK(blklen), + GFP_NOFS, 0, &bio); if (!err && bio) { struct bio_entry *be = __add_bio_entry(sbi, bio); @@ -656,11 +661,49 @@ static int __f2fs_issue_discard_async(struct f2fs_sb_info *sbi, sector_t sector, return err; } +#ifdef CONFIG_BLK_DEV_ZONED +static int f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, + block_t blkstart, block_t blklen) +{ + sector_t sector = SECTOR_FROM_BLOCK(blkstart); + sector_t nr_sects = SECTOR_FROM_BLOCK(blklen); + struct block_device *bdev = sbi->sb->s_bdev; + + if (nr_sects != bdev_zone_size(bdev)) { + f2fs_msg(sbi->sb, KERN_INFO, + "Unaligned discard attempted (sector %llu + %llu)", + (unsigned long long)sector, + (unsigned long long)nr_sects); + return -EIO; + } + + /* + * We need to know the type of the zone: for conventional zones, + * use regular discard if the drive supports it. For sequential + * zones, reset the zone write pointer. + */ + switch (get_blkz_type(sbi, blkstart)) { + + case BLK_ZONE_TYPE_CONVENTIONAL: + if (!blk_queue_discard(bdev_get_queue(bdev))) + return 0; + return __f2fs_issue_discard_async(sbi, blkstart, + blklen); + + case BLK_ZONE_TYPE_SEQWRITE_REQ: + case BLK_ZONE_TYPE_SEQWRITE_PREF: + return blkdev_reset_zones(bdev, sector, + nr_sects, GFP_NOFS); + default: + /* Unknown zone type: broken device ? */ + return -EIO; + } +} +#endif + static int f2fs_issue_discard(struct f2fs_sb_info *sbi, block_t blkstart, block_t blklen) { - sector_t start = SECTOR_FROM_BLOCK(blkstart); - sector_t len = SECTOR_FROM_BLOCK(blklen); struct seg_entry *se; unsigned int offset; block_t i; @@ -672,8 +715,12 @@ static int f2fs_issue_discard(struct f2fs_sb_info *sbi, if (!f2fs_test_and_set_bit(offset, se->discard_map)) sbi->discard_blks--; } - trace_f2fs_issue_discard(sbi->sb, blkstart, blklen); - return __f2fs_issue_discard_async(sbi, start, len, GFP_NOFS, 0); + +#ifdef CONFIG_BLK_DEV_ZONED + if (f2fs_sb_mounted_blkzoned(sbi->sb)) + return f2fs_issue_discard_zone(sbi, blkstart, blklen); +#endif + return __f2fs_issue_discard_async(sbi, blkstart, blklen); } static void __add_discard_entry(struct f2fs_sb_info *sbi, From 126606c7a99b32ba8265a51fab01533fe40c9ecc Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 28 Oct 2016 17:45:07 +0900 Subject: [PATCH 34/66] f2fs: Trace reset zone events Similarly to the regular discard, trace zone reset events. Signed-off-by: Damien Le Moal Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 1 + include/trace/events/f2fs.h | 21 +++++++++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index b7b40bad080a..5e7e311758f8 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -692,6 +692,7 @@ static int f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, case BLK_ZONE_TYPE_SEQWRITE_REQ: case BLK_ZONE_TYPE_SEQWRITE_PREF: + trace_f2fs_issue_reset_zone(sbi->sb, blkstart); return blkdev_reset_zones(bdev, sector, nr_sects, GFP_NOFS); default: diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 903a09165bb1..a2193e35bc58 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -1113,6 +1113,27 @@ TRACE_EVENT(f2fs_issue_discard, (unsigned long long)__entry->blklen) ); +TRACE_EVENT(f2fs_issue_reset_zone, + + TP_PROTO(struct super_block *sb, block_t blkstart), + + TP_ARGS(sb, blkstart), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(block_t, blkstart) + ), + + TP_fast_assign( + __entry->dev = sb->s_dev; + __entry->blkstart = blkstart; + ), + + TP_printk("dev = (%d,%d), reset zone at block = 0x%llx", + show_dev(__entry), + (unsigned long long)__entry->blkstart) +); + TRACE_EVENT(f2fs_issue_flush, TP_PROTO(struct super_block *sb, unsigned int nobarrier, From 60dcedc9972d136fab1600c919dd32080bcc26f7 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 2 Nov 2016 20:43:21 +0800 Subject: [PATCH 35/66] f2fs: record inode updating status correctly We should record updating status of inode only for living inode, for those unlinked inode it needs to clear its ino cache, otherwise after the ino was been reused, it will cause unneeded node page writing during ->fsync. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/inode.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 7b5e402f0a72..af06bda51a54 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -377,6 +377,9 @@ void f2fs_evict_inode(struct inode *inode) goto no_delete; #endif + remove_ino_entry(sbi, inode->i_ino, APPEND_INO); + remove_ino_entry(sbi, inode->i_ino, UPDATE_INO); + sb_start_intwrite(inode->i_sb); set_inode_flag(inode, FI_NO_ALLOC); i_size_write(inode, 0); @@ -409,10 +412,12 @@ no_delete: invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino, inode->i_ino); if (xnid) invalidate_mapping_pages(NODE_MAPPING(sbi), xnid, xnid); - if (is_inode_flag_set(inode, FI_APPEND_WRITE)) - add_ino_entry(sbi, inode->i_ino, APPEND_INO); - if (is_inode_flag_set(inode, FI_UPDATE_WRITE)) - add_ino_entry(sbi, inode->i_ino, UPDATE_INO); + if (inode->i_nlink) { + if (is_inode_flag_set(inode, FI_APPEND_WRITE)) + add_ino_entry(sbi, inode->i_ino, APPEND_INO); + if (is_inode_flag_set(inode, FI_UPDATE_WRITE)) + add_ino_entry(sbi, inode->i_ino, UPDATE_INO); + } if (is_inode_flag_set(inode, FI_FREE_NID)) { alloc_nid_failed(sbi, inode->i_ino); clear_inode_flag(inode, FI_FREE_NID); From 9f0552e078b8a25fcf9c3950a05ea1398616d2b9 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 4 Nov 2016 00:26:55 +0800 Subject: [PATCH 36/66] f2fs: fix wrong i_atime recovery Shouldn't update in-memory i_atime with on-disk i_mtime of inode when recovering inode. Shuoran found this bug which is hidden for a long time, honour is belong to him. Signed-off-by: Shuoran Liu Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/recovery.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 2fc84a991325..d2ba4da08ec3 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -180,10 +180,10 @@ static void recover_inode(struct inode *inode, struct page *page) inode->i_mode = le16_to_cpu(raw->i_mode); f2fs_i_size_write(inode, le64_to_cpu(raw->i_size)); - inode->i_atime.tv_sec = le64_to_cpu(raw->i_mtime); + inode->i_atime.tv_sec = le64_to_cpu(raw->i_atime); inode->i_ctime.tv_sec = le64_to_cpu(raw->i_ctime); inode->i_mtime.tv_sec = le64_to_cpu(raw->i_mtime); - inode->i_atime.tv_nsec = le32_to_cpu(raw->i_mtime_nsec); + inode->i_atime.tv_nsec = le32_to_cpu(raw->i_atime_nsec); inode->i_ctime.tv_nsec = le32_to_cpu(raw->i_ctime_nsec); inode->i_mtime.tv_nsec = le32_to_cpu(raw->i_mtime_nsec); From bdb7d964c4fb73078ab21c20e8c7b7bf7e641bb6 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 4 Nov 2016 14:33:57 -0700 Subject: [PATCH 37/66] f2fs: assign segments correctly for direct_io Previously, we assigned CURSEG_WARM_DATA for direct_io, but if we have two or four logs, we do not use that type at all. Let's fix it. Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 5e7e311758f8..7fb7dd3d1e74 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1501,8 +1501,12 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, struct curseg_info *curseg; bool direct_io = (type == CURSEG_DIRECT_IO); - type = direct_io ? CURSEG_WARM_DATA : type; - + if (direct_io) { + if (sbi->active_logs <= 4) + type = CURSEG_HOT_DATA; + else + type = CURSEG_WARM_DATA; + } curseg = CURSEG_I(sbi, type); mutex_lock(&curseg->curseg_mutex); From b4b9d34c855ef383402cd1acefb1e33a515ae5f5 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 4 Nov 2016 14:59:15 -0700 Subject: [PATCH 38/66] f2fs: remove checkpoint in f2fs_freeze The generic freeze_super() calls sync_filesystems() before f2fs_freeze(). So, basically we don't need to do checkpoint in f2fs_freeze(). But, in xfs/068, it triggers circular locking problem below due to gc_mutex for checkpoint. ====================================================== [ INFO: possible circular locking dependency detected ] 4.9.0-rc1+ #132 Tainted: G OE ------------------------------------------------------- 1. wait for __sb_start_write() by [] dump_stack+0x85/0xc2 [] print_circular_bug+0x1cf/0x230 [] __lock_acquire+0x19e0/0x1bc0 [] lock_acquire+0x11b/0x220 [] ? f2fs_drop_inode+0x9b/0x160 [f2fs] [] __sb_start_write+0x130/0x200 [] ? f2fs_drop_inode+0x9b/0x160 [f2fs] [] f2fs_drop_inode+0x9b/0x160 [f2fs] [] iput+0x171/0x2c0 [] f2fs_sync_inode_meta+0x3f/0xf0 [f2fs] [] block_operations+0x84/0x110 [f2fs] [] write_checkpoint+0xe8/0xf20 [f2fs] [] ? trace_hardirqs_on+0xd/0x10 [] ? f2fs_sync_fs+0x79/0x190 [f2fs] [] ? sched_clock+0x9/0x10 [] ? f2fs_sync_fs+0x79/0x190 [f2fs] [] f2fs_sync_fs+0x85/0x190 [f2fs] [] ? do_fsync+0x70/0x70 [] ? do_fsync+0x70/0x70 [] sync_fs_one_sb+0x20/0x30 [] iterate_supers+0xae/0x100 [] sys_sync+0x55/0x90 [] entry_SYSCALL_64_fastpath+0x23/0xc6 2. wait for sbi->gc_mutex by [] lock_acquire+0x11b/0x220 [] mutex_lock_nested+0x76/0x3f0 [] f2fs_sync_fs+0x79/0x190 [f2fs] [] f2fs_freeze+0x1c/0x20 [f2fs] [] freeze_super+0xcf/0x190 [] do_vfs_ioctl+0x53c/0x6a0 [] SyS_ioctl+0x79/0x90 [] entry_SYSCALL_64_fastpath+0x23/0xc6 Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 12a4f3f44a2f..eca9aea9834e 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -800,13 +800,17 @@ int f2fs_sync_fs(struct super_block *sb, int sync) static int f2fs_freeze(struct super_block *sb) { - int err; - if (f2fs_readonly(sb)) return 0; - err = f2fs_sync_fs(sb, 1); - return err; + /* IO error happened before */ + if (unlikely(f2fs_cp_error(F2FS_SB(sb)))) + return -EIO; + + /* must be clean, since sync_filesystem() was already called */ + if (is_sbi_flag_set(F2FS_SB(sb), SBI_IS_DIRTY)) + return -EINVAL; + return 0; } static int f2fs_unfreeze(struct super_block *sb) @@ -2152,3 +2156,4 @@ module_exit(exit_f2fs_fs) MODULE_AUTHOR("Samsung Electronics's Praesto Team"); MODULE_DESCRIPTION("Flash Friendly File System"); MODULE_LICENSE("GPL"); + From d47b8715953ad0cf82bb0a9d30d7b11d83bc9c11 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 5 Nov 2016 11:12:40 +0800 Subject: [PATCH 39/66] Revert "f2fs: do not recover from previous remained wrong dnodes" i_times of inode will be set with current system time which can be configured through 'date', so it's not safe to judge dnode block as garbage data or unchanged inode depend on i_times. Now, we have used enhanced 'cp_ver + cp' crc method to verify valid dnode block, so I expect recoverying invalid dnode is almost not possible. This reverts commit 807b1e1c8e08452948495b1a9985ab46d329e5c2. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/recovery.c | 31 +------------------------------ 1 file changed, 1 insertion(+), 30 deletions(-) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index d2ba4da08ec3..62523b217571 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -196,32 +196,6 @@ static void recover_inode(struct inode *inode, struct page *page) ino_of_node(page), name); } -static bool is_same_inode(struct inode *inode, struct page *ipage) -{ - struct f2fs_inode *ri = F2FS_INODE(ipage); - struct timespec disk; - - if (!IS_INODE(ipage)) - return true; - - disk.tv_sec = le64_to_cpu(ri->i_ctime); - disk.tv_nsec = le32_to_cpu(ri->i_ctime_nsec); - if (timespec_compare(&inode->i_ctime, &disk) > 0) - return false; - - disk.tv_sec = le64_to_cpu(ri->i_atime); - disk.tv_nsec = le32_to_cpu(ri->i_atime_nsec); - if (timespec_compare(&inode->i_atime, &disk) > 0) - return false; - - disk.tv_sec = le64_to_cpu(ri->i_mtime); - disk.tv_nsec = le32_to_cpu(ri->i_mtime_nsec); - if (timespec_compare(&inode->i_mtime, &disk) > 0) - return false; - - return true; -} - static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) { struct curseg_info *curseg; @@ -248,10 +222,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) goto next; entry = get_fsync_inode(head, ino_of_node(page)); - if (entry) { - if (!is_same_inode(entry->inode, page)) - goto next; - } else { + if (!entry) { if (IS_INODE(page) && is_dent_dnode(page)) { err = recover_inode_page(sbi, page); if (err) From 206147112860418fa9363cf94ec2c172bdf4313a Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Mon, 7 Nov 2016 21:22:31 +0800 Subject: [PATCH 40/66] f2fs: return directly if block has been removed from the victim If one block has been to written to a new place, just return in move data process. This patch check it again with holding page lock. Signed-off-by: Yunlei He Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 9c189171b89c..a7b2c13404f8 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -544,7 +544,8 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, return true; } -static void move_encrypted_block(struct inode *inode, block_t bidx) +static void move_encrypted_block(struct inode *inode, block_t bidx, + unsigned int segno, int off) { struct f2fs_io_info fio = { .sbi = F2FS_I_SB(inode), @@ -565,6 +566,9 @@ static void move_encrypted_block(struct inode *inode, block_t bidx) if (!page) return; + if (!check_valid_map(F2FS_I_SB(inode), segno, off)) + goto out; + set_new_dnode(&dn, inode, NULL, NULL, 0); err = get_dnode_of_data(&dn, bidx, LOOKUP_NODE); if (err) @@ -645,7 +649,8 @@ out: f2fs_put_page(page, 1); } -static void move_data_page(struct inode *inode, block_t bidx, int gc_type) +static void move_data_page(struct inode *inode, block_t bidx, int gc_type, + unsigned int segno, int off) { struct page *page; @@ -653,6 +658,9 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type) if (IS_ERR(page)) return; + if (!check_valid_map(F2FS_I_SB(inode), segno, off)) + goto out; + if (gc_type == BG_GC) { if (PageWriteback(page)) goto out; @@ -796,9 +804,9 @@ next_step: start_bidx = start_bidx_of_node(nofs, inode) + ofs_in_node; if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) - move_encrypted_block(inode, start_bidx); + move_encrypted_block(inode, start_bidx, segno, off); else - move_data_page(inode, start_bidx, gc_type); + move_data_page(inode, start_bidx, gc_type, segno, off); if (locked) { up_write(&fi->dio_rwsem[WRITE]); From 6ae1be13e85f4c42c8ca371fda50ae39eebbfd96 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 11 Nov 2016 12:31:40 -0800 Subject: [PATCH 41/66] f2fs: revert segment allocation for direct IO Now we don't need to be too much careful about storage alignment for dio, since its speed becomes quite fast and we'd better avoid any misalignment first. Revert: 38aa0889b250 (f2fs: align direct_io'ed data to section) Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 6 +----- fs/f2fs/f2fs.h | 1 - fs/f2fs/segment.c | 36 +++++++++--------------------------- 3 files changed, 10 insertions(+), 33 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 47ded0c34df1..ca53da568633 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -588,7 +588,6 @@ static int __allocate_data_block(struct dnode_of_data *dn) struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); struct f2fs_summary sum; struct node_info ni; - int seg = CURSEG_WARM_DATA; pgoff_t fofs; blkcnt_t count = 1; @@ -606,11 +605,8 @@ alloc: get_node_info(sbi, dn->nid, &ni); set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); - if (dn->ofs_in_node == 0 && dn->inode_page == dn->node_page) - seg = CURSEG_DIRECT_IO; - allocate_data_block(sbi, NULL, dn->data_blkaddr, &dn->data_blkaddr, - &sum, seg); + &sum, CURSEG_WARM_DATA); set_data_blkaddr(dn); /* update i_size */ diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index a6e2ede94b08..0496e82834d0 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -591,7 +591,6 @@ enum { CURSEG_WARM_NODE, /* direct node blocks of normal files */ CURSEG_COLD_NODE, /* indirect node blocks */ NO_CHECK_TYPE, - CURSEG_DIRECT_IO, /* to use for the direct IO path */ }; struct flush_cmd { diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 7fb7dd3d1e74..4cc589a493e3 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1346,25 +1346,21 @@ static void allocate_segment_by_default(struct f2fs_sb_info *sbi, stat_inc_seg_type(sbi, curseg); } -static void __allocate_new_segments(struct f2fs_sb_info *sbi, int type) -{ - struct curseg_info *curseg = CURSEG_I(sbi, type); - unsigned int old_segno; - - old_segno = curseg->segno; - SIT_I(sbi)->s_ops->allocate_segment(sbi, type, true); - locate_dirty_segment(sbi, old_segno); -} - void allocate_new_segments(struct f2fs_sb_info *sbi) { + struct curseg_info *curseg; + unsigned int old_segno; int i; if (test_opt(sbi, LFS)) return; - for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) - __allocate_new_segments(sbi, i); + for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { + curseg = CURSEG_I(sbi, i); + old_segno = curseg->segno; + SIT_I(sbi)->s_ops->allocate_segment(sbi, i, true); + locate_dirty_segment(sbi, old_segno); + } } static const struct segment_allocation default_salloc_ops = { @@ -1498,25 +1494,11 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, struct f2fs_summary *sum, int type) { struct sit_info *sit_i = SIT_I(sbi); - struct curseg_info *curseg; - bool direct_io = (type == CURSEG_DIRECT_IO); - - if (direct_io) { - if (sbi->active_logs <= 4) - type = CURSEG_HOT_DATA; - else - type = CURSEG_WARM_DATA; - } - curseg = CURSEG_I(sbi, type); + struct curseg_info *curseg = CURSEG_I(sbi, type); mutex_lock(&curseg->curseg_mutex); mutex_lock(&sit_i->sentry_lock); - /* direct_io'ed data is aligned to the segment for better performance */ - if (direct_io && curseg->next_blkoff && - !has_not_enough_free_secs(sbi, 0, 0)) - __allocate_new_segments(sbi, type); - *new_blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); /* From e57e9ae5b179a6b243c42bf6d9549d1595c27089 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 11 Nov 2016 12:08:22 -0800 Subject: [PATCH 42/66] f2fs: allow dio read for LFS mode We can allow dio reads for LFS mode, while doing buffered writes for dio writes. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index ca53da568633..996b9aef94ce 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1732,7 +1732,7 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) return 0; - if (test_opt(F2FS_I_SB(inode), LFS)) + if (rw == WRITE && test_opt(F2FS_I_SB(inode), LFS)) return 0; trace_f2fs_direct_IO_enter(inode, offset, count, rw); From 3c62be17d4f562f43fe1d03b48194399caa35aa5 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 6 Oct 2016 19:02:05 -0700 Subject: [PATCH 43/66] f2fs: support multiple devices This patch implements multiple devices support for f2fs. Given multiple devices by mkfs.f2fs, f2fs shows them entirely as one big volume under one f2fs instance. Internal block management is very simple, but we will modify block allocation and background GC policy to boost IO speed by exploiting them accoording to each device speed. Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 55 ++++++++++++++-- fs/f2fs/f2fs.h | 29 +++++++-- fs/f2fs/segment.c | 119 ++++++++++++++++++++++++---------- fs/f2fs/super.c | 138 +++++++++++++++++++++++++++++++--------- include/linux/f2fs_fs.h | 10 ++- 5 files changed, 277 insertions(+), 74 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 996b9aef94ce..861173f00459 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -87,6 +87,46 @@ static void f2fs_write_end_io(struct bio *bio) bio_put(bio); } +/* + * Return true, if pre_bio's bdev is same as its target device. + */ +struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi, + block_t blk_addr, struct bio *bio) +{ + struct block_device *bdev = sbi->sb->s_bdev; + int i; + + for (i = 0; i < sbi->s_ndevs; i++) { + if (FDEV(i).start_blk <= blk_addr && + FDEV(i).end_blk >= blk_addr) { + blk_addr -= FDEV(i).start_blk; + bdev = FDEV(i).bdev; + break; + } + } + if (bio) { + bio->bi_bdev = bdev; + bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blk_addr); + } + return bdev; +} + +int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr) +{ + int i; + + for (i = 0; i < sbi->s_ndevs; i++) + if (FDEV(i).start_blk <= blkaddr && FDEV(i).end_blk >= blkaddr) + return i; + return 0; +} + +static bool __same_bdev(struct f2fs_sb_info *sbi, + block_t blk_addr, struct bio *bio) +{ + return f2fs_target_device(sbi, blk_addr, NULL) == bio->bi_bdev; +} + /* * Low-level block read/write IO operations. */ @@ -97,8 +137,7 @@ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr, bio = f2fs_bio_alloc(npages); - bio->bi_bdev = sbi->sb->s_bdev; - bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blk_addr); + f2fs_target_device(sbi, blk_addr, bio); bio->bi_end_io = is_read ? f2fs_read_end_io : f2fs_write_end_io; bio->bi_private = is_read ? NULL : sbi; @@ -273,7 +312,8 @@ void f2fs_submit_page_mbio(struct f2fs_io_info *fio) down_write(&io->io_rwsem); if (io->bio && (io->last_block_in_bio != fio->new_blkaddr - 1 || - (io->fio.op != fio->op || io->fio.op_flags != fio->op_flags))) + (io->fio.op != fio->op || io->fio.op_flags != fio->op_flags) || + !__same_bdev(sbi, fio->new_blkaddr, io->bio))) __submit_merged_bio(io); alloc_new: if (io->bio == NULL) { @@ -961,7 +1001,6 @@ static struct bio *f2fs_grab_bio(struct inode *inode, block_t blkaddr, { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct fscrypt_ctx *ctx = NULL; - struct block_device *bdev = sbi->sb->s_bdev; struct bio *bio; if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) { @@ -979,8 +1018,7 @@ static struct bio *f2fs_grab_bio(struct inode *inode, block_t blkaddr, fscrypt_release_ctx(ctx); return ERR_PTR(-ENOMEM); } - bio->bi_bdev = bdev; - bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blkaddr); + f2fs_target_device(sbi, blkaddr, bio); bio->bi_end_io = f2fs_read_end_io; bio->bi_private = ctx; @@ -1075,7 +1113,8 @@ got_it: * This page will go to BIO. Do we need to send this * BIO off first? */ - if (bio && (last_block_in_bio != block_nr - 1)) { + if (bio && (last_block_in_bio != block_nr - 1 || + !__same_bdev(F2FS_I_SB(inode), block_nr, bio))) { submit_and_realloc: __submit_bio(F2FS_I_SB(inode), bio, DATA); bio = NULL; @@ -1734,6 +1773,8 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) return 0; if (rw == WRITE && test_opt(F2FS_I_SB(inode), LFS)) return 0; + if (F2FS_I_SB(inode)->s_ndevs) + return 0; trace_f2fs_direct_IO_enter(inode, offset, count, rw); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 0496e82834d0..96221b1f620e 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -709,6 +709,20 @@ struct f2fs_bio_info { struct rw_semaphore io_rwsem; /* blocking op for bio */ }; +#define FDEV(i) (sbi->devs[i]) +#define RDEV(i) (raw_super->devs[i]) +struct f2fs_dev_info { + struct block_device *bdev; + char path[MAX_PATH_LEN]; + unsigned int total_segments; + block_t start_blk; + block_t end_blk; +#ifdef CONFIG_BLK_DEV_ZONED + unsigned int nr_blkz; /* Total number of zones */ + u8 *blkz_type; /* Array of zones type */ +#endif +}; + enum inode_type { DIR_INODE, /* for dirty dir inode */ FILE_INODE, /* for dirty regular/symlink inode */ @@ -757,10 +771,8 @@ struct f2fs_sb_info { #endif #ifdef CONFIG_BLK_DEV_ZONED - unsigned int nr_blkz; /* Total number of zones */ unsigned int blocks_per_blkz; /* F2FS blocks per zone */ unsigned int log_blocks_per_blkz; /* log2 F2FS blocks per zone */ - u8 *blkz_type; /* Array of zones type */ #endif /* for node-related operations */ @@ -876,6 +888,8 @@ struct f2fs_sb_info { /* For shrinker support */ struct list_head s_list; + int s_ndevs; /* number of devices */ + struct f2fs_dev_info *devs; /* for device list */ struct mutex umount_mutex; unsigned int shrinker_run_no; @@ -2138,6 +2152,9 @@ void f2fs_submit_merged_bio_cond(struct f2fs_sb_info *, struct inode *, void f2fs_flush_merged_bios(struct f2fs_sb_info *); int f2fs_submit_page_bio(struct f2fs_io_info *); void f2fs_submit_page_mbio(struct f2fs_io_info *); +struct block_device *f2fs_target_device(struct f2fs_sb_info *, + block_t, struct bio *); +int f2fs_target_device_index(struct f2fs_sb_info *, block_t); void set_data_blkaddr(struct dnode_of_data *); void f2fs_update_data_blkaddr(struct dnode_of_data *, block_t); int reserve_new_blocks(struct dnode_of_data *, blkcnt_t); @@ -2425,11 +2442,15 @@ static inline int f2fs_sb_mounted_blkzoned(struct super_block *sb) #ifdef CONFIG_BLK_DEV_ZONED static inline int get_blkz_type(struct f2fs_sb_info *sbi, - block_t blkaddr) + struct block_device *bdev, block_t blkaddr) { unsigned int zno = blkaddr >> sbi->log_blocks_per_blkz; + int i; - return sbi->blkz_type[zno]; + for (i = 0; i < sbi->s_ndevs; i++) + if (FDEV(i).bdev == bdev) + return FDEV(i).blkz_type[zno]; + return -EINVAL; } #endif diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 4cc589a493e3..1869bc3973d5 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -403,6 +403,33 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) } } +static int __submit_flush_wait(struct block_device *bdev) +{ + struct bio *bio = f2fs_bio_alloc(0); + int ret; + + bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH); + bio->bi_bdev = bdev; + ret = submit_bio_wait(bio); + bio_put(bio); + return ret; +} + +static int submit_flush_wait(struct f2fs_sb_info *sbi) +{ + int ret = __submit_flush_wait(sbi->sb->s_bdev); + int i; + + if (sbi->s_ndevs && !ret) { + for (i = 1; i < sbi->s_ndevs; i++) { + ret = __submit_flush_wait(FDEV(i).bdev); + if (ret) + break; + } + } + return ret; +} + static int issue_flush_thread(void *data) { struct f2fs_sb_info *sbi = data; @@ -413,25 +440,18 @@ repeat: return 0; if (!llist_empty(&fcc->issue_list)) { - struct bio *bio; struct flush_cmd *cmd, *next; int ret; - bio = f2fs_bio_alloc(0); - fcc->dispatch_list = llist_del_all(&fcc->issue_list); fcc->dispatch_list = llist_reverse_order(fcc->dispatch_list); - bio->bi_bdev = sbi->sb->s_bdev; - bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH); - ret = submit_bio_wait(bio); - + ret = submit_flush_wait(sbi); llist_for_each_entry_safe(cmd, next, fcc->dispatch_list, llnode) { cmd->ret = ret; complete(&cmd->wait); } - bio_put(bio); fcc->dispatch_list = NULL; } @@ -452,15 +472,11 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi) return 0; if (!test_opt(sbi, FLUSH_MERGE) || !atomic_read(&fcc->submit_flush)) { - struct bio *bio = f2fs_bio_alloc(0); int ret; atomic_inc(&fcc->submit_flush); - bio->bi_bdev = sbi->sb->s_bdev; - bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH); - ret = submit_bio_wait(bio); + ret = submit_flush_wait(sbi); atomic_dec(&fcc->submit_flush); - bio_put(bio); return ret; } @@ -637,14 +653,18 @@ static void f2fs_submit_bio_wait_endio(struct bio *bio) /* this function is copied from blkdev_issue_discard from block/blk-lib.c */ static int __f2fs_issue_discard_async(struct f2fs_sb_info *sbi, - block_t blkstart, block_t blklen) + struct block_device *bdev, block_t blkstart, block_t blklen) { - struct block_device *bdev = sbi->sb->s_bdev; struct bio *bio = NULL; int err; trace_f2fs_issue_discard(sbi->sb, blkstart, blklen); + if (sbi->s_ndevs) { + int devi = f2fs_target_device_index(sbi, blkstart); + + blkstart -= FDEV(devi).start_blk; + } err = __blkdev_issue_discard(bdev, SECTOR_FROM_BLOCK(blkstart), SECTOR_FROM_BLOCK(blklen), @@ -662,18 +682,24 @@ static int __f2fs_issue_discard_async(struct f2fs_sb_info *sbi, } #ifdef CONFIG_BLK_DEV_ZONED -static int f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, - block_t blkstart, block_t blklen) +static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t blkstart, block_t blklen) { - sector_t sector = SECTOR_FROM_BLOCK(blkstart); sector_t nr_sects = SECTOR_FROM_BLOCK(blklen); - struct block_device *bdev = sbi->sb->s_bdev; + sector_t sector; + int devi = 0; - if (nr_sects != bdev_zone_size(bdev)) { + if (sbi->s_ndevs) { + devi = f2fs_target_device_index(sbi, blkstart); + blkstart -= FDEV(devi).start_blk; + } + sector = SECTOR_FROM_BLOCK(blkstart); + + if (sector % bdev_zone_size(bdev) || nr_sects != bdev_zone_size(bdev)) { f2fs_msg(sbi->sb, KERN_INFO, - "Unaligned discard attempted (sector %llu + %llu)", - (unsigned long long)sector, - (unsigned long long)nr_sects); + "(%d) %s: Unaligned discard attempted (block %x + %x)", + devi, sbi->s_ndevs ? FDEV(devi).path: "", + blkstart, blklen); return -EIO; } @@ -682,14 +708,12 @@ static int f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, * use regular discard if the drive supports it. For sequential * zones, reset the zone write pointer. */ - switch (get_blkz_type(sbi, blkstart)) { + switch (get_blkz_type(sbi, bdev, blkstart)) { case BLK_ZONE_TYPE_CONVENTIONAL: if (!blk_queue_discard(bdev_get_queue(bdev))) return 0; - return __f2fs_issue_discard_async(sbi, blkstart, - blklen); - + return __f2fs_issue_discard_async(sbi, bdev, blkstart, blklen); case BLK_ZONE_TYPE_SEQWRITE_REQ: case BLK_ZONE_TYPE_SEQWRITE_PREF: trace_f2fs_issue_reset_zone(sbi->sb, blkstart); @@ -702,14 +726,45 @@ static int f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, } #endif +static int __issue_discard_async(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t blkstart, block_t blklen) +{ +#ifdef CONFIG_BLK_DEV_ZONED + if (f2fs_sb_mounted_blkzoned(sbi->sb) && + bdev_zoned_model(bdev) != BLK_ZONED_NONE) + return __f2fs_issue_discard_zone(sbi, bdev, blkstart, blklen); +#endif + return __f2fs_issue_discard_async(sbi, bdev, blkstart, blklen); +} + static int f2fs_issue_discard(struct f2fs_sb_info *sbi, block_t blkstart, block_t blklen) { + sector_t start = blkstart, len = 0; + struct block_device *bdev; struct seg_entry *se; unsigned int offset; block_t i; + int err = 0; + + bdev = f2fs_target_device(sbi, blkstart, NULL); + + for (i = blkstart; i < blkstart + blklen; i++, len++) { + if (i != start) { + struct block_device *bdev2 = + f2fs_target_device(sbi, i, NULL); + + if (bdev2 != bdev) { + err = __issue_discard_async(sbi, bdev, + start, len); + if (err) + return err; + bdev = bdev2; + start = i; + len = 0; + } + } - for (i = blkstart; i < blkstart + blklen; i++) { se = get_seg_entry(sbi, GET_SEGNO(sbi, i)); offset = GET_BLKOFF_FROM_SEG0(sbi, i); @@ -717,11 +772,9 @@ static int f2fs_issue_discard(struct f2fs_sb_info *sbi, sbi->discard_blks--; } -#ifdef CONFIG_BLK_DEV_ZONED - if (f2fs_sb_mounted_blkzoned(sbi->sb)) - return f2fs_issue_discard_zone(sbi, blkstart, blklen); -#endif - return __f2fs_issue_discard_async(sbi, blkstart, blklen); + if (len) + err = __issue_discard_async(sbi, bdev, start, len); + return err; } static void __add_discard_entry(struct f2fs_sb_info *sbi, diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index eca9aea9834e..4ccbb860ad06 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -713,6 +713,19 @@ static void destroy_percpu_info(struct f2fs_sb_info *sbi) percpu_counter_destroy(&sbi->total_valid_inode_count); } +static void destroy_device_list(struct f2fs_sb_info *sbi) +{ + int i; + + for (i = 0; i < sbi->s_ndevs; i++) { + blkdev_put(FDEV(i).bdev, FMODE_EXCL); +#ifdef CONFIG_BLK_DEV_ZONED + kfree(FDEV(i).blkz_type); +#endif + } + kfree(sbi->devs); +} + static void f2fs_put_super(struct super_block *sb) { struct f2fs_sb_info *sbi = F2FS_SB(sb); @@ -773,6 +786,8 @@ static void f2fs_put_super(struct super_block *sb) crypto_free_shash(sbi->s_chksum_driver); kfree(sbi->raw_super); + destroy_device_list(sbi); + destroy_percpu_info(sbi); kfree(sbi); } @@ -1516,9 +1531,9 @@ static int init_percpu_info(struct f2fs_sb_info *sbi) } #ifdef CONFIG_BLK_DEV_ZONED -static int init_blkz_info(struct f2fs_sb_info *sbi) +static int init_blkz_info(struct f2fs_sb_info *sbi, int devi) { - struct block_device *bdev = sbi->sb->s_bdev; + struct block_device *bdev = FDEV(devi).bdev; sector_t nr_sectors = bdev->bd_part->nr_sects; sector_t sector = 0; struct blk_zone *zones; @@ -1529,15 +1544,21 @@ static int init_blkz_info(struct f2fs_sb_info *sbi) if (!f2fs_sb_mounted_blkzoned(sbi->sb)) return 0; + if (sbi->blocks_per_blkz && sbi->blocks_per_blkz != + SECTOR_TO_BLOCK(bdev_zone_size(bdev))) + return -EINVAL; sbi->blocks_per_blkz = SECTOR_TO_BLOCK(bdev_zone_size(bdev)); + if (sbi->log_blocks_per_blkz && sbi->log_blocks_per_blkz != + __ilog2_u32(sbi->blocks_per_blkz)) + return -EINVAL; sbi->log_blocks_per_blkz = __ilog2_u32(sbi->blocks_per_blkz); - sbi->nr_blkz = SECTOR_TO_BLOCK(nr_sectors) >> - sbi->log_blocks_per_blkz; + FDEV(devi).nr_blkz = SECTOR_TO_BLOCK(nr_sectors) >> + sbi->log_blocks_per_blkz; if (nr_sectors & (bdev_zone_size(bdev) - 1)) - sbi->nr_blkz++; + FDEV(devi).nr_blkz++; - sbi->blkz_type = kmalloc(sbi->nr_blkz, GFP_KERNEL); - if (!sbi->blkz_type) + FDEV(devi).blkz_type = kmalloc(FDEV(devi).nr_blkz, GFP_KERNEL); + if (!FDEV(devi).blkz_type) return -ENOMEM; #define F2FS_REPORT_NR_ZONES 4096 @@ -1562,7 +1583,7 @@ static int init_blkz_info(struct f2fs_sb_info *sbi) } for (i = 0; i < nr_zones; i++) { - sbi->blkz_type[n] = zones[i].type; + FDEV(devi).blkz_type[n] = zones[i].type; sector += zones[i].len; n++; } @@ -1666,6 +1687,77 @@ int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover) return err; } +static int f2fs_scan_devices(struct f2fs_sb_info *sbi) +{ + struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); + int i; + + for (i = 0; i < MAX_DEVICES; i++) { + if (!RDEV(i).path[0]) + return 0; + + if (i == 0) { + sbi->devs = kzalloc(sizeof(struct f2fs_dev_info) * + MAX_DEVICES, GFP_KERNEL); + if (!sbi->devs) + return -ENOMEM; + } + + memcpy(FDEV(i).path, RDEV(i).path, MAX_PATH_LEN); + FDEV(i).total_segments = le32_to_cpu(RDEV(i).total_segments); + if (i == 0) { + FDEV(i).start_blk = 0; + FDEV(i).end_blk = FDEV(i).start_blk + + (FDEV(i).total_segments << + sbi->log_blocks_per_seg) - 1 + + le32_to_cpu(raw_super->segment0_blkaddr); + } else { + FDEV(i).start_blk = FDEV(i - 1).end_blk + 1; + FDEV(i).end_blk = FDEV(i).start_blk + + (FDEV(i).total_segments << + sbi->log_blocks_per_seg) - 1; + } + + FDEV(i).bdev = blkdev_get_by_path(FDEV(i).path, + sbi->sb->s_mode, sbi->sb->s_type); + if (IS_ERR(FDEV(i).bdev)) + return PTR_ERR(FDEV(i).bdev); + + /* to release errored devices */ + sbi->s_ndevs = i + 1; + +#ifdef CONFIG_BLK_DEV_ZONED + if (bdev_zoned_model(FDEV(i).bdev) == BLK_ZONED_HM && + !f2fs_sb_mounted_blkzoned(sbi->sb)) { + f2fs_msg(sbi->sb, KERN_ERR, + "Zoned block device feature not enabled\n"); + return -EINVAL; + } + if (bdev_zoned_model(FDEV(i).bdev) != BLK_ZONED_NONE) { + if (init_blkz_info(sbi, i)) { + f2fs_msg(sbi->sb, KERN_ERR, + "Failed to initialize F2FS blkzone information"); + return -EINVAL; + } + f2fs_msg(sbi->sb, KERN_INFO, + "Mount Device [%2d]: %20s, %8u, %8x - %8x (zone: %s)", + i, FDEV(i).path, + FDEV(i).total_segments, + FDEV(i).start_blk, FDEV(i).end_blk, + bdev_zoned_model(FDEV(i).bdev) == BLK_ZONED_HA ? + "Host-aware" : "Host-managed"); + continue; + } +#endif + f2fs_msg(sbi->sb, KERN_INFO, + "Mount Device [%2d]: %20s, %8u, %8x - %8x", + i, FDEV(i).path, + FDEV(i).total_segments, + FDEV(i).start_blk, FDEV(i).end_blk); + } + return 0; +} + static int f2fs_fill_super(struct super_block *sb, void *data, int silent) { struct f2fs_sb_info *sbi; @@ -1724,15 +1816,7 @@ try_onemore: "Zoned block device support is not enabled\n"); goto free_sb_buf; } -#else - if (bdev_zoned_model(sb->s_bdev) == BLK_ZONED_HM && - !f2fs_sb_mounted_blkzoned(sb)) { - f2fs_msg(sb, KERN_ERR, - "Zoned block device feature not enabled\n"); - goto free_sb_buf; - } #endif - default_options(sbi); /* parse mount options */ options = kstrdup((const char *)data, GFP_KERNEL); @@ -1802,6 +1886,13 @@ try_onemore: goto free_meta_inode; } + /* Initialize device list */ + err = f2fs_scan_devices(sbi); + if (err) { + f2fs_msg(sb, KERN_ERR, "Failed to find devices"); + goto free_devices; + } + sbi->total_valid_node_count = le32_to_cpu(sbi->ckpt->valid_node_count); percpu_counter_set(&sbi->total_valid_inode_count, @@ -1820,15 +1911,6 @@ try_onemore: init_ino_entry_info(sbi); -#ifdef CONFIG_BLK_DEV_ZONED - err = init_blkz_info(sbi); - if (err) { - f2fs_msg(sb, KERN_ERR, - "Failed to initialize F2FS blkzone information"); - goto free_blkz; - } -#endif - /* setup f2fs internal modules */ err = build_segment_manager(sbi); if (err) { @@ -2007,10 +2089,8 @@ free_nm: destroy_node_manager(sbi); free_sm: destroy_segment_manager(sbi); -#ifdef CONFIG_BLK_DEV_ZONED -free_blkz: - kfree(sbi->blkz_type); -#endif +free_devices: + destroy_device_list(sbi); kfree(sbi->ckpt); free_meta_inode: make_bad_inode(sbi->meta_inode); diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index 422630b8e588..cea41a124a80 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -52,10 +52,17 @@ #define VERSION_LEN 256 #define MAX_VOLUME_NAME 512 +#define MAX_PATH_LEN 64 +#define MAX_DEVICES 8 /* * For superblock */ +struct f2fs_device { + __u8 path[MAX_PATH_LEN]; + __le32 total_segments; +} __packed; + struct f2fs_super_block { __le32 magic; /* Magic Number */ __le16 major_ver; /* Major Version */ @@ -94,7 +101,8 @@ struct f2fs_super_block { __le32 feature; /* defined features */ __u8 encryption_level; /* versioning level for encryption */ __u8 encrypt_pw_salt[16]; /* Salt used for string2key algorithm */ - __u8 reserved[871]; /* valid reserved region */ + struct f2fs_device devs[MAX_DEVICES]; /* device list */ + __u8 reserved[327]; /* valid reserved region */ } __packed; /* From a7de608691f766cd148971a71d4f13aa1692d4c8 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 11 Nov 2016 16:31:56 -0800 Subject: [PATCH 44/66] f2fs: use err for f2fs_preallocate_blocks This patch has no functional change. Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 26 +++++++++++++------------- fs/f2fs/f2fs.h | 2 +- fs/f2fs/file.c | 35 +++++++++++++++++++---------------- 3 files changed, 33 insertions(+), 30 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 861173f00459..2938311c97bf 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -658,11 +658,11 @@ alloc: return 0; } -ssize_t f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) +int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) { struct inode *inode = file_inode(iocb->ki_filp); struct f2fs_map_blocks map; - ssize_t ret = 0; + int err = 0; map.m_lblk = F2FS_BLK_ALIGN(iocb->ki_pos); map.m_len = F2FS_BYTES_TO_BLK(iocb->ki_pos + iov_iter_count(from)); @@ -674,19 +674,19 @@ ssize_t f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) map.m_next_pgofs = NULL; if (iocb->ki_flags & IOCB_DIRECT) { - ret = f2fs_convert_inline_inode(inode); - if (ret) - return ret; + err = f2fs_convert_inline_inode(inode); + if (err) + return err; return f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_DIO); } if (iocb->ki_pos + iov_iter_count(from) > MAX_INLINE_DATA) { - ret = f2fs_convert_inline_inode(inode); - if (ret) - return ret; + err = f2fs_convert_inline_inode(inode); + if (err) + return err; } if (!f2fs_has_inline_data(inode)) return f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO); - return ret; + return err; } /* @@ -863,19 +863,19 @@ static int __get_data_block(struct inode *inode, sector_t iblock, pgoff_t *next_pgofs) { struct f2fs_map_blocks map; - int ret; + int err; map.m_lblk = iblock; map.m_len = bh->b_size >> inode->i_blkbits; map.m_next_pgofs = next_pgofs; - ret = f2fs_map_blocks(inode, &map, create, flag); - if (!ret) { + err = f2fs_map_blocks(inode, &map, create, flag); + if (!err) { map_bh(bh, inode->i_sb, map.m_pblk); bh->b_state = (bh->b_state & ~F2FS_MAP_FLAGS) | map.m_flags; bh->b_size = map.m_len << inode->i_blkbits; } - return ret; + return err; } static int get_data_block(struct inode *inode, sector_t iblock, diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 96221b1f620e..a1dcc60ae946 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2160,7 +2160,7 @@ void f2fs_update_data_blkaddr(struct dnode_of_data *, block_t); int reserve_new_blocks(struct dnode_of_data *, blkcnt_t); int reserve_new_block(struct dnode_of_data *); int f2fs_get_block(struct dnode_of_data *, pgoff_t); -ssize_t f2fs_preallocate_blocks(struct kiocb *, struct iov_iter *); +int f2fs_preallocate_blocks(struct kiocb *, struct iov_iter *); int f2fs_reserve_block(struct dnode_of_data *, pgoff_t); struct page *get_read_data_page(struct inode *, pgoff_t, int, bool); struct page *find_data_page(struct inode *, pgoff_t); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 67a6b5ddc251..39616de92a55 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1320,15 +1320,15 @@ static int expand_inode_data(struct inode *inode, loff_t offset, pgoff_t pg_end; loff_t new_size = i_size_read(inode); loff_t off_end; - int ret; + int err; - ret = inode_newsize_ok(inode, (len + offset)); - if (ret) - return ret; + err = inode_newsize_ok(inode, (len + offset)); + if (err) + return err; - ret = f2fs_convert_inline_inode(inode); - if (ret) - return ret; + err = f2fs_convert_inline_inode(inode); + if (err) + return err; f2fs_balance_fs(sbi, true); @@ -1340,12 +1340,12 @@ static int expand_inode_data(struct inode *inode, loff_t offset, if (off_end) map.m_len++; - ret = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO); - if (ret) { + err = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO); + if (err) { pgoff_t last_off; if (!map.m_len) - return ret; + return err; last_off = map.m_lblk + map.m_len - 1; @@ -1359,7 +1359,7 @@ static int expand_inode_data(struct inode *inode, loff_t offset, if (!(mode & FALLOC_FL_KEEP_SIZE) && i_size_read(inode) < new_size) f2fs_i_size_write(inode, new_size); - return ret; + return err; } static long f2fs_fallocate(struct file *file, int mode, @@ -2263,12 +2263,15 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) inode_lock(inode); ret = generic_write_checks(iocb, from); if (ret > 0) { - ret = f2fs_preallocate_blocks(iocb, from); - if (!ret) { - blk_start_plug(&plug); - ret = __generic_file_write_iter(iocb, from); - blk_finish_plug(&plug); + int err = f2fs_preallocate_blocks(iocb, from); + + if (err) { + inode_unlock(inode); + return err; } + blk_start_plug(&plug); + ret = __generic_file_write_iter(iocb, from); + blk_finish_plug(&plug); } inode_unlock(inode); From c040ff9d69fd1d782fe577ba9e35c1f5798158ae Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 11 Nov 2016 16:46:40 -0800 Subject: [PATCH 45/66] f2fs: fix redundant block allocation In direct_IO path of f2fs_file_write_iter(), 1. f2fs_preallocate_blocks(F2FS_GET_BLOCK_PRE_DIO) -> allocate LBA X 2. f2fs_direct_IO() -> return 0; Then, f2fs_write_data_page() will allocate another LBA X+1. This makes EIO triggered by HM-SMR. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 2938311c97bf..59203a3c143e 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -658,6 +658,13 @@ alloc: return 0; } +static inline bool __force_buffered_io(struct inode *inode, int rw) +{ + return ((f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) || + (rw == WRITE && test_opt(F2FS_I_SB(inode), LFS)) || + F2FS_I_SB(inode)->s_ndevs); +} + int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) { struct inode *inode = file_inode(iocb->ki_filp); @@ -677,7 +684,10 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) err = f2fs_convert_inline_inode(inode); if (err) return err; - return f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_DIO); + return f2fs_map_blocks(inode, &map, 1, + __force_buffered_io(inode, WRITE) ? + F2FS_GET_BLOCK_PRE_AIO : + F2FS_GET_BLOCK_PRE_DIO); } if (iocb->ki_pos + iov_iter_count(from) > MAX_INLINE_DATA) { err = f2fs_convert_inline_inode(inode); @@ -1769,11 +1779,7 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) if (err) return err; - if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) - return 0; - if (rw == WRITE && test_opt(F2FS_I_SB(inode), LFS)) - return 0; - if (F2FS_I_SB(inode)->s_ndevs) + if (__force_buffered_io(inode, rw)) return 0; trace_f2fs_direct_IO_enter(inode, offset, count, rw); From 7702bdbe505a22380dd958e2ee35124c7c414806 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 14 Nov 2016 17:38:35 -0800 Subject: [PATCH 46/66] f2fs: avoid BG_GC in f2fs_balance_fs If many threads hit has_not_enough_free_secs() in f2fs_balance_fs() at the same time, all the threads would do FG_GC or BG_GC. In this critical path, we totally don't need to do BG_GC at all. Let's avoid that. Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- fs/f2fs/file.c | 2 +- fs/f2fs/gc.c | 7 +++++-- fs/f2fs/segment.c | 2 +- 4 files changed, 8 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index a1dcc60ae946..c963fa8d6895 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2183,7 +2183,7 @@ int f2fs_migrate_page(struct address_space *, struct page *, struct page *, int start_gc_thread(struct f2fs_sb_info *); void stop_gc_thread(struct f2fs_sb_info *); block_t start_bidx_of_node(unsigned int, struct inode *); -int f2fs_gc(struct f2fs_sb_info *, bool); +int f2fs_gc(struct f2fs_sb_info *, bool, bool); void build_gc_manager(struct f2fs_sb_info *); /* diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 39616de92a55..9ea89ca8fd0b 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1849,7 +1849,7 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg) mutex_lock(&sbi->gc_mutex); } - ret = f2fs_gc(sbi, sync); + ret = f2fs_gc(sbi, sync, true); out: mnt_drop_write_file(filp); return ret; diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index a7b2c13404f8..36f6cfc613d3 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -82,7 +82,7 @@ static int gc_thread_func(void *data) stat_inc_bggc_count(sbi); /* if return value is not zero, no victim was selected */ - if (f2fs_gc(sbi, test_opt(sbi, FORCE_FG_GC))) + if (f2fs_gc(sbi, test_opt(sbi, FORCE_FG_GC), true)) wait_ms = gc_th->no_gc_sleep_time; trace_f2fs_background_gc(sbi->sb, wait_ms, @@ -909,7 +909,7 @@ next: return sec_freed; } -int f2fs_gc(struct f2fs_sb_info *sbi, bool sync) +int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background) { unsigned int segno; int gc_type = sync ? FG_GC : BG_GC; @@ -950,6 +950,9 @@ gc_more: if (ret) goto stop; } + } else if (gc_type == BG_GC && !background) { + /* f2fs_balance_fs doesn't need to do BG_GC in critical path. */ + goto stop; } if (segno == NULL_SEGNO && !__get_victim(sbi, &segno, gc_type)) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 1869bc3973d5..20eedf0eb751 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -366,7 +366,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) */ if (has_not_enough_free_secs(sbi, 0, 0)) { mutex_lock(&sbi->gc_mutex); - f2fs_gc(sbi, false); + f2fs_gc(sbi, false, false); } } From c79b7ff1d3c7710c23d8828a69d8cbc5597ad19f Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 14 Nov 2016 18:20:10 -0800 Subject: [PATCH 47/66] f2fs: fix wrong written_valid_blocks counting Previously, written_valid_blocks was got by ckpt->valid_block_count. But if the last checkpoint has some NEW_ADDR due to power-cut, we can get wrong value. Fix it to get the number from actual written block count from sit entries. Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 20eedf0eb751..808cb6800852 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2255,7 +2255,6 @@ out: static int build_sit_info(struct f2fs_sb_info *sbi) { struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); - struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); struct sit_info *sit_i; unsigned int sit_segs, start; char *src_bitmap, *dst_bitmap; @@ -2322,7 +2321,7 @@ static int build_sit_info(struct f2fs_sb_info *sbi) sit_i->sit_base_addr = le32_to_cpu(raw_super->sit_blkaddr); sit_i->sit_blocks = sit_segs << sbi->log_blocks_per_seg; - sit_i->written_valid_blocks = le64_to_cpu(ckpt->valid_block_count); + sit_i->written_valid_blocks = 0; sit_i->sit_bitmap = dst_bitmap; sit_i->bitmap_size = bitmap_size; sit_i->dirty_sentries = 0; @@ -2476,6 +2475,9 @@ static void init_free_segmap(struct f2fs_sb_info *sbi) struct seg_entry *sentry = get_seg_entry(sbi, start); if (!sentry->valid_blocks) __set_free(sbi, start); + else + SIT_I(sbi)->written_valid_blocks += + sentry->valid_blocks; } /* set use the current segments */ From 36951b38d13ac7cce9fcf89e0e01c22ed0d05688 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 16 Nov 2016 10:41:20 +0800 Subject: [PATCH 48/66] f2fs: don't wait writeback for datas during checkpoint Normally, while committing checkpoint, we will wait on all pages to be writebacked no matter the page is data or metadata, so in scenario where there are lots of data IO being submitted with metadata, we may suffer long latency for waiting writeback during checkpoint. Indeed, we only care about persistence for pages with metadata, but not pages with data, as file system consistent are only related to metadate, so in order to avoid encountering long latency in above scenario, let's recognize and reference metadata in submitted IOs, wait writeback only for metadatas. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 2 +- fs/f2fs/data.c | 35 +++++++++++++++++++++++++++++------ fs/f2fs/debug.c | 7 ++++--- fs/f2fs/f2fs.h | 9 ++++++--- fs/f2fs/file.c | 2 -- fs/f2fs/gc.c | 2 -- fs/f2fs/segment.c | 1 - 7 files changed, 40 insertions(+), 18 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index e5677e4a69d6..54cc6a9a0d30 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1002,7 +1002,7 @@ static void wait_on_all_pages_writeback(struct f2fs_sb_info *sbi) for (;;) { prepare_to_wait(&sbi->cp_wait, &wait, TASK_UNINTERRUPTIBLE); - if (!atomic_read(&sbi->nr_wb_bios)) + if (!get_pages(sbi, F2FS_WB_CP_DATA)) break; io_schedule_timeout(5*HZ); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 59203a3c143e..8c5b63bda68b 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -29,6 +29,26 @@ #include "trace.h" #include +static bool __is_cp_guaranteed(struct page *page) +{ + struct address_space *mapping = page->mapping; + struct inode *inode; + struct f2fs_sb_info *sbi; + + if (!mapping) + return false; + + inode = mapping->host; + sbi = F2FS_I_SB(inode); + + if (inode->i_ino == F2FS_META_INO(sbi) || + inode->i_ino == F2FS_NODE_INO(sbi) || + S_ISDIR(inode->i_mode) || + is_cold_data(page)) + return true; + return false; +} + static void f2fs_read_end_io(struct bio *bio) { struct bio_vec *bvec; @@ -71,6 +91,7 @@ static void f2fs_write_end_io(struct bio *bio) bio_for_each_segment_all(bvec, bio, i) { struct page *page = bvec->bv_page; + enum count_type type = WB_DATA_TYPE(page); fscrypt_pullback_bio_page(&page, true); @@ -78,9 +99,11 @@ static void f2fs_write_end_io(struct bio *bio) mapping_set_error(page->mapping, -EIO); f2fs_stop_checkpoint(sbi, true); } + dec_page_count(sbi, type); + clear_cold_data(page); end_page_writeback(page); } - if (atomic_dec_and_test(&sbi->nr_wb_bios) && + if (!get_pages(sbi, F2FS_WB_CP_DATA) && wq_has_sleeper(&sbi->cp_wait)) wake_up(&sbi->cp_wait); @@ -148,7 +171,6 @@ static inline void __submit_bio(struct f2fs_sb_info *sbi, struct bio *bio, enum page_type type) { if (!is_read_io(bio_op(bio))) { - atomic_inc(&sbi->nr_wb_bios); if (f2fs_sb_mounted_blkzoned(sbi->sb) && current->plug && (type == DATA || type == NODE)) blk_finish_plug(current->plug); @@ -309,6 +331,11 @@ void f2fs_submit_page_mbio(struct f2fs_io_info *fio) verify_block_addr(sbi, fio->old_blkaddr); verify_block_addr(sbi, fio->new_blkaddr); + bio_page = fio->encrypted_page ? fio->encrypted_page : fio->page; + + if (!is_read) + inc_page_count(sbi, WB_DATA_TYPE(bio_page)); + down_write(&io->io_rwsem); if (io->bio && (io->last_block_in_bio != fio->new_blkaddr - 1 || @@ -322,8 +349,6 @@ alloc_new: io->fio = *fio; } - bio_page = fio->encrypted_page ? fio->encrypted_page : fio->page; - if (bio_add_page(io->bio, bio_page, PAGE_SIZE, 0) < PAGE_SIZE) { __submit_merged_bio(io); @@ -1339,7 +1364,6 @@ done: if (err && err != -ENOENT) goto redirty_out; - clear_cold_data(page); out: inode_dec_dirty_pages(inode); if (err) @@ -1742,7 +1766,6 @@ static int f2fs_write_end(struct file *file, goto unlock_out; set_page_dirty(page); - clear_cold_data(page); if (pos + copied > i_size_read(inode)) f2fs_i_size_write(inode, pos + copied); diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 2fdf23398fa1..67a04d8074bb 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -50,7 +50,8 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->ndirty_files = sbi->ndirty_inode[FILE_INODE]; si->ndirty_all = sbi->ndirty_inode[DIRTY_META]; si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES); - si->wb_bios = atomic_read(&sbi->nr_wb_bios); + si->nr_wb_cp_data = get_pages(sbi, F2FS_WB_CP_DATA); + si->nr_wb_data = get_pages(sbi, F2FS_WB_DATA); si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg; si->rsvd_segs = reserved_segments(sbi); si->overp_segs = overprovision_segments(sbi); @@ -313,8 +314,8 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, " - Inner Struct Count: tree: %d(%d), node: %d\n", si->ext_tree, si->zombie_tree, si->ext_node); seq_puts(s, "\nBalancing F2FS Async:\n"); - seq_printf(s, " - inmem: %4d, wb_bios: %4d\n", - si->inmem_pages, si->wb_bios); + seq_printf(s, " - inmem: %4d, wb_cp_data: %4d, wb_data: %4d\n", + si->inmem_pages, si->nr_wb_cp_data, si->nr_wb_data); seq_printf(s, " - nodes: %4d in %4d\n", si->ndirty_node, si->node_pages); seq_printf(s, " - dents: %4d in dirs:%4d (%4d)\n", diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index c963fa8d6895..ae758e8c245d 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -654,6 +654,7 @@ struct f2fs_sm_info { * f2fs monitors the number of several block types such as on-writeback, * dirty dentry blocks, dirty node blocks, and dirty meta blocks. */ +#define WB_DATA_TYPE(p) (__is_cp_guaranteed(p) ? F2FS_WB_CP_DATA : F2FS_WB_DATA) enum count_type { F2FS_DIRTY_DENTS, F2FS_DIRTY_DATA, @@ -661,6 +662,8 @@ enum count_type { F2FS_DIRTY_META, F2FS_INMEM_PAGES, F2FS_DIRTY_IMETA, + F2FS_WB_CP_DATA, + F2FS_WB_DATA, NR_COUNT_TYPE, }; @@ -840,7 +843,6 @@ struct f2fs_sb_info { block_t discard_blks; /* discard command candidats */ block_t last_valid_block_count; /* for recovery */ u32 s_next_generation; /* for NFS support */ - atomic_t nr_wb_bios; /* # of writeback bios */ /* # of pages, see count_type */ atomic_t nr_pages[NR_COUNT_TYPE]; @@ -1254,7 +1256,8 @@ static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type) { atomic_inc(&sbi->nr_pages[count_type]); - if (count_type == F2FS_DIRTY_DATA || count_type == F2FS_INMEM_PAGES) + if (count_type == F2FS_DIRTY_DATA || count_type == F2FS_INMEM_PAGES || + count_type == F2FS_WB_CP_DATA || count_type == F2FS_WB_DATA) return; set_sbi_flag(sbi, SBI_IS_DIRTY); @@ -2209,7 +2212,7 @@ struct f2fs_stat_info { unsigned int ndirty_dirs, ndirty_files, ndirty_all; int nats, dirty_nats, sits, dirty_sits, free_nids, alloc_nids; int total_count, utilization; - int bg_gc, wb_bios; + int bg_gc, nr_wb_cp_data, nr_wb_data; int inline_xattr, inline_inode, inline_dir, orphans; unsigned int valid_count, valid_node_count, valid_inode_count, discard_blks; unsigned int bimodal, avg_vblocks; diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 9ea89ca8fd0b..f9883d597b42 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -94,8 +94,6 @@ mapped: if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) f2fs_wait_on_encrypted_page_writeback(sbi, dn.data_blkaddr); - /* if gced page is attached, don't write to cold segment */ - clear_cold_data(page); out: sb_end_pagefault(inode->i_sb); f2fs_update_time(sbi, REQ_TIME); diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 36f6cfc613d3..5c2fe68be549 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -693,8 +693,6 @@ retry: congestion_wait(BLK_RW_ASYNC, HZ/50); goto retry; } - - clear_cold_data(page); } out: f2fs_put_page(page, 1); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 808cb6800852..c8dc037dbfa1 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -290,7 +290,6 @@ static int __commit_inmem_pages(struct inode *inode, /* record old blkaddr for revoking */ cur->old_addr = fio.old_blkaddr; - clear_cold_data(page); submit_bio = true; } unlock_page(page); From d40a43af0a57a017eba9ad2679183791587ceb6a Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Wed, 16 Nov 2016 17:26:24 +0800 Subject: [PATCH 49/66] f2fs: fix an infinite loop when flush nodes in cp Thread A Thread B - write_checkpoint - block_operations -blk_start_plug -sync_node_pages - f2fs_do_sync_file - fsync_node_pages - f2fs_wait_on_page_writeback Thread A wait for global F2FS_DIRTY_NODES decreased to zero, it start a plug list, some requests have been added to this list. Thread B lock one dirty node page, and wait this page write back. But this page has been in plug list of thread A with PG_writeback flag. Thread A keep on running and its plug list has no chance to finish, so it seems a deadlock between cp and fsync path. This patch add a wait on page write back before set node page dirty to avoid this problem. Signed-off-by: Yunlei He Signed-off-by: Pengyang Hou Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 38cc07d49ff3..1de25f871016 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1410,6 +1410,7 @@ continue_unlock: "Retry to write fsync mark: ino=%u, idx=%lx", ino, last_page->index); lock_page(last_page); + f2fs_wait_on_page_writeback(last_page, NODE, true); set_page_dirty(last_page); unlock_page(last_page); goto retry; From 04d47e673863c637a2b44ad34a558aeb5d0a727e Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 17 Nov 2016 20:53:11 +0800 Subject: [PATCH 50/66] f2fs: fix to account total free nid correctly Thread A Thread B Thread C - f2fs_create - f2fs_new_inode - f2fs_lock_op - alloc_nid alloc last nid - f2fs_unlock_op - f2fs_create - f2fs_new_inode - f2fs_lock_op - alloc_nid as node count still not be increased, we will loop in alloc_nid - f2fs_write_node_pages - f2fs_balance_fs_bg - f2fs_sync_fs - write_checkpoint - block_operations - f2fs_lock_all - f2fs_lock_op While creating new inode, we do not allocate and account nid atomically, so that when there is almost no free nids left, we may encounter deadloop like above stack. In order to avoid that, reuse nm_i::available_nids for accounting free nids and make nid allocation and counting being atomical during node creation. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- fs/f2fs/node.c | 34 +++++++++++++++++++++++++++++----- 2 files changed, 30 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index ae758e8c245d..1693d4cb593b 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -512,7 +512,7 @@ enum nid_list { struct f2fs_nm_info { block_t nat_blkaddr; /* base disk address of NAT */ nid_t max_nid; /* maximum possible node ids */ - nid_t available_nids; /* maximum available node ids */ + nid_t available_nids; /* # of available node ids */ nid_t next_scan_nid; /* the next nid to be scanned */ unsigned int ram_thresh; /* control the memory footprint */ unsigned int ra_nid_pages; /* # of nid pages to be readaheaded */ diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 1de25f871016..c1bbfdcf519d 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1884,11 +1884,13 @@ retry: if (time_to_inject(sbi, FAULT_ALLOC_NID)) return false; #endif - if (unlikely(sbi->total_valid_node_count + 1 > nm_i->available_nids)) - return false; - spin_lock(&nm_i->nid_list_lock); + if (unlikely(nm_i->available_nids == 0)) { + spin_unlock(&nm_i->nid_list_lock); + return false; + } + /* We should not use stale free nids created by build_free_nids */ if (nm_i->nid_cnt[FREE_NID_LIST] && !on_build_free_nids(nm_i)) { f2fs_bug_on(sbi, list_empty(&nm_i->nid_list[FREE_NID_LIST])); @@ -1899,6 +1901,7 @@ retry: __remove_nid_from_list(sbi, i, FREE_NID_LIST, true); i->state = NID_ALLOC; __insert_nid_to_list(sbi, i, ALLOC_NID_LIST, false); + nm_i->available_nids--; spin_unlock(&nm_i->nid_list_lock); return true; } @@ -1950,6 +1953,9 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) i->state = NID_NEW; __insert_nid_to_list(sbi, i, FREE_NID_LIST, false); } + + nm_i->available_nids++; + spin_unlock(&nm_i->nid_list_lock); if (need_free) @@ -2149,6 +2155,19 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi) ne = grab_nat_entry(nm_i, nid); node_info_from_raw_nat(&ne->ni, &raw_ne); } + + /* + * if a free nat in journal has not been used after last + * checkpoint, we should remove it from available nids, + * since later we will add it again. + */ + if (!get_nat_flag(ne, IS_DIRTY) && + le32_to_cpu(raw_ne.block_addr) == NULL_ADDR) { + spin_lock(&nm_i->nid_list_lock); + nm_i->available_nids--; + spin_unlock(&nm_i->nid_list_lock); + } + __set_nat_cache_dirty(nm_i, ne); } update_nats_in_cursum(journal, -i); @@ -2221,8 +2240,12 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, raw_nat_from_node_info(raw_ne, &ne->ni); nat_reset_flag(ne); __clear_nat_cache_dirty(NM_I(sbi), ne); - if (nat_get_blkaddr(ne) == NULL_ADDR) + if (nat_get_blkaddr(ne) == NULL_ADDR) { add_free_nid(sbi, nid, false); + spin_lock(&NM_I(sbi)->nid_list_lock); + NM_I(sbi)->available_nids++; + spin_unlock(&NM_I(sbi)->nid_list_lock); + } } if (to_journal) @@ -2297,7 +2320,8 @@ static int init_node_manager(struct f2fs_sb_info *sbi) nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nat_blocks; /* not used nids: 0, node, meta, (and root counted as valid node) */ - nm_i->available_nids = nm_i->max_nid - F2FS_RESERVED_NODE_NUM; + nm_i->available_nids = nm_i->max_nid - sbi->total_valid_node_count - + F2FS_RESERVED_NODE_NUM; nm_i->nid_cnt[FREE_NID_LIST] = 0; nm_i->nid_cnt[ALLOC_NID_LIST] = 0; nm_i->nat_cnt = 0; From 281518c694a5228d6c46fac83529fb3e2c331281 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 17 Nov 2016 20:53:31 +0800 Subject: [PATCH 51/66] f2fs: fix fdatasync For below two cases, we can't guarantee data consistence: a) 1. xfs_io "pwrite 0 4195328" "fsync" 2. xfs_io "pwrite 4195328 1024" "fdatasync" 3. godown 4. umount & mount --> isize we updated before fdatasync won't be recovered b) 1. xfs_io "pwrite -S 0xcc 0 4202496" "fsync" 2. xfs_io "fpunch 4194304 4096" "fdatasync" 3. godown 4. umount & mount --> dnode we punched before fdatasync won't be recovered The reason is that normally fdatasync won't be aware of modification of metadata in file, e.g. isize changing, dnode updating, so in ->fsync we will skip flushing node pages for above cases, result in making fdatasynced file being lost during recovery. Currently we have introduced DIRTY_META global list in sbi for tracking dirty inode selectively, so in fdatasync we can choose to flush nodes depend on dirty state of current inode in the list. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 11 ++++++++++- fs/f2fs/file.c | 2 +- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 1693d4cb593b..3f91cb53d545 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1710,8 +1710,17 @@ static inline void f2fs_i_size_write(struct inode *inode, loff_t i_size) set_inode_flag(inode, FI_AUTO_RECOVER); } -static inline bool f2fs_skip_inode_update(struct inode *inode) +static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync) { + if (dsync) { + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + bool ret; + + spin_lock(&sbi->inode_lock[DIRTY_META]); + ret = list_empty(&F2FS_I(inode)->gdirty_list); + spin_unlock(&sbi->inode_lock[DIRTY_META]); + return ret; + } if (!is_inode_flag_set(inode, FI_AUTO_RECOVER)) return false; return F2FS_I(inode)->last_disk_size == i_size_read(inode); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index f9883d597b42..3d37333306a5 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -208,7 +208,7 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end, } /* if the inode is dirty, let's recover all the time */ - if (!datasync && !f2fs_skip_inode_update(inode)) { + if (!f2fs_skip_inode_update(inode, datasync)) { f2fs_write_inode(inode, NULL); goto go_write; } From 3a3a5ead7b6d2c9a29f493791ba23f264052db34 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 16 Nov 2016 15:09:48 -0800 Subject: [PATCH 52/66] f2fs: do not recover i_size if it's valid If i_size is already valid during roll_forward recovery, we should not update it according to the block alignment. Signed-off-by: Jaegeuk Kim --- fs/f2fs/recovery.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 62523b217571..687c176f0b56 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -425,7 +425,7 @@ retry_dn: continue; } - if ((start + 1) << PAGE_SHIFT > i_size_read(inode)) + if (i_size_read(inode) <= (start << PAGE_SHIFT)) f2fs_i_size_write(inode, (start + 1) << PAGE_SHIFT); /* From 97dd26ad834739d4e4ea35fd7ab5f92824de4cbb Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 16 Nov 2016 18:53:16 -0800 Subject: [PATCH 53/66] f2fs: fix wrong AUTO_RECOVER condition If i_size is not aligned to the f2fs's block size, we should not skip inode update during fsync. Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 3f91cb53d545..62383d20a69f 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1721,7 +1721,8 @@ static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync) spin_unlock(&sbi->inode_lock[DIRTY_META]); return ret; } - if (!is_inode_flag_set(inode, FI_AUTO_RECOVER)) + if (!is_inode_flag_set(inode, FI_AUTO_RECOVER) || + i_size_read(inode) & PAGE_MASK) return false; return F2FS_I(inode)->last_disk_size == i_size_read(inode); } From b4ceec29219e340178baa9c5f17bf97a42951cc8 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Fri, 18 Nov 2016 22:21:13 +0800 Subject: [PATCH 54/66] f2fs: drop duplicate header timer.h Drop duplicate header timer.h from segment.c. Signed-off-by: Geliang Tang Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index c8dc037dbfa1..eaa0b40e1ba0 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -16,7 +16,6 @@ #include #include #include -#include #include "f2fs.h" #include "segment.h" From b08b12d2ddc85b977a0531470cf6a7158289aaaf Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 18 Nov 2016 22:27:41 +0800 Subject: [PATCH 55/66] f2fs: fix incorrect free inode count in ->statfs While calculating inode count that we can create at most in the left space, we should consider space which data/node blocks occupied, since we create data/node mixly in main area. So fix the wrong calculation in ->statfs. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 4ccbb860ad06..2852a0b34d64 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -852,7 +852,8 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_bavail = user_block_count - valid_user_blocks(sbi); buf->f_files = sbi->total_node_count - F2FS_RESERVED_NODE_NUM; - buf->f_ffree = buf->f_files - valid_inode_count(sbi); + buf->f_ffree = min(buf->f_files - valid_node_count(sbi), + buf->f_bavail); buf->f_namelen = F2FS_NAME_LEN; buf->f_fsid.val[0] = (u32)id; From 05e6ea2685c964db1e675a24a4f4e2adc22d2388 Mon Sep 17 00:00:00 2001 From: Nicolai Stange Date: Sun, 20 Nov 2016 19:57:23 +0100 Subject: [PATCH 56/66] f2fs: set ->owner for debugfs status file's file_operations The struct file_operations instance serving the f2fs/status debugfs file lacks an initialization of its ->owner. This means that although that file might have been opened, the f2fs module can still get removed. Any further operation on that opened file, releasing included, will cause accesses to unmapped memory. Indeed, Mike Marshall reported the following: BUG: unable to handle kernel paging request at ffffffffa0307430 IP: [] full_proxy_release+0x24/0x90 <...> Call Trace: [] __fput+0xdf/0x1d0 [] ____fput+0xe/0x10 [] task_work_run+0x8e/0xc0 [] do_exit+0x2ae/0xae0 [] ? __audit_syscall_entry+0xae/0x100 [] ? syscall_trace_enter+0x1ca/0x310 [] do_group_exit+0x44/0xc0 [] SyS_exit_group+0x14/0x20 [] do_syscall_64+0x61/0x150 [] entry_SYSCALL64_slow_path+0x25/0x25 <...> ---[ end trace f22ae883fa3ea6b8 ]--- Fixing recursive fault but reboot is needed! Fix this by initializing the f2fs/status file_operations' ->owner with THIS_MODULE. This will allow debugfs to grab a reference to the f2fs module upon any open on that file, thus preventing it from getting removed. Fixes: 902829aa0b72 ("f2fs: move proc files to debugfs") Reported-by: Mike Marshall Reported-by: Martin Brandenburg Cc: stable@vger.kernel.org Signed-off-by: Nicolai Stange Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 67a04d8074bb..fbd5184140d0 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -377,6 +377,7 @@ static int stat_open(struct inode *inode, struct file *file) } static const struct file_operations stat_fops = { + .owner = THIS_MODULE, .open = stat_open, .read = seq_read, .llseek = seq_lseek, From 19c526515f6b998039d5d71fea879d255f173746 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 22 Nov 2016 15:20:16 +0100 Subject: [PATCH 57/66] f2fs: fix 32-bit build The addition of multiple-device support broke CONFIG_BLK_DEV_ZONED on 32-bit machines because of a 64-bit division: fs/f2fs/f2fs.o: In function `__issue_discard_async': extent_cache.c:(.text.__issue_discard_async+0xd4): undefined reference to `__aeabi_uldivmod' Fortunately, bdev_zone_size() is guaranteed to return a power-of-two number, so we can replace the % operator with a cheaper bit mask. Fixes: 792b84b74b54 ("f2fs: support multiple devices") Signed-off-by: Arnd Bergmann Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index eaa0b40e1ba0..d5141a06b9a3 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -693,7 +693,8 @@ static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, } sector = SECTOR_FROM_BLOCK(blkstart); - if (sector % bdev_zone_size(bdev) || nr_sects != bdev_zone_size(bdev)) { + if (sector & (bdev_zone_size(bdev) - 1) || + nr_sects != bdev_zone_size(bdev)) { f2fs_msg(sbi->sb, KERN_INFO, "(%d) %s: Unaligned discard attempted (block %x + %x)", devi, sbi->s_ndevs ? FDEV(devi).path: "", From 8508e44ae98622f841f5ef29d0bf3d5db4e0c1cc Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 24 Nov 2016 12:45:15 -0800 Subject: [PATCH 58/66] f2fs: fix to determine start_cp_addr by sbi->cur_cp_pack We don't guarantee cp_addr is fixed by cp_version. This is to sync with f2fs-tools. Cc: stable@vger.kernel.org Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 8 +++++++- fs/f2fs/f2fs.h | 28 +++++++++++++++++----------- 2 files changed, 24 insertions(+), 12 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 54cc6a9a0d30..35601b0d077f 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -772,6 +772,11 @@ int get_valid_checkpoint(struct f2fs_sb_info *sbi) if (sanity_check_ckpt(sbi)) goto fail_no_cp; + if (cur_page == cp1) + sbi->cur_cp_pack = 1; + else + sbi->cur_cp_pack = 2; + if (cp_blks <= 1) goto done; @@ -1127,7 +1132,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) le32_to_cpu(ckpt->checksum_offset))) = cpu_to_le32(crc32); - start_blk = __start_cp_addr(sbi); + start_blk = __start_cp_next_addr(sbi); /* need to wait for end_io results */ wait_on_all_pages_writeback(sbi); @@ -1190,6 +1195,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) clear_sbi_flag(sbi, SBI_IS_DIRTY); clear_sbi_flag(sbi, SBI_NEED_CP); + __set_cp_next_pack(sbi); /* * redirty superblock if metadata like node page or inode cache is diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 62383d20a69f..593cd6deffb7 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -792,6 +792,7 @@ struct f2fs_sb_info { /* for checkpoint */ struct f2fs_checkpoint *ckpt; /* raw checkpoint pointer */ + int cur_cp_pack; /* remain current cp pack */ spinlock_t cp_lock; /* for flag in ckpt */ struct inode *meta_inode; /* cache meta blocks */ struct mutex cp_mutex; /* checkpoint procedure lock */ @@ -1352,22 +1353,27 @@ static inline void *__bitmap_ptr(struct f2fs_sb_info *sbi, int flag) static inline block_t __start_cp_addr(struct f2fs_sb_info *sbi) { - block_t start_addr; - struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); - unsigned long long ckpt_version = cur_cp_version(ckpt); + block_t start_addr = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_blkaddr); - start_addr = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_blkaddr); - - /* - * odd numbered checkpoint should at cp segment 0 - * and even segment must be at cp segment 1 - */ - if (!(ckpt_version & 1)) + if (sbi->cur_cp_pack == 2) start_addr += sbi->blocks_per_seg; - return start_addr; } +static inline block_t __start_cp_next_addr(struct f2fs_sb_info *sbi) +{ + block_t start_addr = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_blkaddr); + + if (sbi->cur_cp_pack == 1) + start_addr += sbi->blocks_per_seg; + return start_addr; +} + +static inline void __set_cp_next_pack(struct f2fs_sb_info *sbi) +{ + sbi->cur_cp_pack = (sbi->cur_cp_pack == 1) ? 2 : 1; +} + static inline block_t __start_sum_addr(struct f2fs_sb_info *sbi) { return le32_to_cpu(F2FS_CKPT(sbi)->cp_pack_start_sum); From 26787236b36660baf4d136281d40b5bb33a570ec Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 28 Nov 2016 15:33:38 -0800 Subject: [PATCH 59/66] f2fs: do not activate auto_recovery for fallocated i_size If a file needs to keep its i_size by fallocate, we need to turn off auto recovery during roll-forward recovery. This will resolve the below scenario. 1. xfs_io -f /mnt/f2fs/file -c "pwrite 0 4096" -c "fsync" 2. xfs_io -f /mnt/f2fs/file -c "falloc -k 4096 4096" -c "fsync" 3. md5sum /mnt/f2fs/file; 4. godown /mnt/f2fs/ 5. umount /mnt/f2fs/ 6. mount -t f2fs /dev/sdx /mnt/f2fs 7. md5sum /mnt/f2fs/file Reported-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 38 +++++++++++++++++++++----------------- fs/f2fs/file.c | 2 ++ fs/f2fs/recovery.c | 11 ++++++++--- 3 files changed, 31 insertions(+), 20 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 593cd6deffb7..8a659493796b 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -401,6 +401,7 @@ struct f2fs_map_blocks { #define FADVISE_LOST_PINO_BIT 0x02 #define FADVISE_ENCRYPT_BIT 0x04 #define FADVISE_ENC_NAME_BIT 0x08 +#define FADVISE_KEEP_SIZE_BIT 0x10 #define file_is_cold(inode) is_file(inode, FADVISE_COLD_BIT) #define file_wrong_pino(inode) is_file(inode, FADVISE_LOST_PINO_BIT) @@ -413,6 +414,8 @@ struct f2fs_map_blocks { #define file_clear_encrypt(inode) clear_file(inode, FADVISE_ENCRYPT_BIT) #define file_enc_name(inode) is_file(inode, FADVISE_ENC_NAME_BIT) #define file_set_enc_name(inode) set_file(inode, FADVISE_ENC_NAME_BIT) +#define file_keep_isize(inode) is_file(inode, FADVISE_KEEP_SIZE_BIT) +#define file_set_keep_isize(inode) set_file(inode, FADVISE_KEEP_SIZE_BIT) #define DEF_DIR_LEVEL 0 @@ -1716,23 +1719,6 @@ static inline void f2fs_i_size_write(struct inode *inode, loff_t i_size) set_inode_flag(inode, FI_AUTO_RECOVER); } -static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync) -{ - if (dsync) { - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - bool ret; - - spin_lock(&sbi->inode_lock[DIRTY_META]); - ret = list_empty(&F2FS_I(inode)->gdirty_list); - spin_unlock(&sbi->inode_lock[DIRTY_META]); - return ret; - } - if (!is_inode_flag_set(inode, FI_AUTO_RECOVER) || - i_size_read(inode) & PAGE_MASK) - return false; - return F2FS_I(inode)->last_disk_size == i_size_read(inode); -} - static inline void f2fs_i_depth_write(struct inode *inode, unsigned int depth) { F2FS_I(inode)->i_current_depth = depth; @@ -1885,6 +1871,24 @@ static inline void clear_file(struct inode *inode, int type) f2fs_mark_inode_dirty_sync(inode, true); } +static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync) +{ + if (dsync) { + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + bool ret; + + spin_lock(&sbi->inode_lock[DIRTY_META]); + ret = list_empty(&F2FS_I(inode)->gdirty_list); + spin_unlock(&sbi->inode_lock[DIRTY_META]); + return ret; + } + if (!is_inode_flag_set(inode, FI_AUTO_RECOVER) || + file_keep_isize(inode) || + i_size_read(inode) & PAGE_MASK) + return false; + return F2FS_I(inode)->last_disk_size == i_size_read(inode); +} + static inline int f2fs_readonly(struct super_block *sb) { return sb->s_flags & MS_RDONLY; diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 3d37333306a5..9053a9c5a47e 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1399,6 +1399,8 @@ static long f2fs_fallocate(struct file *file, int mode, if (!ret) { inode->i_mtime = inode->i_ctime = current_time(inode); f2fs_mark_inode_dirty_sync(inode, false); + if (mode & FALLOC_FL_KEEP_SIZE) + file_set_keep_isize(inode); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); } diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 687c176f0b56..981a9584b62f 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -187,6 +187,8 @@ static void recover_inode(struct inode *inode, struct page *page) inode->i_ctime.tv_nsec = le32_to_cpu(raw->i_ctime_nsec); inode->i_mtime.tv_nsec = le32_to_cpu(raw->i_mtime_nsec); + F2FS_I(inode)->i_advise = raw->i_advise; + if (file_enc_name(inode)) name = ""; else @@ -425,7 +427,8 @@ retry_dn: continue; } - if (i_size_read(inode) <= (start << PAGE_SHIFT)) + if (!file_keep_isize(inode) && + (i_size_read(inode) <= (start << PAGE_SHIFT))) f2fs_i_size_write(inode, (start + 1) << PAGE_SHIFT); /* @@ -478,8 +481,10 @@ err: f2fs_put_dnode(&dn); out: f2fs_msg(sbi->sb, KERN_NOTICE, - "recover_data: ino = %lx, recovered = %d blocks, err = %d", - inode->i_ino, recovered, err); + "recover_data: ino = %lx (i_size: %s) recovered = %d, err = %d", + inode->i_ino, + file_keep_isize(inode) ? "keep" : "recover", + recovered, err); return err; } From 0002b61bdaac732bcff364a18f5bd57c95def0a5 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 28 Nov 2016 19:13:43 -0800 Subject: [PATCH 60/66] f2fs: return AOP_WRITEPAGE_ACTIVATE for writepage We should use AOP_WRITEPAGE_ACTIVATE when we bypass writing pages. Signed-off-by: Chao Yu Signed-off-by: Miao Xie Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 8c5b63bda68b..b90fb010a991 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1384,6 +1384,8 @@ out: redirty_out: redirty_page_for_writepage(wbc, page); + if (!err) + return AOP_WRITEPAGE_ACTIVATE; unlock_page(page); return err; } @@ -1479,6 +1481,15 @@ continue_unlock: ret = mapping->a_ops->writepage(page, wbc); if (unlikely(ret)) { + /* + * keep nr_to_write, since vfs uses this to + * get # of written pages. + */ + if (ret == AOP_WRITEPAGE_ACTIVATE) { + unlock_page(page); + ret = 0; + continue; + } done_index = page->index + 1; done = 1; break; From 204706c7accfabb67b97eef9f9a28361b6201199 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 2 Dec 2016 15:11:32 -0800 Subject: [PATCH 61/66] Revert "f2fs: use percpu_counter for # of dirty pages in inode" This reverts commit 1beba1b3a953107c3ff5448ab4e4297db4619c76. The perpcu_counter doesn't provide atomicity in single core and consume more DRAM. That incurs fs_mark test failure due to ENOMEM. Cc: stable@vger.kernel.org # 4.7+ Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 10 +++++----- fs/f2fs/file.c | 2 +- fs/f2fs/super.c | 7 +------ 3 files changed, 7 insertions(+), 12 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 8a659493796b..35dbab157ec3 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -431,7 +431,7 @@ struct f2fs_inode_info { /* Use below internally in f2fs*/ unsigned long flags; /* use to pass per-file flags */ struct rw_semaphore i_sem; /* protect fi info */ - struct percpu_counter dirty_pages; /* # of dirty pages */ + atomic_t dirty_pages; /* # of dirty pages */ f2fs_hash_t chash; /* hash value of given file name */ unsigned int clevel; /* maximum level of given file name */ nid_t i_xattr_nid; /* node id that contains xattrs */ @@ -1269,7 +1269,7 @@ static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type) static inline void inode_inc_dirty_pages(struct inode *inode) { - percpu_counter_inc(&F2FS_I(inode)->dirty_pages); + atomic_inc(&F2FS_I(inode)->dirty_pages); inc_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ? F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA); } @@ -1285,7 +1285,7 @@ static inline void inode_dec_dirty_pages(struct inode *inode) !S_ISLNK(inode->i_mode)) return; - percpu_counter_dec(&F2FS_I(inode)->dirty_pages); + atomic_dec(&F2FS_I(inode)->dirty_pages); dec_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ? F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA); } @@ -1295,9 +1295,9 @@ static inline s64 get_pages(struct f2fs_sb_info *sbi, int count_type) return atomic_read(&sbi->nr_pages[count_type]); } -static inline s64 get_dirty_pages(struct inode *inode) +static inline int get_dirty_pages(struct inode *inode) { - return percpu_counter_sum_positive(&F2FS_I(inode)->dirty_pages); + return atomic_read(&F2FS_I(inode)->dirty_pages); } static inline int get_blocktype_secs(struct f2fs_sb_info *sbi, int block_type) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 9053a9c5a47e..4c87261c1cd2 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1533,7 +1533,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) goto out; f2fs_msg(F2FS_I_SB(inode)->sb, KERN_WARNING, - "Unexpected flush for atomic writes: ino=%lu, npages=%lld", + "Unexpected flush for atomic writes: ino=%lu, npages=%u", inode->i_ino, get_dirty_pages(inode)); ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); if (ret) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 2852a0b34d64..7591d2d7b612 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -571,13 +571,9 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) init_once((void *) fi); - if (percpu_counter_init(&fi->dirty_pages, 0, GFP_NOFS)) { - kmem_cache_free(f2fs_inode_cachep, fi); - return NULL; - } - /* Initialize f2fs-specific inode info */ fi->vfs_inode.i_version = 1; + atomic_set(&fi->dirty_pages, 0); fi->i_current_depth = 1; fi->i_advise = 0; init_rwsem(&fi->i_sem); @@ -703,7 +699,6 @@ static void f2fs_i_callback(struct rcu_head *head) static void f2fs_destroy_inode(struct inode *inode) { - percpu_counter_destroy(&F2FS_I(inode)->dirty_pages); call_rcu(&inode->i_rcu, f2fs_i_callback); } From f455c8a5f0a24090e99249eb7280012376adec2c Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 5 Dec 2016 11:37:14 -0800 Subject: [PATCH 62/66] f2fs: call sync_fs when f2fs is idle The sync_fs in f2fs_balance_fs_bg must avoid interrupting current user requests. Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index d5141a06b9a3..8affc5621181 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -383,12 +383,15 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) else build_free_nids(sbi, false); + if (!is_idle(sbi)) + return; + /* checkpoint is the only way to shrink partial cached entries */ if (!available_free_memory(sbi, NAT_ENTRIES) || !available_free_memory(sbi, INO_ENTRIES) || excess_prefree_segs(sbi) || excess_dirty_nats(sbi) || - (is_idle(sbi) && f2fs_time_over(sbi, CP_TIME))) { + f2fs_time_over(sbi, CP_TIME)) { if (test_opt(sbi, DATA_FLUSH)) { struct blk_plug plug; From 2040fce83fe17763b07c97c1f691da2bb85e4135 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 5 Dec 2016 13:56:04 -0800 Subject: [PATCH 63/66] f2fs: detect wrong layout Previous mkfs.f2fs allows small partition inappropriately, so f2fs should detect that as well. Refer this in f2fs-tools. mkfs.f2fs: detect small partition by overprovision ratio and # of segments Reported-and-Tested-by: Eric Biggers Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.h | 2 ++ fs/f2fs/super.c | 11 +++++++++++ 2 files changed, 13 insertions(+) diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 89ab4301ef02..9d44ce83acb2 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -18,6 +18,8 @@ #define DEF_RECLAIM_PREFREE_SEGMENTS 5 /* 5% over total segments */ #define DEF_MAX_RECLAIM_PREFREE_SEGMENTS 4096 /* 8GB in maximum */ +#define F2FS_MIN_SEGMENTS 9 /* SB + 2 (CP + SIT + NAT) + SSA + MAIN */ + /* L: Logical segment # in volume, R: Relative segment # in main area */ #define GET_L2R_SEGNO(free_i, segno) (segno - free_i->start_segno) #define GET_R2L_SEGNO(free_i, segno) (segno + free_i->start_segno) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 7591d2d7b612..1a526474b332 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1452,6 +1452,7 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi) unsigned int total, fsmeta; struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + unsigned int ovp_segments, reserved_segments; total = le32_to_cpu(raw_super->segment_count); fsmeta = le32_to_cpu(raw_super->segment_count_ckpt); @@ -1463,6 +1464,16 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi) if (unlikely(fsmeta >= total)) return 1; + ovp_segments = le32_to_cpu(ckpt->overprov_segment_count); + reserved_segments = le32_to_cpu(ckpt->rsvd_segment_count); + + if (unlikely(fsmeta < F2FS_MIN_SEGMENTS || + ovp_segments == 0 || reserved_segments == 0)) { + f2fs_msg(sbi->sb, KERN_ERR, + "Wrong layout: check mkfs.f2fs version"); + return 1; + } + if (unlikely(f2fs_cp_error(sbi))) { f2fs_msg(sbi->sb, KERN_ERR, "A bug case: need to run fsck"); return 1; From a2125ff7dd1ed3a2a53cdc1f8f9c9cec9cfaa7ab Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 5 Dec 2016 17:25:32 -0800 Subject: [PATCH 64/66] f2fs: free meta pages if sanity check for ckpt is failed This fixes missing freeing meta pages in the error case. Tested-by: Eric Biggers Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 35601b0d077f..698b13ae261c 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -770,7 +770,7 @@ int get_valid_checkpoint(struct f2fs_sb_info *sbi) /* Sanity checking of checkpoint */ if (sanity_check_ckpt(sbi)) - goto fail_no_cp; + goto free_fail_no_cp; if (cur_page == cp1) sbi->cur_cp_pack = 1; @@ -798,6 +798,9 @@ done: f2fs_put_page(cp2, 1); return 0; +free_fail_no_cp: + f2fs_put_page(cp1, 1); + f2fs_put_page(cp2, 1); fail_no_cp: kfree(sbi->ckpt); return -EINVAL; From 5eba8c5d1fb3af28b2073ba5228d4998196c1bcc Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 7 Dec 2016 16:23:32 -0800 Subject: [PATCH 65/66] f2fs: fix to access nullified flush_cmd_control pointer f2fs_sync_file() remount_ro - f2fs_readonly - destroy_flush_cmd_control - f2fs_issue_flush - no fcc pointer! So, this patch doesn't free fcc in this case, but just stop its kernel thread which sends flush commands. Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- fs/f2fs/segment.c | 33 +++++++++++++++++++++++++-------- fs/f2fs/super.c | 5 +++-- 3 files changed, 29 insertions(+), 11 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 35dbab157ec3..137bbf0ae4c6 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2103,7 +2103,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *, bool); void f2fs_balance_fs_bg(struct f2fs_sb_info *); int f2fs_issue_flush(struct f2fs_sb_info *); int create_flush_cmd_control(struct f2fs_sb_info *); -void destroy_flush_cmd_control(struct f2fs_sb_info *); +void destroy_flush_cmd_control(struct f2fs_sb_info *, bool); void invalidate_blocks(struct f2fs_sb_info *, block_t); bool is_checkpointed_data(struct f2fs_sb_info *, block_t); void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 8affc5621181..d7d5727fe8b2 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -489,8 +489,13 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi) if (!fcc->dispatch_list) wake_up(&fcc->flush_wait_queue); - wait_for_completion(&cmd.wait); - atomic_dec(&fcc->submit_flush); + if (fcc->f2fs_issue_flush) { + wait_for_completion(&cmd.wait); + atomic_dec(&fcc->submit_flush); + } else { + llist_del_all(&fcc->issue_list); + atomic_set(&fcc->submit_flush, 0); + } return cmd.ret; } @@ -501,6 +506,11 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi) struct flush_cmd_control *fcc; int err = 0; + if (SM_I(sbi)->cmd_control_info) { + fcc = SM_I(sbi)->cmd_control_info; + goto init_thread; + } + fcc = kzalloc(sizeof(struct flush_cmd_control), GFP_KERNEL); if (!fcc) return -ENOMEM; @@ -508,6 +518,7 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi) init_waitqueue_head(&fcc->flush_wait_queue); init_llist_head(&fcc->issue_list); SM_I(sbi)->cmd_control_info = fcc; +init_thread: fcc->f2fs_issue_flush = kthread_run(issue_flush_thread, sbi, "f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev)); if (IS_ERR(fcc->f2fs_issue_flush)) { @@ -520,14 +531,20 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi) return err; } -void destroy_flush_cmd_control(struct f2fs_sb_info *sbi) +void destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free) { struct flush_cmd_control *fcc = SM_I(sbi)->cmd_control_info; - if (fcc && fcc->f2fs_issue_flush) - kthread_stop(fcc->f2fs_issue_flush); - kfree(fcc); - SM_I(sbi)->cmd_control_info = NULL; + if (fcc && fcc->f2fs_issue_flush) { + struct task_struct *flush_thread = fcc->f2fs_issue_flush; + + fcc->f2fs_issue_flush = NULL; + kthread_stop(flush_thread); + } + if (free) { + kfree(fcc); + SM_I(sbi)->cmd_control_info = NULL; + } } static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, @@ -2738,7 +2755,7 @@ void destroy_segment_manager(struct f2fs_sb_info *sbi) if (!sm_info) return; - destroy_flush_cmd_control(sbi); + destroy_flush_cmd_control(sbi, true); destroy_dirty_segmap(sbi); destroy_curseg(sbi); destroy_free_segmap(sbi); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 1a526474b332..b62c10d28e06 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1102,8 +1102,9 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) * or if flush_merge is not passed in mount option. */ if ((*flags & MS_RDONLY) || !test_opt(sbi, FLUSH_MERGE)) { - destroy_flush_cmd_control(sbi); - } else if (!SM_I(sbi)->cmd_control_info) { + clear_opt(sbi, FLUSH_MERGE); + destroy_flush_cmd_control(sbi, false); + } else { err = create_flush_cmd_control(sbi); if (err) goto restore_gc; From c0ed4405a99ec9be2a0f062eaafc002d8d26c99f Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Sun, 11 Dec 2016 15:35:15 +0800 Subject: [PATCH 66/66] f2fs: fix a missing size change in f2fs_setattr This patch fix a missing size change in f2fs_setattr Signed-off-by: Yunlei He Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 4c87261c1cd2..383b5c29f46b 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -677,6 +677,7 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); int err; + bool size_changed = false; err = setattr_prepare(dentry, attr); if (err) @@ -707,6 +708,8 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) } inode->i_mtime = inode->i_ctime = current_time(inode); } + + size_changed = true; } __setattr_copy(inode, attr); @@ -719,8 +722,8 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) } } - /* update attributes only */ - f2fs_mark_inode_dirty_sync(inode, false); + /* file size may changed here */ + f2fs_mark_inode_dirty_sync(inode, size_changed); /* inode change will produce dirty node pages flushed by checkpoint */ f2fs_balance_fs(F2FS_I_SB(inode), true);