From 4d93874b9e9ce582fd4401334c88eaf93b6dff43 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Sun, 7 Feb 2021 11:04:22 -0800 Subject: [PATCH 01/32] ext4: use memcpy_from_page() in pagecache_read() Signed-off-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20210207190425.38107-6-chaitanya.kulkarni@wdc.com Signed-off-by: Theodore Ts'o --- fs/ext4/verity.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c index 00e3cbde472e..daf88669a2e9 100644 --- a/fs/ext4/verity.c +++ b/fs/ext4/verity.c @@ -45,16 +45,13 @@ static int pagecache_read(struct inode *inode, void *buf, size_t count, size_t n = min_t(size_t, count, PAGE_SIZE - offset_in_page(pos)); struct page *page; - void *addr; page = read_mapping_page(inode->i_mapping, pos >> PAGE_SHIFT, NULL); if (IS_ERR(page)) return PTR_ERR(page); - addr = kmap_atomic(page); - memcpy(buf, addr + offset_in_page(pos), n); - kunmap_atomic(addr); + memcpy_from_page(buf, page, offset_in_page(pos), n); put_page(page); From bd256fda92efe97b692dc72e246d35fa724d42d8 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Sun, 7 Feb 2021 11:04:23 -0800 Subject: [PATCH 02/32] ext4: use memcpy_to_page() in pagecache_write() Signed-off-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20210207190425.38107-7-chaitanya.kulkarni@wdc.com Signed-off-by: Theodore Ts'o --- fs/ext4/verity.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c index daf88669a2e9..c6ed2aa9c534 100644 --- a/fs/ext4/verity.c +++ b/fs/ext4/verity.c @@ -77,7 +77,6 @@ static int pagecache_write(struct inode *inode, const void *buf, size_t count, PAGE_SIZE - offset_in_page(pos)); struct page *page; void *fsdata; - void *addr; int res; res = pagecache_write_begin(NULL, inode->i_mapping, pos, n, 0, @@ -85,9 +84,7 @@ static int pagecache_write(struct inode *inode, const void *buf, size_t count, if (res) return res; - addr = kmap_atomic(page); - memcpy(addr + offset_in_page(pos), buf, n); - kunmap_atomic(addr); + memcpy_to_page(page, offset_in_page(pos), buf, n); res = pagecache_write_end(NULL, inode->i_mapping, pos, n, n, page, fsdata); From d699ae4fc27496d01e8bc5ab2106bd79d1e7be92 Mon Sep 17 00:00:00 2001 From: Alexander Lochmann Date: Thu, 11 Feb 2021 10:51:55 +0100 Subject: [PATCH 03/32] ext4: updated locking documentation for journal_t Some members of transaction_t are allowed to be read without any lock being held if consistency doesn't matter. Based on LockDoc's findings, we extended the locking documentation of those members. Each one of them is marked with a short comment: "no lock for quick racy checks". Signed-off-by: Alexander Lochmann Signed-off-by: Horst Schirmeier Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/ad82c7a9-a624-4ed5-5ada-a6410c44c0b3@tu-dortmund.de Signed-off-by: Theodore Ts'o --- include/linux/jbd2.h | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 99d3cd051ac3..74710bb231a6 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -768,7 +768,8 @@ enum passtype {PASS_SCAN, PASS_REVOKE, PASS_REPLAY}; struct journal_s { /** - * @j_flags: General journaling state flags [j_state_lock] + * @j_flags: General journaling state flags [j_state_lock, + * no lock for quick racy checks] */ unsigned long j_flags; @@ -808,7 +809,8 @@ struct journal_s /** * @j_barrier_count: * - * Number of processes waiting to create a barrier lock [j_state_lock] + * Number of processes waiting to create a barrier lock [j_state_lock, + * no lock for quick racy checks] */ int j_barrier_count; @@ -821,7 +823,8 @@ struct journal_s * @j_running_transaction: * * Transactions: The current running transaction... - * [j_state_lock] [caller holding open handle] + * [j_state_lock, no lock for quick racy checks] [caller holding + * open handle] */ transaction_t *j_running_transaction; @@ -1033,7 +1036,7 @@ struct journal_s * @j_commit_sequence: * * Sequence number of the most recently committed transaction - * [j_state_lock]. + * [j_state_lock, no lock for quick racy checks] */ tid_t j_commit_sequence; @@ -1041,7 +1044,7 @@ struct journal_s * @j_commit_request: * * Sequence number of the most recent transaction wanting commit - * [j_state_lock] + * [j_state_lock, no lock for quick racy checks] */ tid_t j_commit_request; From 3042b1b45c4106feff063932d4fd481c5009dbe1 Mon Sep 17 00:00:00 2001 From: Alexander Lochmann Date: Thu, 11 Feb 2021 18:14:10 +0100 Subject: [PATCH 04/32] Updated locking documentation for transaction_t Some members of transaction_t are allowed to be read without any lock being held if accessed from the correct context. We used LockDoc's findings to determine those members. Each member of them is marked with a short comment: "no lock needed for jbd2 thread". Signed-off-by: Alexander Lochmann Signed-off-by: Horst Schirmeier Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20210211171410.17984-1-alexander.lochmann@tu-dortmund.de Signed-off-by: Theodore Ts'o --- include/linux/jbd2.h | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 74710bb231a6..b9aa85081a40 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -594,18 +594,22 @@ struct transaction_s */ unsigned long t_log_start; - /* Number of buffers on the t_buffers list [j_list_lock] */ + /* + * Number of buffers on the t_buffers list [j_list_lock, no locks + * needed for jbd2 thread] + */ int t_nr_buffers; /* * Doubly-linked circular list of all buffers reserved but not yet - * modified by this transaction [j_list_lock] + * modified by this transaction [j_list_lock, no locks needed fo + * jbd2 thread] */ struct journal_head *t_reserved_list; /* * Doubly-linked circular list of all metadata buffers owned by this - * transaction [j_list_lock] + * transaction [j_list_lock, no locks needed for jbd2 thread] */ struct journal_head *t_buffers; @@ -629,9 +633,11 @@ struct transaction_s struct journal_head *t_checkpoint_io_list; /* - * Doubly-linked circular list of metadata buffers being shadowed by log - * IO. The IO buffers on the iobuf list and the shadow buffers on this - * list match each other one for one at all times. [j_list_lock] + * Doubly-linked circular list of metadata buffers being + * shadowed by log IO. The IO buffers on the iobuf list and + * the shadow buffers on this list match each other one for + * one at all times. [j_list_lock, no locks needed for jbd2 + * thread] */ struct journal_head *t_shadow_list; From 6b3caab4ba9b2d290162e610810a946a33c65117 Mon Sep 17 00:00:00 2001 From: Eric Whitney Date: Tue, 16 Feb 2021 14:16:34 -0500 Subject: [PATCH 05/32] ext4: delete some unused tracepoint definitions A number of tracepoint instances have been removed from ext4 by past patches but the definitions of those tracepoints have not. All instances of ext4_ext_in_cache and ext4_ext_put_in_cache were removed by commit 69eb33dc24dc ("ext4: remove single extent cache"). ext4_get_reserved_cluster_alloc was removed by commit b6bf9171ef5c ("ext4: reduce reserved cluster count by number of allocated clusters"). ext4_find_delalloc_range was removed by commit 7d1b1fbc95eb ("ext4: reimplement ext4_find_delay_alloc_range on extent status tree"). All instances of ext4_direct_IO_enter and ext4_direct_IO_exit were removed by commit 378f32bab371 ("ext4: introduce direct I/O write using iomap infrastructure"). Signed-off-by: Eric Whitney Link: https://lore.kernel.org/r/20210216191634.20957-1-enwlinux@gmail.com Signed-off-by: Theodore Ts'o --- include/trace/events/ext4.h | 176 ------------------------------------ 1 file changed, 176 deletions(-) diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 70ae5497b73a..0ea36b2b0662 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -1358,64 +1358,6 @@ TRACE_EVENT(ext4_read_block_bitmap_load, __entry->group, __entry->prefetch) ); -TRACE_EVENT(ext4_direct_IO_enter, - TP_PROTO(struct inode *inode, loff_t offset, unsigned long len, int rw), - - TP_ARGS(inode, offset, len, rw), - - TP_STRUCT__entry( - __field( dev_t, dev ) - __field( ino_t, ino ) - __field( loff_t, pos ) - __field( unsigned long, len ) - __field( int, rw ) - ), - - TP_fast_assign( - __entry->dev = inode->i_sb->s_dev; - __entry->ino = inode->i_ino; - __entry->pos = offset; - __entry->len = len; - __entry->rw = rw; - ), - - TP_printk("dev %d,%d ino %lu pos %lld len %lu rw %d", - MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->ino, - __entry->pos, __entry->len, __entry->rw) -); - -TRACE_EVENT(ext4_direct_IO_exit, - TP_PROTO(struct inode *inode, loff_t offset, unsigned long len, - int rw, int ret), - - TP_ARGS(inode, offset, len, rw, ret), - - TP_STRUCT__entry( - __field( dev_t, dev ) - __field( ino_t, ino ) - __field( loff_t, pos ) - __field( unsigned long, len ) - __field( int, rw ) - __field( int, ret ) - ), - - TP_fast_assign( - __entry->dev = inode->i_sb->s_dev; - __entry->ino = inode->i_ino; - __entry->pos = offset; - __entry->len = len; - __entry->rw = rw; - __entry->ret = ret; - ), - - TP_printk("dev %d,%d ino %lu pos %lld len %lu rw %d ret %d", - MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->ino, - __entry->pos, __entry->len, - __entry->rw, __entry->ret) -); - DECLARE_EVENT_CLASS(ext4__fallocate_mode, TP_PROTO(struct inode *inode, loff_t offset, loff_t len, int mode), @@ -1962,124 +1904,6 @@ TRACE_EVENT(ext4_get_implied_cluster_alloc_exit, __entry->len, show_mflags(__entry->flags), __entry->ret) ); -TRACE_EVENT(ext4_ext_put_in_cache, - TP_PROTO(struct inode *inode, ext4_lblk_t lblk, unsigned int len, - ext4_fsblk_t start), - - TP_ARGS(inode, lblk, len, start), - - TP_STRUCT__entry( - __field( dev_t, dev ) - __field( ino_t, ino ) - __field( ext4_lblk_t, lblk ) - __field( unsigned int, len ) - __field( ext4_fsblk_t, start ) - ), - - TP_fast_assign( - __entry->dev = inode->i_sb->s_dev; - __entry->ino = inode->i_ino; - __entry->lblk = lblk; - __entry->len = len; - __entry->start = start; - ), - - TP_printk("dev %d,%d ino %lu lblk %u len %u start %llu", - MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->ino, - (unsigned) __entry->lblk, - __entry->len, - (unsigned long long) __entry->start) -); - -TRACE_EVENT(ext4_ext_in_cache, - TP_PROTO(struct inode *inode, ext4_lblk_t lblk, int ret), - - TP_ARGS(inode, lblk, ret), - - TP_STRUCT__entry( - __field( dev_t, dev ) - __field( ino_t, ino ) - __field( ext4_lblk_t, lblk ) - __field( int, ret ) - ), - - TP_fast_assign( - __entry->dev = inode->i_sb->s_dev; - __entry->ino = inode->i_ino; - __entry->lblk = lblk; - __entry->ret = ret; - ), - - TP_printk("dev %d,%d ino %lu lblk %u ret %d", - MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->ino, - (unsigned) __entry->lblk, - __entry->ret) - -); - -TRACE_EVENT(ext4_find_delalloc_range, - TP_PROTO(struct inode *inode, ext4_lblk_t from, ext4_lblk_t to, - int reverse, int found, ext4_lblk_t found_blk), - - TP_ARGS(inode, from, to, reverse, found, found_blk), - - TP_STRUCT__entry( - __field( dev_t, dev ) - __field( ino_t, ino ) - __field( ext4_lblk_t, from ) - __field( ext4_lblk_t, to ) - __field( int, reverse ) - __field( int, found ) - __field( ext4_lblk_t, found_blk ) - ), - - TP_fast_assign( - __entry->dev = inode->i_sb->s_dev; - __entry->ino = inode->i_ino; - __entry->from = from; - __entry->to = to; - __entry->reverse = reverse; - __entry->found = found; - __entry->found_blk = found_blk; - ), - - TP_printk("dev %d,%d ino %lu from %u to %u reverse %d found %d " - "(blk = %u)", - MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->ino, - (unsigned) __entry->from, (unsigned) __entry->to, - __entry->reverse, __entry->found, - (unsigned) __entry->found_blk) -); - -TRACE_EVENT(ext4_get_reserved_cluster_alloc, - TP_PROTO(struct inode *inode, ext4_lblk_t lblk, unsigned int len), - - TP_ARGS(inode, lblk, len), - - TP_STRUCT__entry( - __field( dev_t, dev ) - __field( ino_t, ino ) - __field( ext4_lblk_t, lblk ) - __field( unsigned int, len ) - ), - - TP_fast_assign( - __entry->dev = inode->i_sb->s_dev; - __entry->ino = inode->i_ino; - __entry->lblk = lblk; - __entry->len = len; - ), - - TP_printk("dev %d,%d ino %lu lblk %u len %u", - MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->ino, - (unsigned) __entry->lblk, - __entry->len) -); - TRACE_EVENT(ext4_ext_show_extent, TP_PROTO(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk, unsigned short len), From 400086d7c11327cfe1b1aa4487adceb41e82b4df Mon Sep 17 00:00:00 2001 From: Milan Djurovic Date: Mon, 15 Mar 2021 22:29:53 -0700 Subject: [PATCH 06/32] ext4: remove unnecessary braces in fs/ext4/dir.c Removes braces to follow the coding style. Signed-off-by: Milan Djurovic Link: https://lore.kernel.org/r/20210316052953.67616-1-mdjurovic@zohomail.com Signed-off-by: Theodore Ts'o --- fs/ext4/dir.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 5ed870614c8d..110688d6b7a6 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -124,9 +124,9 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) if (is_dx_dir(inode)) { err = ext4_dx_readdir(file, ctx); - if (err != ERR_BAD_DX_DIR) { + if (err != ERR_BAD_DX_DIR) return err; - } + /* Can we just clear INDEX flag to ignore htree information? */ if (!ext4_has_metadata_csum(sb)) { /* From 471fbbea7ff7061b2d6474665cb5a2ceb4fd6500 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Fri, 19 Mar 2021 07:34:13 +0000 Subject: [PATCH 07/32] ext4: handle casefolding with encryption This adds support for encryption with casefolding. Since the name on disk is case preserving, and also encrypted, we can no longer just recompute the hash on the fly. Additionally, to avoid leaking extra information from the hash of the unencrypted name, we use siphash via an fscrypt v2 policy. The hash is stored at the end of the directory entry for all entries inside of an encrypted and casefolded directory apart from those that deal with '.' and '..'. This way, the change is backwards compatible with existing ext4 filesystems. [ Changed to advertise this feature via the file: /sys/fs/ext4/features/encrypted_casefold -- TYT ] Signed-off-by: Daniel Rosenberg Reviewed-by: Andreas Dilger Link: https://lore.kernel.org/r/20210319073414.1381041-2-drosen@google.com Signed-off-by: Theodore Ts'o --- Documentation/filesystems/ext4/directory.rst | 27 +++ fs/ext4/dir.c | 37 +++- fs/ext4/ext4.h | 56 +++++- fs/ext4/hash.c | 25 ++- fs/ext4/inline.c | 25 ++- fs/ext4/namei.c | 200 ++++++++++++++----- fs/ext4/super.c | 6 - fs/ext4/sysfs.c | 2 + 8 files changed, 288 insertions(+), 90 deletions(-) diff --git a/Documentation/filesystems/ext4/directory.rst b/Documentation/filesystems/ext4/directory.rst index 073940cc64ed..55f618b37144 100644 --- a/Documentation/filesystems/ext4/directory.rst +++ b/Documentation/filesystems/ext4/directory.rst @@ -121,6 +121,31 @@ The directory file type is one of the following values: * - 0x7 - Symbolic link. +To support directories that are both encrypted and casefolded directories, we +must also include hash information in the directory entry. We append +``ext4_extended_dir_entry_2`` to ``ext4_dir_entry_2`` except for the entries +for dot and dotdot, which are kept the same. The structure follows immediately +after ``name`` and is included in the size listed by ``rec_len`` If a directory +entry uses this extension, it may be up to 271 bytes. + +.. list-table:: + :widths: 8 8 24 40 + :header-rows: 1 + + * - Offset + - Size + - Name + - Description + * - 0x0 + - \_\_le32 + - hash + - The hash of the directory name + * - 0x4 + - \_\_le32 + - minor\_hash + - The minor hash of the directory name + + In order to add checksums to these classic directory blocks, a phony ``struct ext4_dir_entry`` is placed at the end of each leaf block to hold the checksum. The directory entry is 12 bytes long. The inode @@ -322,6 +347,8 @@ The directory hash is one of the following values: - Half MD4, unsigned. * - 0x5 - Tea, unsigned. + * - 0x6 + - Siphash. Interior nodes of an htree are recorded as ``struct dx_node``, which is also the full length of a data block: diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 110688d6b7a6..ffb295aa891c 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -55,6 +55,18 @@ static int is_dx_dir(struct inode *inode) return 0; } +static bool is_fake_dir_entry(struct ext4_dir_entry_2 *de) +{ + /* Check if . or .. , or skip if namelen is 0 */ + if ((de->name_len > 0) && (de->name_len <= 2) && (de->name[0] == '.') && + (de->name[1] == '.' || de->name[1] == '\0')) + return true; + /* Check if this is a csum entry */ + if (de->file_type == EXT4_FT_DIR_CSUM) + return true; + return false; +} + /* * Return 0 if the directory entry is OK, and 1 if there is a problem * @@ -73,16 +85,20 @@ int __ext4_check_dir_entry(const char *function, unsigned int line, const int rlen = ext4_rec_len_from_disk(de->rec_len, dir->i_sb->s_blocksize); const int next_offset = ((char *) de - buf) + rlen; + bool fake = is_fake_dir_entry(de); + bool has_csum = ext4_has_metadata_csum(dir->i_sb); - if (unlikely(rlen < EXT4_DIR_REC_LEN(1))) + if (unlikely(rlen < ext4_dir_rec_len(1, fake ? NULL : dir))) error_msg = "rec_len is smaller than minimal"; else if (unlikely(rlen % 4 != 0)) error_msg = "rec_len % 4 != 0"; - else if (unlikely(rlen < EXT4_DIR_REC_LEN(de->name_len))) + else if (unlikely(rlen < ext4_dir_rec_len(de->name_len, + fake ? NULL : dir))) error_msg = "rec_len is too small for name_len"; else if (unlikely(next_offset > size)) error_msg = "directory entry overrun"; - else if (unlikely(next_offset > size - EXT4_DIR_REC_LEN(1) && + else if (unlikely(next_offset > size - ext4_dir_rec_len(1, + has_csum ? NULL : dir) && next_offset != size)) error_msg = "directory entry too close to block end"; else if (unlikely(le32_to_cpu(de->inode) > @@ -94,15 +110,15 @@ int __ext4_check_dir_entry(const char *function, unsigned int line, if (filp) ext4_error_file(filp, function, line, bh->b_blocknr, "bad entry in directory: %s - offset=%u, " - "inode=%u, rec_len=%d, name_len=%d, size=%d", + "inode=%u, rec_len=%d, size=%d fake=%d", error_msg, offset, le32_to_cpu(de->inode), - rlen, de->name_len, size); + rlen, size, fake); else ext4_error_inode(dir, function, line, bh->b_blocknr, "bad entry in directory: %s - offset=%u, " - "inode=%u, rec_len=%d, name_len=%d, size=%d", + "inode=%u, rec_len=%d, size=%d fake=%d", error_msg, offset, le32_to_cpu(de->inode), - rlen, de->name_len, size); + rlen, size, fake); return 1; } @@ -224,7 +240,8 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) * failure will be detected in the * dirent test below. */ if (ext4_rec_len_from_disk(de->rec_len, - sb->s_blocksize) < EXT4_DIR_REC_LEN(1)) + sb->s_blocksize) < ext4_dir_rec_len(1, + inode)) break; i += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize); @@ -265,7 +282,9 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) /* Directory is encrypted */ err = fscrypt_fname_disk_to_usr(inode, - 0, 0, &de_name, &fstr); + EXT4_DIRENT_HASH(de), + EXT4_DIRENT_MINOR_HASH(de), + &de_name, &fstr); de_name = fstr; fstr.len = save_len; if (err) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 826a56e3bbd2..a796985c2b7a 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2196,6 +2196,17 @@ struct ext4_dir_entry { char name[EXT4_NAME_LEN]; /* File name */ }; + +/* + * Encrypted Casefolded entries require saving the hash on disk. This structure + * followed ext4_dir_entry_2's name[name_len] at the next 4 byte aligned + * boundary. + */ +struct ext4_dir_entry_hash { + __le32 hash; + __le32 minor_hash; +}; + /* * The new version of the directory entry. Since EXT4 structures are * stored in intel byte order, and the name_len field could never be @@ -2210,6 +2221,22 @@ struct ext4_dir_entry_2 { char name[EXT4_NAME_LEN]; /* File name */ }; +/* + * Access the hashes at the end of ext4_dir_entry_2 + */ +#define EXT4_DIRENT_HASHES(entry) \ + ((struct ext4_dir_entry_hash *) \ + (((void *)(entry)) + \ + ((8 + (entry)->name_len + EXT4_DIR_ROUND) & ~EXT4_DIR_ROUND))) +#define EXT4_DIRENT_HASH(entry) le32_to_cpu(EXT4_DIRENT_HASHES(de)->hash) +#define EXT4_DIRENT_MINOR_HASH(entry) \ + le32_to_cpu(EXT4_DIRENT_HASHES(de)->minor_hash) + +static inline bool ext4_hash_in_dirent(const struct inode *inode) +{ + return IS_CASEFOLDED(inode) && IS_ENCRYPTED(inode); +} + /* * This is a bogus directory entry at the end of each leaf block that * records checksums. @@ -2251,10 +2278,24 @@ struct ext4_dir_entry_tail { */ #define EXT4_DIR_PAD 4 #define EXT4_DIR_ROUND (EXT4_DIR_PAD - 1) -#define EXT4_DIR_REC_LEN(name_len) (((name_len) + 8 + EXT4_DIR_ROUND) & \ - ~EXT4_DIR_ROUND) #define EXT4_MAX_REC_LEN ((1<<16)-1) +/* + * The rec_len is dependent on the type of directory. Directories that are + * casefolded and encrypted need to store the hash as well, so we add room for + * ext4_extended_dir_entry_2. For all entries related to '.' or '..' you should + * pass NULL for dir, as those entries do not use the extra fields. + */ +static inline unsigned int ext4_dir_rec_len(__u8 name_len, + const struct inode *dir) +{ + int rec_len = (name_len + 8 + EXT4_DIR_ROUND); + + if (dir && ext4_hash_in_dirent(dir)) + rec_len += sizeof(struct ext4_dir_entry_hash); + return (rec_len & ~EXT4_DIR_ROUND); +} + /* * If we ever get support for fs block sizes > page_size, we'll need * to remove the #if statements in the next two functions... @@ -2311,6 +2352,7 @@ static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize) #define DX_HASH_LEGACY_UNSIGNED 3 #define DX_HASH_HALF_MD4_UNSIGNED 4 #define DX_HASH_TEA_UNSIGNED 5 +#define DX_HASH_SIPHASH 6 static inline u32 ext4_chksum(struct ext4_sb_info *sbi, u32 crc, const void *address, unsigned int length) @@ -2365,6 +2407,7 @@ struct ext4_filename { }; #define fname_name(p) ((p)->disk_name.name) +#define fname_usr_name(p) ((p)->usr_fname->name) #define fname_len(p) ((p)->disk_name.len) /* @@ -2707,9 +2750,9 @@ extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *, struct ext4_dir_entry_2 *, struct buffer_head *, char *, int, unsigned int); -#define ext4_check_dir_entry(dir, filp, de, bh, buf, size, offset) \ +#define ext4_check_dir_entry(dir, filp, de, bh, buf, size, offset) \ unlikely(__ext4_check_dir_entry(__func__, __LINE__, (dir), (filp), \ - (de), (bh), (buf), (size), (offset))) + (de), (bh), (buf), (size), (offset))) extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, __u32 minor_hash, struct ext4_dir_entry_2 *dirent, @@ -2720,7 +2763,7 @@ extern int ext4_find_dest_de(struct inode *dir, struct inode *inode, void *buf, int buf_size, struct ext4_filename *fname, struct ext4_dir_entry_2 **dest_de); -void ext4_insert_dentry(struct inode *inode, +void ext4_insert_dentry(struct inode *dir, struct inode *inode, struct ext4_dir_entry_2 *de, int buf_size, struct ext4_filename *fname); @@ -3519,9 +3562,6 @@ extern void ext4_initialize_dirent_tail(struct buffer_head *bh, unsigned int blocksize); extern int ext4_handle_dirty_dirblock(handle_t *handle, struct inode *inode, struct buffer_head *bh); -extern int ext4_ci_compare(const struct inode *parent, - const struct qstr *fname, - const struct qstr *entry, bool quick); extern int __ext4_unlink(handle_t *handle, struct inode *dir, const struct qstr *d_name, struct inode *inode); extern int __ext4_link(struct inode *dir, struct inode *inode, diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c index a92eb79de0cc..f34f4176c1e7 100644 --- a/fs/ext4/hash.c +++ b/fs/ext4/hash.c @@ -197,7 +197,7 @@ static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num) * represented, and whether or not the returned hash is 32 bits or 64 * bits. 32 bit hashes will return 0 for the minor hash. */ -static int __ext4fs_dirhash(const char *name, int len, +static int __ext4fs_dirhash(const struct inode *dir, const char *name, int len, struct dx_hash_info *hinfo) { __u32 hash; @@ -259,6 +259,22 @@ static int __ext4fs_dirhash(const char *name, int len, hash = buf[0]; minor_hash = buf[1]; break; + case DX_HASH_SIPHASH: + { + struct qstr qname = QSTR_INIT(name, len); + __u64 combined_hash; + + if (fscrypt_has_encryption_key(dir)) { + combined_hash = fscrypt_fname_siphash(dir, &qname); + } else { + ext4_warning_inode(dir, "Siphash requires key"); + return -1; + } + + hash = (__u32)(combined_hash >> 32); + minor_hash = (__u32)combined_hash; + break; + } default: hinfo->hash = 0; return -1; @@ -280,7 +296,8 @@ int ext4fs_dirhash(const struct inode *dir, const char *name, int len, unsigned char *buff; struct qstr qstr = {.name = name, .len = len }; - if (len && IS_CASEFOLDED(dir) && um) { + if (len && IS_CASEFOLDED(dir) && um && + (!IS_ENCRYPTED(dir) || fscrypt_has_encryption_key(dir))) { buff = kzalloc(sizeof(char) * PATH_MAX, GFP_KERNEL); if (!buff) return -ENOMEM; @@ -291,12 +308,12 @@ int ext4fs_dirhash(const struct inode *dir, const char *name, int len, goto opaque_seq; } - r = __ext4fs_dirhash(buff, dlen, hinfo); + r = __ext4fs_dirhash(dir, buff, dlen, hinfo); kfree(buff); return r; } opaque_seq: #endif - return __ext4fs_dirhash(name, len, hinfo); + return __ext4fs_dirhash(dir, name, len, hinfo); } diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index b41512d1badc..ba960e6e7e3a 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -1031,7 +1031,7 @@ static int ext4_add_dirent_to_inline(handle_t *handle, err = ext4_journal_get_write_access(handle, iloc->bh); if (err) return err; - ext4_insert_dentry(inode, de, inline_size, fname); + ext4_insert_dentry(dir, inode, de, inline_size, fname); ext4_show_inline_dir(dir, iloc->bh, inline_start, inline_size); @@ -1100,7 +1100,7 @@ static int ext4_update_inline_dir(handle_t *handle, struct inode *dir, int old_size = EXT4_I(dir)->i_inline_size - EXT4_MIN_INLINE_DATA_SIZE; int new_size = get_max_inline_xattr_value_size(dir, iloc); - if (new_size - old_size <= EXT4_DIR_REC_LEN(1)) + if (new_size - old_size <= ext4_dir_rec_len(1, NULL)) return -ENOSPC; ret = ext4_update_inline_data(handle, dir, @@ -1380,8 +1380,8 @@ int ext4_inlinedir_to_tree(struct file *dir_file, fake.name_len = 1; strcpy(fake.name, "."); fake.rec_len = ext4_rec_len_to_disk( - EXT4_DIR_REC_LEN(fake.name_len), - inline_size); + ext4_dir_rec_len(fake.name_len, NULL), + inline_size); ext4_set_de_type(inode->i_sb, &fake, S_IFDIR); de = &fake; pos = EXT4_INLINE_DOTDOT_OFFSET; @@ -1390,8 +1390,8 @@ int ext4_inlinedir_to_tree(struct file *dir_file, fake.name_len = 2; strcpy(fake.name, ".."); fake.rec_len = ext4_rec_len_to_disk( - EXT4_DIR_REC_LEN(fake.name_len), - inline_size); + ext4_dir_rec_len(fake.name_len, NULL), + inline_size); ext4_set_de_type(inode->i_sb, &fake, S_IFDIR); de = &fake; pos = EXT4_INLINE_DOTDOT_SIZE; @@ -1406,7 +1406,12 @@ int ext4_inlinedir_to_tree(struct file *dir_file, } } - ext4fs_dirhash(dir, de->name, de->name_len, hinfo); + if (ext4_hash_in_dirent(dir)) { + hinfo->hash = EXT4_DIRENT_HASH(de); + hinfo->minor_hash = EXT4_DIRENT_MINOR_HASH(de); + } else { + ext4fs_dirhash(dir, de->name, de->name_len, hinfo); + } if ((hinfo->hash < start_hash) || ((hinfo->hash == start_hash) && (hinfo->minor_hash < start_minor_hash))) @@ -1488,8 +1493,8 @@ int ext4_read_inline_dir(struct file *file, * So we will use extra_offset and extra_size to indicate them * during the inline dir iteration. */ - dotdot_offset = EXT4_DIR_REC_LEN(1); - dotdot_size = dotdot_offset + EXT4_DIR_REC_LEN(2); + dotdot_offset = ext4_dir_rec_len(1, NULL); + dotdot_size = dotdot_offset + ext4_dir_rec_len(2, NULL); extra_offset = dotdot_size - EXT4_INLINE_DOTDOT_SIZE; extra_size = extra_offset + inline_size; @@ -1524,7 +1529,7 @@ int ext4_read_inline_dir(struct file *file, * failure will be detected in the * dirent test below. */ if (ext4_rec_len_from_disk(de->rec_len, extra_size) - < EXT4_DIR_REC_LEN(1)) + < ext4_dir_rec_len(1, NULL)) break; i += ext4_rec_len_from_disk(de->rec_len, extra_size); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 883e2a7cd4ab..68646500f7ba 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -280,9 +280,11 @@ static int dx_make_map(struct inode *dir, struct ext4_dir_entry_2 *de, unsigned blocksize, struct dx_hash_info *hinfo, struct dx_map_entry map[]); static void dx_sort_map(struct dx_map_entry *map, unsigned count); -static struct ext4_dir_entry_2 *dx_move_dirents(char *from, char *to, - struct dx_map_entry *offsets, int count, unsigned blocksize); -static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize); +static struct ext4_dir_entry_2 *dx_move_dirents(struct inode *dir, char *from, + char *to, struct dx_map_entry *offsets, + int count, unsigned int blocksize); +static struct ext4_dir_entry_2 *dx_pack_dirents(struct inode *dir, char *base, + unsigned int blocksize); static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block); static int ext4_htree_next_block(struct inode *dir, __u32 hash, @@ -574,8 +576,9 @@ static inline void dx_set_limit(struct dx_entry *entries, unsigned value) static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize) { - unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) - - EXT4_DIR_REC_LEN(2) - infosize; + unsigned int entry_space = dir->i_sb->s_blocksize - + ext4_dir_rec_len(1, NULL) - + ext4_dir_rec_len(2, NULL) - infosize; if (ext4_has_metadata_csum(dir->i_sb)) entry_space -= sizeof(struct dx_tail); @@ -584,7 +587,8 @@ static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize) static inline unsigned dx_node_limit(struct inode *dir) { - unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0); + unsigned int entry_space = dir->i_sb->s_blocksize - + ext4_dir_rec_len(0, dir); if (ext4_has_metadata_csum(dir->i_sb)) entry_space -= sizeof(struct dx_tail); @@ -673,7 +677,10 @@ static struct stats dx_show_leaf(struct inode *dir, name = fname_crypto_str.name; len = fname_crypto_str.len; } - ext4fs_dirhash(dir, de->name, + if (IS_CASEFOLDED(dir)) + h.hash = EXT4_DIRENT_HASH(de); + else + ext4fs_dirhash(dir, de->name, de->name_len, &h); printk("%*.s:(E)%x.%u ", len, name, h.hash, (unsigned) ((char *) de @@ -689,7 +696,7 @@ static struct stats dx_show_leaf(struct inode *dir, (unsigned) ((char *) de - base)); #endif } - space += EXT4_DIR_REC_LEN(de->name_len); + space += ext4_dir_rec_len(de->name_len, dir); names++; } de = ext4_next_entry(de, size); @@ -784,11 +791,25 @@ dx_probe(struct ext4_filename *fname, struct inode *dir, root = (struct dx_root *) frame->bh->b_data; if (root->info.hash_version != DX_HASH_TEA && root->info.hash_version != DX_HASH_HALF_MD4 && - root->info.hash_version != DX_HASH_LEGACY) { + root->info.hash_version != DX_HASH_LEGACY && + root->info.hash_version != DX_HASH_SIPHASH) { ext4_warning_inode(dir, "Unrecognised inode hash code %u", root->info.hash_version); goto fail; } + if (ext4_hash_in_dirent(dir)) { + if (root->info.hash_version != DX_HASH_SIPHASH) { + ext4_warning_inode(dir, + "Hash in dirent, but hash is not SIPHASH"); + goto fail; + } + } else { + if (root->info.hash_version == DX_HASH_SIPHASH) { + ext4_warning_inode(dir, + "Hash code is SIPHASH, but hash not in dirent"); + goto fail; + } + } if (fname) hinfo = &fname->hinfo; hinfo->hash_version = root->info.hash_version; @@ -997,6 +1018,7 @@ static int htree_dirblock_to_tree(struct file *dir_file, struct ext4_dir_entry_2 *de, *top; int err = 0, count = 0; struct fscrypt_str fname_crypto_str = FSTR_INIT(NULL, 0), tmp_str; + int csum = ext4_has_metadata_csum(dir->i_sb); dxtrace(printk(KERN_INFO "In htree dirblock_to_tree: block %lu\n", (unsigned long)block)); @@ -1005,9 +1027,11 @@ static int htree_dirblock_to_tree(struct file *dir_file, return PTR_ERR(bh); de = (struct ext4_dir_entry_2 *) bh->b_data; + /* csum entries are not larger in the casefolded encrypted case */ top = (struct ext4_dir_entry_2 *) ((char *) de + dir->i_sb->s_blocksize - - EXT4_DIR_REC_LEN(0)); + ext4_dir_rec_len(0, + csum ? NULL : dir)); /* Check if the directory is encrypted */ if (IS_ENCRYPTED(dir)) { err = fscrypt_prepare_readdir(dir); @@ -1031,7 +1055,17 @@ static int htree_dirblock_to_tree(struct file *dir_file, /* silently ignore the rest of the block */ break; } - ext4fs_dirhash(dir, de->name, de->name_len, hinfo); + if (ext4_hash_in_dirent(dir)) { + if (de->name_len && de->inode) { + hinfo->hash = EXT4_DIRENT_HASH(de); + hinfo->minor_hash = EXT4_DIRENT_MINOR_HASH(de); + } else { + hinfo->hash = 0; + hinfo->minor_hash = 0; + } + } else { + ext4fs_dirhash(dir, de->name, de->name_len, hinfo); + } if ((hinfo->hash < start_hash) || ((hinfo->hash == start_hash) && (hinfo->minor_hash < start_minor_hash))) @@ -1100,7 +1134,11 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, start_hash, start_minor_hash)); dir = file_inode(dir_file); if (!(ext4_test_inode_flag(dir, EXT4_INODE_INDEX))) { - hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version; + if (ext4_hash_in_dirent(dir)) + hinfo.hash_version = DX_HASH_SIPHASH; + else + hinfo.hash_version = + EXT4_SB(dir->i_sb)->s_def_hash_version; if (hinfo.hash_version <= DX_HASH_TEA) hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned; @@ -1218,7 +1256,10 @@ static int dx_make_map(struct inode *dir, struct ext4_dir_entry_2 *de, while ((char *) de < base + blocksize) { if (de->name_len && de->inode) { - ext4fs_dirhash(dir, de->name, de->name_len, &h); + if (ext4_hash_in_dirent(dir)) + h.hash = EXT4_DIRENT_HASH(de); + else + ext4fs_dirhash(dir, de->name, de->name_len, &h); map_tail--; map_tail->hash = h.hash; map_tail->offs = ((char *) de - base)>>2; @@ -1282,31 +1323,47 @@ static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block) * Returns: 0 if the directory entry matches, more than 0 if it * doesn't match or less than zero on error. */ -int ext4_ci_compare(const struct inode *parent, const struct qstr *name, - const struct qstr *entry, bool quick) +static int ext4_ci_compare(const struct inode *parent, const struct qstr *name, + u8 *de_name, size_t de_name_len, bool quick) { const struct super_block *sb = parent->i_sb; const struct unicode_map *um = sb->s_encoding; + struct fscrypt_str decrypted_name = FSTR_INIT(NULL, de_name_len); + struct qstr entry = QSTR_INIT(de_name, de_name_len); int ret; - if (quick) - ret = utf8_strncasecmp_folded(um, name, entry); - else - ret = utf8_strncasecmp(um, name, entry); + if (IS_ENCRYPTED(parent)) { + const struct fscrypt_str encrypted_name = + FSTR_INIT(de_name, de_name_len); + decrypted_name.name = kmalloc(de_name_len, GFP_KERNEL); + if (!decrypted_name.name) + return -ENOMEM; + ret = fscrypt_fname_disk_to_usr(parent, 0, 0, &encrypted_name, + &decrypted_name); + if (ret < 0) + goto out; + entry.name = decrypted_name.name; + entry.len = decrypted_name.len; + } + + if (quick) + ret = utf8_strncasecmp_folded(um, name, &entry); + else + ret = utf8_strncasecmp(um, name, &entry); if (ret < 0) { /* Handle invalid character sequence as either an error * or as an opaque byte sequence. */ if (sb_has_strict_encoding(sb)) - return -EINVAL; - - if (name->len != entry->len) - return 1; - - return !!memcmp(name->name, entry->name, name->len); + ret = -EINVAL; + else if (name->len != entry.len) + ret = 1; + else + ret = !!memcmp(name->name, entry.name, entry.len); } - +out: + kfree(decrypted_name.name); return ret; } @@ -1342,14 +1399,11 @@ void ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname, * * Return: %true if the directory entry matches, otherwise %false. */ -static inline bool ext4_match(const struct inode *parent, +static bool ext4_match(struct inode *parent, const struct ext4_filename *fname, - const struct ext4_dir_entry_2 *de) + struct ext4_dir_entry_2 *de) { struct fscrypt_name f; -#ifdef CONFIG_UNICODE - const struct qstr entry = {.name = de->name, .len = de->name_len}; -#endif if (!de->inode) return false; @@ -1365,10 +1419,23 @@ static inline bool ext4_match(const struct inode *parent, if (fname->cf_name.name) { struct qstr cf = {.name = fname->cf_name.name, .len = fname->cf_name.len}; - return !ext4_ci_compare(parent, &cf, &entry, true); + if (IS_ENCRYPTED(parent)) { + struct dx_hash_info hinfo; + + hinfo.hash_version = DX_HASH_SIPHASH; + hinfo.seed = NULL; + ext4fs_dirhash(parent, fname->cf_name.name, + fname_len(fname), &hinfo); + if (hinfo.hash != EXT4_DIRENT_HASH(de) || + hinfo.minor_hash != + EXT4_DIRENT_MINOR_HASH(de)) + return 0; + } + return !ext4_ci_compare(parent, &cf, de->name, + de->name_len, true); } - return !ext4_ci_compare(parent, fname->usr_fname, &entry, - false); + return !ext4_ci_compare(parent, fname->usr_fname, de->name, + de->name_len, false); } #endif @@ -1765,7 +1832,8 @@ struct dentry *ext4_get_parent(struct dentry *child) * Returns pointer to last entry moved. */ static struct ext4_dir_entry_2 * -dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count, +dx_move_dirents(struct inode *dir, char *from, char *to, + struct dx_map_entry *map, int count, unsigned blocksize) { unsigned rec_len = 0; @@ -1773,7 +1841,8 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count, while (count--) { struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *) (from + (map->offs<<2)); - rec_len = EXT4_DIR_REC_LEN(de->name_len); + rec_len = ext4_dir_rec_len(de->name_len, dir); + memcpy (to, de, rec_len); ((struct ext4_dir_entry_2 *) to)->rec_len = ext4_rec_len_to_disk(rec_len, blocksize); @@ -1788,7 +1857,8 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count, * Compact each dir entry in the range to the minimal rec_len. * Returns pointer to last entry in range. */ -static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize) +static struct ext4_dir_entry_2 *dx_pack_dirents(struct inode *dir, char *base, + unsigned int blocksize) { struct ext4_dir_entry_2 *next, *to, *prev, *de = (struct ext4_dir_entry_2 *) base; unsigned rec_len = 0; @@ -1797,7 +1867,7 @@ static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize) while ((char*)de < base + blocksize) { next = ext4_next_entry(de, blocksize); if (de->inode && de->name_len) { - rec_len = EXT4_DIR_REC_LEN(de->name_len); + rec_len = ext4_dir_rec_len(de->name_len, dir); if (de > to) memmove(to, de, rec_len); to->rec_len = ext4_rec_len_to_disk(rec_len, blocksize); @@ -1887,9 +1957,9 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, hash2, split, count-split)); /* Fancy dance to stay within two buffers */ - de2 = dx_move_dirents(data1, data2, map + split, count - split, + de2 = dx_move_dirents(dir, data1, data2, map + split, count - split, blocksize); - de = dx_pack_dirents(data1, blocksize); + de = dx_pack_dirents(dir, data1, blocksize); de->rec_len = ext4_rec_len_to_disk(data1 + (blocksize - csum_size) - (char *) de, blocksize); @@ -1937,7 +2007,7 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode, struct ext4_dir_entry_2 **dest_de) { struct ext4_dir_entry_2 *de; - unsigned short reclen = EXT4_DIR_REC_LEN(fname_len(fname)); + unsigned short reclen = ext4_dir_rec_len(fname_len(fname), dir); int nlen, rlen; unsigned int offset = 0; char *top; @@ -1950,7 +2020,7 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode, return -EFSCORRUPTED; if (ext4_match(dir, fname, de)) return -EEXIST; - nlen = EXT4_DIR_REC_LEN(de->name_len); + nlen = ext4_dir_rec_len(de->name_len, dir); rlen = ext4_rec_len_from_disk(de->rec_len, buf_size); if ((de->inode ? rlen - nlen : rlen) >= reclen) break; @@ -1964,7 +2034,8 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode, return 0; } -void ext4_insert_dentry(struct inode *inode, +void ext4_insert_dentry(struct inode *dir, + struct inode *inode, struct ext4_dir_entry_2 *de, int buf_size, struct ext4_filename *fname) @@ -1972,7 +2043,7 @@ void ext4_insert_dentry(struct inode *inode, int nlen, rlen; - nlen = EXT4_DIR_REC_LEN(de->name_len); + nlen = ext4_dir_rec_len(de->name_len, dir); rlen = ext4_rec_len_from_disk(de->rec_len, buf_size); if (de->inode) { struct ext4_dir_entry_2 *de1 = @@ -1986,6 +2057,17 @@ void ext4_insert_dentry(struct inode *inode, ext4_set_de_type(inode->i_sb, de, inode->i_mode); de->name_len = fname_len(fname); memcpy(de->name, fname_name(fname), fname_len(fname)); + if (ext4_hash_in_dirent(dir)) { + struct dx_hash_info hinfo; + + hinfo.hash_version = DX_HASH_SIPHASH; + hinfo.seed = NULL; + ext4fs_dirhash(dir, fname_usr_name(fname), + fname_len(fname), &hinfo); + EXT4_DIRENT_HASHES(de)->hash = cpu_to_le32(hinfo.hash); + EXT4_DIRENT_HASHES(de)->minor_hash = + cpu_to_le32(hinfo.minor_hash); + } } /* @@ -2022,7 +2104,7 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname, } /* By now the buffer is marked for journaling */ - ext4_insert_dentry(inode, de, blocksize, fname); + ext4_insert_dentry(dir, inode, de, blocksize, fname); /* * XXX shouldn't update any times until successful @@ -2114,11 +2196,16 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname, /* Initialize the root; the dot dirents already exist */ de = (struct ext4_dir_entry_2 *) (&root->dotdot); - de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(2), - blocksize); + de->rec_len = ext4_rec_len_to_disk( + blocksize - ext4_dir_rec_len(2, NULL), blocksize); memset (&root->info, 0, sizeof(root->info)); root->info.info_length = sizeof(root->info); - root->info.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version; + if (ext4_hash_in_dirent(dir)) + root->info.hash_version = DX_HASH_SIPHASH; + else + root->info.hash_version = + EXT4_SB(dir->i_sb)->s_def_hash_version; + entries = root->entries; dx_set_block(entries, 1); dx_set_count(entries, 1); @@ -2129,7 +2216,12 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname, if (fname->hinfo.hash_version <= DX_HASH_TEA) fname->hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned; fname->hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed; - ext4fs_dirhash(dir, fname_name(fname), fname_len(fname), &fname->hinfo); + if (ext4_hash_in_dirent(dir)) + ext4fs_dirhash(dir, fname_usr_name(fname), + fname_len(fname), &fname->hinfo); + else + ext4fs_dirhash(dir, fname_name(fname), + fname_len(fname), &fname->hinfo); memset(frames, 0, sizeof(frames)); frame = frames; @@ -2722,7 +2814,7 @@ struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode, { de->inode = cpu_to_le32(inode->i_ino); de->name_len = 1; - de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len), + de->rec_len = ext4_rec_len_to_disk(ext4_dir_rec_len(de->name_len, NULL), blocksize); strcpy(de->name, "."); ext4_set_de_type(inode->i_sb, de, S_IFDIR); @@ -2732,11 +2824,12 @@ struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode, de->name_len = 2; if (!dotdot_real_len) de->rec_len = ext4_rec_len_to_disk(blocksize - - (csum_size + EXT4_DIR_REC_LEN(1)), + (csum_size + ext4_dir_rec_len(1, NULL)), blocksize); else de->rec_len = ext4_rec_len_to_disk( - EXT4_DIR_REC_LEN(de->name_len), blocksize); + ext4_dir_rec_len(de->name_len, NULL), + blocksize); strcpy(de->name, ".."); ext4_set_de_type(inode->i_sb, de, S_IFDIR); @@ -2869,7 +2962,8 @@ bool ext4_empty_dir(struct inode *inode) } sb = inode->i_sb; - if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2)) { + if (inode->i_size < ext4_dir_rec_len(1, NULL) + + ext4_dir_rec_len(2, NULL)) { EXT4_ERROR_INODE(inode, "invalid size"); return true; } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index b9693680463a..7b937a84f657 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4292,12 +4292,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) struct unicode_map *encoding; __u16 encoding_flags; - if (ext4_has_feature_encrypt(sb)) { - ext4_msg(sb, KERN_ERR, - "Can't mount with encoding and encryption"); - goto failed_mount; - } - if (ext4_sb_read_encoding(es, &encoding_info, &encoding_flags)) { ext4_msg(sb, KERN_ERR, diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index a3d08276d441..7367ba406e01 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -313,6 +313,7 @@ EXT4_ATTR_FEATURE(verity); #endif EXT4_ATTR_FEATURE(metadata_csum_seed); EXT4_ATTR_FEATURE(fast_commit); +EXT4_ATTR_FEATURE(encrypted_casefold); static struct attribute *ext4_feat_attrs[] = { ATTR_LIST(lazy_itable_init), @@ -330,6 +331,7 @@ static struct attribute *ext4_feat_attrs[] = { #endif ATTR_LIST(metadata_csum_seed), ATTR_LIST(fast_commit), + ATTR_LIST(encrypted_casefold), NULL, }; ATTRIBUTE_GROUPS(ext4_feat); From 1ae98e295fa2577fb5e492200c58d10230e00e99 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Fri, 19 Mar 2021 07:34:14 +0000 Subject: [PATCH 08/32] ext4: optimize match for casefolded encrypted dirs Matching names with casefolded encrypting directories requires decrypting entries to confirm case since we are case preserving. We can avoid needing to decrypt if our hash values don't match. Signed-off-by: Daniel Rosenberg Link: https://lore.kernel.org/r/20210319073414.1381041-3-drosen@google.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 17 ++++++++-------- fs/ext4/namei.c | 53 ++++++++++++++++++++++++++----------------------- 2 files changed, 37 insertions(+), 33 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index a796985c2b7a..da7cb2c755b6 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2638,9 +2638,9 @@ extern unsigned ext4_free_clusters_after_init(struct super_block *sb, ext4_fsblk_t ext4_inode_to_goal_block(struct inode *); #ifdef CONFIG_UNICODE -extern void ext4_fname_setup_ci_filename(struct inode *dir, +extern int ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname, - struct fscrypt_str *fname); + struct ext4_filename *fname); #endif #ifdef CONFIG_FS_ENCRYPTION @@ -2671,9 +2671,9 @@ static inline int ext4_fname_setup_filename(struct inode *dir, ext4_fname_from_fscrypt_name(fname, &name); #ifdef CONFIG_UNICODE - ext4_fname_setup_ci_filename(dir, iname, &fname->cf_name); + err = ext4_fname_setup_ci_filename(dir, iname, fname); #endif - return 0; + return err; } static inline int ext4_fname_prepare_lookup(struct inode *dir, @@ -2690,9 +2690,9 @@ static inline int ext4_fname_prepare_lookup(struct inode *dir, ext4_fname_from_fscrypt_name(fname, &name); #ifdef CONFIG_UNICODE - ext4_fname_setup_ci_filename(dir, &dentry->d_name, &fname->cf_name); + err = ext4_fname_setup_ci_filename(dir, &dentry->d_name, fname); #endif - return 0; + return err; } static inline void ext4_fname_free_filename(struct ext4_filename *fname) @@ -2717,15 +2717,16 @@ static inline int ext4_fname_setup_filename(struct inode *dir, int lookup, struct ext4_filename *fname) { + int err = 0; fname->usr_fname = iname; fname->disk_name.name = (unsigned char *) iname->name; fname->disk_name.len = iname->len; #ifdef CONFIG_UNICODE - ext4_fname_setup_ci_filename(dir, iname, &fname->cf_name); + err = ext4_fname_setup_ci_filename(dir, iname, fname); #endif - return 0; + return err; } static inline int ext4_fname_prepare_lookup(struct inode *dir, diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 68646500f7ba..f2e9cb71763d 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -816,7 +816,9 @@ dx_probe(struct ext4_filename *fname, struct inode *dir, if (hinfo->hash_version <= DX_HASH_TEA) hinfo->hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned; hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed; - if (fname && fname_name(fname)) + /* hash is already computed for encrypted casefolded directory */ + if (fname && fname_name(fname) && + !(IS_ENCRYPTED(dir) && IS_CASEFOLDED(dir))) ext4fs_dirhash(dir, fname_name(fname), fname_len(fname), hinfo); hash = hinfo->hash; @@ -1367,19 +1369,21 @@ out: return ret; } -void ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname, - struct fscrypt_str *cf_name) +int ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname, + struct ext4_filename *name) { + struct fscrypt_str *cf_name = &name->cf_name; + struct dx_hash_info *hinfo = &name->hinfo; int len; if (!IS_CASEFOLDED(dir) || !dir->i_sb->s_encoding) { cf_name->name = NULL; - return; + return 0; } cf_name->name = kmalloc(EXT4_NAME_LEN, GFP_NOFS); if (!cf_name->name) - return; + return -ENOMEM; len = utf8_casefold(dir->i_sb->s_encoding, iname, cf_name->name, @@ -1387,10 +1391,18 @@ void ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname, if (len <= 0) { kfree(cf_name->name); cf_name->name = NULL; - return; } cf_name->len = (unsigned) len; + if (!IS_ENCRYPTED(dir)) + return 0; + hinfo->hash_version = DX_HASH_SIPHASH; + hinfo->seed = NULL; + if (cf_name->name) + ext4fs_dirhash(dir, cf_name->name, cf_name->len, hinfo); + else + ext4fs_dirhash(dir, iname->name, iname->len, hinfo); + return 0; } #endif @@ -1420,16 +1432,12 @@ static bool ext4_match(struct inode *parent, struct qstr cf = {.name = fname->cf_name.name, .len = fname->cf_name.len}; if (IS_ENCRYPTED(parent)) { - struct dx_hash_info hinfo; + if (fname->hinfo.hash != EXT4_DIRENT_HASH(de) || + fname->hinfo.minor_hash != + EXT4_DIRENT_MINOR_HASH(de)) { - hinfo.hash_version = DX_HASH_SIPHASH; - hinfo.seed = NULL; - ext4fs_dirhash(parent, fname->cf_name.name, - fname_len(fname), &hinfo); - if (hinfo.hash != EXT4_DIRENT_HASH(de) || - hinfo.minor_hash != - EXT4_DIRENT_MINOR_HASH(de)) return 0; + } } return !ext4_ci_compare(parent, &cf, de->name, de->name_len, true); @@ -2058,15 +2066,11 @@ void ext4_insert_dentry(struct inode *dir, de->name_len = fname_len(fname); memcpy(de->name, fname_name(fname), fname_len(fname)); if (ext4_hash_in_dirent(dir)) { - struct dx_hash_info hinfo; + struct dx_hash_info *hinfo = &fname->hinfo; - hinfo.hash_version = DX_HASH_SIPHASH; - hinfo.seed = NULL; - ext4fs_dirhash(dir, fname_usr_name(fname), - fname_len(fname), &hinfo); - EXT4_DIRENT_HASHES(de)->hash = cpu_to_le32(hinfo.hash); + EXT4_DIRENT_HASHES(de)->hash = cpu_to_le32(hinfo->hash); EXT4_DIRENT_HASHES(de)->minor_hash = - cpu_to_le32(hinfo.minor_hash); + cpu_to_le32(hinfo->minor_hash); } } @@ -2216,10 +2220,9 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname, if (fname->hinfo.hash_version <= DX_HASH_TEA) fname->hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned; fname->hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed; - if (ext4_hash_in_dirent(dir)) - ext4fs_dirhash(dir, fname_usr_name(fname), - fname_len(fname), &fname->hinfo); - else + + /* casefolded encrypted hashes are computed on fname setup */ + if (!ext4_hash_in_dirent(dir)) ext4fs_dirhash(dir, fname_name(fname), fname_len(fname), &fname->hinfo); From d556435156b7970b8ce61b355df558a5168927cc Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 22 Mar 2021 11:21:38 +0100 Subject: [PATCH 09/32] jbd2: avoid -Wempty-body warnings Building with 'make W=1' shows a harmless -Wempty-body warning: fs/jbd2/recovery.c: In function 'fc_do_one_pass': fs/jbd2/recovery.c:267:75: error: suggest braces around empty body in an 'if' statement [-Werror=empty-body] 267 | jbd_debug(3, "Fast commit replay failed, err = %d\n", err); | ^ Change the empty dprintk() macros to no_printk(), which avoids this warning and adds format string checking. Signed-off-by: Arnd Bergmann Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20210322102152.95684-1-arnd@kernel.org Signed-off-by: Theodore Ts'o --- include/linux/jbd2.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index b9aa85081a40..db0e1920cb12 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -61,7 +61,7 @@ void __jbd2_debug(int level, const char *file, const char *func, #define jbd_debug(n, fmt, a...) \ __jbd2_debug((n), __FILE__, __func__, __LINE__, (fmt), ##a) #else -#define jbd_debug(n, fmt, a...) /**/ +#define jbd_debug(n, fmt, a...) no_printk(fmt, ##a) #endif extern void *jbd2_alloc(size_t size, gfp_t flags); From a149d2a5cabbf6507a7832a1c4fd2593c55fd450 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Wed, 31 Mar 2021 20:15:16 +0800 Subject: [PATCH 10/32] ext4: fix check to prevent false positive report of incorrect used inodes Commit <50122847007> ("ext4: fix check to prevent initializing reserved inodes") check the block group zero and prevent initializing reserved inodes. But in some special cases, the reserved inode may not all belong to the group zero, it may exist into the second group if we format filesystem below. mkfs.ext4 -b 4096 -g 8192 -N 1024 -I 4096 /dev/sda So, it will end up triggering a false positive report of a corrupted file system. This patch fix it by avoid check reserved inodes if no free inode blocks will be zeroed. Cc: stable@kernel.org Fixes: 50122847007 ("ext4: fix check to prevent initializing reserved inodes") Signed-off-by: Zhang Yi Suggested-by: Jan Kara Link: https://lore.kernel.org/r/20210331121516.2243099-1-yi.zhang@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/ialloc.c | 46 +++++++++++++++++++++++++++++++--------------- 1 file changed, 31 insertions(+), 15 deletions(-) diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 633ae7becd61..5f0c7fe32672 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -1513,6 +1513,7 @@ int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, handle_t *handle; ext4_fsblk_t blk; int num, ret = 0, used_blks = 0; + unsigned long used_inos = 0; /* This should not happen, but just to be sure check this */ if (sb_rdonly(sb)) { @@ -1543,22 +1544,37 @@ int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, * used inodes so we need to skip blocks with used inodes in * inode table. */ - if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT))) - used_blks = DIV_ROUND_UP((EXT4_INODES_PER_GROUP(sb) - - ext4_itable_unused_count(sb, gdp)), - sbi->s_inodes_per_block); + if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT))) { + used_inos = EXT4_INODES_PER_GROUP(sb) - + ext4_itable_unused_count(sb, gdp); + used_blks = DIV_ROUND_UP(used_inos, sbi->s_inodes_per_block); - if ((used_blks < 0) || (used_blks > sbi->s_itb_per_group) || - ((group == 0) && ((EXT4_INODES_PER_GROUP(sb) - - ext4_itable_unused_count(sb, gdp)) < - EXT4_FIRST_INO(sb)))) { - ext4_error(sb, "Something is wrong with group %u: " - "used itable blocks: %d; " - "itable unused count: %u", - group, used_blks, - ext4_itable_unused_count(sb, gdp)); - ret = 1; - goto err_out; + /* Bogus inode unused count? */ + if (used_blks < 0 || used_blks > sbi->s_itb_per_group) { + ext4_error(sb, "Something is wrong with group %u: " + "used itable blocks: %d; " + "itable unused count: %u", + group, used_blks, + ext4_itable_unused_count(sb, gdp)); + ret = 1; + goto err_out; + } + + used_inos += group * EXT4_INODES_PER_GROUP(sb); + /* + * Are there some uninitialized inodes in the inode table + * before the first normal inode? + */ + if ((used_blks != sbi->s_itb_per_group) && + (used_inos < EXT4_FIRST_INO(sb))) { + ext4_error(sb, "Something is wrong with group %u: " + "itable unused count: %u; " + "itables initialized count: %ld", + group, ext4_itable_unused_count(sb, gdp), + used_inos); + ret = 1; + goto err_out; + } } blk = ext4_inode_table(sb, gdp) + used_blks; From 67d25186046145748d5fe4c5019d832215e01c1e Mon Sep 17 00:00:00 2001 From: Harshad Shirwadkar Date: Thu, 1 Apr 2021 10:21:23 -0700 Subject: [PATCH 11/32] ext4: drop s_mb_bal_lock and convert protected fields to atomic s_mb_buddies_generated gets used later in this patch series to determine if the cr 0 and cr 1 optimziations should be performed or not. Currently, s_mb_buddies_generated is protected under a spin_lock. In the allocation path, it is better if we don't depend on the lock and instead read the value atomically. In order to do that, we drop s_bal_lock altogether and we convert the only two protected fields by it s_mb_buddies_generated and s_mb_generation_time to atomic type. Signed-off-by: Harshad Shirwadkar Reviewed-by: Andreas Dilger Reviewed-by: Ritesh Harjani Link: https://lore.kernel.org/r/20210401172129.189766-2-harshadshirwadkar@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 5 ++--- fs/ext4/mballoc.c | 13 +++++-------- 2 files changed, 7 insertions(+), 11 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index da7cb2c755b6..7ba90a9fd53a 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1553,9 +1553,8 @@ struct ext4_sb_info { atomic_t s_bal_goals; /* goal hits */ atomic_t s_bal_breaks; /* too long searches */ atomic_t s_bal_2orders; /* 2^order hits */ - spinlock_t s_bal_lock; - unsigned long s_mb_buddies_generated; - unsigned long long s_mb_generation_time; + atomic_t s_mb_buddies_generated; /* number of buddies generated */ + atomic64_t s_mb_generation_time; atomic_t s_mb_lost_chunks; atomic_t s_mb_preallocated; atomic_t s_mb_discarded; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index a02fadf4fc84..34ddce5d6e5d 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -816,10 +816,8 @@ void ext4_mb_generate_buddy(struct super_block *sb, clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state)); period = get_cycles() - period; - spin_lock(&sbi->s_bal_lock); - sbi->s_mb_buddies_generated++; - sbi->s_mb_generation_time += period; - spin_unlock(&sbi->s_bal_lock); + atomic_inc(&sbi->s_mb_buddies_generated); + atomic64_add(period, &sbi->s_mb_generation_time); } /* The buddy information is attached the buddy cache inode @@ -2850,7 +2848,6 @@ int ext4_mb_init(struct super_block *sb) } while (i <= sb->s_blocksize_bits + 1); spin_lock_init(&sbi->s_md_lock); - spin_lock_init(&sbi->s_bal_lock); sbi->s_mb_free_pending = 0; INIT_LIST_HEAD(&sbi->s_freed_data_list); @@ -2986,9 +2983,9 @@ int ext4_mb_release(struct super_block *sb) atomic_read(&sbi->s_bal_breaks), atomic_read(&sbi->s_mb_lost_chunks)); ext4_msg(sb, KERN_INFO, - "mballoc: %lu generated and it took %Lu", - sbi->s_mb_buddies_generated, - sbi->s_mb_generation_time); + "mballoc: %u generated and it took %llu", + atomic_read(&sbi->s_mb_buddies_generated), + atomic64_read(&sbi->s_mb_generation_time)); ext4_msg(sb, KERN_INFO, "mballoc: %u preallocated, %u discarded", atomic_read(&sbi->s_mb_preallocated), From b237e3044450fcabc6d63d8578b2fbc8237caba3 Mon Sep 17 00:00:00 2001 From: Harshad Shirwadkar Date: Thu, 1 Apr 2021 10:21:24 -0700 Subject: [PATCH 12/32] ext4: add ability to return parsed options from parse_options Before this patch, the function parse_options() was returning journal_devnum and journal_ioprio variables to the caller. This patch generalizes that interface to allow parse_options to return any parsed options to return back to the caller. In this patch series, it gets used to capture the value of "mb_optimize_scan=%u" mount option. Signed-off-by: Harshad Shirwadkar Reviewed-by: Ritesh Harjani Link: https://lore.kernel.org/r/20210401172129.189766-3-harshadshirwadkar@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 51 +++++++++++++++++++++++++++++-------------------- 1 file changed, 30 insertions(+), 21 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 7b937a84f657..0ef82e646100 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2090,9 +2090,14 @@ static int ext4_set_test_dummy_encryption(struct super_block *sb, return 1; } +struct ext4_parsed_options { + unsigned long journal_devnum; + unsigned int journal_ioprio; +}; + static int handle_mount_opt(struct super_block *sb, char *opt, int token, - substring_t *args, unsigned long *journal_devnum, - unsigned int *journal_ioprio, int is_remount) + substring_t *args, struct ext4_parsed_options *parsed_opts, + int is_remount) { struct ext4_sb_info *sbi = EXT4_SB(sb); const struct mount_opts *m; @@ -2249,7 +2254,7 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, "Cannot specify journal on remount"); return -1; } - *journal_devnum = arg; + parsed_opts->journal_devnum = arg; } else if (token == Opt_journal_path) { char *journal_path; struct inode *journal_inode; @@ -2285,7 +2290,7 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, return -1; } - *journal_devnum = new_encode_dev(journal_inode->i_rdev); + parsed_opts->journal_devnum = new_encode_dev(journal_inode->i_rdev); path_put(&path); kfree(journal_path); } else if (token == Opt_journal_ioprio) { @@ -2294,7 +2299,7 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, " (must be 0-7)"); return -1; } - *journal_ioprio = + parsed_opts->journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, arg); } else if (token == Opt_test_dummy_encryption) { return ext4_set_test_dummy_encryption(sb, opt, &args[0], @@ -2411,8 +2416,7 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, } static int parse_options(char *options, struct super_block *sb, - unsigned long *journal_devnum, - unsigned int *journal_ioprio, + struct ext4_parsed_options *ret_opts, int is_remount) { struct ext4_sb_info __maybe_unused *sbi = EXT4_SB(sb); @@ -2432,8 +2436,8 @@ static int parse_options(char *options, struct super_block *sb, */ args[0].to = args[0].from = NULL; token = match_token(p, tokens, args); - if (handle_mount_opt(sb, p, token, args, journal_devnum, - journal_ioprio, is_remount) < 0) + if (handle_mount_opt(sb, p, token, args, ret_opts, + is_remount) < 0) return 0; } #ifdef CONFIG_QUOTA @@ -4015,7 +4019,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) ext4_fsblk_t sb_block = get_sb_block(&data); ext4_fsblk_t logical_sb_block; unsigned long offset = 0; - unsigned long journal_devnum = 0; unsigned long def_mount_opts; struct inode *root; const char *descr; @@ -4026,8 +4029,12 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) int needs_recovery, has_huge_files; __u64 blocks_count; int err = 0; - unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; ext4_group_t first_not_zeroed; + struct ext4_parsed_options parsed_opts; + + /* Set defaults for the variables that will be set during parsing */ + parsed_opts.journal_ioprio = DEFAULT_JOURNAL_IOPRIO; + parsed_opts.journal_devnum = 0; if ((data && !orig_data) || !sbi) goto out_free_base; @@ -4273,8 +4280,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) GFP_KERNEL); if (!s_mount_opts) goto failed_mount; - if (!parse_options(s_mount_opts, sb, &journal_devnum, - &journal_ioprio, 0)) { + if (!parse_options(s_mount_opts, sb, &parsed_opts, 0)) { ext4_msg(sb, KERN_WARNING, "failed to parse options in superblock: %s", s_mount_opts); @@ -4282,8 +4288,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) kfree(s_mount_opts); } sbi->s_def_mount_opt = sbi->s_mount_opt; - if (!parse_options((char *) data, sb, &journal_devnum, - &journal_ioprio, 0)) + if (!parse_options((char *) data, sb, &parsed_opts, 0)) goto failed_mount; #ifdef CONFIG_UNICODE @@ -4768,7 +4773,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) * root first: it may be modified in the journal! */ if (!test_opt(sb, NOLOAD) && ext4_has_feature_journal(sb)) { - err = ext4_load_journal(sb, es, journal_devnum); + err = ext4_load_journal(sb, es, parsed_opts.journal_devnum); if (err) goto failed_mount3a; } else if (test_opt(sb, NOLOAD) && !sb_rdonly(sb) && @@ -4868,7 +4873,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount_wq; } - set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); + set_task_ioprio(sbi->s_journal->j_task, parsed_opts.journal_ioprio); sbi->s_journal->j_submit_inode_data_buffers = ext4_journal_submit_inode_data_buffers; @@ -5807,13 +5812,16 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) struct ext4_mount_options old_opts; int enable_quota = 0; ext4_group_t g; - unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; int err = 0; #ifdef CONFIG_QUOTA int i, j; char *to_free[EXT4_MAXQUOTAS]; #endif char *orig_data = kstrdup(data, GFP_KERNEL); + struct ext4_parsed_options parsed_opts; + + parsed_opts.journal_ioprio = DEFAULT_JOURNAL_IOPRIO; + parsed_opts.journal_devnum = 0; if (data && !orig_data) return -ENOMEM; @@ -5844,7 +5852,8 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) old_opts.s_qf_names[i] = NULL; #endif if (sbi->s_journal && sbi->s_journal->j_task->io_context) - journal_ioprio = sbi->s_journal->j_task->io_context->ioprio; + parsed_opts.journal_ioprio = + sbi->s_journal->j_task->io_context->ioprio; /* * Some options can be enabled by ext4 and/or by VFS mount flag @@ -5854,7 +5863,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) vfs_flags = SB_LAZYTIME | SB_I_VERSION; sb->s_flags = (sb->s_flags & ~vfs_flags) | (*flags & vfs_flags); - if (!parse_options(data, sb, NULL, &journal_ioprio, 1)) { + if (!parse_options(data, sb, &parsed_opts, 1)) { err = -EINVAL; goto restore_opts; } @@ -5904,7 +5913,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) if (sbi->s_journal) { ext4_init_journal_params(sb, sbi->s_journal); - set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); + set_task_ioprio(sbi->s_journal->j_task, parsed_opts.journal_ioprio); } /* Flush outstanding errors before changing fs state */ From a6c75eaf11032f4a3d2b3ce2265a194ac6e4a7f0 Mon Sep 17 00:00:00 2001 From: Harshad Shirwadkar Date: Thu, 1 Apr 2021 10:21:25 -0700 Subject: [PATCH 13/32] ext4: add mballoc stats proc file Add new stats for measuring the performance of mballoc. This patch is forked from Artem Blagodarenko's work that can be found here: https://github.com/lustre/lustre-release/blob/master/ldiskfs/kernel_patches/patches/rhel8/ext4-simple-blockalloc.patch This patch reorganizes the stats by cr level. This is how the output looks like: mballoc: reqs: 0 success: 0 groups_scanned: 0 cr0_stats: hits: 0 groups_considered: 0 useless_loops: 0 bad_suggestions: 0 cr1_stats: hits: 0 groups_considered: 0 useless_loops: 0 bad_suggestions: 0 cr2_stats: hits: 0 groups_considered: 0 useless_loops: 0 cr3_stats: hits: 0 groups_considered: 0 useless_loops: 0 extents_scanned: 0 goal_hits: 0 2^n_hits: 0 breaks: 0 lost: 0 buddies_generated: 0/40 buddies_time_used: 0 preallocated: 0 discarded: 0 Signed-off-by: Harshad Shirwadkar Reviewed-by: Andreas Dilger Reviewed-by: Ritesh Harjani Link: https://lore.kernel.org/r/20210401172129.189766-4-harshadshirwadkar@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 5 ++++ fs/ext4/mballoc.c | 75 +++++++++++++++++++++++++++++++++++++++++++++-- fs/ext4/sysfs.c | 2 ++ 3 files changed, 80 insertions(+), 2 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 7ba90a9fd53a..67a675d222f2 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1550,9 +1550,13 @@ struct ext4_sb_info { atomic_t s_bal_success; /* we found long enough chunks */ atomic_t s_bal_allocated; /* in blocks */ atomic_t s_bal_ex_scanned; /* total extents scanned */ + atomic_t s_bal_groups_scanned; /* number of groups scanned */ atomic_t s_bal_goals; /* goal hits */ atomic_t s_bal_breaks; /* too long searches */ atomic_t s_bal_2orders; /* 2^order hits */ + atomic64_t s_bal_cX_groups_considered[4]; + atomic64_t s_bal_cX_hits[4]; + atomic64_t s_bal_cX_failed[4]; /* cX loop didn't find blocks */ atomic_t s_mb_buddies_generated; /* number of buddies generated */ atomic64_t s_mb_generation_time; atomic_t s_mb_lost_chunks; @@ -2856,6 +2860,7 @@ int __init ext4_fc_init_dentry_cache(void); extern const struct seq_operations ext4_mb_seq_groups_ops; extern long ext4_mb_stats; extern long ext4_mb_max_to_scan; +extern int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset); extern int ext4_mb_init(struct super_block *); extern int ext4_mb_release(struct super_block *); extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *, diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 34ddce5d6e5d..70b8ab19c252 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2146,6 +2146,8 @@ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac, ext4_grpblk_t free; int ret = 0; + if (sbi->s_mb_stats) + atomic64_inc(&sbi->s_bal_cX_groups_considered[ac->ac_criteria]); if (should_lock) ext4_lock_group(sb, group); free = grp->bb_free; @@ -2420,6 +2422,9 @@ repeat: if (ac->ac_status != AC_STATUS_CONTINUE) break; } + /* Processed all groups and haven't found blocks */ + if (sbi->s_mb_stats && i == ngroups) + atomic64_inc(&sbi->s_bal_cX_failed[cr]); } if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND && @@ -2449,6 +2454,9 @@ repeat: goto repeat; } } + + if (sbi->s_mb_stats && ac->ac_status == AC_STATUS_FOUND) + atomic64_inc(&sbi->s_bal_cX_hits[ac->ac_criteria]); out: if (!err && ac->ac_status != AC_STATUS_FOUND && first_err) err = first_err; @@ -2548,6 +2556,67 @@ const struct seq_operations ext4_mb_seq_groups_ops = { .show = ext4_mb_seq_groups_show, }; +int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset) +{ + struct super_block *sb = (struct super_block *)seq->private; + struct ext4_sb_info *sbi = EXT4_SB(sb); + + seq_puts(seq, "mballoc:\n"); + if (!sbi->s_mb_stats) { + seq_puts(seq, "\tmb stats collection turned off.\n"); + seq_puts(seq, "\tTo enable, please write \"1\" to sysfs file mb_stats.\n"); + return 0; + } + seq_printf(seq, "\treqs: %u\n", atomic_read(&sbi->s_bal_reqs)); + seq_printf(seq, "\tsuccess: %u\n", atomic_read(&sbi->s_bal_success)); + + seq_printf(seq, "\tgroups_scanned: %u\n", atomic_read(&sbi->s_bal_groups_scanned)); + + seq_puts(seq, "\tcr0_stats:\n"); + seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[0])); + seq_printf(seq, "\t\tgroups_considered: %llu\n", + atomic64_read(&sbi->s_bal_cX_groups_considered[0])); + seq_printf(seq, "\t\tuseless_loops: %llu\n", + atomic64_read(&sbi->s_bal_cX_failed[0])); + + seq_puts(seq, "\tcr1_stats:\n"); + seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[1])); + seq_printf(seq, "\t\tgroups_considered: %llu\n", + atomic64_read(&sbi->s_bal_cX_groups_considered[1])); + seq_printf(seq, "\t\tuseless_loops: %llu\n", + atomic64_read(&sbi->s_bal_cX_failed[1])); + + seq_puts(seq, "\tcr2_stats:\n"); + seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[2])); + seq_printf(seq, "\t\tgroups_considered: %llu\n", + atomic64_read(&sbi->s_bal_cX_groups_considered[2])); + seq_printf(seq, "\t\tuseless_loops: %llu\n", + atomic64_read(&sbi->s_bal_cX_failed[2])); + + seq_puts(seq, "\tcr3_stats:\n"); + seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[3])); + seq_printf(seq, "\t\tgroups_considered: %llu\n", + atomic64_read(&sbi->s_bal_cX_groups_considered[3])); + seq_printf(seq, "\t\tuseless_loops: %llu\n", + atomic64_read(&sbi->s_bal_cX_failed[3])); + seq_printf(seq, "\textents_scanned: %u\n", atomic_read(&sbi->s_bal_ex_scanned)); + seq_printf(seq, "\t\tgoal_hits: %u\n", atomic_read(&sbi->s_bal_goals)); + seq_printf(seq, "\t\t2^n_hits: %u\n", atomic_read(&sbi->s_bal_2orders)); + seq_printf(seq, "\t\tbreaks: %u\n", atomic_read(&sbi->s_bal_breaks)); + seq_printf(seq, "\t\tlost: %u\n", atomic_read(&sbi->s_mb_lost_chunks)); + + seq_printf(seq, "\tbuddies_generated: %u/%u\n", + atomic_read(&sbi->s_mb_buddies_generated), + ext4_get_groups_count(sb)); + seq_printf(seq, "\tbuddies_time_used: %llu\n", + atomic64_read(&sbi->s_mb_generation_time)); + seq_printf(seq, "\tpreallocated: %u\n", + atomic_read(&sbi->s_mb_preallocated)); + seq_printf(seq, "\tdiscarded: %u\n", + atomic_read(&sbi->s_mb_discarded)); + return 0; +} + static struct kmem_cache *get_groupinfo_cache(int blocksize_bits) { int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE; @@ -2975,9 +3044,10 @@ int ext4_mb_release(struct super_block *sb) atomic_read(&sbi->s_bal_reqs), atomic_read(&sbi->s_bal_success)); ext4_msg(sb, KERN_INFO, - "mballoc: %u extents scanned, %u goal hits, " + "mballoc: %u extents scanned, %u groups scanned, %u goal hits, " "%u 2^N hits, %u breaks, %u lost", atomic_read(&sbi->s_bal_ex_scanned), + atomic_read(&sbi->s_bal_groups_scanned), atomic_read(&sbi->s_bal_goals), atomic_read(&sbi->s_bal_2orders), atomic_read(&sbi->s_bal_breaks), @@ -3580,12 +3650,13 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); - if (sbi->s_mb_stats && ac->ac_g_ex.fe_len > 1) { + if (sbi->s_mb_stats && ac->ac_g_ex.fe_len >= 1) { atomic_inc(&sbi->s_bal_reqs); atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated); if (ac->ac_b_ex.fe_len >= ac->ac_o_ex.fe_len) atomic_inc(&sbi->s_bal_success); atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned); + atomic_add(ac->ac_groups_scanned, &sbi->s_bal_groups_scanned); if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start && ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group) atomic_inc(&sbi->s_bal_goals); diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index 7367ba406e01..1b5580eb88e6 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -530,6 +530,8 @@ int ext4_register_sysfs(struct super_block *sb) ext4_fc_info_show, sb); proc_create_seq_data("mb_groups", S_IRUGO, sbi->s_proc, &ext4_mb_seq_groups_ops, sb); + proc_create_single_data("mb_stats", 0444, sbi->s_proc, + ext4_seq_mb_stats_show, sb); } return 0; } From 4b68f6df105966f04f45f1eca0561b86f2b3551d Mon Sep 17 00:00:00 2001 From: Harshad Shirwadkar Date: Thu, 1 Apr 2021 10:21:26 -0700 Subject: [PATCH 14/32] ext4: add MB_NUM_ORDERS macro A few arrays in mballoc.c use the total number of valid orders as their size. Currently, this value is set as "sb->s_blocksize_bits + 2". This makes code harder to read. So, instead add a new macro MB_NUM_ORDERS(sb) to make the code more readable. Signed-off-by: Harshad Shirwadkar Reviewed-by: Andreas Dilger Reviewed-by: Ritesh Harjani Link: https://lore.kernel.org/r/20210401172129.189766-5-harshadshirwadkar@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 19 ++++++++++--------- fs/ext4/mballoc.h | 5 +++++ 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 70b8ab19c252..e899a7d21982 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -756,7 +756,7 @@ mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp) grp->bb_largest_free_order = -1; /* uninit */ - bits = sb->s_blocksize_bits + 1; + bits = MB_NUM_ORDERS(sb) - 1; for (i = bits; i >= 0; i--) { if (grp->bb_counters[i] > 0) { grp->bb_largest_free_order = i; @@ -957,7 +957,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) grinfo->bb_fragments = 0; memset(grinfo->bb_counters, 0, sizeof(*grinfo->bb_counters) * - (sb->s_blocksize_bits+2)); + (MB_NUM_ORDERS(sb))); /* * incore got set to the group block bitmap below */ @@ -1928,7 +1928,7 @@ void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac, int max; BUG_ON(ac->ac_2order <= 0); - for (i = ac->ac_2order; i <= sb->s_blocksize_bits + 1; i++) { + for (i = ac->ac_2order; i < MB_NUM_ORDERS(sb); i++) { if (grp->bb_counters[i] == 0) continue; @@ -2107,7 +2107,7 @@ static bool ext4_mb_good_group(struct ext4_allocation_context *ac, if (free < ac->ac_g_ex.fe_len) return false; - if (ac->ac_2order > ac->ac_sb->s_blocksize_bits+1) + if (ac->ac_2order >= MB_NUM_ORDERS(ac->ac_sb)) return true; if (grp->bb_largest_free_order < ac->ac_2order) @@ -2315,13 +2315,13 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) * We also support searching for power-of-two requests only for * requests upto maximum buddy size we have constructed. */ - if (i >= sbi->s_mb_order2_reqs && i <= sb->s_blocksize_bits + 2) { + if (i >= sbi->s_mb_order2_reqs && i <= MB_NUM_ORDERS(sb)) { /* * This should tell if fe_len is exactly power of 2 */ if ((ac->ac_g_ex.fe_len & (~(1 << (i - 1)))) == 0) ac->ac_2order = array_index_nospec(i - 1, - sb->s_blocksize_bits + 2); + MB_NUM_ORDERS(sb)); } /* if stream allocation is enabled, use global goal */ @@ -2880,7 +2880,7 @@ int ext4_mb_init(struct super_block *sb) unsigned max; int ret; - i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets); + i = MB_NUM_ORDERS(sb) * sizeof(*sbi->s_mb_offsets); sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL); if (sbi->s_mb_offsets == NULL) { @@ -2888,7 +2888,7 @@ int ext4_mb_init(struct super_block *sb) goto out; } - i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_maxs); + i = MB_NUM_ORDERS(sb) * sizeof(*sbi->s_mb_maxs); sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); if (sbi->s_mb_maxs == NULL) { ret = -ENOMEM; @@ -2914,7 +2914,8 @@ int ext4_mb_init(struct super_block *sb) offset_incr = offset_incr >> 1; max = max >> 1; i++; - } while (i <= sb->s_blocksize_bits + 1); + } while (i < MB_NUM_ORDERS(sb)); + spin_lock_init(&sbi->s_md_lock); sbi->s_mb_free_pending = 0; diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index e75b4749aa1c..68111a10cfee 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -78,6 +78,11 @@ */ #define MB_DEFAULT_MAX_INODE_PREALLOC 512 +/* + * Number of valid buddy orders + */ +#define MB_NUM_ORDERS(sb) ((sb)->s_blocksize_bits + 2) + struct ext4_free_data { /* this links the free block information from sb_info */ struct list_head efd_list; From 196e402adf2e4cd66f101923409f1970ec5f1af3 Mon Sep 17 00:00:00 2001 From: Harshad Shirwadkar Date: Thu, 1 Apr 2021 10:21:27 -0700 Subject: [PATCH 15/32] ext4: improve cr 0 / cr 1 group scanning Instead of traversing through groups linearly, scan groups in specific orders at cr 0 and cr 1. At cr 0, we want to find groups that have the largest free order >= the order of the request. So, with this patch, we maintain lists for each possible order and insert each group into a list based on the largest free order in its buddy bitmap. During cr 0 allocation, we traverse these lists in the increasing order of largest free orders. This allows us to find a group with the best available cr 0 match in constant time. If nothing can be found, we fallback to cr 1 immediately. At CR1, the story is slightly different. We want to traverse in the order of increasing average fragment size. For CR1, we maintain a rb tree of groupinfos which is sorted by average fragment size. Instead of traversing linearly, at CR1, we traverse in the order of increasing average fragment size, starting at the most optimal group. This brings down cr 1 search complexity to log(num groups). For cr >= 2, we just perform the linear search as before. Also, in case of lock contention, we intermittently fallback to linear search even in CR 0 and CR 1 cases. This allows us to proceed during the allocation path even in case of high contention. There is an opportunity to do optimization at CR2 too. That's because at CR2 we only consider groups where bb_free counter (number of free blocks) is greater than the request extent size. That's left as future work. All the changes introduced in this patch are protected under a new mount option "mb_optimize_scan". With this patchset, following experiment was performed: Created a highly fragmented disk of size 65TB. The disk had no contiguous 2M regions. Following command was run consecutively for 3 times: time dd if=/dev/urandom of=file bs=2M count=10 Here are the results with and without cr 0/1 optimizations introduced in this patch: |---------+------------------------------+---------------------------| | | Without CR 0/1 Optimizations | With CR 0/1 Optimizations | |---------+------------------------------+---------------------------| | 1st run | 5m1.871s | 2m47.642s | | 2nd run | 2m28.390s | 0m0.611s | | 3rd run | 2m26.530s | 0m1.255s | |---------+------------------------------+---------------------------| Signed-off-by: Harshad Shirwadkar Reported-by: kernel test robot Reported-by: Dan Carpenter Reviewed-by: Andreas Dilger Link: https://lore.kernel.org/r/20210401172129.189766-6-harshadshirwadkar@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 21 ++- fs/ext4/mballoc.c | 399 ++++++++++++++++++++++++++++++++++++++++++++-- fs/ext4/mballoc.h | 17 +- fs/ext4/super.c | 28 +++- fs/ext4/sysfs.c | 2 + 5 files changed, 452 insertions(+), 15 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 67a675d222f2..42df628b35e5 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -162,7 +162,12 @@ enum SHIFT_DIRECTION { #define EXT4_MB_USE_RESERVED 0x2000 /* Do strict check for free blocks while retrying block allocation */ #define EXT4_MB_STRICT_CHECK 0x4000 - +/* Large fragment size list lookup succeeded at least once for cr = 0 */ +#define EXT4_MB_CR0_OPTIMIZED 0x8000 +/* Avg fragment size rb tree lookup succeeded at least once for cr = 1 */ +#define EXT4_MB_CR1_OPTIMIZED 0x00010000 +/* Perform linear traversal for one group */ +#define EXT4_MB_SEARCH_NEXT_LINEAR 0x00020000 struct ext4_allocation_request { /* target inode for block we're allocating */ struct inode *inode; @@ -1247,7 +1252,9 @@ struct ext4_inode_info { #define EXT4_MOUNT2_JOURNAL_FAST_COMMIT 0x00000010 /* Journal fast commit */ #define EXT4_MOUNT2_DAX_NEVER 0x00000020 /* Do not allow Direct Access */ #define EXT4_MOUNT2_DAX_INODE 0x00000040 /* For printing options only */ - +#define EXT4_MOUNT2_MB_OPTIMIZE_SCAN 0x00000080 /* Optimize group + * scanning in mballoc + */ #define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \ ~EXT4_MOUNT_##opt @@ -1528,9 +1535,14 @@ struct ext4_sb_info { unsigned int s_mb_free_pending; struct list_head s_freed_data_list; /* List of blocks to be freed after commit completed */ + struct rb_root s_mb_avg_fragment_size_root; + rwlock_t s_mb_rb_lock; + struct list_head *s_mb_largest_free_orders; + rwlock_t *s_mb_largest_free_orders_locks; /* tunables */ unsigned long s_stripe; + unsigned int s_mb_max_linear_groups; unsigned int s_mb_stream_request; unsigned int s_mb_max_to_scan; unsigned int s_mb_min_to_scan; @@ -1554,6 +1566,8 @@ struct ext4_sb_info { atomic_t s_bal_goals; /* goal hits */ atomic_t s_bal_breaks; /* too long searches */ atomic_t s_bal_2orders; /* 2^order hits */ + atomic_t s_bal_cr0_bad_suggestions; + atomic_t s_bal_cr1_bad_suggestions; atomic64_t s_bal_cX_groups_considered[4]; atomic64_t s_bal_cX_hits[4]; atomic64_t s_bal_cX_failed[4]; /* cX loop didn't find blocks */ @@ -3360,11 +3374,14 @@ struct ext4_group_info { ext4_grpblk_t bb_free; /* total free blocks */ ext4_grpblk_t bb_fragments; /* nr of freespace fragments */ ext4_grpblk_t bb_largest_free_order;/* order of largest frag in BG */ + ext4_group_t bb_group; /* Group number */ struct list_head bb_prealloc_list; #ifdef DOUBLE_CHECK void *bb_bitmap; #endif struct rw_semaphore alloc_sem; + struct rb_node bb_avg_fragment_size_rb; + struct list_head bb_largest_free_order_node; ext4_grpblk_t bb_counters[]; /* Nr of free power-of-two-block * regions, index is order. * bb_counters[3] = 5 means diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index e899a7d21982..c62555598a8e 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -127,11 +127,50 @@ * smallest multiple of the stripe value (sbi->s_stripe) which is * greater than the default mb_group_prealloc. * + * If "mb_optimize_scan" mount option is set, we maintain in memory group info + * structures in two data structures: + * + * 1) Array of largest free order lists (sbi->s_mb_largest_free_orders) + * + * Locking: sbi->s_mb_largest_free_orders_locks(array of rw locks) + * + * This is an array of lists where the index in the array represents the + * largest free order in the buddy bitmap of the participating group infos of + * that list. So, there are exactly MB_NUM_ORDERS(sb) (which means total + * number of buddy bitmap orders possible) number of lists. Group-infos are + * placed in appropriate lists. + * + * 2) Average fragment size rb tree (sbi->s_mb_avg_fragment_size_root) + * + * Locking: sbi->s_mb_rb_lock (rwlock) + * + * This is a red black tree consisting of group infos and the tree is sorted + * by average fragment sizes (which is calculated as ext4_group_info->bb_free + * / ext4_group_info->bb_fragments). + * + * When "mb_optimize_scan" mount option is set, mballoc consults the above data + * structures to decide the order in which groups are to be traversed for + * fulfilling an allocation request. + * + * At CR = 0, we look for groups which have the largest_free_order >= the order + * of the request. We directly look at the largest free order list in the data + * structure (1) above where largest_free_order = order of the request. If that + * list is empty, we look at remaining list in the increasing order of + * largest_free_order. This allows us to perform CR = 0 lookup in O(1) time. + * + * At CR = 1, we only consider groups where average fragment size > request + * size. So, we lookup a group which has average fragment size just above or + * equal to request size using our rb tree (data structure 2) in O(log N) time. + * + * If "mb_optimize_scan" mount option is not set, mballoc traverses groups in + * linear order which requires O(N) search time for each CR 0 and CR 1 phase. + * * The regular allocator (using the buddy cache) supports a few tunables. * * /sys/fs/ext4//mb_min_to_scan * /sys/fs/ext4//mb_max_to_scan * /sys/fs/ext4//mb_order2_req + * /sys/fs/ext4//mb_linear_limit * * The regular allocator uses buddy scan only if the request len is power of * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The @@ -149,6 +188,16 @@ * can be used for allocation. ext4_mb_good_group explains how the groups are * checked. * + * When "mb_optimize_scan" is turned on, as mentioned above, the groups may not + * get traversed linearly. That may result in subsequent allocations being not + * close to each other. And so, the underlying device may get filled up in a + * non-linear fashion. While that may not matter on non-rotational devices, for + * rotational devices that may result in higher seek times. "mb_linear_limit" + * tells mballoc how many groups mballoc should search linearly before + * performing consulting above data structures for more efficient lookups. For + * non rotational devices, this value defaults to 0 and for rotational devices + * this is set to MB_DEFAULT_LINEAR_LIMIT. + * * Both the prealloc space are getting populated as above. So for the first * request we will hit the buddy cache which will result in this prealloc * space getting filled. The prealloc space is then later used for the @@ -299,6 +348,8 @@ * - bitlock on a group (group) * - object (inode/locality) (object) * - per-pa lock (pa) + * - cr0 lists lock (cr0) + * - cr1 tree lock (cr1) * * Paths: * - new pa @@ -328,6 +379,9 @@ * group * object * + * - allocation path (ext4_mb_regular_allocator) + * group + * cr0/cr1 */ static struct kmem_cache *ext4_pspace_cachep; static struct kmem_cache *ext4_ac_cachep; @@ -351,6 +405,9 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, ext4_group_t group); static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac); +static bool ext4_mb_good_group(struct ext4_allocation_context *ac, + ext4_group_t group, int cr); + /* * The algorithm using this percpu seq counter goes below: * 1. We sample the percpu discard_pa_seq counter before trying for block @@ -744,6 +801,269 @@ static void ext4_mb_mark_free_simple(struct super_block *sb, } } +static void ext4_mb_rb_insert(struct rb_root *root, struct rb_node *new, + int (*cmp)(struct rb_node *, struct rb_node *)) +{ + struct rb_node **iter = &root->rb_node, *parent = NULL; + + while (*iter) { + parent = *iter; + if (cmp(new, *iter) > 0) + iter = &((*iter)->rb_left); + else + iter = &((*iter)->rb_right); + } + + rb_link_node(new, parent, iter); + rb_insert_color(new, root); +} + +static int +ext4_mb_avg_fragment_size_cmp(struct rb_node *rb1, struct rb_node *rb2) +{ + struct ext4_group_info *grp1 = rb_entry(rb1, + struct ext4_group_info, + bb_avg_fragment_size_rb); + struct ext4_group_info *grp2 = rb_entry(rb2, + struct ext4_group_info, + bb_avg_fragment_size_rb); + int num_frags_1, num_frags_2; + + num_frags_1 = grp1->bb_fragments ? + grp1->bb_free / grp1->bb_fragments : 0; + num_frags_2 = grp2->bb_fragments ? + grp2->bb_free / grp2->bb_fragments : 0; + + return (num_frags_2 - num_frags_1); +} + +/* + * Reinsert grpinfo into the avg_fragment_size tree with new average + * fragment size. + */ +static void +mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (!test_opt2(sb, MB_OPTIMIZE_SCAN) || grp->bb_free == 0) + return; + + write_lock(&sbi->s_mb_rb_lock); + if (!RB_EMPTY_NODE(&grp->bb_avg_fragment_size_rb)) { + rb_erase(&grp->bb_avg_fragment_size_rb, + &sbi->s_mb_avg_fragment_size_root); + RB_CLEAR_NODE(&grp->bb_avg_fragment_size_rb); + } + + ext4_mb_rb_insert(&sbi->s_mb_avg_fragment_size_root, + &grp->bb_avg_fragment_size_rb, + ext4_mb_avg_fragment_size_cmp); + write_unlock(&sbi->s_mb_rb_lock); +} + +/* + * Choose next group by traversing largest_free_order lists. Updates *new_cr if + * cr level needs an update. + */ +static void ext4_mb_choose_next_group_cr0(struct ext4_allocation_context *ac, + int *new_cr, ext4_group_t *group, ext4_group_t ngroups) +{ + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + struct ext4_group_info *iter, *grp; + int i; + + if (ac->ac_status == AC_STATUS_FOUND) + return; + + if (unlikely(sbi->s_mb_stats && ac->ac_flags & EXT4_MB_CR0_OPTIMIZED)) + atomic_inc(&sbi->s_bal_cr0_bad_suggestions); + + grp = NULL; + for (i = ac->ac_2order; i < MB_NUM_ORDERS(ac->ac_sb); i++) { + if (list_empty(&sbi->s_mb_largest_free_orders[i])) + continue; + read_lock(&sbi->s_mb_largest_free_orders_locks[i]); + if (list_empty(&sbi->s_mb_largest_free_orders[i])) { + read_unlock(&sbi->s_mb_largest_free_orders_locks[i]); + continue; + } + grp = NULL; + list_for_each_entry(iter, &sbi->s_mb_largest_free_orders[i], + bb_largest_free_order_node) { + if (sbi->s_mb_stats) + atomic64_inc(&sbi->s_bal_cX_groups_considered[0]); + if (likely(ext4_mb_good_group(ac, iter->bb_group, 0))) { + grp = iter; + break; + } + } + read_unlock(&sbi->s_mb_largest_free_orders_locks[i]); + if (grp) + break; + } + + if (!grp) { + /* Increment cr and search again */ + *new_cr = 1; + } else { + *group = grp->bb_group; + ac->ac_last_optimal_group = *group; + ac->ac_flags |= EXT4_MB_CR0_OPTIMIZED; + } +} + +/* + * Choose next group by traversing average fragment size tree. Updates *new_cr + * if cr lvel needs an update. Sets EXT4_MB_SEARCH_NEXT_LINEAR to indicate that + * the linear search should continue for one iteration since there's lock + * contention on the rb tree lock. + */ +static void ext4_mb_choose_next_group_cr1(struct ext4_allocation_context *ac, + int *new_cr, ext4_group_t *group, ext4_group_t ngroups) +{ + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + int avg_fragment_size, best_so_far; + struct rb_node *node, *found; + struct ext4_group_info *grp; + + /* + * If there is contention on the lock, instead of waiting for the lock + * to become available, just continue searching lineraly. We'll resume + * our rb tree search later starting at ac->ac_last_optimal_group. + */ + if (!read_trylock(&sbi->s_mb_rb_lock)) { + ac->ac_flags |= EXT4_MB_SEARCH_NEXT_LINEAR; + return; + } + + if (unlikely(ac->ac_flags & EXT4_MB_CR1_OPTIMIZED)) { + if (sbi->s_mb_stats) + atomic_inc(&sbi->s_bal_cr1_bad_suggestions); + /* We have found something at CR 1 in the past */ + grp = ext4_get_group_info(ac->ac_sb, ac->ac_last_optimal_group); + for (found = rb_next(&grp->bb_avg_fragment_size_rb); found != NULL; + found = rb_next(found)) { + grp = rb_entry(found, struct ext4_group_info, + bb_avg_fragment_size_rb); + if (sbi->s_mb_stats) + atomic64_inc(&sbi->s_bal_cX_groups_considered[1]); + if (likely(ext4_mb_good_group(ac, grp->bb_group, 1))) + break; + } + goto done; + } + + node = sbi->s_mb_avg_fragment_size_root.rb_node; + best_so_far = 0; + found = NULL; + + while (node) { + grp = rb_entry(node, struct ext4_group_info, + bb_avg_fragment_size_rb); + avg_fragment_size = 0; + if (ext4_mb_good_group(ac, grp->bb_group, 1)) { + avg_fragment_size = grp->bb_fragments ? + grp->bb_free / grp->bb_fragments : 0; + if (!best_so_far || avg_fragment_size < best_so_far) { + best_so_far = avg_fragment_size; + found = node; + } + } + if (avg_fragment_size > ac->ac_g_ex.fe_len) + node = node->rb_right; + else + node = node->rb_left; + } + +done: + if (found) { + grp = rb_entry(found, struct ext4_group_info, + bb_avg_fragment_size_rb); + *group = grp->bb_group; + ac->ac_flags |= EXT4_MB_CR1_OPTIMIZED; + } else { + *new_cr = 2; + } + + read_unlock(&sbi->s_mb_rb_lock); + ac->ac_last_optimal_group = *group; +} + +static inline int should_optimize_scan(struct ext4_allocation_context *ac) +{ + if (unlikely(!test_opt2(ac->ac_sb, MB_OPTIMIZE_SCAN))) + return 0; + if (ac->ac_criteria >= 2) + return 0; + if (ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) + return 0; + return 1; +} + +/* + * Return next linear group for allocation. If linear traversal should not be + * performed, this function just returns the same group + */ +static int +next_linear_group(struct ext4_allocation_context *ac, int group, int ngroups) +{ + if (!should_optimize_scan(ac)) + goto inc_and_return; + + if (ac->ac_groups_linear_remaining) { + ac->ac_groups_linear_remaining--; + goto inc_and_return; + } + + if (ac->ac_flags & EXT4_MB_SEARCH_NEXT_LINEAR) { + ac->ac_flags &= ~EXT4_MB_SEARCH_NEXT_LINEAR; + goto inc_and_return; + } + + return group; +inc_and_return: + /* + * Artificially restricted ngroups for non-extent + * files makes group > ngroups possible on first loop. + */ + return group + 1 >= ngroups ? 0 : group + 1; +} + +/* + * ext4_mb_choose_next_group: choose next group for allocation. + * + * @ac Allocation Context + * @new_cr This is an output parameter. If the there is no good group + * available at current CR level, this field is updated to indicate + * the new cr level that should be used. + * @group This is an input / output parameter. As an input it indicates the + * next group that the allocator intends to use for allocation. As + * output, this field indicates the next group that should be used as + * determined by the optimization functions. + * @ngroups Total number of groups + */ +static void ext4_mb_choose_next_group(struct ext4_allocation_context *ac, + int *new_cr, ext4_group_t *group, ext4_group_t ngroups) +{ + *new_cr = ac->ac_criteria; + + if (!should_optimize_scan(ac) || ac->ac_groups_linear_remaining) + return; + + if (*new_cr == 0) { + ext4_mb_choose_next_group_cr0(ac, new_cr, group, ngroups); + } else if (*new_cr == 1) { + ext4_mb_choose_next_group_cr1(ac, new_cr, group, ngroups); + } else { + /* + * TODO: For CR=2, we can arrange groups in an rb tree sorted by + * bb_free. But until that happens, we should never come here. + */ + WARN_ON(1); + } +} + /* * Cache the order of the largest free extent we have available in this block * group. @@ -751,18 +1071,33 @@ static void ext4_mb_mark_free_simple(struct super_block *sb, static void mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp) { + struct ext4_sb_info *sbi = EXT4_SB(sb); int i; - int bits; + if (test_opt2(sb, MB_OPTIMIZE_SCAN) && grp->bb_largest_free_order >= 0) { + write_lock(&sbi->s_mb_largest_free_orders_locks[ + grp->bb_largest_free_order]); + list_del_init(&grp->bb_largest_free_order_node); + write_unlock(&sbi->s_mb_largest_free_orders_locks[ + grp->bb_largest_free_order]); + } grp->bb_largest_free_order = -1; /* uninit */ - bits = MB_NUM_ORDERS(sb) - 1; - for (i = bits; i >= 0; i--) { + for (i = MB_NUM_ORDERS(sb) - 1; i >= 0; i--) { if (grp->bb_counters[i] > 0) { grp->bb_largest_free_order = i; break; } } + if (test_opt2(sb, MB_OPTIMIZE_SCAN) && + grp->bb_largest_free_order >= 0 && grp->bb_free) { + write_lock(&sbi->s_mb_largest_free_orders_locks[ + grp->bb_largest_free_order]); + list_add_tail(&grp->bb_largest_free_order_node, + &sbi->s_mb_largest_free_orders[grp->bb_largest_free_order]); + write_unlock(&sbi->s_mb_largest_free_orders_locks[ + grp->bb_largest_free_order]); + } } static noinline_for_stack @@ -818,6 +1153,7 @@ void ext4_mb_generate_buddy(struct super_block *sb, period = get_cycles() - period; atomic_inc(&sbi->s_mb_buddies_generated); atomic64_add(period, &sbi->s_mb_generation_time); + mb_update_avg_fragment_size(sb, grp); } /* The buddy information is attached the buddy cache inode @@ -1517,6 +1853,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, done: mb_set_largest_free_order(sb, e4b->bd_info); + mb_update_avg_fragment_size(sb, e4b->bd_info); mb_check_buddy(e4b); } @@ -1653,6 +1990,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex) } mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info); + mb_update_avg_fragment_size(e4b->bd_sb, e4b->bd_info); ext4_set_bits(e4b->bd_bitmap, ex->fe_start, len0); mb_check_buddy(e4b); @@ -2347,17 +2685,21 @@ repeat: * from the goal value specified */ group = ac->ac_g_ex.fe_group; + ac->ac_last_optimal_group = group; + ac->ac_groups_linear_remaining = sbi->s_mb_max_linear_groups; prefetch_grp = group; - for (i = 0; i < ngroups; group++, i++) { - int ret = 0; + for (i = 0; i < ngroups; group = next_linear_group(ac, group, ngroups), + i++) { + int ret = 0, new_cr; + cond_resched(); - /* - * Artificially restricted ngroups for non-extent - * files makes group > ngroups possible on first loop. - */ - if (group >= ngroups) - group = 0; + + ext4_mb_choose_next_group(ac, &new_cr, &group, ngroups); + if (new_cr != cr) { + cr = new_cr; + goto repeat; + } /* * Batch reads of the block allocation bitmaps @@ -2578,6 +2920,8 @@ int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset) atomic64_read(&sbi->s_bal_cX_groups_considered[0])); seq_printf(seq, "\t\tuseless_loops: %llu\n", atomic64_read(&sbi->s_bal_cX_failed[0])); + seq_printf(seq, "\t\tbad_suggestions: %u\n", + atomic_read(&sbi->s_bal_cr0_bad_suggestions)); seq_puts(seq, "\tcr1_stats:\n"); seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[1])); @@ -2585,6 +2929,8 @@ int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset) atomic64_read(&sbi->s_bal_cX_groups_considered[1])); seq_printf(seq, "\t\tuseless_loops: %llu\n", atomic64_read(&sbi->s_bal_cX_failed[1])); + seq_printf(seq, "\t\tbad_suggestions: %u\n", + atomic_read(&sbi->s_bal_cr1_bad_suggestions)); seq_puts(seq, "\tcr2_stats:\n"); seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[2])); @@ -2719,7 +3065,10 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); init_rwsem(&meta_group_info[i]->alloc_sem); meta_group_info[i]->bb_free_root = RB_ROOT; + INIT_LIST_HEAD(&meta_group_info[i]->bb_largest_free_order_node); + RB_CLEAR_NODE(&meta_group_info[i]->bb_avg_fragment_size_rb); meta_group_info[i]->bb_largest_free_order = -1; /* uninit */ + meta_group_info[i]->bb_group = group; mb_group_bb_bitmap_alloc(sb, meta_group_info[i], group); return 0; @@ -2916,6 +3265,26 @@ int ext4_mb_init(struct super_block *sb) i++; } while (i < MB_NUM_ORDERS(sb)); + sbi->s_mb_avg_fragment_size_root = RB_ROOT; + sbi->s_mb_largest_free_orders = + kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct list_head), + GFP_KERNEL); + if (!sbi->s_mb_largest_free_orders) { + ret = -ENOMEM; + goto out; + } + sbi->s_mb_largest_free_orders_locks = + kmalloc_array(MB_NUM_ORDERS(sb), sizeof(rwlock_t), + GFP_KERNEL); + if (!sbi->s_mb_largest_free_orders_locks) { + ret = -ENOMEM; + goto out; + } + for (i = 0; i < MB_NUM_ORDERS(sb); i++) { + INIT_LIST_HEAD(&sbi->s_mb_largest_free_orders[i]); + rwlock_init(&sbi->s_mb_largest_free_orders_locks[i]); + } + rwlock_init(&sbi->s_mb_rb_lock); spin_lock_init(&sbi->s_md_lock); sbi->s_mb_free_pending = 0; @@ -2968,6 +3337,10 @@ int ext4_mb_init(struct super_block *sb) spin_lock_init(&lg->lg_prealloc_lock); } + if (blk_queue_nonrot(bdev_get_queue(sb->s_bdev))) + sbi->s_mb_max_linear_groups = 0; + else + sbi->s_mb_max_linear_groups = MB_DEFAULT_LINEAR_LIMIT; /* init file for buddy data */ ret = ext4_mb_init_backend(sb); if (ret != 0) @@ -2979,6 +3352,8 @@ out_free_locality_groups: free_percpu(sbi->s_locality_groups); sbi->s_locality_groups = NULL; out: + kfree(sbi->s_mb_largest_free_orders); + kfree(sbi->s_mb_largest_free_orders_locks); kfree(sbi->s_mb_offsets); sbi->s_mb_offsets = NULL; kfree(sbi->s_mb_maxs); @@ -3035,6 +3410,8 @@ int ext4_mb_release(struct super_block *sb) kvfree(group_info); rcu_read_unlock(); } + kfree(sbi->s_mb_largest_free_orders); + kfree(sbi->s_mb_largest_free_orders_locks); kfree(sbi->s_mb_offsets); kfree(sbi->s_mb_maxs); iput(sbi->s_buddy_cache); diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index 68111a10cfee..02585e3cbcad 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -78,6 +78,18 @@ */ #define MB_DEFAULT_MAX_INODE_PREALLOC 512 +/* + * Number of groups to search linearly before performing group scanning + * optimization. + */ +#define MB_DEFAULT_LINEAR_LIMIT 4 + +/* + * Minimum number of groups that should be present in the file system to perform + * group scanning optimizations. + */ +#define MB_DEFAULT_LINEAR_SCAN_THRESHOLD 16 + /* * Number of valid buddy orders */ @@ -166,11 +178,14 @@ struct ext4_allocation_context { /* copy of the best found extent taken before preallocation efforts */ struct ext4_free_extent ac_f_ex; + ext4_group_t ac_last_optimal_group; + __u32 ac_groups_considered; + __u32 ac_flags; /* allocation hints */ __u16 ac_groups_scanned; + __u16 ac_groups_linear_remaining; __u16 ac_found; __u16 ac_tail; __u16 ac_buddy; - __u16 ac_flags; /* allocation hints */ __u8 ac_status; __u8 ac_criteria; __u8 ac_2order; /* if request is to allocate 2^N blocks and diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 0ef82e646100..3b12deb93a6b 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1688,7 +1688,7 @@ enum { Opt_dioread_nolock, Opt_dioread_lock, Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable, Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache, - Opt_prefetch_block_bitmaps, + Opt_prefetch_block_bitmaps, Opt_mb_optimize_scan, #ifdef CONFIG_EXT4_DEBUG Opt_fc_debug_max_replay, Opt_fc_debug_force #endif @@ -1789,6 +1789,7 @@ static const match_table_t tokens = { {Opt_nombcache, "nombcache"}, {Opt_nombcache, "no_mbcache"}, /* for backward compatibility */ {Opt_prefetch_block_bitmaps, "prefetch_block_bitmaps"}, + {Opt_mb_optimize_scan, "mb_optimize_scan=%d"}, {Opt_removed, "check=none"}, /* mount option from ext2/3 */ {Opt_removed, "nocheck"}, /* mount option from ext2/3 */ {Opt_removed, "reservation"}, /* mount option from ext2/3 */ @@ -1821,6 +1822,8 @@ static ext4_fsblk_t get_sb_block(void **data) } #define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3)) +#define DEFAULT_MB_OPTIMIZE_SCAN (-1) + static const char deprecated_msg[] = "Mount option \"%s\" will be removed by %s\n" "Contact linux-ext4@vger.kernel.org if you think we should keep it.\n"; @@ -2009,6 +2012,7 @@ static const struct mount_opts { {Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET}, {Opt_prefetch_block_bitmaps, EXT4_MOUNT_PREFETCH_BLOCK_BITMAPS, MOPT_SET}, + {Opt_mb_optimize_scan, EXT4_MOUNT2_MB_OPTIMIZE_SCAN, MOPT_GTE0}, #ifdef CONFIG_EXT4_DEBUG {Opt_fc_debug_force, EXT4_MOUNT2_JOURNAL_FAST_COMMIT, MOPT_SET | MOPT_2 | MOPT_EXT4_ONLY}, @@ -2093,6 +2097,7 @@ static int ext4_set_test_dummy_encryption(struct super_block *sb, struct ext4_parsed_options { unsigned long journal_devnum; unsigned int journal_ioprio; + int mb_optimize_scan; }; static int handle_mount_opt(struct super_block *sb, char *opt, int token, @@ -2389,6 +2394,13 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, sbi->s_mount_opt |= m->mount_opt; } else if (token == Opt_data_err_ignore) { sbi->s_mount_opt &= ~m->mount_opt; + } else if (token == Opt_mb_optimize_scan) { + if (arg != 0 && arg != 1) { + ext4_msg(sb, KERN_WARNING, + "mb_optimize_scan should be set to 0 or 1."); + return -1; + } + parsed_opts->mb_optimize_scan = arg; } else { if (!args->from) arg = 1; @@ -4035,6 +4047,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) /* Set defaults for the variables that will be set during parsing */ parsed_opts.journal_ioprio = DEFAULT_JOURNAL_IOPRIO; parsed_opts.journal_devnum = 0; + parsed_opts.mb_optimize_scan = DEFAULT_MB_OPTIMIZE_SCAN; if ((data && !orig_data) || !sbi) goto out_free_base; @@ -4979,6 +4992,19 @@ no_journal: ext4_fc_replay_cleanup(sb); ext4_ext_init(sb); + + /* + * Enable optimize_scan if number of groups is > threshold. This can be + * turned off by passing "mb_optimize_scan=0". This can also be + * turned on forcefully by passing "mb_optimize_scan=1". + */ + if (parsed_opts.mb_optimize_scan == 1) + set_opt2(sb, MB_OPTIMIZE_SCAN); + else if (parsed_opts.mb_optimize_scan == 0) + clear_opt2(sb, MB_OPTIMIZE_SCAN); + else if (sbi->s_groups_count >= MB_DEFAULT_LINEAR_SCAN_THRESHOLD) + set_opt2(sb, MB_OPTIMIZE_SCAN); + err = ext4_mb_init(sb); if (err) { ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)", diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index 1b5580eb88e6..8716790e7e21 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -215,6 +215,7 @@ EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs); EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request); EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc); EXT4_RW_ATTR_SBI_UI(mb_max_inode_prealloc, s_mb_max_inode_prealloc); +EXT4_RW_ATTR_SBI_UI(mb_max_linear_groups, s_mb_max_linear_groups); EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb); EXT4_ATTR(trigger_fs_error, 0200, trigger_test_error); EXT4_RW_ATTR_SBI_UI(err_ratelimit_interval_ms, s_err_ratelimit_state.interval); @@ -263,6 +264,7 @@ static struct attribute *ext4_attrs[] = { ATTR_LIST(mb_stream_req), ATTR_LIST(mb_group_prealloc), ATTR_LIST(mb_max_inode_prealloc), + ATTR_LIST(mb_max_linear_groups), ATTR_LIST(max_writeback_mb_bump), ATTR_LIST(extent_max_zeroout_kb), ATTR_LIST(trigger_fs_error), From f68f4063855903fd3a279e646451eab04db0655f Mon Sep 17 00:00:00 2001 From: Harshad Shirwadkar Date: Thu, 1 Apr 2021 10:21:28 -0700 Subject: [PATCH 16/32] ext4: add proc files to monitor new structures This patch adds a new file "mb_structs_summary" which allows us to see the summary of the new allocator structures added in this series. Here's the sample output of file: optimize_scan: 1 max_free_order_lists: list_order_0_groups: 0 list_order_1_groups: 0 list_order_2_groups: 0 list_order_3_groups: 0 list_order_4_groups: 0 list_order_5_groups: 0 list_order_6_groups: 0 list_order_7_groups: 0 list_order_8_groups: 0 list_order_9_groups: 0 list_order_10_groups: 0 list_order_11_groups: 0 list_order_12_groups: 0 list_order_13_groups: 40 fragment_size_tree: tree_min: 16384 tree_max: 32768 tree_nodes: 40 Signed-off-by: Harshad Shirwadkar Reviewed-by: Andreas Dilger Reviewed-by: Ritesh Harjani Link: https://lore.kernel.org/r/20210401172129.189766-7-harshadshirwadkar@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 1 + fs/ext4/mballoc.c | 86 +++++++++++++++++++++++++++++++++++++++++++++++ fs/ext4/sysfs.c | 2 ++ 3 files changed, 89 insertions(+) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 42df628b35e5..369b2d2c8850 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2872,6 +2872,7 @@ int __init ext4_fc_init_dentry_cache(void); /* mballoc.c */ extern const struct seq_operations ext4_mb_seq_groups_ops; +extern const struct seq_operations ext4_mb_seq_structs_summary_ops; extern long ext4_mb_stats; extern long ext4_mb_max_to_scan; extern int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset); diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index c62555598a8e..bf180b9e9062 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2963,6 +2963,92 @@ int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset) return 0; } +static void *ext4_mb_seq_structs_summary_start(struct seq_file *seq, loff_t *pos) +{ + struct super_block *sb = PDE_DATA(file_inode(seq->file)); + unsigned long position; + + read_lock(&EXT4_SB(sb)->s_mb_rb_lock); + + if (*pos < 0 || *pos >= MB_NUM_ORDERS(sb) + 1) + return NULL; + position = *pos + 1; + return (void *) ((unsigned long) position); +} + +static void *ext4_mb_seq_structs_summary_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct super_block *sb = PDE_DATA(file_inode(seq->file)); + unsigned long position; + + ++*pos; + if (*pos < 0 || *pos >= MB_NUM_ORDERS(sb) + 1) + return NULL; + position = *pos + 1; + return (void *) ((unsigned long) position); +} + +static int ext4_mb_seq_structs_summary_show(struct seq_file *seq, void *v) +{ + struct super_block *sb = PDE_DATA(file_inode(seq->file)); + struct ext4_sb_info *sbi = EXT4_SB(sb); + unsigned long position = ((unsigned long) v); + struct ext4_group_info *grp; + struct rb_node *n; + unsigned int count, min, max; + + position--; + if (position >= MB_NUM_ORDERS(sb)) { + seq_puts(seq, "fragment_size_tree:\n"); + n = rb_first(&sbi->s_mb_avg_fragment_size_root); + if (!n) { + seq_puts(seq, "\ttree_min: 0\n\ttree_max: 0\n\ttree_nodes: 0\n"); + return 0; + } + grp = rb_entry(n, struct ext4_group_info, bb_avg_fragment_size_rb); + min = grp->bb_fragments ? grp->bb_free / grp->bb_fragments : 0; + count = 1; + while (rb_next(n)) { + count++; + n = rb_next(n); + } + grp = rb_entry(n, struct ext4_group_info, bb_avg_fragment_size_rb); + max = grp->bb_fragments ? grp->bb_free / grp->bb_fragments : 0; + + seq_printf(seq, "\ttree_min: %u\n\ttree_max: %u\n\ttree_nodes: %u\n", + min, max, count); + return 0; + } + + if (position == 0) { + seq_printf(seq, "optimize_scan: %d\n", + test_opt2(sb, MB_OPTIMIZE_SCAN) ? 1 : 0); + seq_puts(seq, "max_free_order_lists:\n"); + } + count = 0; + list_for_each_entry(grp, &sbi->s_mb_largest_free_orders[position], + bb_largest_free_order_node) + count++; + seq_printf(seq, "\tlist_order_%u_groups: %u\n", + (unsigned int)position, count); + + return 0; +} + +static void ext4_mb_seq_structs_summary_stop(struct seq_file *seq, void *v) +{ + struct super_block *sb = PDE_DATA(file_inode(seq->file)); + + read_unlock(&EXT4_SB(sb)->s_mb_rb_lock); +} + +const struct seq_operations ext4_mb_seq_structs_summary_ops = { + .start = ext4_mb_seq_structs_summary_start, + .next = ext4_mb_seq_structs_summary_next, + .stop = ext4_mb_seq_structs_summary_stop, + .show = ext4_mb_seq_structs_summary_show, +}; + static struct kmem_cache *get_groupinfo_cache(int blocksize_bits) { int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE; diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index 8716790e7e21..6f825dedc3d4 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -534,6 +534,8 @@ int ext4_register_sysfs(struct super_block *sb) &ext4_mb_seq_groups_ops, sb); proc_create_single_data("mb_stats", 0444, sbi->s_proc, ext4_seq_mb_stats_show, sb); + proc_create_seq_data("mb_structs_summary", 0444, sbi->s_proc, + &ext4_mb_seq_structs_summary_ops, sb); } return 0; } From 21175ca434c5d49509b73cf473618b01b0b85437 Mon Sep 17 00:00:00 2001 From: Harshad Shirwadkar Date: Thu, 1 Apr 2021 10:21:29 -0700 Subject: [PATCH 17/32] ext4: make prefetch_block_bitmaps default Block bitmap prefetching is needed for these allocator optimization data structures to get populated and provide better group scanning order. So, turn it on bu default. prefetch_block_bitmaps mount option is now marked as removed and a new option no_prefetch_block_bitmaps is added to disable block bitmap prefetching. Signed-off-by: Harshad Shirwadkar Link: https://lore.kernel.org/r/20210401172129.189766-8-harshadshirwadkar@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 2 +- fs/ext4/super.c | 15 ++++++++------- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 369b2d2c8850..a29660db86ac 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1227,7 +1227,7 @@ struct ext4_inode_info { #define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ #define EXT4_MOUNT_WARN_ON_ERROR 0x2000000 /* Trigger WARN_ON on error */ -#define EXT4_MOUNT_PREFETCH_BLOCK_BITMAPS 0x4000000 +#define EXT4_MOUNT_NO_PREFETCH_BLOCK_BITMAPS 0x4000000 #define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ #define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */ #define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */ diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 3b12deb93a6b..0f9af7ba9210 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1688,7 +1688,7 @@ enum { Opt_dioread_nolock, Opt_dioread_lock, Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable, Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache, - Opt_prefetch_block_bitmaps, Opt_mb_optimize_scan, + Opt_no_prefetch_block_bitmaps, Opt_mb_optimize_scan, #ifdef CONFIG_EXT4_DEBUG Opt_fc_debug_max_replay, Opt_fc_debug_force #endif @@ -1788,7 +1788,8 @@ static const match_table_t tokens = { {Opt_inlinecrypt, "inlinecrypt"}, {Opt_nombcache, "nombcache"}, {Opt_nombcache, "no_mbcache"}, /* for backward compatibility */ - {Opt_prefetch_block_bitmaps, "prefetch_block_bitmaps"}, + {Opt_removed, "prefetch_block_bitmaps"}, + {Opt_no_prefetch_block_bitmaps, "no_prefetch_block_bitmaps"}, {Opt_mb_optimize_scan, "mb_optimize_scan=%d"}, {Opt_removed, "check=none"}, /* mount option from ext2/3 */ {Opt_removed, "nocheck"}, /* mount option from ext2/3 */ @@ -2010,7 +2011,7 @@ static const struct mount_opts { {Opt_max_dir_size_kb, 0, MOPT_GTE0}, {Opt_test_dummy_encryption, 0, MOPT_STRING}, {Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET}, - {Opt_prefetch_block_bitmaps, EXT4_MOUNT_PREFETCH_BLOCK_BITMAPS, + {Opt_no_prefetch_block_bitmaps, EXT4_MOUNT_NO_PREFETCH_BLOCK_BITMAPS, MOPT_SET}, {Opt_mb_optimize_scan, EXT4_MOUNT2_MB_OPTIMIZE_SCAN, MOPT_GTE0}, #ifdef CONFIG_EXT4_DEBUG @@ -3707,11 +3708,11 @@ static struct ext4_li_request *ext4_li_request_new(struct super_block *sb, elr->lr_super = sb; elr->lr_first_not_zeroed = start; - if (test_opt(sb, PREFETCH_BLOCK_BITMAPS)) - elr->lr_mode = EXT4_LI_MODE_PREFETCH_BBITMAP; - else { + if (test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS)) { elr->lr_mode = EXT4_LI_MODE_ITABLE; elr->lr_next_group = start; + } else { + elr->lr_mode = EXT4_LI_MODE_PREFETCH_BBITMAP; } /* @@ -3742,7 +3743,7 @@ int ext4_register_li_request(struct super_block *sb, goto out; } - if (!test_opt(sb, PREFETCH_BLOCK_BITMAPS) && + if (test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS) && (first_not_zeroed == ngroups || sb_rdonly(sb) || !test_opt(sb, INIT_INODE_TABLE))) goto out; From 72ffb49a7b623c92a37657eda7cc46a06d3e8398 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Wed, 31 Mar 2021 11:31:38 +0800 Subject: [PATCH 18/32] ext4: do not set SB_ACTIVE in ext4_orphan_cleanup() When CONFIG_QUOTA is enabled, if we failed to mount the filesystem due to some error happens behind ext4_orphan_cleanup(), it will end up triggering a after free issue of super_block. The problem is that ext4_orphan_cleanup() will set SB_ACTIVE flag if CONFIG_QUOTA is enabled, after we cleanup the truncated inodes, the last iput() will put them into the lru list, and these inodes' pages may probably dirty and will be write back by the writeback thread, so it could be raced by freeing super_block in the error path of mount_bdev(). After check the setting of SB_ACTIVE flag in ext4_orphan_cleanup(), it was used to ensure updating the quota file properly, but evict inode and trash data immediately in the last iput does not affect the quotafile, so setting the SB_ACTIVE flag seems not required[1]. Fix this issue by just remove the SB_ACTIVE setting. [1] https://lore.kernel.org/linux-ext4/99cce8ca-e4a0-7301-840f-2ace67c551f3@huawei.com/T/#m04990cfbc4f44592421736b504afcc346b2a7c00 Cc: stable@kernel.org Signed-off-by: Zhang Yi Tested-by: Jan Kara Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20210331033138.918975-1-yi.zhang@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 0f9af7ba9210..097ab07c9272 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3040,9 +3040,6 @@ static void ext4_orphan_cleanup(struct super_block *sb, sb->s_flags &= ~SB_RDONLY; } #ifdef CONFIG_QUOTA - /* Needed for iput() to work correctly and not trash data */ - sb->s_flags |= SB_ACTIVE; - /* * Turn on quotas which were not enabled for read-only mounts if * filesystem has quota feature, so that they are updated correctly. From 3cd461712cb450ea6aa8a8200f35f24eb296bfb2 Mon Sep 17 00:00:00 2001 From: Yang Guo Date: Thu, 1 Apr 2021 15:03:30 +0800 Subject: [PATCH 19/32] ext4: delete redundant uptodate check for buffer The buffer uptodate state has been checked in function set_buffer_uptodate, there is no need use buffer_uptodate before calling set_buffer_uptodate and delete it. Cc: "Theodore Ts'o" Cc: Andreas Dilger Signed-off-by: Yang Guo Signed-off-by: Shaokun Zhang Reviewed-by: Ritesh Harjani Link: https://lore.kernel.org/r/1617260610-29770-1-git-send-email-zhangshaokun@hisilicon.com Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 0948a43f1b3d..32fa3ad38797 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1066,8 +1066,7 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len, block_end = block_start + blocksize; if (block_end <= from || block_start >= to) { if (PageUptodate(page)) { - if (!buffer_uptodate(bh)) - set_buffer_uptodate(bh); + set_buffer_uptodate(bh); } continue; } @@ -1092,8 +1091,7 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len, } } if (PageUptodate(page)) { - if (!buffer_uptodate(bh)) - set_buffer_uptodate(bh); + set_buffer_uptodate(bh); continue; } if (!buffer_uptodate(bh) && !buffer_delay(bh) && From ac2f7ca51b0929461ea49918f27c11b680f28995 Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Thu, 1 Apr 2021 16:19:03 +0800 Subject: [PATCH 20/32] ext4: always panic when errors=panic is specified Before commit 014c9caa29d3 ("ext4: make ext4_abort() use __ext4_error()"), the following series of commands would trigger a panic: 1. mount /dev/sda -o ro,errors=panic test 2. mount /dev/sda -o remount,abort test After commit 014c9caa29d3, remounting a file system using the test mount option "abort" will no longer trigger a panic. This commit will restore the behaviour immediately before commit 014c9caa29d3. (However, note that the Linux kernel's behavior has not been consistent; some previous kernel versions, including 5.4 and 4.19 similarly did not panic after using the mount option "abort".) This also makes a change to long-standing behaviour; namely, the following series commands will now cause a panic, when previously it did not: 1. mount /dev/sda -o ro,errors=panic test 2. echo test > /sys/fs/ext4/sda/trigger_fs_error However, this makes ext4's behaviour much more consistent, so this is a good thing. Cc: stable@kernel.org Fixes: 014c9caa29d3 ("ext4: make ext4_abort() use __ext4_error()") Signed-off-by: Ye Bin Link: https://lore.kernel.org/r/20210401081903.3421208-1-yebin10@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 097ab07c9272..d92360d6bf26 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -667,9 +667,6 @@ static void ext4_handle_error(struct super_block *sb, bool force_ro, int error, ext4_commit_super(sb); } - if (sb_rdonly(sb) || continue_fs) - return; - /* * We force ERRORS_RO behavior when system is rebooting. Otherwise we * could panic during 'reboot -f' as the underlying device got already @@ -679,6 +676,10 @@ static void ext4_handle_error(struct super_block *sb, bool force_ro, int error, panic("EXT4-fs (device %s): panic forced after error\n", sb->s_id); } + + if (sb_rdonly(sb) || continue_fs) + return; + ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); /* * Make sure updated value of ->s_mount_flags will be visible before From f88f1466e2a2e5ca17dfada436d3efa1b03a3972 Mon Sep 17 00:00:00 2001 From: Fengnan Chang Date: Fri, 2 Apr 2021 18:16:31 +0800 Subject: [PATCH 21/32] ext4: fix error code in ext4_commit_super We should set the error code when ext4_commit_super check argument failed. Found in code review. Fixes: c4be0c1dc4cdc ("filesystem freeze: add error handling of write_super_lockfs/unlockfs"). Cc: stable@kernel.org Signed-off-by: Fengnan Chang Reviewed-by: Andreas Dilger Link: https://lore.kernel.org/r/20210402101631.561-1-changfengnan@vivo.com Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index d92360d6bf26..1f5ef17f714a 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5585,8 +5585,10 @@ static int ext4_commit_super(struct super_block *sb) struct buffer_head *sbh = EXT4_SB(sb)->s_sbh; int error = 0; - if (!sbh || block_device_ejected(sb)) - return error; + if (!sbh) + return -EINVAL; + if (block_device_ejected(sb)) + return -ENODEV; ext4_update_super(sb); From 6810fad956df9e5467e8e8a5ac66fda0836c71fa Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Tue, 6 Apr 2021 10:53:31 +0800 Subject: [PATCH 22/32] ext4: fix ext4_error_err save negative errno into superblock Fix As write_mmp_block() so that it returns -EIO instead of 1, so that the correct error gets saved into the superblock. Cc: stable@kernel.org Fixes: 54d3adbc29f0 ("ext4: save all error info in save_error_info() and drop ext4_set_errno()") Reported-by: Liu Zhi Qiang Signed-off-by: Ye Bin Reviewed-by: Andreas Dilger Link: https://lore.kernel.org/r/20210406025331.148343-1-yebin10@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/mmp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index 795c3ff2907c..68fbeedd627b 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -56,7 +56,7 @@ static int write_mmp_block(struct super_block *sb, struct buffer_head *bh) wait_on_buffer(bh); sb_end_write(sb); if (unlikely(!buffer_uptodate(bh))) - return 1; + return -EIO; return 0; } From 3b1833e92baba135923af4a07e73fe6e54be5a2f Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 6 Apr 2021 18:17:59 +0200 Subject: [PATCH 23/32] ext4: annotate data race in start_this_handle() Access to journal->j_running_transaction is not protected by appropriate lock and thus is racy. We are well aware of that and the code handles the race properly. Just add a comment and data_race() annotation. Cc: stable@kernel.org Reported-by: syzbot+30774a6acf6a2cf6d535@syzkaller.appspotmail.com Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20210406161804.20150-1-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/jbd2/transaction.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 9396666b7314..398d1d9209e2 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -349,7 +349,12 @@ static int start_this_handle(journal_t *journal, handle_t *handle, } alloc_transaction: - if (!journal->j_running_transaction) { + /* + * This check is racy but it is just an optimization of allocating new + * transaction early if there are high chances we'll need it. If we + * guess wrong, we'll retry or free unused transaction. + */ + if (!data_race(journal->j_running_transaction)) { /* * If __GFP_FS is not present, then we may be being called from * inside the fs writeback layer, so we MUST NOT fail. From 83fe6b18b8d04c6c849379005e1679bac9752466 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 6 Apr 2021 18:18:00 +0200 Subject: [PATCH 24/32] ext4: annotate data race in jbd2_journal_dirty_metadata() Assertion checks in jbd2_journal_dirty_metadata() are known to be racy but we don't want to be grabbing locks just for them. We thus recheck them under b_state_lock only if it looks like they would fail. Annotate the checks with data_race(). Cc: stable@kernel.org Reported-by: Hao Sun Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20210406161804.20150-2-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/jbd2/transaction.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 398d1d9209e2..e8fc45fd751f 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -1479,8 +1479,8 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) * crucial to catch bugs so let's do a reliable check until the * lockless handling is fully proven. */ - if (jh->b_transaction != transaction && - jh->b_next_transaction != transaction) { + if (data_race(jh->b_transaction != transaction && + jh->b_next_transaction != transaction)) { spin_lock(&jh->b_state_lock); J_ASSERT_JH(jh, jh->b_transaction == transaction || jh->b_next_transaction == transaction); @@ -1488,8 +1488,8 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) } if (jh->b_modified == 1) { /* If it's in our transaction it must be in BJ_Metadata list. */ - if (jh->b_transaction == transaction && - jh->b_jlist != BJ_Metadata) { + if (data_race(jh->b_transaction == transaction && + jh->b_jlist != BJ_Metadata)) { spin_lock(&jh->b_state_lock); if (jh->b_transaction == transaction && jh->b_jlist != BJ_Metadata) From e1262cd2e68a0870fb9fc95eb202d22e8f0074b7 Mon Sep 17 00:00:00 2001 From: Xu Yihang Date: Thu, 8 Apr 2021 15:00:33 +0800 Subject: [PATCH 25/32] ext4: fix error return code in ext4_fc_perform_commit() In case of if not ext4_fc_add_tlv branch, an error return code is missing. Cc: stable@kernel.org Fixes: aa75f4d3daae ("ext4: main fast-commit commit path") Reported-by: Hulk Robot Signed-off-by: Xu Yihang Reviewed-by: Harshad Shirwadkar Link: https://lore.kernel.org/r/20210408070033.123047-1-xuyihang@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/fast_commit.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index 7541d0b5d706..312273ed8a9f 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -1088,8 +1088,10 @@ static int ext4_fc_perform_commit(journal_t *journal) head.fc_tid = cpu_to_le32( sbi->s_journal->j_running_transaction->t_tid); if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head), - (u8 *)&head, &crc)) + (u8 *)&head, &crc)) { + ret = -ENOSPC; goto out; + } } spin_lock(&sbi->s_fc_lock); From 3088e5a5153cda27ec26461e5edf2821e15e802c Mon Sep 17 00:00:00 2001 From: Bhaskar Chowdhury Date: Sat, 27 Mar 2021 16:00:05 +0530 Subject: [PATCH 26/32] ext4: fix various seppling typos Signed-off-by: Bhaskar Chowdhury Link: https://lore.kernel.org/r/cover.1616840203.git.unixbhaskar@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/fast_commit.c | 2 +- fs/ext4/indirect.c | 2 +- fs/ext4/inline.c | 2 +- fs/ext4/inode.c | 2 +- fs/ext4/mballoc.h | 2 +- fs/ext4/migrate.c | 6 +++--- fs/ext4/namei.c | 2 +- fs/ext4/xattr.c | 2 +- 8 files changed, 10 insertions(+), 10 deletions(-) diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index 312273ed8a9f..2ade51e29d83 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -66,7 +66,7 @@ * Fast Commit Ineligibility * ------------------------- * Not all operations are supported by fast commits today (e.g extended - * attributes). Fast commit ineligiblity is marked by calling one of the + * attributes). Fast commit ineligibility is marked by calling one of the * two following functions: * * - ext4_fc_mark_ineligible(): This makes next fast commit operation to fall diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 1223a18c3ff9..a7bc6ad656a9 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -705,7 +705,7 @@ static int ext4_ind_trunc_restart_fn(handle_t *handle, struct inode *inode, /* * Truncate transactions can be complex and absolutely huge. So we need to - * be able to restart the transaction at a conventient checkpoint to make + * be able to restart the transaction at a convenient checkpoint to make * sure we don't overflow the journal. * * Try to extend this transaction for the purposes of truncation. If diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index ba960e6e7e3a..3cf01629010d 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -795,7 +795,7 @@ ext4_journalled_write_inline_data(struct inode *inode, * clear the inode state safely. * 2. The inode has inline data, then we need to read the data, make it * update and dirty so that ext4_da_writepages can handle it. We don't - * need to start the journal since the file's metatdata isn't changed now. + * need to start the journal since the file's metadata isn't changed now. */ static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping, struct inode *inode, diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 32fa3ad38797..fe6045a46599 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3822,7 +3822,7 @@ unlock: * starting from file offset 'from'. The range to be zero'd must * be contained with in one block. If the specified range exceeds * the end of the block it will be shortened to end of the block - * that cooresponds to 'from' + * that corresponds to 'from' */ static int ext4_block_zero_page_range(handle_t *handle, struct address_space *mapping, loff_t from, loff_t length) diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index 02585e3cbcad..39da92ceabf8 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -59,7 +59,7 @@ * by the stream allocator, which purpose is to pack requests * as close each to other as possible to produce smooth I/O traffic * We use locality group prealloc space for stream request. - * We can tune the same via /proc/fs/ext4//stream_req + * We can tune the same via /proc/fs/ext4//stream_req */ #define MB_DEFAULT_STREAM_THRESHOLD 16 /* 64K */ diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index c5e3fc998211..7e0b4f81c6c0 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -32,7 +32,7 @@ static int finish_range(handle_t *handle, struct inode *inode, newext.ee_block = cpu_to_le32(lb->first_block); newext.ee_len = cpu_to_le16(lb->last_block - lb->first_block + 1); ext4_ext_store_pblock(&newext, lb->first_pblock); - /* Locking only for convinience since we are operating on temp inode */ + /* Locking only for convenience since we are operating on temp inode */ down_write(&EXT4_I(inode)->i_data_sem); path = ext4_find_extent(inode, lb->first_block, NULL, 0); if (IS_ERR(path)) { @@ -43,8 +43,8 @@ static int finish_range(handle_t *handle, struct inode *inode, /* * Calculate the credit needed to inserting this extent - * Since we are doing this in loop we may accumalate extra - * credit. But below we try to not accumalate too much + * Since we are doing this in loop we may accumulate extra + * credit. But below we try to not accumulate too much * of them by restarting the journal. */ needed = ext4_ext_calc_credits_for_single_extent(inode, diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index f2e9cb71763d..610cb0636ee7 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -979,7 +979,7 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash, * If the hash is 1, then continue only if the next page has a * continuation hash of any value. This is used for readdir * handling. Otherwise, check to see if the hash matches the - * desired contiuation hash. If it doesn't, return since + * desired continuation hash. If it doesn't, return since * there's no point to read in the successive index pages. */ bhash = dx_get_hash(p->at); diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 6c1018223c54..10ba4b24a0aa 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1617,7 +1617,7 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i, * If storing the value in an external inode is an option, * reserve space for xattr entries/names in the external * attribute block so that a long value does not occupy the - * whole space and prevent futher entries being added. + * whole space and prevent further entries being added. */ if (ext4_has_feature_ea_inode(inode->i_sb) && new_size && is_block && From 666245d9a436d9b98c975ca19d0a707d600d8666 Mon Sep 17 00:00:00 2001 From: Jack Qiu Date: Fri, 9 Apr 2021 12:20:35 +0800 Subject: [PATCH 27/32] ext4: fix trailing whitespace Made suggested modifications from checkpatch in reference to ERROR: trailing whitespace Signed-off-by: Jack Qiu Reviewed-by: Ritesh Harjani Link: https://lore.kernel.org/r/20210409042035.15516-1-jack.qiu@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/balloc.c | 2 +- fs/ext4/mballoc.c | 2 +- fs/ext4/namei.c | 6 +++--- fs/ext4/super.c | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 74a5172c2d83..9dc6e74b265c 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -239,7 +239,7 @@ unsigned ext4_free_clusters_after_init(struct super_block *sb, ext4_group_t block_group, struct ext4_group_desc *gdp) { - return num_clusters_in_group(sb, block_group) - + return num_clusters_in_group(sb, block_group) - ext4_num_overhead_clusters(sb, block_group, gdp); } diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index bf180b9e9062..3239e6669e84 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3089,7 +3089,7 @@ int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups) sbi->s_group_info_size = size / sizeof(*sbi->s_group_info); if (old_groupinfo) ext4_kvfree_array_rcu(old_groupinfo); - ext4_debug("allocated s_groupinfo array for %d meta_bg's\n", + ext4_debug("allocated s_groupinfo array for %d meta_bg's\n", sbi->s_group_info_size); return 0; } diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 610cb0636ee7..680fc3211cbf 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2234,10 +2234,10 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname, retval = ext4_handle_dirty_dx_node(handle, dir, frame->bh); if (retval) - goto out_frames; + goto out_frames; retval = ext4_handle_dirty_dirblock(handle, dir, bh2); if (retval) - goto out_frames; + goto out_frames; de = do_split(handle,dir, &bh2, frame, &fname->hinfo); if (IS_ERR(de)) { @@ -3469,7 +3469,7 @@ static int ext4_symlink(struct user_namespace *mnt_userns, struct inode *dir, * for transaction commit if we are running out of space * and thus we deadlock. So we have to stop transaction now * and restart it when symlink contents is written. - * + * * To keep fs consistent in case of crash, we have to put inode * to orphan list in the mean time. */ diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 1f5ef17f714a..886e0d518668 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5020,7 +5020,7 @@ no_journal: ext4_journal_commit_callback; block = ext4_count_free_clusters(sb); - ext4_free_blocks_count_set(sbi->s_es, + ext4_free_blocks_count_set(sbi->s_es, EXT4_C2B(sbi, block)); err = percpu_counter_init(&sbi->s_freeclusters_counter, block, GFP_KERNEL); From fcdf3c34b7abdcbb49690c94c7fa6ce224dc9749 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 9 Apr 2021 22:12:05 +0200 Subject: [PATCH 28/32] ext4: fix debug format string warning Using no_printk() for jbd_debug() revealed two warnings: fs/jbd2/recovery.c: In function 'fc_do_one_pass': fs/jbd2/recovery.c:256:30: error: format '%d' expects a matching 'int' argument [-Werror=format=] 256 | jbd_debug(3, "Processing fast commit blk with seq %d"); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ fs/ext4/fast_commit.c: In function 'ext4_fc_replay_add_range': fs/ext4/fast_commit.c:1732:30: error: format '%d' expects argument of type 'int', but argument 2 has type 'long unsigned int' [-Werror=format=] 1732 | jbd_debug(1, "Converting from %d to %d %lld", The first one was added incorrectly, and was also missing a few newlines in debug output, and the second one happened when the type of an argument changed. Reported-by: kernel test robot Fixes: d556435156b7 ("jbd2: avoid -Wempty-body warnings") Fixes: 6db074618969 ("ext4: use BIT() macro for BH_** state bits") Fixes: 5b849b5f96b4 ("jbd2: fast commit recovery path") Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20210409201211.1866633-1-arnd@kernel.org Signed-off-by: Theodore Ts'o --- fs/ext4/fast_commit.c | 2 +- fs/jbd2/recovery.c | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index 2ade51e29d83..f98ca4f37ef6 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -1736,7 +1736,7 @@ static int ext4_fc_replay_add_range(struct super_block *sb, } /* Range is mapped and needs a state change */ - jbd_debug(1, "Converting from %d to %d %lld", + jbd_debug(1, "Converting from %ld to %d %lld", map.m_flags & EXT4_MAP_UNWRITTEN, ext4_ext_is_unwritten(ex), map.m_pblk); ret = ext4_ext_replay_update_ex(inode, cur, map.m_len, diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index 69f18fe20923..d47a0d96bf30 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -245,15 +245,14 @@ static int fc_do_one_pass(journal_t *journal, return 0; while (next_fc_block <= journal->j_fc_last) { - jbd_debug(3, "Fast commit replay: next block %ld", + jbd_debug(3, "Fast commit replay: next block %ld\n", next_fc_block); err = jread(&bh, journal, next_fc_block); if (err) { - jbd_debug(3, "Fast commit replay: read error"); + jbd_debug(3, "Fast commit replay: read error\n"); break; } - jbd_debug(3, "Processing fast commit blk with seq %d"); err = journal->j_fc_replay_callback(journal, bh, pass, next_fc_block - journal->j_fc_first, expected_commit_id); From 4811d9929cdae4238baf5b2522247bd2f9fa7b50 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 12 Apr 2021 17:19:00 -0400 Subject: [PATCH 29/32] ext4: allow the dax flag to be set and cleared on inline directories This is needed to allow generic/607 to pass for file systems with the inline data_feature enabled, and it allows the use of file systems where the directories use inline_data, while the files are accessed via DAX. Cc: stable@kernel.org Signed-off-by: Theodore Ts'o --- fs/ext4/ialloc.c | 3 ++- fs/ext4/ioctl.c | 6 ++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 5f0c7fe32672..71d321b3b984 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -1292,7 +1292,8 @@ got: ei->i_extra_isize = sbi->s_want_extra_isize; ei->i_inline_off = 0; - if (ext4_has_feature_inline_data(sb)) + if (ext4_has_feature_inline_data(sb) && + (!(ei->i_flags & EXT4_DAX_FL) || S_ISDIR(mode))) ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); ret = inode; err = dquot_alloc_inode(inode); diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index a2cf35066f46..0796bfa72829 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -315,6 +315,12 @@ static void ext4_dax_dontcache(struct inode *inode, unsigned int flags) static bool dax_compatible(struct inode *inode, unsigned int oldflags, unsigned int flags) { + /* Allow the DAX flag to be changed on inline directories */ + if (S_ISDIR(inode->i_mode)) { + flags &= ~EXT4_INLINE_DATA_FL; + oldflags &= ~EXT4_INLINE_DATA_FL; + } + if (flags & EXT4_DAX_FL) { if ((oldflags & EXT4_DAX_MUT_EXCL) || ext4_test_inode_state(inode, From 5afa7e8b70d65819245fece61a65fd753b4aae33 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 17 Apr 2021 23:03:50 -0400 Subject: [PATCH 30/32] fs: fix reporting supported extra file attributes for statx() statx(2) notes that any attribute that is not indicated as supported by stx_attributes_mask has no usable value. Commits 801e523796004 ("fs: move generic stat response attr handling to vfs_getattr_nosec") and 712b2698e4c02 ("fs/stat: Define DAX statx attribute") sets STATX_ATTR_AUTOMOUNT and STATX_ATTR_DAX, respectively, without setting stx_attributes_mask, which can cause xfstests generic/532 to fail. Fix this in the same way as commit 1b9598c8fb99 ("xfs: fix reporting supported extra file attributes for statx()") Fixes: 801e523796004 ("fs: move generic stat response attr handling to vfs_getattr_nosec") Fixes: 712b2698e4c02 ("fs/stat: Define DAX statx attribute") Cc: stable@kernel.org Signed-off-by: Theodore Ts'o --- fs/stat.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/stat.c b/fs/stat.c index fbc171d038aa..1fa38bdec1a6 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -86,12 +86,20 @@ int vfs_getattr_nosec(const struct path *path, struct kstat *stat, /* SB_NOATIME means filesystem supplies dummy atime value */ if (inode->i_sb->s_flags & SB_NOATIME) stat->result_mask &= ~STATX_ATIME; + + /* + * Note: If you add another clause to set an attribute flag, please + * update attributes_mask below. + */ if (IS_AUTOMOUNT(inode)) stat->attributes |= STATX_ATTR_AUTOMOUNT; if (IS_DAX(inode)) stat->attributes |= STATX_ATTR_DAX; + stat->attributes_mask |= (STATX_ATTR_AUTOMOUNT | + STATX_ATTR_DAX); + mnt_userns = mnt_user_ns(path->mnt); if (inode->i_op->getattr) return inode->i_op->getattr(mnt_userns, path, stat, From 5899593f51e63dde2f07c67358bd65a641585abb Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 15 Apr 2021 17:54:17 +0200 Subject: [PATCH 31/32] ext4: Fix occasional generic/418 failure Eric has noticed that after pagecache read rework, generic/418 is occasionally failing for ext4 when blocksize < pagesize. In fact, the pagecache rework just made hard to hit race in ext4 more likely. The problem is that since ext4 conversion of direct IO writes to iomap framework (commit 378f32bab371), we update inode size after direct IO write only after invalidating page cache. Thus if buffered read sneaks at unfortunate moment like: CPU1 - write at offset 1k CPU2 - read from offset 0 iomap_dio_rw(..., IOMAP_DIO_FORCE_WAIT); ext4_readpage(); ext4_handle_inode_extension() the read will zero out tail of the page as it still sees smaller inode size and thus page cache becomes inconsistent with on-disk contents with all the consequences. Fix the problem by moving inode size update into end_io handler which gets called before the page cache is invalidated. Reported-and-tested-by: Eric Whitney Fixes: 378f32bab371 ("ext4: introduce direct I/O write using iomap infrastructure") CC: stable@vger.kernel.org Signed-off-by: Jan Kara Acked-by: Dave Chinner Link: https://lore.kernel.org/r/20210415155417.4734-1-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/file.c | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 194f5d00fa32..7924634ab0bf 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -371,15 +371,32 @@ truncate: static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size, int error, unsigned int flags) { - loff_t offset = iocb->ki_pos; + loff_t pos = iocb->ki_pos; struct inode *inode = file_inode(iocb->ki_filp); if (error) return error; - if (size && flags & IOMAP_DIO_UNWRITTEN) - return ext4_convert_unwritten_extents(NULL, inode, - offset, size); + if (size && flags & IOMAP_DIO_UNWRITTEN) { + error = ext4_convert_unwritten_extents(NULL, inode, pos, size); + if (error < 0) + return error; + } + /* + * If we are extending the file, we have to update i_size here before + * page cache gets invalidated in iomap_dio_rw(). Otherwise racing + * buffered reads could zero out too much from page cache pages. Update + * of on-disk size will happen later in ext4_dio_write_iter() where + * we have enough information to also perform orphan list handling etc. + * Note that we perform all extending writes synchronously under + * i_rwsem held exclusively so i_size update is safe here in that case. + * If the write was not extending, we cannot see pos > i_size here + * because operations reducing i_size like truncate wait for all + * outstanding DIO before updating i_size. + */ + pos += size; + if (pos > i_size_read(inode)) + i_size_write(inode, pos); return 0; } From 6c0912739699d8e4b6a87086401bf3ad3c59502d Mon Sep 17 00:00:00 2001 From: Leah Rumancik Date: Thu, 22 Apr 2021 18:08:34 +0000 Subject: [PATCH 32/32] ext4: wipe ext4_dir_entry2 upon file deletion Upon file deletion, zero out all fields in ext4_dir_entry2 besides rec_len. In case sensitive data is stored in filenames, this ensures no potentially sensitive data is left in the directory entry upon deletion. Also, wipe these fields upon moving a directory entry during the conversion to an htree and when splitting htree nodes. The data wiped may still exist in the journal, but there are future commits planned to address this. Signed-off-by: Leah Rumancik Link: https://lore.kernel.org/r/20210422180834.2242353-1-leah.rumancik@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/namei.c | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 680fc3211cbf..8b46a17a85c1 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1854,7 +1854,14 @@ dx_move_dirents(struct inode *dir, char *from, char *to, memcpy (to, de, rec_len); ((struct ext4_dir_entry_2 *) to)->rec_len = ext4_rec_len_to_disk(rec_len, blocksize); + + /* wipe dir_entry excluding the rec_len field */ de->inode = 0; + memset(&de->name_len, 0, ext4_rec_len_from_disk(de->rec_len, + blocksize) - + offsetof(struct ext4_dir_entry_2, + name_len)); + map++; to += rec_len; } @@ -2188,6 +2195,7 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname, data2 = bh2->b_data; memcpy(data2, de, len); + memset(de, 0, len); /* wipe old data */ de = (struct ext4_dir_entry_2 *) data2; top = data2 + len; while ((char *)(de2 = ext4_next_entry(de, blocksize)) < top) @@ -2577,15 +2585,27 @@ int ext4_generic_delete_entry(struct inode *dir, entry_buf, buf_size, i)) return -EFSCORRUPTED; if (de == de_del) { - if (pde) + if (pde) { pde->rec_len = ext4_rec_len_to_disk( ext4_rec_len_from_disk(pde->rec_len, blocksize) + ext4_rec_len_from_disk(de->rec_len, blocksize), blocksize); - else + + /* wipe entire dir_entry */ + memset(de, 0, ext4_rec_len_from_disk(de->rec_len, + blocksize)); + } else { + /* wipe dir_entry excluding the rec_len field */ de->inode = 0; + memset(&de->name_len, 0, + ext4_rec_len_from_disk(de->rec_len, + blocksize) - + offsetof(struct ext4_dir_entry_2, + name_len)); + } + inode_inc_iversion(dir); return 0; }