From ccf0f32acd436b9e554303fd571f1bbf5f49d8e2 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 18 Feb 2018 20:53:23 -0500 Subject: [PATCH 01/29] ext4: add tracepoints for shutdown and file system errors Signed-off-by: Theodore Ts'o --- fs/ext4/ioctl.c | 1 + fs/ext4/super.c | 4 ++++ include/trace/events/ext4.h | 43 +++++++++++++++++++++++++++++++++++++ 3 files changed, 48 insertions(+) diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 7e99ad02f1ba..4d1b1575f8ac 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -481,6 +481,7 @@ static int ext4_shutdown(struct super_block *sb, unsigned long arg) return 0; ext4_msg(sb, KERN_ALERT, "shut down requested (%d)", flags); + trace_ext4_shutdown(sb, flags); switch (flags) { case EXT4_GOING_FLAGS_DEFAULT: diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 39bf464c35f1..756f515b762d 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -448,6 +448,7 @@ void __ext4_error(struct super_block *sb, const char *function, if (unlikely(ext4_forced_shutdown(EXT4_SB(sb)))) return; + trace_ext4_error(sb, function, line); if (ext4_error_ratelimit(sb)) { va_start(args, fmt); vaf.fmt = fmt; @@ -472,6 +473,7 @@ void __ext4_error_inode(struct inode *inode, const char *function, if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) return; + trace_ext4_error(inode->i_sb, function, line); es->s_last_error_ino = cpu_to_le32(inode->i_ino); es->s_last_error_block = cpu_to_le64(block); if (ext4_error_ratelimit(inode->i_sb)) { @@ -507,6 +509,7 @@ void __ext4_error_file(struct file *file, const char *function, if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) return; + trace_ext4_error(inode->i_sb, function, line); es = EXT4_SB(inode->i_sb)->s_es; es->s_last_error_ino = cpu_to_le32(inode->i_ino); if (ext4_error_ratelimit(inode->i_sb)) { @@ -719,6 +722,7 @@ __acquires(bitlock) if (unlikely(ext4_forced_shutdown(EXT4_SB(sb)))) return; + trace_ext4_error(sb, function, line); es->s_last_error_ino = cpu_to_le32(ino); es->s_last_error_block = cpu_to_le64(block); __save_error_info(sb, function, line); diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 4d0e3af4e561..0e31eb136c57 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -2585,6 +2585,49 @@ DEFINE_GETFSMAP_EVENT(ext4_getfsmap_low_key); DEFINE_GETFSMAP_EVENT(ext4_getfsmap_high_key); DEFINE_GETFSMAP_EVENT(ext4_getfsmap_mapping); +TRACE_EVENT(ext4_shutdown, + TP_PROTO(struct super_block *sb, unsigned long flags), + + TP_ARGS(sb, flags), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( unsigned, flags ) + ), + + TP_fast_assign( + __entry->dev = sb->s_dev; + __entry->flags = flags; + ), + + TP_printk("dev %d,%d flags %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->flags) +); + +TRACE_EVENT(ext4_error, + TP_PROTO(struct super_block *sb, const char *function, + unsigned int line), + + TP_ARGS(sb, function, line), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( const char *, function ) + __field( unsigned, line ) + ), + + TP_fast_assign( + __entry->dev = sb->s_dev; + __entry->function = function; + __entry->line = line; + ), + + TP_printk("dev %d,%d function %s line %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->function, __entry->line) +); + #endif /* _TRACE_EXT4_H */ /* This part must be outside protection */ From ed65b00f8de1d0687565a1ad6511901a721adb66 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 18 Feb 2018 21:33:13 -0500 Subject: [PATCH 02/29] jbd2: clarify bad journal block checksum message There were two error messages emitted by jbd2, one for a bad checksum for a jbd2 descriptor block, and one for a bad checksum for a jbd2 data block. Change the data block checksum error so that the two can be disambiguated. Signed-off-by: Theodore Ts'o --- fs/jbd2/recovery.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index f99910b69c78..a4967b27ffb6 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -600,8 +600,8 @@ static int do_one_pass(journal_t *journal, success = -EFSBADCRC; printk(KERN_ERR "JBD2: Invalid " "checksum recovering " - "block %llu in log\n", - blocknr); + "data block %llu in " + "log\n", blocknr); block_error = 1; goto skip_write; } From 576d18ed60f5465110087c5e0eb1010de13e374d Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 18 Feb 2018 22:07:36 -0500 Subject: [PATCH 03/29] ext4: shutdown should not prevent get_write_access The ext4 forced shutdown flag needs to prevent new handles from being started, but it needs to allow existing handles to complete. So the forced shutdown flag should not force ext4_journal_get_write_access to fail. Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org --- fs/ext4/ext4_jbd2.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index 2d593201cf7a..7c70b08d104c 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -166,13 +166,6 @@ int __ext4_journal_get_write_access(const char *where, unsigned int line, might_sleep(); if (ext4_handle_valid(handle)) { - struct super_block *sb; - - sb = handle->h_transaction->t_journal->j_private; - if (unlikely(ext4_forced_shutdown(EXT4_SB(sb)))) { - jbd2_journal_abort_handle(handle); - return -EIO; - } err = jbd2_journal_get_write_access(handle, bh); if (err) ext4_journal_abort_handle(where, line, __func__, bh, From a6d9946bb925293fda9f5ed6d33d8580b001f006 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 18 Feb 2018 23:16:28 -0500 Subject: [PATCH 04/29] ext4: eliminate sleep from shutdown ioctl The msleep() when processing EXT4_GOING_FLAGS_NOLOGFLUSH was a hack to avoid some races (that are now fixed), but in fact it introduced its own race. Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org --- fs/ext4/ioctl.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 4d1b1575f8ac..16d3d1325f5b 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -498,10 +498,8 @@ static int ext4_shutdown(struct super_block *sb, unsigned long arg) break; case EXT4_GOING_FLAGS_NOLOGFLUSH: set_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags); - if (sbi->s_journal && !is_journal_aborted(sbi->s_journal)) { - msleep(100); + if (sbi->s_journal && !is_journal_aborted(sbi->s_journal)) jbd2_journal_abort(sbi->s_journal, 0); - } break; default: return -EINVAL; From fb7c02445c497943e7296cd3deee04422b63acb8 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 18 Feb 2018 23:45:18 -0500 Subject: [PATCH 05/29] ext4: pass -ESHUTDOWN code to jbd2 layer Previously the jbd2 layer assumed that a file system check would be required after a journal abort. In the case of the deliberate file system shutdown, this should not be necessary. Allow the jbd2 layer to distinguish between these two cases by using the ESHUTDOWN errno. Also add proper locking to __journal_abort_soft(). Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org --- fs/ext4/ioctl.c | 4 ++-- fs/jbd2/journal.c | 25 +++++++++++++++++++------ 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 16d3d1325f5b..9ac33a7cbd32 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -493,13 +493,13 @@ static int ext4_shutdown(struct super_block *sb, unsigned long arg) set_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags); if (sbi->s_journal && !is_journal_aborted(sbi->s_journal)) { (void) ext4_force_commit(sb); - jbd2_journal_abort(sbi->s_journal, 0); + jbd2_journal_abort(sbi->s_journal, -ESHUTDOWN); } break; case EXT4_GOING_FLAGS_NOLOGFLUSH: set_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags); if (sbi->s_journal && !is_journal_aborted(sbi->s_journal)) - jbd2_journal_abort(sbi->s_journal, 0); + jbd2_journal_abort(sbi->s_journal, -ESHUTDOWN); break; default: return -EINVAL; diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 3fbf48ec2188..efa0c72a0b9f 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1483,12 +1483,15 @@ static void jbd2_mark_journal_empty(journal_t *journal, int write_op) void jbd2_journal_update_sb_errno(journal_t *journal) { journal_superblock_t *sb = journal->j_superblock; + int errcode; read_lock(&journal->j_state_lock); - jbd_debug(1, "JBD2: updating superblock error (errno %d)\n", - journal->j_errno); - sb->s_errno = cpu_to_be32(journal->j_errno); + errcode = journal->j_errno; read_unlock(&journal->j_state_lock); + if (errcode == -ESHUTDOWN) + errcode = 0; + jbd_debug(1, "JBD2: updating superblock error (errno %d)\n", errcode); + sb->s_errno = cpu_to_be32(errcode); jbd2_write_superblock(journal, REQ_SYNC | REQ_FUA); } @@ -2105,12 +2108,22 @@ void __jbd2_journal_abort_hard(journal_t *journal) * but don't do any other IO. */ static void __journal_abort_soft (journal_t *journal, int errno) { - if (journal->j_flags & JBD2_ABORT) - return; + int old_errno; - if (!journal->j_errno) + write_lock(&journal->j_state_lock); + old_errno = journal->j_errno; + if (!journal->j_errno || errno == -ESHUTDOWN) journal->j_errno = errno; + if (journal->j_flags & JBD2_ABORT) { + write_unlock(&journal->j_state_lock); + if (!old_errno && old_errno != -ESHUTDOWN && + errno == -ESHUTDOWN) + jbd2_journal_update_sb_errno(journal); + return; + } + write_unlock(&journal->j_state_lock); + __jbd2_journal_abort_hard(journal); if (errno) { From 85e0c4e89c1b864e763c4e3bb15d0b6d501ad5d9 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 19 Feb 2018 12:22:53 -0500 Subject: [PATCH 06/29] jbd2: if the journal is aborted then don't allow update of the log tail This updates the jbd2 superblock unnecessarily, and on an abort we shouldn't truncate the log. Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org --- fs/jbd2/journal.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index efa0c72a0b9f..dfb057900e79 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -974,7 +974,7 @@ out: } /* - * This is a variaon of __jbd2_update_log_tail which checks for validity of + * This is a variation of __jbd2_update_log_tail which checks for validity of * provided log tail and locks j_checkpoint_mutex. So it is safe against races * with other threads updating log tail. */ @@ -1417,6 +1417,9 @@ int jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid, journal_superblock_t *sb = journal->j_superblock; int ret; + if (is_journal_aborted(journal)) + return -EIO; + BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex)); jbd_debug(1, "JBD2: updating superblock (start %lu, seq %u)\n", tail_block, tail_tid); From 044e6e3d74a3d7103a0c8a9305dfd94d64000660 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 19 Feb 2018 14:16:47 -0500 Subject: [PATCH 07/29] ext4: don't update checksum of new initialized bitmaps When reading the inode or block allocation bitmap, if the bitmap needs to be initialized, do not update the checksum in the block group descriptor. That's because we're not set up to journal those changes. Instead, just set the verified bit on the bitmap block, so that it's not necessary to validate the checksum. When a block or inode allocation actually happens, at that point the checksum will be calculated, and update of the bg descriptor block will be properly journalled. Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org --- fs/ext4/balloc.c | 3 +-- fs/ext4/ialloc.c | 47 +++-------------------------------------------- 2 files changed, 4 insertions(+), 46 deletions(-) diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index f9b3e0a83526..f82c4966f4ce 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -243,8 +243,6 @@ static int ext4_init_block_bitmap(struct super_block *sb, */ ext4_mark_bitmap_end(num_clusters_in_group(sb, block_group), sb->s_blocksize * 8, bh->b_data); - ext4_block_bitmap_csum_set(sb, block_group, gdp, bh); - ext4_group_desc_csum_set(sb, block_group, gdp); return 0; } @@ -448,6 +446,7 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group) err = ext4_init_block_bitmap(sb, bh, block_group, desc); set_bitmap_uptodate(bh); set_buffer_uptodate(bh); + set_buffer_verified(bh); ext4_unlock_group(sb, block_group); unlock_buffer(bh); if (err) { diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 7830d28df331..3fa93665b4a3 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -66,44 +66,6 @@ void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap) memset(bitmap + (i >> 3), 0xff, (end_bit - i) >> 3); } -/* Initializes an uninitialized inode bitmap */ -static int ext4_init_inode_bitmap(struct super_block *sb, - struct buffer_head *bh, - ext4_group_t block_group, - struct ext4_group_desc *gdp) -{ - struct ext4_group_info *grp; - struct ext4_sb_info *sbi = EXT4_SB(sb); - J_ASSERT_BH(bh, buffer_locked(bh)); - - /* If checksum is bad mark all blocks and inodes use to prevent - * allocation, essentially implementing a per-group read-only flag. */ - if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) { - grp = ext4_get_group_info(sb, block_group); - if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) - percpu_counter_sub(&sbi->s_freeclusters_counter, - grp->bb_free); - set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); - if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { - int count; - count = ext4_free_inodes_count(sb, gdp); - percpu_counter_sub(&sbi->s_freeinodes_counter, - count); - } - set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); - return -EFSBADCRC; - } - - memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8); - ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8, - bh->b_data); - ext4_inode_bitmap_csum_set(sb, block_group, gdp, bh, - EXT4_INODES_PER_GROUP(sb) / 8); - ext4_group_desc_csum_set(sb, block_group, gdp); - - return 0; -} - void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate) { if (uptodate) { @@ -187,17 +149,14 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) ext4_lock_group(sb, block_group); if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { - err = ext4_init_inode_bitmap(sb, bh, block_group, desc); + memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8); + ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), + sb->s_blocksize * 8, bh->b_data); set_bitmap_uptodate(bh); set_buffer_uptodate(bh); set_buffer_verified(bh); ext4_unlock_group(sb, block_group); unlock_buffer(bh); - if (err) { - ext4_error(sb, "Failed to init inode bitmap for group " - "%u: %d", block_group, err); - goto out; - } return bh; } ext4_unlock_group(sb, block_group); From 73fdad00b208b139cf43f3163fbc0f67e4c6047c Mon Sep 17 00:00:00 2001 From: Eryu Guan Date: Thu, 22 Mar 2018 11:41:25 -0400 Subject: [PATCH 08/29] ext4: protect i_disksize update by i_data_sem in direct write path i_disksize update should be protected by i_data_sem, by either taking the lock explicitly or by using ext4_update_i_disksize() helper. But the i_disksize updates in ext4_direct_IO_write() are not protected at all, which may be racing with i_disksize updates in writeback path in delalloc buffer write path. This is found by code inspection, and I didn't hit any i_disksize corruption due to this bug. Thanks to Jan Kara for catching this bug and suggesting the fix! Reported-by: Jan Kara Suggested-by: Jan Kara Signed-off-by: Eryu Guan Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org --- fs/ext4/inode.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index c94780075b04..bff44b4a0783 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3658,7 +3658,6 @@ static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; - struct ext4_inode_info *ei = EXT4_I(inode); ssize_t ret; loff_t offset = iocb->ki_pos; size_t count = iov_iter_count(iter); @@ -3682,7 +3681,7 @@ static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter) goto out; } orphan = 1; - ei->i_disksize = inode->i_size; + ext4_update_i_disksize(inode, inode->i_size); ext4_journal_stop(handle); } @@ -3790,7 +3789,7 @@ static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter) if (ret > 0) { loff_t end = offset + ret; if (end > inode->i_size) { - ei->i_disksize = end; + ext4_update_i_disksize(inode, end); i_size_write(inode, end); /* * We're going to return a positive `ret' From 45d8ec4d9fd5468c08f2ef0b2b132bb62dc81a3d Mon Sep 17 00:00:00 2001 From: Eryu Guan Date: Thu, 22 Mar 2018 11:44:59 -0400 Subject: [PATCH 09/29] ext4: update i_disksize if direct write past ondisk size Currently in ext4 direct write path, we update i_disksize only when new eof is greater than i_size, and don't update it even when new eof is greater than i_disksize but less than i_size. This doesn't work well with delalloc buffer write, which updates i_size and i_disksize only when delalloc blocks are resolved (at writeback time), the i_disksize from direct write can be lost if a previous buffer write succeeded at write time but failed at writeback time, then results in corrupted ondisk inode size. Consider this case, first buffer write 4k data to a new file at offset 16k with delayed allocation, then direct write 4k data to the same file at offset 4k before delalloc blocks are resolved, which doesn't update i_disksize because it writes within i_size(20k), but the extent tree metadata has been committed in journal. Then writeback of the delalloc blocks fails (due to device error etc.), and i_size/i_disksize from buffer write can't be written to disk (still zero). A subsequent umount/mount cycle recovers journal and writes extent tree metadata from direct write to disk, but with i_disksize being zero. Fix it by updating i_disksize too in direct write path when new eof is greater than i_disksize but less than i_size, so i_disksize is always consistent with direct write. This fixes occasional i_size corruption in fstests generic/475. Signed-off-by: Eryu Guan Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index bff44b4a0783..9acac476c15c 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3658,6 +3658,7 @@ static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; + struct ext4_inode_info *ei = EXT4_I(inode); ssize_t ret; loff_t offset = iocb->ki_pos; size_t count = iov_iter_count(iter); @@ -3668,7 +3669,7 @@ static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter) int orphan = 0; handle_t *handle; - if (final_size > inode->i_size) { + if (final_size > inode->i_size || final_size > ei->i_disksize) { /* Credits for sb + inode write */ handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); if (IS_ERR(handle)) { @@ -3788,9 +3789,10 @@ static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter) ext4_orphan_del(handle, inode); if (ret > 0) { loff_t end = offset + ret; - if (end > inode->i_size) { + if (end > inode->i_size || end > ei->i_disksize) { ext4_update_i_disksize(inode, end); - i_size_write(inode, end); + if (end > inode->i_size) + i_size_write(inode, end); /* * We're going to return a positive `ret' * here due to non-zero-length I/O, so there's From fe23cb65c2c394ea306f3714a17d46ab2e6a0af1 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Thu, 22 Mar 2018 11:50:26 -0400 Subject: [PATCH 10/29] ext4: fix offset overflow on 32-bit archs in ext4_iomap_begin() ext4_iomap_begin() has a bug where offset returned in the iomap structure will be truncated to unsigned long size. On 64-bit architectures this is fine but on 32-bit architectures obviously not. Not many places actually use the offset stored in the iomap structure but one of visible failures is in SEEK_HOLE / SEEK_DATA implementation. If we create a file like: dd if=/dev/urandom of=file bs=1k seek=8m count=1 then lseek64("file", 0x100000000ULL, SEEK_DATA) wrongly returns 0x100000000 on unfixed kernel while it should return 0x200000000. Avoid the overflow by proper type cast. Fixes: 545052e9e35a ("ext4: Switch to iomap for SEEK_HOLE / SEEK_DATA") Signed-off-by: Jiri Slaby Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org # v4.15 --- fs/ext4/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 9acac476c15c..d8a692f04156 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3524,7 +3524,7 @@ retry: iomap->flags |= IOMAP_F_DIRTY; iomap->bdev = inode->i_sb->s_bdev; iomap->dax_dev = sbi->s_daxdev; - iomap->offset = first_block << blkbits; + iomap->offset = (u64)first_block << blkbits; iomap->length = (u64)map.m_len << blkbits; if (ret == 0) { From 1d39834fba99c48edd3d4887ccd474da61a1ada7 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Thu, 22 Mar 2018 11:52:10 -0400 Subject: [PATCH 11/29] ext4: remove EXT4_STATE_DIOREAD_LOCK flag Commit 16c54688592c ("ext4: Allow parallel DIO reads") reworked the way locking happens around parallel dio reads. This resulted in obviating the need for EXT4_STATE_DIOREAD_LOCK flag and accompanying logic. Currently this amounts to dead code so let's remove it. No functional changes Signed-off-by: Nikolay Borisov Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara --- fs/ext4/ext4.h | 17 ----------------- fs/ext4/extents.c | 19 +++++-------------- fs/ext4/inode.c | 8 -------- fs/ext4/ioctl.c | 4 ---- fs/ext4/move_extent.c | 4 ---- fs/ext4/super.c | 12 +++++------- 6 files changed, 10 insertions(+), 54 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 3241475a1733..a42e71203e53 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1522,8 +1522,6 @@ enum { EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */ EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/ EXT4_STATE_NEWENTRY, /* File just added to dir */ - EXT4_STATE_DIOREAD_LOCK, /* Disable support for dio read - nolocking */ EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */ EXT4_STATE_EXT_PRECACHED, /* extents have been precached */ EXT4_STATE_LUSTRE_EA_INODE, /* Lustre-style ea_inode */ @@ -3181,21 +3179,6 @@ static inline void set_bitmap_uptodate(struct buffer_head *bh) set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state); } -/* - * Disable DIO read nolock optimization, so new dioreaders will be forced - * to grab i_mutex - */ -static inline void ext4_inode_block_unlocked_dio(struct inode *inode) -{ - ext4_set_inode_state(inode, EXT4_STATE_DIOREAD_LOCK); - smp_mb(); -} -static inline void ext4_inode_resume_unlocked_dio(struct inode *inode) -{ - smp_mb(); - ext4_clear_inode_state(inode, EXT4_STATE_DIOREAD_LOCK); -} - #define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) /* For ioend & aio unwritten conversion wait queues */ diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 054416e9d827..837f0a6357c7 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4796,7 +4796,6 @@ static long ext4_zero_range(struct file *file, loff_t offset, flags |= EXT4_GET_BLOCKS_KEEP_SIZE; /* Wait all existing dio workers, newcomers will block on i_mutex */ - ext4_inode_block_unlocked_dio(inode); inode_dio_wait(inode); /* Preallocate the range including the unaligned edges */ @@ -4807,7 +4806,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, round_down(offset, 1 << blkbits)) >> blkbits, new_size, flags); if (ret) - goto out_dio; + goto out_mutex; } @@ -4824,7 +4823,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, ret = ext4_update_disksize_before_punch(inode, offset, len); if (ret) { up_write(&EXT4_I(inode)->i_mmap_sem); - goto out_dio; + goto out_mutex; } /* Now release the pages and zero block aligned part of pages */ truncate_pagecache_range(inode, start, end - 1); @@ -4834,10 +4833,10 @@ static long ext4_zero_range(struct file *file, loff_t offset, flags); up_write(&EXT4_I(inode)->i_mmap_sem); if (ret) - goto out_dio; + goto out_mutex; } if (!partial_begin && !partial_end) - goto out_dio; + goto out_mutex; /* * In worst case we have to writeout two nonadjacent unwritten @@ -4850,7 +4849,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, if (IS_ERR(handle)) { ret = PTR_ERR(handle); ext4_std_error(inode->i_sb, ret); - goto out_dio; + goto out_mutex; } inode->i_mtime = inode->i_ctime = current_time(inode); @@ -4875,8 +4874,6 @@ static long ext4_zero_range(struct file *file, loff_t offset, ext4_handle_sync(handle); ext4_journal_stop(handle); -out_dio: - ext4_inode_resume_unlocked_dio(inode); out_mutex: inode_unlock(inode); return ret; @@ -4964,11 +4961,9 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) } /* Wait all existing dio workers, newcomers will block on i_mutex */ - ext4_inode_block_unlocked_dio(inode); inode_dio_wait(inode); ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags); - ext4_inode_resume_unlocked_dio(inode); if (ret) goto out; @@ -5485,7 +5480,6 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) } /* Wait for existing dio to complete */ - ext4_inode_block_unlocked_dio(inode); inode_dio_wait(inode); /* @@ -5562,7 +5556,6 @@ out_stop: ext4_journal_stop(handle); out_mmap: up_write(&EXT4_I(inode)->i_mmap_sem); - ext4_inode_resume_unlocked_dio(inode); out_mutex: inode_unlock(inode); return ret; @@ -5635,7 +5628,6 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) } /* Wait for existing dio to complete */ - ext4_inode_block_unlocked_dio(inode); inode_dio_wait(inode); /* @@ -5737,7 +5729,6 @@ out_stop: ext4_journal_stop(handle); out_mmap: up_write(&EXT4_I(inode)->i_mmap_sem); - ext4_inode_resume_unlocked_dio(inode); out_mutex: inode_unlock(inode); return ret; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index d8a692f04156..951a3d69ed17 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4252,7 +4252,6 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) } /* Wait all existing dio workers, newcomers will block on i_mutex */ - ext4_inode_block_unlocked_dio(inode); inode_dio_wait(inode); /* @@ -4325,7 +4324,6 @@ out_stop: ext4_journal_stop(handle); out_dio: up_write(&EXT4_I(inode)->i_mmap_sem); - ext4_inode_resume_unlocked_dio(inode); out_mutex: inode_unlock(inode); return ret; @@ -5507,9 +5505,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) */ if (orphan) { if (!ext4_should_journal_data(inode)) { - ext4_inode_block_unlocked_dio(inode); inode_dio_wait(inode); - ext4_inode_resume_unlocked_dio(inode); } else ext4_wait_for_tail_page_commit(inode); } @@ -6000,7 +5996,6 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) return -EROFS; /* Wait for all existing dio workers */ - ext4_inode_block_unlocked_dio(inode); inode_dio_wait(inode); /* @@ -6016,7 +6011,6 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) err = filemap_write_and_wait(inode->i_mapping); if (err < 0) { up_write(&EXT4_I(inode)->i_mmap_sem); - ext4_inode_resume_unlocked_dio(inode); return err; } } @@ -6039,7 +6033,6 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) if (err < 0) { jbd2_journal_unlock_updates(journal); percpu_up_write(&sbi->s_journal_flag_rwsem); - ext4_inode_resume_unlocked_dio(inode); return err; } ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA); @@ -6051,7 +6044,6 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) if (val) up_write(&EXT4_I(inode)->i_mmap_sem); - ext4_inode_resume_unlocked_dio(inode); /* Finally we can mark the inode as dirty. */ diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 9ac33a7cbd32..a7074115d6f6 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -124,8 +124,6 @@ static long swap_inode_boot_loader(struct super_block *sb, truncate_inode_pages(&inode_bl->i_data, 0); /* Wait for all existing dio workers */ - ext4_inode_block_unlocked_dio(inode); - ext4_inode_block_unlocked_dio(inode_bl); inode_dio_wait(inode); inode_dio_wait(inode_bl); @@ -186,8 +184,6 @@ static long swap_inode_boot_loader(struct super_block *sb, ext4_double_up_write_data_sem(inode, inode_bl); journal_err_out: - ext4_inode_resume_unlocked_dio(inode); - ext4_inode_resume_unlocked_dio(inode_bl); unlock_two_nondirectories(inode, inode_bl); iput(inode_bl); return err; diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index b96e4bd3b3ec..8e17efdcbf11 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -601,8 +601,6 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk, lock_two_nondirectories(orig_inode, donor_inode); /* Wait for all existing dio workers */ - ext4_inode_block_unlocked_dio(orig_inode); - ext4_inode_block_unlocked_dio(donor_inode); inode_dio_wait(orig_inode); inode_dio_wait(donor_inode); @@ -693,8 +691,6 @@ out: ext4_ext_drop_refs(path); kfree(path); ext4_double_up_write_data_sem(orig_inode, donor_inode); - ext4_inode_resume_unlocked_dio(orig_inode); - ext4_inode_resume_unlocked_dio(donor_inode); unlock_two_nondirectories(orig_inode, donor_inode); return ret; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 756f515b762d..3a13b7b24616 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -101,15 +101,13 @@ static struct inode *ext4_get_journal_inode(struct super_block *sb, * i_data_sem (rw) * * truncate: - * sb_start_write -> i_mutex -> EXT4_STATE_DIOREAD_LOCK (w) -> i_mmap_sem (w) -> - * i_mmap_rwsem (w) -> page lock - * sb_start_write -> i_mutex -> EXT4_STATE_DIOREAD_LOCK (w) -> i_mmap_sem (w) -> - * transaction start -> i_data_sem (rw) + * sb_start_write -> i_mutex -> i_mmap_sem (w) -> i_mmap_rwsem (w) -> page lock + * sb_start_write -> i_mutex -> i_mmap_sem (w) -> transaction start -> + * i_data_sem (rw) * * direct IO: - * sb_start_write -> i_mutex -> EXT4_STATE_DIOREAD_LOCK (r) -> mmap_sem - * sb_start_write -> i_mutex -> EXT4_STATE_DIOREAD_LOCK (r) -> - * transaction start -> i_data_sem (rw) + * sb_start_write -> i_mutex -> mmap_sem + * sb_start_write -> i_mutex -> transaction start -> i_data_sem (rw) * * writepages: * transaction start -> page lock(s) -> i_data_sem (rw) From 0d9366d67bcf066b028e57d09c9a86ce879bcc28 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Thu, 22 Mar 2018 11:59:00 -0400 Subject: [PATCH 12/29] ext4: don't complain about incorrect features when probing If mount is auto-probing for filesystem type, it will try various filesystems in order, with the MS_SILENT flag set. We get that flag as the silent arg to ext4_fill_super. If we're probing (silent==1) then don't complain about feature incompatibilities that are found if it looks like it's actually a different valid extN type - failed probes should be silent in this case. If the on-disk features are unknown even to ext4, then complain. Reported-by: Joakim Tjernlund Tested-by: Joakim Tjernlund Signed-off-by: Eric Sandeen Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara --- fs/ext4/super.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 3a13b7b24616..9d1da40c1f62 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3662,6 +3662,12 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) ext4_msg(sb, KERN_INFO, "mounting ext2 file system " "using the ext4 subsystem"); else { + /* + * If we're probing be silent, if this looks like + * it's actually an ext[34] filesystem. + */ + if (silent && ext4_feature_set_ok(sb, sb_rdonly(sb))) + goto failed_mount; ext4_msg(sb, KERN_ERR, "couldn't mount as ext2 due " "to feature incompatibilities"); goto failed_mount; @@ -3673,6 +3679,12 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) ext4_msg(sb, KERN_INFO, "mounting ext3 file system " "using the ext4 subsystem"); else { + /* + * If we're probing be silent, if this looks like + * it's actually an ext4 filesystem. + */ + if (silent && ext4_feature_set_ok(sb, sb_rdonly(sb))) + goto failed_mount; ext4_msg(sb, KERN_ERR, "couldn't mount as ext3 due " "to feature incompatibilities"); goto failed_mount; From 043d20d1592a194b96fb19282396e46cda633180 Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Mon, 26 Mar 2018 01:32:50 -0400 Subject: [PATCH 13/29] ext4: use generic_writepages instead of __writepage/write_cache_pages Code cleanup. Instead of writing an internal static function, use the available generic_writepages(). Signed-off-by: Goldwyn Rodrigues Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 951a3d69ed17..435965598cb8 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2694,15 +2694,6 @@ out: return err; } -static int __writepage(struct page *page, struct writeback_control *wbc, - void *data) -{ - struct address_space *mapping = data; - int ret = ext4_writepage(page, wbc); - mapping_set_error(mapping, ret); - return ret; -} - static int ext4_writepages(struct address_space *mapping, struct writeback_control *wbc) { @@ -2740,11 +2731,7 @@ static int ext4_writepages(struct address_space *mapping, goto out_writepages; if (ext4_should_journal_data(inode)) { - struct blk_plug plug; - - blk_start_plug(&plug); - ret = write_cache_pages(mapping, wbc, __writepage, mapping); - blk_finish_plug(&plug); + ret = generic_writepages(mapping, wbc); goto out_writepages; } From dcae058a8da9c3cfc0055c7937ccd1a3dd0382a8 Mon Sep 17 00:00:00 2001 From: "zhenwei.pi" Date: Mon, 26 Mar 2018 01:44:03 -0400 Subject: [PATCH 14/29] ext4: fix comments in ext4_swap_extents() "mark_unwritten" in comment and "unwritten" in the function arguments is mismatched. Signed-off-by: zhenwei.pi Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 837f0a6357c7..0a7315961bac 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5742,7 +5742,7 @@ out_mutex: * @lblk1: Start block for first inode * @lblk2: Start block for second inode * @count: Number of blocks to swap - * @mark_unwritten: Mark second inode's extents as unwritten after swap + * @unwritten: Mark second inode's extents as unwritten after swap * @erp: Pointer to save error value * * This helper routine does exactly what is promise "swap extents". All other @@ -5756,7 +5756,7 @@ out_mutex: */ int ext4_swap_extents(handle_t *handle, struct inode *inode1, - struct inode *inode2, ext4_lblk_t lblk1, ext4_lblk_t lblk2, + struct inode *inode2, ext4_lblk_t lblk1, ext4_lblk_t lblk2, ext4_lblk_t count, int unwritten, int *erp) { struct ext4_ext_path *path1 = NULL; From 7dac4a1726a9c64a517d595c40e95e2d0d135f6f Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 26 Mar 2018 23:54:10 -0400 Subject: [PATCH 15/29] ext4: add validity checks for bitmap block numbers An privileged attacker can cause a crash by mounting a crafted ext4 image which triggers a out-of-bounds read in the function ext4_valid_block_bitmap() in fs/ext4/balloc.c. This issue has been assigned CVE-2018-1093. BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=199181 BugLink: https://bugzilla.redhat.com/show_bug.cgi?id=1560782 Reported-by: Wen Xu Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org --- fs/ext4/balloc.c | 16 ++++++++++++++-- fs/ext4/ialloc.c | 7 +++++++ 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index f82c4966f4ce..a33d8fb1bf2a 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -338,20 +338,25 @@ static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb, /* check whether block bitmap block number is set */ blk = ext4_block_bitmap(sb, desc); offset = blk - group_first_block; - if (!ext4_test_bit(EXT4_B2C(sbi, offset), bh->b_data)) + if (offset < 0 || EXT4_B2C(sbi, offset) >= sb->s_blocksize || + !ext4_test_bit(EXT4_B2C(sbi, offset), bh->b_data)) /* bad block bitmap */ return blk; /* check whether the inode bitmap block number is set */ blk = ext4_inode_bitmap(sb, desc); offset = blk - group_first_block; - if (!ext4_test_bit(EXT4_B2C(sbi, offset), bh->b_data)) + if (offset < 0 || EXT4_B2C(sbi, offset) >= sb->s_blocksize || + !ext4_test_bit(EXT4_B2C(sbi, offset), bh->b_data)) /* bad block bitmap */ return blk; /* check whether the inode table block number is set */ blk = ext4_inode_table(sb, desc); offset = blk - group_first_block; + if (offset < 0 || EXT4_B2C(sbi, offset) >= sb->s_blocksize || + EXT4_B2C(sbi, offset + sbi->s_itb_per_group) >= sb->s_blocksize) + return blk; next_zero_bit = ext4_find_next_zero_bit(bh->b_data, EXT4_B2C(sbi, offset + sbi->s_itb_per_group), EXT4_B2C(sbi, offset)); @@ -417,6 +422,7 @@ struct buffer_head * ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group) { struct ext4_group_desc *desc; + struct ext4_sb_info *sbi = EXT4_SB(sb); struct buffer_head *bh; ext4_fsblk_t bitmap_blk; int err; @@ -425,6 +431,12 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group) if (!desc) return ERR_PTR(-EFSCORRUPTED); bitmap_blk = ext4_block_bitmap(sb, desc); + if ((bitmap_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) || + (bitmap_blk >= ext4_blocks_count(sbi->s_es))) { + ext4_error(sb, "Invalid block bitmap block %llu in " + "block_group %u", bitmap_blk, block_group); + return ERR_PTR(-EFSCORRUPTED); + } bh = sb_getblk(sb, bitmap_blk); if (unlikely(!bh)) { ext4_error(sb, "Cannot get buffer for block bitmap - " diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 3fa93665b4a3..df92e3ec9913 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -122,6 +122,7 @@ static struct buffer_head * ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) { struct ext4_group_desc *desc; + struct ext4_sb_info *sbi = EXT4_SB(sb); struct buffer_head *bh = NULL; ext4_fsblk_t bitmap_blk; int err; @@ -131,6 +132,12 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) return ERR_PTR(-EFSCORRUPTED); bitmap_blk = ext4_inode_bitmap(sb, desc); + if ((bitmap_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) || + (bitmap_blk >= ext4_blocks_count(sbi->s_es))) { + ext4_error(sb, "Invalid inode bitmap blk %llu in " + "block_group %u", bitmap_blk, block_group); + return ERR_PTR(-EFSCORRUPTED); + } bh = sb_getblk(sb, bitmap_blk); if (unlikely(!bh)) { ext4_error(sb, "Cannot read inode bitmap - " From ce3fd194fcc6fbdc00ce095a852f22df97baa401 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 29 Mar 2018 14:31:42 -0400 Subject: [PATCH 16/29] ext4: limit xattr size to INT_MAX ext4 isn't validating the sizes of xattrs where the value of the xattr is stored in an external inode. This is problematic because ->e_value_size is a u32, but ext4_xattr_get() returns an int. A very large size is misinterpreted as an error code, which ext4_get_acl() translates into a bogus ERR_PTR() for which IS_ERR() returns false, causing a crash. Fix this by validating that all xattrs are <= INT_MAX bytes. This issue has been assigned CVE-2018-1095. https://bugzilla.kernel.org/show_bug.cgi?id=199185 https://bugzilla.redhat.com/show_bug.cgi?id=1560793 Reported-by: Wen Xu Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org Fixes: e50e5129f384 ("ext4: xattr-in-inode support") --- fs/ext4/xattr.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 63656dbafdc4..2077d87b09f2 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -195,10 +195,13 @@ ext4_xattr_check_entries(struct ext4_xattr_entry *entry, void *end, /* Check the values */ while (!IS_LAST_ENTRY(entry)) { - if (entry->e_value_size != 0 && - entry->e_value_inum == 0) { + u32 size = le32_to_cpu(entry->e_value_size); + + if (size > INT_MAX) + return -EFSCORRUPTED; + + if (size != 0 && entry->e_value_inum == 0) { u16 offs = le16_to_cpu(entry->e_value_offs); - u32 size = le32_to_cpu(entry->e_value_size); void *value; /* From 8e4b5eae5decd9dfe5a4ee369c22028f90ab4c44 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 29 Mar 2018 21:56:09 -0400 Subject: [PATCH 17/29] ext4: fail ext4_iget for root directory if unallocated If the root directory has an i_links_count of zero, then when the file system is mounted, then when ext4_fill_super() notices the problem and tries to call iput() the root directory in the error return path, ext4_evict_inode() will try to free the inode on disk, before all of the file system structures are set up, and this will result in an OOPS caused by a NULL pointer dereference. This issue has been assigned CVE-2018-1092. https://bugzilla.kernel.org/show_bug.cgi?id=199179 https://bugzilla.redhat.com/show_bug.cgi?id=1560777 Reported-by: Wen Xu Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org --- fs/ext4/inode.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 435965598cb8..18aa2ef963ad 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4732,6 +4732,12 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) goto bad_inode; raw_inode = ext4_raw_inode(&iloc); + if ((ino == EXT4_ROOT_INO) && (raw_inode->i_links_count == 0)) { + EXT4_ERROR_INODE(inode, "root inode unallocated"); + ret = -EFSCORRUPTED; + goto bad_inode; + } + if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > From a45403b51582a87872927a3e0fc0a389c26867f1 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 29 Mar 2018 22:10:31 -0400 Subject: [PATCH 18/29] ext4: always initialize the crc32c checksum driver The extended attribute code now uses the crc32c checksum for hashing purposes, so we should just always always initialize it. We also want to prevent NULL pointer dereferences if one of the metadata checksum features is enabled after the file sytsem is originally mounted. This issue has been assigned CVE-2018-1094. https://bugzilla.kernel.org/show_bug.cgi?id=199183 https://bugzilla.redhat.com/show_bug.cgi?id=1560788 Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org --- fs/ext4/super.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 9d1da40c1f62..7cd022c344d1 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3492,15 +3492,12 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } /* Load the checksum driver */ - if (ext4_has_feature_metadata_csum(sb) || - ext4_has_feature_ea_inode(sb)) { - sbi->s_chksum_driver = crypto_alloc_shash("crc32c", 0, 0); - if (IS_ERR(sbi->s_chksum_driver)) { - ext4_msg(sb, KERN_ERR, "Cannot load crc32c driver."); - ret = PTR_ERR(sbi->s_chksum_driver); - sbi->s_chksum_driver = NULL; - goto failed_mount; - } + sbi->s_chksum_driver = crypto_alloc_shash("crc32c", 0, 0); + if (IS_ERR(sbi->s_chksum_driver)) { + ext4_msg(sb, KERN_ERR, "Cannot load crc32c driver."); + ret = PTR_ERR(sbi->s_chksum_driver); + sbi->s_chksum_driver = NULL; + goto failed_mount; } /* Check superblock checksum */ From 18db4b4e6fc31eda838dd1c1296d67dbcb3dc957 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 29 Mar 2018 22:10:35 -0400 Subject: [PATCH 19/29] ext4: don't allow r/w mounts if metadata blocks overlap the superblock If some metadata block, such as an allocation bitmap, overlaps the superblock, it's very likely that if the file system is mounted read/write, the results will not be pretty. So disallow r/w mounts for file systems corrupted in this particular way. Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org --- fs/ext4/super.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 7cd022c344d1..edcfe6956eba 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2335,6 +2335,8 @@ static int ext4_check_descriptors(struct super_block *sb, ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Block bitmap for group %u overlaps " "superblock", i); + if (!sb_rdonly(sb)) + return 0; } if (block_bitmap < first_block || block_bitmap > last_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " @@ -2347,6 +2349,8 @@ static int ext4_check_descriptors(struct super_block *sb, ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Inode bitmap for group %u overlaps " "superblock", i); + if (!sb_rdonly(sb)) + return 0; } if (inode_bitmap < first_block || inode_bitmap > last_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " @@ -2359,6 +2363,8 @@ static int ext4_check_descriptors(struct super_block *sb, ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Inode table for group %u overlaps " "superblock", i); + if (!sb_rdonly(sb)) + return 0; } if (inode_table < first_block || inode_table + sbi->s_itb_per_group - 1 > last_block) { From c2e5df762601157f9c101025f10f873ae792e6cb Mon Sep 17 00:00:00 2001 From: Tyson Nottingham Date: Fri, 30 Mar 2018 00:03:38 -0400 Subject: [PATCH 20/29] ext4: null out kobject* during sysfs cleanup Make cleanup of ext4_feat kobject consistent with similar objects. Signed-off-by: Tyson Nottingham Signed-off-by: Theodore Ts'o --- fs/ext4/sysfs.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index 1205261f130c..aa8165e7c44b 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -464,6 +464,7 @@ int __init ext4_init_sysfs(void) feat_err: kobject_put(ext4_feat); + ext4_feat = NULL; kset_err: kset_unregister(ext4_kset); ext4_kset = NULL; @@ -473,6 +474,7 @@ kset_err: void ext4_exit_sysfs(void) { kobject_put(ext4_feat); + ext4_feat = NULL; kset_unregister(ext4_kset); ext4_kset = NULL; remove_proc_entry(proc_dirname, NULL); From 6ca06829fba6019913f524772ec189f1dc7ec3c8 Mon Sep 17 00:00:00 2001 From: Tyson Nottingham Date: Fri, 30 Mar 2018 00:13:10 -0400 Subject: [PATCH 21/29] ext4: remove unused parameters in sysfs code Signed-off-by: Tyson Nottingham Signed-off-by: Theodore Ts'o --- fs/ext4/sysfs.c | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index aa8165e7c44b..3b6b3ab57bf6 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -49,8 +49,7 @@ struct ext4_attr { } u; }; -static ssize_t session_write_kbytes_show(struct ext4_attr *a, - struct ext4_sb_info *sbi, char *buf) +static ssize_t session_write_kbytes_show(struct ext4_sb_info *sbi, char *buf) { struct super_block *sb = sbi->s_buddy_cache->i_sb; @@ -61,8 +60,7 @@ static ssize_t session_write_kbytes_show(struct ext4_attr *a, sbi->s_sectors_written_start) >> 1); } -static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a, - struct ext4_sb_info *sbi, char *buf) +static ssize_t lifetime_write_kbytes_show(struct ext4_sb_info *sbi, char *buf) { struct super_block *sb = sbi->s_buddy_cache->i_sb; @@ -74,8 +72,7 @@ static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a, EXT4_SB(sb)->s_sectors_written_start) >> 1))); } -static ssize_t inode_readahead_blks_store(struct ext4_attr *a, - struct ext4_sb_info *sbi, +static ssize_t inode_readahead_blks_store(struct ext4_sb_info *sbi, const char *buf, size_t count) { unsigned long t; @@ -92,8 +89,7 @@ static ssize_t inode_readahead_blks_store(struct ext4_attr *a, return count; } -static ssize_t reserved_clusters_store(struct ext4_attr *a, - struct ext4_sb_info *sbi, +static ssize_t reserved_clusters_store(struct ext4_sb_info *sbi, const char *buf, size_t count) { unsigned long long val; @@ -109,8 +105,7 @@ static ssize_t reserved_clusters_store(struct ext4_attr *a, return count; } -static ssize_t trigger_test_error(struct ext4_attr *a, - struct ext4_sb_info *sbi, +static ssize_t trigger_test_error(struct ext4_sb_info *sbi, const char *buf, size_t count) { int len = count; @@ -268,9 +263,9 @@ static ssize_t ext4_attr_show(struct kobject *kobj, (s64) EXT4_C2B(sbi, percpu_counter_sum(&sbi->s_dirtyclusters_counter))); case attr_session_write_kbytes: - return session_write_kbytes_show(a, sbi, buf); + return session_write_kbytes_show(sbi, buf); case attr_lifetime_write_kbytes: - return lifetime_write_kbytes_show(a, sbi, buf); + return lifetime_write_kbytes_show(sbi, buf); case attr_reserved_clusters: return snprintf(buf, PAGE_SIZE, "%llu\n", (unsigned long long) @@ -306,7 +301,7 @@ static ssize_t ext4_attr_store(struct kobject *kobj, switch (a->attr_id) { case attr_reserved_clusters: - return reserved_clusters_store(a, sbi, buf, len); + return reserved_clusters_store(sbi, buf, len); case attr_pointer_ui: if (!ptr) return 0; @@ -316,9 +311,9 @@ static ssize_t ext4_attr_store(struct kobject *kobj, *((unsigned int *) ptr) = t; return len; case attr_inode_readahead: - return inode_readahead_blks_store(a, sbi, buf, len); + return inode_readahead_blks_store(sbi, buf, len); case attr_trigger_test_error: - return trigger_test_error(a, sbi, buf, len); + return trigger_test_error(sbi, buf, len); } return 0; } From bc1420ae56266fa2c5a8e452d55f744ca98fe42f Mon Sep 17 00:00:00 2001 From: Tyson Nottingham Date: Fri, 30 Mar 2018 00:41:34 -0400 Subject: [PATCH 22/29] ext4: simplify kobject usage Replace kset with generic kobject provided by kobject_create_and_add(), since the latter is sufficient. Signed-off-by: Tyson Nottingham Signed-off-by: Theodore Ts'o --- fs/ext4/sysfs.c | 45 ++++++++++++--------------------------------- 1 file changed, 12 insertions(+), 33 deletions(-) diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index 3b6b3ab57bf6..9ebd26c957c2 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -325,13 +325,6 @@ static void ext4_sb_release(struct kobject *kobj) complete(&sbi->s_kobj_unregister); } -static void ext4_kset_release(struct kobject *kobj) -{ - struct kset *kset = container_of(kobj, struct kset, kobj); - - kfree(kset); -} - static const struct sysfs_ops ext4_attr_ops = { .show = ext4_attr_show, .store = ext4_attr_store, @@ -343,19 +336,14 @@ static struct kobj_type ext4_sb_ktype = { .release = ext4_sb_release, }; -static struct kobj_type ext4_ktype = { - .sysfs_ops = &ext4_attr_ops, - .release = ext4_kset_release, -}; - -static struct kset *ext4_kset; - static struct kobj_type ext4_feat_ktype = { .default_attrs = ext4_feat_attrs, .sysfs_ops = &ext4_attr_ops, .release = (void (*)(struct kobject *))kfree, }; +static struct kobject *ext4_root; + static struct kobject *ext4_feat; #define PROC_FILE_SHOW_DEFN(name) \ @@ -393,9 +381,8 @@ int ext4_register_sysfs(struct super_block *sb) const struct ext4_proc_files *p; int err; - sbi->s_kobj.kset = ext4_kset; init_completion(&sbi->s_kobj_unregister); - err = kobject_init_and_add(&sbi->s_kobj, &ext4_sb_ktype, NULL, + err = kobject_init_and_add(&sbi->s_kobj, &ext4_sb_ktype, ext4_root, "%s", sb->s_id); if (err) { kobject_put(&sbi->s_kobj); @@ -431,26 +418,18 @@ int __init ext4_init_sysfs(void) { int ret; - ext4_kset = kzalloc(sizeof(*ext4_kset), GFP_KERNEL); - if (!ext4_kset) + ext4_root = kobject_create_and_add("ext4", fs_kobj); + if (!ext4_root) return -ENOMEM; - kobject_set_name(&ext4_kset->kobj, "ext4"); - ext4_kset->kobj.parent = fs_kobj; - ext4_kset->kobj.ktype = &ext4_ktype; - ret = kset_register(ext4_kset); - if (ret) - goto kset_err; - ext4_feat = kzalloc(sizeof(*ext4_feat), GFP_KERNEL); if (!ext4_feat) { ret = -ENOMEM; - goto kset_err; + goto root_err; } - ext4_feat->kset = ext4_kset; ret = kobject_init_and_add(ext4_feat, &ext4_feat_ktype, - NULL, "features"); + ext4_root, "features"); if (ret) goto feat_err; @@ -460,9 +439,9 @@ int __init ext4_init_sysfs(void) feat_err: kobject_put(ext4_feat); ext4_feat = NULL; -kset_err: - kset_unregister(ext4_kset); - ext4_kset = NULL; +root_err: + kobject_put(ext4_root); + ext4_root = NULL; return ret; } @@ -470,8 +449,8 @@ void ext4_exit_sysfs(void) { kobject_put(ext4_feat); ext4_feat = NULL; - kset_unregister(ext4_kset); - ext4_kset = NULL; + kobject_put(ext4_root); + ext4_root = NULL; remove_proc_entry(proc_dirname, NULL); ext4_proc_root = NULL; } From 68afa7e0836263e28ebf272e56a110a56ffdabb6 Mon Sep 17 00:00:00 2001 From: Tyson Nottingham Date: Fri, 30 Mar 2018 00:51:10 -0400 Subject: [PATCH 23/29] ext4: show more binary mount options in procfs Previously, /proc/fs/ext4//options would only show binary options if they were set (1 in the options bit mask). E.g. it would show "grpid" if it was set, but it would not show "nogrpid" if grpid was not set. This seems sensible, but when an option is absent from the file, it can be hard for the unfamiliar to know what is being used. E.g. if there isn't a (no)grpid entry, nogrpid is in effect. But if there isn't a (no)auto_da_alloc entry, auto_da_alloc is in effect. If there isn't a (minixdf|bsddf) entry, it turns out bsddf is in effect. It all depends on how the option is implemented. It's clearer to be explicit, so print the corresponding option regardless of whether it means a 1 or a 0 in the bit mask. Note that options which do not have an explicit disable option aren't indicated as being disabled even with this change (e.g. dax). Signed-off-by: Tyson Nottingham Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index edcfe6956eba..2376ac09e013 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2021,7 +2021,7 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb, { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; - int def_errors, def_mount_opt = nodefs ? 0 : sbi->s_def_mount_opt; + int def_errors, def_mount_opt = sbi->s_def_mount_opt; const struct mount_opts *m; char sep = nodefs ? '\n' : ','; @@ -2036,7 +2036,7 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb, if (((m->flags & (MOPT_SET|MOPT_CLEAR)) == 0) || (m->flags & MOPT_CLEAR_ERR)) continue; - if (!(m->mount_opt & (sbi->s_mount_opt ^ def_mount_opt))) + if (!nodefs && !(m->mount_opt & (sbi->s_mount_opt ^ def_mount_opt))) continue; /* skip if same as the default */ if ((want_set && (sbi->s_mount_opt & m->mount_opt) != m->mount_opt) || @@ -2070,7 +2070,8 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb, SEQ_OPTS_PUTS("i_version"); if (nodefs || sbi->s_stripe) SEQ_OPTS_PRINT("stripe=%lu", sbi->s_stripe); - if (EXT4_MOUNT_DATA_FLAGS & (sbi->s_mount_opt ^ def_mount_opt)) { + if (nodefs || EXT4_MOUNT_DATA_FLAGS & + (sbi->s_mount_opt ^ def_mount_opt)) { if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) SEQ_OPTS_PUTS("data=journal"); else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) From ceec03764a3921d03aefe4cc9b3b14cf8527ceff Mon Sep 17 00:00:00 2001 From: Tyson Nottingham Date: Fri, 30 Mar 2018 00:53:33 -0400 Subject: [PATCH 24/29] ext4: omit init_itable=n in procfs when disabled Don't show init_itable=n in /proc/fs/ext4//options when filesystem is mounted with noinit_itable. Signed-off-by: Tyson Nottingham Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 2376ac09e013..20d5233856c5 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2084,7 +2084,7 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb, SEQ_OPTS_PRINT("inode_readahead_blks=%u", sbi->s_inode_readahead_blks); - if (nodefs || (test_opt(sb, INIT_INODE_TABLE) && + if (test_opt(sb, INIT_INODE_TABLE) && (nodefs || (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT))) SEQ_OPTS_PRINT("init_itable=%u", sbi->s_li_wait_mult); if (nodefs || sbi->s_max_dir_size_kb) From 27f394a7718d00ad16c59c616638cb46c6cd6a9d Mon Sep 17 00:00:00 2001 From: Tyson Nottingham Date: Fri, 30 Mar 2018 00:56:10 -0400 Subject: [PATCH 25/29] ext4: don't show data= option if defaulted Previously, mount -l would show data= even if the ext4 default journaling mode was being used. Change this to be consistent with the rest of the options. Ext4 already did the right thing when the journaling mode being used matched the one specified in the superblock's default mount options. The reason it failed to do the right thing for the ext4 defaults is that, when set, they were never included in sbi->s_def_mount_opt (unlike the superblock's defaults, which were). Signed-off-by: Tyson Nottingham Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 20d5233856c5..185f7e61f4cf 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4112,10 +4112,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) * cope, else JOURNAL_DATA */ if (jbd2_journal_check_available_features - (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) + (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) { set_opt(sb, ORDERED_DATA); - else + sbi->s_def_mount_opt |= EXT4_MOUNT_ORDERED_DATA; + } else { set_opt(sb, JOURNAL_DATA); + sbi->s_def_mount_opt |= EXT4_MOUNT_JOURNAL_DATA; + } break; case EXT4_MOUNT_ORDERED_DATA: From de05ca8526796c7e9f7c7282b7f89a818af19818 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 30 Mar 2018 15:42:25 -0400 Subject: [PATCH 26/29] ext4: move call to ext4_error() into ext4_xattr_check_block() Refactor the call to EXT4_ERROR_INODE() into ext4_xattr_check_block(). This simplifies the code, and fixes a problem where not all callers of ext4_xattr_check_block() were not resulting in ext4_error() getting called when the xattr block is corrupted. Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org --- fs/ext4/xattr.c | 60 ++++++++++++++++++++++--------------------------- 1 file changed, 27 insertions(+), 33 deletions(-) diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 2077d87b09f2..c030e41818ab 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -225,25 +225,36 @@ ext4_xattr_check_entries(struct ext4_xattr_entry *entry, void *end, } static inline int -ext4_xattr_check_block(struct inode *inode, struct buffer_head *bh) +__ext4_xattr_check_block(struct inode *inode, struct buffer_head *bh, + const char *function, unsigned int line) { - int error; + int error = -EFSCORRUPTED; if (buffer_verified(bh)) return 0; if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) || BHDR(bh)->h_blocks != cpu_to_le32(1)) - return -EFSCORRUPTED; + goto errout; + error = -EFSBADCRC; if (!ext4_xattr_block_csum_verify(inode, bh)) - return -EFSBADCRC; + goto errout; error = ext4_xattr_check_entries(BFIRST(bh), bh->b_data + bh->b_size, bh->b_data); - if (!error) +errout: + if (error) + __ext4_error_inode(inode, function, line, 0, + "corrupted xattr block %llu", + (unsigned long long) bh->b_blocknr); + else set_buffer_verified(bh); return error; } +#define ext4_xattr_check_block(inode, bh) \ + __ext4_xattr_check_block((inode), (bh), __func__, __LINE__) + + static int __xattr_check_inode(struct inode *inode, struct ext4_xattr_ibody_header *header, void *end, const char *function, unsigned int line) @@ -514,12 +525,9 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name, goto cleanup; ea_bdebug(bh, "b_count=%d, refcount=%d", atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); - if (ext4_xattr_check_block(inode, bh)) { - EXT4_ERROR_INODE(inode, "bad block %llu", - EXT4_I(inode)->i_file_acl); - error = -EFSCORRUPTED; + error = ext4_xattr_check_block(inode, bh); + if (error) goto cleanup; - } ext4_xattr_block_cache_insert(ea_block_cache, bh); entry = BFIRST(bh); error = ext4_xattr_find_entry(&entry, name_index, name, 1); @@ -679,12 +687,9 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size) goto cleanup; ea_bdebug(bh, "b_count=%d, refcount=%d", atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); - if (ext4_xattr_check_block(inode, bh)) { - EXT4_ERROR_INODE(inode, "bad block %llu", - EXT4_I(inode)->i_file_acl); - error = -EFSCORRUPTED; + error = ext4_xattr_check_block(inode, bh); + if (error) goto cleanup; - } ext4_xattr_block_cache_insert(EA_BLOCK_CACHE(inode), bh); error = ext4_xattr_list_entries(dentry, BFIRST(bh), buffer, buffer_size); @@ -811,10 +816,9 @@ int ext4_get_inode_usage(struct inode *inode, qsize_t *usage) goto out; } - if (ext4_xattr_check_block(inode, bh)) { - ret = -EFSCORRUPTED; + ret = ext4_xattr_check_block(inode, bh); + if (ret) goto out; - } for (entry = BFIRST(bh); !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) @@ -1796,12 +1800,9 @@ ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i, ea_bdebug(bs->bh, "b_count=%d, refcount=%d", atomic_read(&(bs->bh->b_count)), le32_to_cpu(BHDR(bs->bh)->h_refcount)); - if (ext4_xattr_check_block(inode, bs->bh)) { - EXT4_ERROR_INODE(inode, "bad block %llu", - EXT4_I(inode)->i_file_acl); - error = -EFSCORRUPTED; + error = ext4_xattr_check_block(inode, bs->bh); + if (error) goto cleanup; - } /* Find the named attribute. */ bs->s.base = BHDR(bs->bh); bs->s.first = BFIRST(bs->bh); @@ -2724,13 +2725,9 @@ retry: error = -EIO; if (!bh) goto cleanup; - if (ext4_xattr_check_block(inode, bh)) { - EXT4_ERROR_INODE(inode, "bad block %llu", - EXT4_I(inode)->i_file_acl); - error = -EFSCORRUPTED; - brelse(bh); + error = ext4_xattr_check_block(inode, bh); + if (error) goto cleanup; - } base = BHDR(bh); end = bh->b_data + bh->b_size; min_offs = end - base; @@ -2887,11 +2884,8 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, goto cleanup; } error = ext4_xattr_check_block(inode, bh); - if (error) { - EXT4_ERROR_INODE(inode, "bad block %llu (error %d)", - EXT4_I(inode)->i_file_acl, error); + if (error) goto cleanup; - } if (ext4_has_feature_ea_inode(inode->i_sb)) { for (entry = BFIRST(bh); !IS_LAST_ENTRY(entry); From 9496005d6ca4cf8f5ee8f828165a8956872dc59d Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 30 Mar 2018 20:00:56 -0400 Subject: [PATCH 27/29] ext4: add bounds checking to ext4_xattr_find_entry() Add some paranoia checks to make sure we don't stray beyond the end of the valid memory region containing ext4 xattr entries while we are scanning for a match. Also rename the function to xattr_find_entry() since it is static and thus only used in fs/ext4/xattr.c Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/xattr.c | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index c030e41818ab..6304e81bfe6a 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -276,18 +276,22 @@ errout: __xattr_check_inode((inode), (header), (end), __func__, __LINE__) static int -ext4_xattr_find_entry(struct ext4_xattr_entry **pentry, int name_index, - const char *name, int sorted) +xattr_find_entry(struct inode *inode, struct ext4_xattr_entry **pentry, + void *end, int name_index, const char *name, int sorted) { - struct ext4_xattr_entry *entry; + struct ext4_xattr_entry *entry, *next; size_t name_len; int cmp = 1; if (name == NULL) return -EINVAL; name_len = strlen(name); - entry = *pentry; - for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) { + for (entry = *pentry; !IS_LAST_ENTRY(entry); entry = next) { + next = EXT4_XATTR_NEXT(entry); + if ((void *) next >= end) { + EXT4_ERROR_INODE(inode, "corrupted xattr entries"); + return -EFSCORRUPTED; + } cmp = name_index - entry->e_name_index; if (!cmp) cmp = name_len - entry->e_name_len; @@ -509,6 +513,7 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name, struct buffer_head *bh = NULL; struct ext4_xattr_entry *entry; size_t size; + void *end; int error; struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode); @@ -530,7 +535,8 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name, goto cleanup; ext4_xattr_block_cache_insert(ea_block_cache, bh); entry = BFIRST(bh); - error = ext4_xattr_find_entry(&entry, name_index, name, 1); + end = bh->b_data + bh->b_size; + error = xattr_find_entry(inode, &entry, end, name_index, name, 1); if (error) goto cleanup; size = le32_to_cpu(entry->e_value_size); @@ -579,7 +585,7 @@ ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name, if (error) goto cleanup; entry = IFIRST(header); - error = ext4_xattr_find_entry(&entry, name_index, name, 0); + error = xattr_find_entry(inode, &entry, end, name_index, name, 0); if (error) goto cleanup; size = le32_to_cpu(entry->e_value_size); @@ -1808,8 +1814,8 @@ ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i, bs->s.first = BFIRST(bs->bh); bs->s.end = bs->bh->b_data + bs->bh->b_size; bs->s.here = bs->s.first; - error = ext4_xattr_find_entry(&bs->s.here, i->name_index, - i->name, 1); + error = xattr_find_entry(inode, &bs->s.here, bs->s.end, + i->name_index, i->name, 1); if (error && error != -ENODATA) goto cleanup; bs->s.not_found = error; @@ -2168,8 +2174,8 @@ int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i, if (error) return error; /* Find the named attribute. */ - error = ext4_xattr_find_entry(&is->s.here, i->name_index, - i->name, 0); + error = xattr_find_entry(inode, &is->s.here, is->s.end, + i->name_index, i->name, 0); if (error && error != -ENODATA) return error; is->s.not_found = error; From 54dd0e0a1b255f115f8647fc6fb93273251b01b9 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 30 Mar 2018 20:04:11 -0400 Subject: [PATCH 28/29] ext4: add extra checks to ext4_xattr_block_get() Add explicit checks in ext4_xattr_block_get() just in case the e_value_offs and e_value_size fields in the the xattr block are corrupted in memory after the buffer_verified bit is set on the xattr block. Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/xattr.c | 26 +++++++++++++++++++------- fs/ext4/xattr.h | 11 +++++++++++ 2 files changed, 30 insertions(+), 7 deletions(-) diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 6304e81bfe6a..499cb4b1fbd2 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -197,7 +197,7 @@ ext4_xattr_check_entries(struct ext4_xattr_entry *entry, void *end, while (!IS_LAST_ENTRY(entry)) { u32 size = le32_to_cpu(entry->e_value_size); - if (size > INT_MAX) + if (size > EXT4_XATTR_SIZE_MAX) return -EFSCORRUPTED; if (size != 0 && entry->e_value_inum == 0) { @@ -540,8 +540,10 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name, if (error) goto cleanup; size = le32_to_cpu(entry->e_value_size); + error = -ERANGE; + if (unlikely(size > EXT4_XATTR_SIZE_MAX)) + goto cleanup; if (buffer) { - error = -ERANGE; if (size > buffer_size) goto cleanup; if (entry->e_value_inum) { @@ -550,8 +552,12 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name, if (error) goto cleanup; } else { - memcpy(buffer, bh->b_data + - le16_to_cpu(entry->e_value_offs), size); + u16 offset = le16_to_cpu(entry->e_value_offs); + void *p = bh->b_data + offset; + + if (unlikely(p + size > end)) + goto cleanup; + memcpy(buffer, p, size); } } error = size; @@ -589,8 +595,10 @@ ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name, if (error) goto cleanup; size = le32_to_cpu(entry->e_value_size); + error = -ERANGE; + if (unlikely(size > EXT4_XATTR_SIZE_MAX)) + goto cleanup; if (buffer) { - error = -ERANGE; if (size > buffer_size) goto cleanup; if (entry->e_value_inum) { @@ -599,8 +607,12 @@ ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name, if (error) goto cleanup; } else { - memcpy(buffer, (void *)IFIRST(header) + - le16_to_cpu(entry->e_value_offs), size); + u16 offset = le16_to_cpu(entry->e_value_offs); + void *p = (void *)IFIRST(header) + offset; + + if (unlikely(p + size > end)) + goto cleanup; + memcpy(buffer, p, size); } } error = size; diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index dd54c4f995c8..f39cad2abe2a 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -70,6 +70,17 @@ struct ext4_xattr_entry { EXT4_I(inode)->i_extra_isize)) #define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1)) +/* + * XATTR_SIZE_MAX is currently 64k, but for the purposes of checking + * for file system consistency errors, we use a somewhat bigger value. + * This allows XATTR_SIZE_MAX to grow in the future, but by using this + * instead of INT_MAX for certain consistency checks, we don't need to + * worry about arithmetic overflows. (Actually XATTR_SIZE_MAX is + * defined in include/uapi/linux/limits.h, so changing it is going + * not going to be trivial....) + */ +#define EXT4_XATTR_SIZE_MAX (1 << 24) + /* * The minimum size of EA value when you start storing it in an external inode * size of block - size of header - size of 1 entry - 4 null bytes From e40ff213898502d299351cc2fe1e350cd186f0d3 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 1 Apr 2018 23:21:03 -0400 Subject: [PATCH 29/29] ext4: force revalidation of directory pointer after seekdir(2) A malicious user could force the directory pointer to be in an invalid spot by using seekdir(2). Use the mechanism we already have to notice if the directory has changed since the last time we called ext4_readdir() to force a revalidation of the pointer. Reported-by: syzbot+1236ce66f79263e8a862@syzkaller.appspotmail.com Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org --- fs/ext4/dir.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index da87cf757f7d..e2902d394f1b 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -365,13 +365,15 @@ static loff_t ext4_dir_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file->f_mapping->host; int dx_dir = is_dx_dir(inode); - loff_t htree_max = ext4_get_htree_eof(file); + loff_t ret, htree_max = ext4_get_htree_eof(file); if (likely(dx_dir)) - return generic_file_llseek_size(file, offset, whence, + ret = generic_file_llseek_size(file, offset, whence, htree_max, htree_max); else - return ext4_llseek(file, offset, whence); + ret = ext4_llseek(file, offset, whence); + file->f_version = inode_peek_iversion(inode) - 1; + return ret; } /*