ext4: data=journal: write-protect pages on j_submit_inode_data_buffers()
This implements journal callbacks j_submit|finish_inode_data_buffers() with different behavior for data=journal: to write-protect pages under commit, preventing changes to buffers writeably mapped to userspace. If a buffer's content changes between commit's checksum calculation and write-out to disk, it can cause journal recovery/mount failures upon a kernel crash or power loss. [ 27.334874] EXT4-fs: Warning: mounting with data=journal disables delayed allocation, dioread_nolock, and O_DIRECT support! [ 27.339492] JBD2: Invalid checksum recovering data block 8705 in log [ 27.342716] JBD2: recovery failed [ 27.343316] EXT4-fs (loop0): error loading journal mount: /ext4: can't read superblock on /dev/loop0. In j_submit_inode_data_buffers() we write-protect the inode's pages with write_cache_pages() and redirty w/ writepage callback if needed. In j_finish_inode_data_buffers() there is nothing do to. And in order to use the callbacks, inodes are added to the inode list in transaction in __ext4_journalled_writepage() and ext4_page_mkwrite(). In ext4_page_mkwrite() we must make sure that the buffers are attached to the transaction as jbddirty with write_end_fn(), as already done in __ext4_journalled_writepage(). Signed-off-by: Mauricio Faria de Oliveira <mfo@canonical.com> Reported-by: Dann Frazier <dann.frazier@canonical.com> Reported-by: kernel test robot <lkp@intel.com> # wbc.nr_to_write Suggested-by: Jan Kara <jack@suse.cz> Reviewed-by: Jan Kara <jack@suse.cz> Link: https://lore.kernel.org/r/20201006004841.600488-5-mfo@canonical.com Signed-off-by: Theodore Ts'o <tytso@mit.edu>
This commit is contained in:
Родитель
64a9f14499
Коммит
afb585a97f
|
@ -1911,6 +1911,9 @@ static int __ext4_journalled_writepage(struct page *page,
|
|||
err = ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL,
|
||||
write_end_fn);
|
||||
}
|
||||
if (ret == 0)
|
||||
ret = err;
|
||||
err = ext4_jbd2_inode_add_write(handle, inode, 0, len);
|
||||
if (ret == 0)
|
||||
ret = err;
|
||||
EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
|
||||
|
@ -6062,10 +6065,8 @@ retry_alloc:
|
|||
size = i_size_read(inode);
|
||||
/* Page got truncated from under us? */
|
||||
if (page->mapping != mapping || page_offset(page) > size) {
|
||||
unlock_page(page);
|
||||
ret = VM_FAULT_NOPAGE;
|
||||
ext4_journal_stop(handle);
|
||||
goto out;
|
||||
goto out_error;
|
||||
}
|
||||
|
||||
if (page->index == size >> PAGE_SHIFT)
|
||||
|
@ -6075,13 +6076,15 @@ retry_alloc:
|
|||
|
||||
err = __block_write_begin(page, 0, len, ext4_get_block);
|
||||
if (!err) {
|
||||
ret = VM_FAULT_SIGBUS;
|
||||
if (ext4_walk_page_buffers(handle, page_buffers(page),
|
||||
0, len, NULL, do_journal_get_write_access)) {
|
||||
unlock_page(page);
|
||||
ret = VM_FAULT_SIGBUS;
|
||||
ext4_journal_stop(handle);
|
||||
goto out;
|
||||
}
|
||||
0, len, NULL, do_journal_get_write_access))
|
||||
goto out_error;
|
||||
if (ext4_walk_page_buffers(handle, page_buffers(page),
|
||||
0, len, NULL, write_end_fn))
|
||||
goto out_error;
|
||||
if (ext4_jbd2_inode_add_write(handle, inode, 0, len))
|
||||
goto out_error;
|
||||
ext4_set_inode_state(inode, EXT4_STATE_JDATA);
|
||||
} else {
|
||||
unlock_page(page);
|
||||
|
@ -6096,6 +6099,10 @@ out:
|
|||
up_read(&EXT4_I(inode)->i_mmap_sem);
|
||||
sb_end_pagefault(inode->i_sb);
|
||||
return ret;
|
||||
out_error:
|
||||
unlock_page(page);
|
||||
ext4_journal_stop(handle);
|
||||
goto out;
|
||||
}
|
||||
|
||||
vm_fault_t ext4_filemap_fault(struct vm_fault *vmf)
|
||||
|
|
|
@ -571,6 +571,89 @@ static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn)
|
|||
spin_unlock(&sbi->s_md_lock);
|
||||
}
|
||||
|
||||
/*
|
||||
* This writepage callback for write_cache_pages()
|
||||
* takes care of a few cases after page cleaning.
|
||||
*
|
||||
* write_cache_pages() already checks for dirty pages
|
||||
* and calls clear_page_dirty_for_io(), which we want,
|
||||
* to write protect the pages.
|
||||
*
|
||||
* However, we may have to redirty a page (see below.)
|
||||
*/
|
||||
static int ext4_journalled_writepage_callback(struct page *page,
|
||||
struct writeback_control *wbc,
|
||||
void *data)
|
||||
{
|
||||
transaction_t *transaction = (transaction_t *) data;
|
||||
struct buffer_head *bh, *head;
|
||||
struct journal_head *jh;
|
||||
|
||||
bh = head = page_buffers(page);
|
||||
do {
|
||||
/*
|
||||
* We have to redirty a page in these cases:
|
||||
* 1) If buffer is dirty, it means the page was dirty because it
|
||||
* contains a buffer that needs checkpointing. So the dirty bit
|
||||
* needs to be preserved so that checkpointing writes the buffer
|
||||
* properly.
|
||||
* 2) If buffer is not part of the committing transaction
|
||||
* (we may have just accidentally come across this buffer because
|
||||
* inode range tracking is not exact) or if the currently running
|
||||
* transaction already contains this buffer as well, dirty bit
|
||||
* needs to be preserved so that the buffer gets writeprotected
|
||||
* properly on running transaction's commit.
|
||||
*/
|
||||
jh = bh2jh(bh);
|
||||
if (buffer_dirty(bh) ||
|
||||
(jh && (jh->b_transaction != transaction ||
|
||||
jh->b_next_transaction))) {
|
||||
redirty_page_for_writepage(wbc, page);
|
||||
goto out;
|
||||
}
|
||||
} while ((bh = bh->b_this_page) != head);
|
||||
|
||||
out:
|
||||
return AOP_WRITEPAGE_ACTIVATE;
|
||||
}
|
||||
|
||||
static int ext4_journalled_submit_inode_data_buffers(struct jbd2_inode *jinode)
|
||||
{
|
||||
struct address_space *mapping = jinode->i_vfs_inode->i_mapping;
|
||||
struct writeback_control wbc = {
|
||||
.sync_mode = WB_SYNC_ALL,
|
||||
.nr_to_write = LONG_MAX,
|
||||
.range_start = jinode->i_dirty_start,
|
||||
.range_end = jinode->i_dirty_end,
|
||||
};
|
||||
|
||||
return write_cache_pages(mapping, &wbc,
|
||||
ext4_journalled_writepage_callback,
|
||||
jinode->i_transaction);
|
||||
}
|
||||
|
||||
static int ext4_journal_submit_inode_data_buffers(struct jbd2_inode *jinode)
|
||||
{
|
||||
int ret;
|
||||
|
||||
if (ext4_should_journal_data(jinode->i_vfs_inode))
|
||||
ret = ext4_journalled_submit_inode_data_buffers(jinode);
|
||||
else
|
||||
ret = jbd2_journal_submit_inode_data_buffers(jinode);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int ext4_journal_finish_inode_data_buffers(struct jbd2_inode *jinode)
|
||||
{
|
||||
int ret = 0;
|
||||
|
||||
if (!ext4_should_journal_data(jinode->i_vfs_inode))
|
||||
ret = jbd2_journal_finish_inode_data_buffers(jinode);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static bool system_going_down(void)
|
||||
{
|
||||
return system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF
|
||||
|
@ -4753,9 +4836,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
|
|||
|
||||
sbi->s_journal->j_commit_callback = ext4_journal_commit_callback;
|
||||
sbi->s_journal->j_submit_inode_data_buffers =
|
||||
jbd2_journal_submit_inode_data_buffers;
|
||||
ext4_journal_submit_inode_data_buffers;
|
||||
sbi->s_journal->j_finish_inode_data_buffers =
|
||||
jbd2_journal_finish_inode_data_buffers;
|
||||
ext4_journal_finish_inode_data_buffers;
|
||||
|
||||
no_journal:
|
||||
if (!test_opt(sb, NO_MBCACHE)) {
|
||||
|
|
Загрузка…
Ссылка в новой задаче