Btrfs: skip writeback of last page when truncating file to same size

When we truncate a file to the same size and that size is not aligned
with the sector size, we end up triggering writeback (and wait for it to
complete) of the last page. This is unncessary as we can not have delayed
allocation beyond the inode's i_size and the goal of truncating a file
to its own size is to discard prealloc extents (allocated via the
fallocate(2) system call). Besides the unnecessary IO start and wait, it
also breaks the oppurtunity for larger contiguous extents on disk, as
before the last dirty page there might be other dirty pages.

This scenario is probably not very common in general, however it is
common for btrfs receive implementations because currently the send
stream always issues a truncate operation for each processed inode as
the last operation for that inode (this truncate operation is not
always needed and the send implementation will be addressed to avoid
them).

So improve this by not starting and waiting for writeback of the inode's
last page when we are truncating to exactly the same size.

The following script was used to quickly measure the time a receive
operation takes:

 $ cat test_send.sh
 #!/bin/bash

 SRC_DEV=/dev/sdc
 DST_DEV=/dev/sdd
 SRC_MNT=/mnt/sdc
 DST_MNT=/mnt/sdd

 mkfs.btrfs -f $SRC_DEV >/dev/null
 mkfs.btrfs -f $DST_DEV >/dev/null
 mount $SRC_DEV $SRC_MNT
 mount $DST_DEV $DST_MNT

 echo "Creating source filesystem"
 for ((t = 0; t < 10; t++)); do
     (
         for ((i = 1; i <= 20000; i++)); do
             xfs_io -f -c "pwrite -S 0xab 0 5000" \
                $SRC_MNT/file_$i > /dev/null
         done
     ) &
     worker_pids[$t]=$!
 done
 wait ${worker_pids[@]}

 echo "Creating and sending snapshot"
 btrfs subvolume snapshot -r $SRC_MNT $SRC_MNT/snap1 >/dev/null
 /usr/bin/time -f "send took %e seconds"    \
     btrfs send -f $SRC_MNT/send_file $SRC_MNT/snap1
 /usr/bin/time -f "receive took %e seconds" \
     btrfs receive -f $SRC_MNT/send_file $DST_MNT

 umount $SRC_MNT
 umount $DST_MNT

The results for 5 runs were the following:

* Without this change

average receive time was 26.49 seconds
standard deviation of 2.53 seconds

* With this change

average receive time was 12.51 seconds
standard deviation of 0.32 seconds

Reported-by: Robbie Ko <robbieko@synology.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
This commit is contained in:
Filipe Manana 2018-02-06 20:40:31 +00:00 коммит произвёл David Sterba
Родитель ed5d5f37e6
Коммит 213e8c5520
1 изменённых файлов: 10 добавлений и 8 удалений

Просмотреть файл

@ -101,7 +101,7 @@ static const unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
};
static int btrfs_setsize(struct inode *inode, struct iattr *attr);
static int btrfs_truncate(struct inode *inode);
static int btrfs_truncate(struct inode *inode, bool skip_writeback);
static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
static noinline int cow_file_range(struct inode *inode,
struct page *locked_page,
@ -3668,7 +3668,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
goto out;
}
ret = btrfs_truncate(inode);
ret = btrfs_truncate(inode, false);
if (ret)
btrfs_orphan_del(NULL, BTRFS_I(inode));
} else {
@ -5154,7 +5154,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
inode_dio_wait(inode);
btrfs_inode_resume_unlocked_dio(BTRFS_I(inode));
ret = btrfs_truncate(inode);
ret = btrfs_truncate(inode, newsize == oldsize);
if (ret && inode->i_nlink) {
int err;
@ -9136,7 +9136,7 @@ out_noreserve:
return ret;
}
static int btrfs_truncate(struct inode *inode)
static int btrfs_truncate(struct inode *inode, bool skip_writeback)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
@ -9147,10 +9147,12 @@ static int btrfs_truncate(struct inode *inode)
u64 mask = fs_info->sectorsize - 1;
u64 min_size = btrfs_calc_trunc_metadata_size(fs_info, 1);
ret = btrfs_wait_ordered_range(inode, inode->i_size & (~mask),
(u64)-1);
if (ret)
return ret;
if (!skip_writeback) {
ret = btrfs_wait_ordered_range(inode, inode->i_size & (~mask),
(u64)-1);
if (ret)
return ret;
}
/*
* Yes ladies and gentlemen, this is indeed ugly. The fact is we have