btrfs: replace pending/pinned chunks lists with io tree
The pending chunks list contains chunks that are allocated in the current transaction but haven't been created yet. The pinned chunks list contains chunks that are being released in the current transaction. Both describe chunks that are not reflected on disk as in use but are unavailable just the same. The pending chunks list is anchored by the transaction handle, which means that we need to hold a reference to a transaction when working with the list. The way we use them is by iterating over both lists to perform comparisons on the stripes they describe for each device. This is backwards and requires that we keep a transaction handle open while we're trimming. This patchset adds an extent_io_tree to btrfs_device that maintains the allocation state of the device. Extents are set dirty when chunks are first allocated -- when the extent maps are added to the mapping tree. They're cleared when last removed -- when the extent maps are removed from the mapping tree. This matches the lifespan of the pending and pinned chunks list and allows us to do trims on unallocated space safely without pinning the transaction for what may be a lengthy operation. We can also use this io tree to mark which chunks have already been trimmed so we don't repeat the operation. Signed-off-by: Jeff Mahoney <jeffm@suse.com> Signed-off-by: Nikolay Borisov <nborisov@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
This commit is contained in:
Родитель
68c94e55e1
Коммит
1c11b63eff
|
@ -1149,12 +1149,6 @@ struct btrfs_fs_info {
|
||||||
struct mutex unused_bg_unpin_mutex;
|
struct mutex unused_bg_unpin_mutex;
|
||||||
struct mutex delete_unused_bgs_mutex;
|
struct mutex delete_unused_bgs_mutex;
|
||||||
|
|
||||||
/*
|
|
||||||
* Chunks that can't be freed yet (under a trim/discard operation)
|
|
||||||
* and will be latter freed. Protected by fs_info->chunk_mutex.
|
|
||||||
*/
|
|
||||||
struct list_head pinned_chunks;
|
|
||||||
|
|
||||||
/* Cached block sizes */
|
/* Cached block sizes */
|
||||||
u32 nodesize;
|
u32 nodesize;
|
||||||
u32 sectorsize;
|
u32 sectorsize;
|
||||||
|
|
|
@ -2774,8 +2774,6 @@ int open_ctree(struct super_block *sb,
|
||||||
init_waitqueue_head(&fs_info->async_submit_wait);
|
init_waitqueue_head(&fs_info->async_submit_wait);
|
||||||
init_waitqueue_head(&fs_info->delayed_iputs_wait);
|
init_waitqueue_head(&fs_info->delayed_iputs_wait);
|
||||||
|
|
||||||
INIT_LIST_HEAD(&fs_info->pinned_chunks);
|
|
||||||
|
|
||||||
/* Usable values until the real ones are cached from the superblock */
|
/* Usable values until the real ones are cached from the superblock */
|
||||||
fs_info->nodesize = 4096;
|
fs_info->nodesize = 4096;
|
||||||
fs_info->sectorsize = 4096;
|
fs_info->sectorsize = 4096;
|
||||||
|
@ -4050,15 +4048,6 @@ void close_ctree(struct btrfs_fs_info *fs_info)
|
||||||
|
|
||||||
btrfs_free_stripe_hash_table(fs_info);
|
btrfs_free_stripe_hash_table(fs_info);
|
||||||
btrfs_free_ref_cache(fs_info);
|
btrfs_free_ref_cache(fs_info);
|
||||||
|
|
||||||
while (!list_empty(&fs_info->pinned_chunks)) {
|
|
||||||
struct extent_map *em;
|
|
||||||
|
|
||||||
em = list_first_entry(&fs_info->pinned_chunks,
|
|
||||||
struct extent_map, list);
|
|
||||||
list_del_init(&em->list);
|
|
||||||
free_extent_map(em);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
|
int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
|
||||||
|
|
|
@ -10946,10 +10946,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
|
||||||
memcpy(&key, &block_group->key, sizeof(key));
|
memcpy(&key, &block_group->key, sizeof(key));
|
||||||
|
|
||||||
mutex_lock(&fs_info->chunk_mutex);
|
mutex_lock(&fs_info->chunk_mutex);
|
||||||
if (!list_empty(&em->list)) {
|
|
||||||
/* We're in the transaction->pending_chunks list. */
|
|
||||||
free_extent_map(em);
|
|
||||||
}
|
|
||||||
spin_lock(&block_group->lock);
|
spin_lock(&block_group->lock);
|
||||||
block_group->removed = 1;
|
block_group->removed = 1;
|
||||||
/*
|
/*
|
||||||
|
@ -10976,25 +10972,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
|
||||||
* the transaction commit has completed.
|
* the transaction commit has completed.
|
||||||
*/
|
*/
|
||||||
remove_em = (atomic_read(&block_group->trimming) == 0);
|
remove_em = (atomic_read(&block_group->trimming) == 0);
|
||||||
/*
|
|
||||||
* Make sure a trimmer task always sees the em in the pinned_chunks list
|
|
||||||
* if it sees block_group->removed == 1 (needs to lock block_group->lock
|
|
||||||
* before checking block_group->removed).
|
|
||||||
*/
|
|
||||||
if (!remove_em) {
|
|
||||||
/*
|
|
||||||
* Our em might be in trans->transaction->pending_chunks which
|
|
||||||
* is protected by fs_info->chunk_mutex ([lock|unlock]_chunks),
|
|
||||||
* and so is the fs_info->pinned_chunks list.
|
|
||||||
*
|
|
||||||
* So at this point we must be holding the chunk_mutex to avoid
|
|
||||||
* any races with chunk allocation (more specifically at
|
|
||||||
* volumes.c:contains_pending_extent()), to ensure it always
|
|
||||||
* sees the em, either in the pending_chunks list or in the
|
|
||||||
* pinned_chunks list.
|
|
||||||
*/
|
|
||||||
list_move_tail(&em->list, &fs_info->pinned_chunks);
|
|
||||||
}
|
|
||||||
spin_unlock(&block_group->lock);
|
spin_unlock(&block_group->lock);
|
||||||
|
|
||||||
if (remove_em) {
|
if (remove_em) {
|
||||||
|
@ -11002,11 +10979,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
|
||||||
|
|
||||||
em_tree = &fs_info->mapping_tree.map_tree;
|
em_tree = &fs_info->mapping_tree.map_tree;
|
||||||
write_lock(&em_tree->lock);
|
write_lock(&em_tree->lock);
|
||||||
/*
|
|
||||||
* The em might be in the pending_chunks list, so make sure the
|
|
||||||
* chunk mutex is locked, since remove_extent_mapping() will
|
|
||||||
* delete us from that list.
|
|
||||||
*/
|
|
||||||
remove_extent_mapping(em_tree, em);
|
remove_extent_mapping(em_tree, em);
|
||||||
write_unlock(&em_tree->lock);
|
write_unlock(&em_tree->lock);
|
||||||
/* once for the tree */
|
/* once for the tree */
|
||||||
|
|
|
@ -4,6 +4,7 @@
|
||||||
#include <linux/slab.h>
|
#include <linux/slab.h>
|
||||||
#include <linux/spinlock.h>
|
#include <linux/spinlock.h>
|
||||||
#include "ctree.h"
|
#include "ctree.h"
|
||||||
|
#include "volumes.h"
|
||||||
#include "extent_map.h"
|
#include "extent_map.h"
|
||||||
#include "compression.h"
|
#include "compression.h"
|
||||||
|
|
||||||
|
@ -337,6 +338,37 @@ static inline void setup_extent_mapping(struct extent_map_tree *tree,
|
||||||
try_merge_map(tree, em);
|
try_merge_map(tree, em);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void extent_map_device_set_bits(struct extent_map *em, unsigned bits)
|
||||||
|
{
|
||||||
|
struct map_lookup *map = em->map_lookup;
|
||||||
|
u64 stripe_size = em->orig_block_len;
|
||||||
|
int i;
|
||||||
|
|
||||||
|
for (i = 0; i < map->num_stripes; i++) {
|
||||||
|
struct btrfs_bio_stripe *stripe = &map->stripes[i];
|
||||||
|
struct btrfs_device *device = stripe->dev;
|
||||||
|
|
||||||
|
set_extent_bits_nowait(&device->alloc_state, stripe->physical,
|
||||||
|
stripe->physical + stripe_size - 1, bits);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void extent_map_device_clear_bits(struct extent_map *em, unsigned bits)
|
||||||
|
{
|
||||||
|
struct map_lookup *map = em->map_lookup;
|
||||||
|
u64 stripe_size = em->orig_block_len;
|
||||||
|
int i;
|
||||||
|
|
||||||
|
for (i = 0; i < map->num_stripes; i++) {
|
||||||
|
struct btrfs_bio_stripe *stripe = &map->stripes[i];
|
||||||
|
struct btrfs_device *device = stripe->dev;
|
||||||
|
|
||||||
|
__clear_extent_bit(&device->alloc_state, stripe->physical,
|
||||||
|
stripe->physical + stripe_size - 1, bits,
|
||||||
|
0, 0, NULL, GFP_NOWAIT, NULL);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* add_extent_mapping - add new extent map to the extent tree
|
* add_extent_mapping - add new extent map to the extent tree
|
||||||
* @tree: tree to insert new map in
|
* @tree: tree to insert new map in
|
||||||
|
@ -357,6 +389,8 @@ int add_extent_mapping(struct extent_map_tree *tree,
|
||||||
goto out;
|
goto out;
|
||||||
|
|
||||||
setup_extent_mapping(tree, em, modified);
|
setup_extent_mapping(tree, em, modified);
|
||||||
|
if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags))
|
||||||
|
extent_map_device_set_bits(em, CHUNK_ALLOCATED);
|
||||||
out:
|
out:
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
@ -438,6 +472,8 @@ void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
|
||||||
rb_erase_cached(&em->rb_node, &tree->map);
|
rb_erase_cached(&em->rb_node, &tree->map);
|
||||||
if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags))
|
if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags))
|
||||||
list_del_init(&em->list);
|
list_del_init(&em->list);
|
||||||
|
if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags))
|
||||||
|
extent_map_device_clear_bits(em, CHUNK_ALLOCATED);
|
||||||
RB_CLEAR_NODE(&em->rb_node);
|
RB_CLEAR_NODE(&em->rb_node);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -3366,10 +3366,6 @@ void btrfs_put_block_group_trimming(struct btrfs_block_group_cache *block_group)
|
||||||
em = lookup_extent_mapping(em_tree, block_group->key.objectid,
|
em = lookup_extent_mapping(em_tree, block_group->key.objectid,
|
||||||
1);
|
1);
|
||||||
BUG_ON(!em); /* logic error, can't happen */
|
BUG_ON(!em); /* logic error, can't happen */
|
||||||
/*
|
|
||||||
* remove_extent_mapping() will delete us from the pinned_chunks
|
|
||||||
* list, which is protected by the chunk mutex.
|
|
||||||
*/
|
|
||||||
remove_extent_mapping(em_tree, em);
|
remove_extent_mapping(em_tree, em);
|
||||||
write_unlock(&em_tree->lock);
|
write_unlock(&em_tree->lock);
|
||||||
mutex_unlock(&fs_info->chunk_mutex);
|
mutex_unlock(&fs_info->chunk_mutex);
|
||||||
|
|
|
@ -50,14 +50,6 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction)
|
||||||
btrfs_err(transaction->fs_info,
|
btrfs_err(transaction->fs_info,
|
||||||
"pending csums is %llu",
|
"pending csums is %llu",
|
||||||
transaction->delayed_refs.pending_csums);
|
transaction->delayed_refs.pending_csums);
|
||||||
while (!list_empty(&transaction->pending_chunks)) {
|
|
||||||
struct extent_map *em;
|
|
||||||
|
|
||||||
em = list_first_entry(&transaction->pending_chunks,
|
|
||||||
struct extent_map, list);
|
|
||||||
list_del_init(&em->list);
|
|
||||||
free_extent_map(em);
|
|
||||||
}
|
|
||||||
/*
|
/*
|
||||||
* If any block groups are found in ->deleted_bgs then it's
|
* If any block groups are found in ->deleted_bgs then it's
|
||||||
* because the transaction was aborted and a commit did not
|
* because the transaction was aborted and a commit did not
|
||||||
|
@ -235,7 +227,6 @@ loop:
|
||||||
spin_lock_init(&cur_trans->delayed_refs.lock);
|
spin_lock_init(&cur_trans->delayed_refs.lock);
|
||||||
|
|
||||||
INIT_LIST_HEAD(&cur_trans->pending_snapshots);
|
INIT_LIST_HEAD(&cur_trans->pending_snapshots);
|
||||||
INIT_LIST_HEAD(&cur_trans->pending_chunks);
|
|
||||||
INIT_LIST_HEAD(&cur_trans->dev_update_list);
|
INIT_LIST_HEAD(&cur_trans->dev_update_list);
|
||||||
INIT_LIST_HEAD(&cur_trans->switch_commits);
|
INIT_LIST_HEAD(&cur_trans->switch_commits);
|
||||||
INIT_LIST_HEAD(&cur_trans->dirty_bgs);
|
INIT_LIST_HEAD(&cur_trans->dirty_bgs);
|
||||||
|
|
|
@ -51,7 +51,6 @@ struct btrfs_transaction {
|
||||||
wait_queue_head_t writer_wait;
|
wait_queue_head_t writer_wait;
|
||||||
wait_queue_head_t commit_wait;
|
wait_queue_head_t commit_wait;
|
||||||
struct list_head pending_snapshots;
|
struct list_head pending_snapshots;
|
||||||
struct list_head pending_chunks;
|
|
||||||
struct list_head dev_update_list;
|
struct list_head dev_update_list;
|
||||||
struct list_head switch_commits;
|
struct list_head switch_commits;
|
||||||
struct list_head dirty_bgs;
|
struct list_head dirty_bgs;
|
||||||
|
|
|
@ -336,6 +336,7 @@ void btrfs_free_device(struct btrfs_device *device)
|
||||||
{
|
{
|
||||||
WARN_ON(!list_empty(&device->post_commit_list));
|
WARN_ON(!list_empty(&device->post_commit_list));
|
||||||
rcu_string_free(device->name);
|
rcu_string_free(device->name);
|
||||||
|
extent_io_tree_release(&device->alloc_state);
|
||||||
bio_put(device->flush_bio);
|
bio_put(device->flush_bio);
|
||||||
kfree(device);
|
kfree(device);
|
||||||
}
|
}
|
||||||
|
@ -412,6 +413,7 @@ static struct btrfs_device *__alloc_device(void)
|
||||||
btrfs_device_data_ordered_init(dev);
|
btrfs_device_data_ordered_init(dev);
|
||||||
INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
|
INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
|
||||||
INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
|
INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
|
||||||
|
extent_io_tree_init(NULL, &dev->alloc_state, 0, NULL);
|
||||||
|
|
||||||
return dev;
|
return dev;
|
||||||
}
|
}
|
||||||
|
@ -1499,58 +1501,30 @@ error_bdev_put:
|
||||||
return device;
|
return device;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int contains_pending_extent(struct btrfs_transaction *transaction,
|
/*
|
||||||
struct btrfs_device *device,
|
* Try to find a chunk that intersects [start, start + len] range and when one
|
||||||
u64 *start, u64 len)
|
* such is found, record the end of it in *start
|
||||||
|
*/
|
||||||
|
#define in_range(b, first, len) ((b) >= (first) && (b) < (first) + (len))
|
||||||
|
static bool contains_pending_extent(struct btrfs_device *device, u64 *start,
|
||||||
|
u64 len)
|
||||||
{
|
{
|
||||||
struct btrfs_fs_info *fs_info = device->fs_info;
|
u64 physical_start, physical_end;
|
||||||
struct extent_map *em;
|
|
||||||
struct list_head *search_list = &fs_info->pinned_chunks;
|
|
||||||
int ret = 0;
|
|
||||||
u64 physical_start = *start;
|
|
||||||
|
|
||||||
if (transaction)
|
lockdep_assert_held(&device->fs_info->chunk_mutex);
|
||||||
search_list = &transaction->pending_chunks;
|
|
||||||
again:
|
|
||||||
list_for_each_entry(em, search_list, list) {
|
|
||||||
struct map_lookup *map;
|
|
||||||
int i;
|
|
||||||
|
|
||||||
map = em->map_lookup;
|
if (!find_first_extent_bit(&device->alloc_state, *start,
|
||||||
for (i = 0; i < map->num_stripes; i++) {
|
&physical_start, &physical_end,
|
||||||
u64 end;
|
CHUNK_ALLOCATED, NULL)) {
|
||||||
|
|
||||||
if (map->stripes[i].dev != device)
|
if (in_range(physical_start, *start, len) ||
|
||||||
continue;
|
in_range(*start, physical_start,
|
||||||
if (map->stripes[i].physical >= physical_start + len ||
|
physical_end - physical_start)) {
|
||||||
map->stripes[i].physical + em->orig_block_len <=
|
*start = physical_end + 1;
|
||||||
physical_start)
|
return true;
|
||||||
continue;
|
|
||||||
/*
|
|
||||||
* Make sure that while processing the pinned list we do
|
|
||||||
* not override our *start with a lower value, because
|
|
||||||
* we can have pinned chunks that fall within this
|
|
||||||
* device hole and that have lower physical addresses
|
|
||||||
* than the pending chunks we processed before. If we
|
|
||||||
* do not take this special care we can end up getting
|
|
||||||
* 2 pending chunks that start at the same physical
|
|
||||||
* device offsets because the end offset of a pinned
|
|
||||||
* chunk can be equal to the start offset of some
|
|
||||||
* pending chunk.
|
|
||||||
*/
|
|
||||||
end = map->stripes[i].physical + em->orig_block_len;
|
|
||||||
if (end > *start) {
|
|
||||||
*start = end;
|
|
||||||
ret = 1;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (search_list != &fs_info->pinned_chunks) {
|
return false;
|
||||||
search_list = &fs_info->pinned_chunks;
|
|
||||||
goto again;
|
|
||||||
}
|
|
||||||
|
|
||||||
return ret;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -1661,15 +1635,12 @@ again:
|
||||||
* Have to check before we set max_hole_start, otherwise
|
* Have to check before we set max_hole_start, otherwise
|
||||||
* we could end up sending back this offset anyway.
|
* we could end up sending back this offset anyway.
|
||||||
*/
|
*/
|
||||||
if (contains_pending_extent(transaction, device,
|
if (contains_pending_extent(device, &search_start,
|
||||||
&search_start,
|
|
||||||
hole_size)) {
|
hole_size)) {
|
||||||
if (key.offset >= search_start) {
|
if (key.offset >= search_start)
|
||||||
hole_size = key.offset - search_start;
|
hole_size = key.offset - search_start;
|
||||||
} else {
|
else
|
||||||
WARN_ON_ONCE(1);
|
|
||||||
hole_size = 0;
|
hole_size = 0;
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (hole_size > max_hole_size) {
|
if (hole_size > max_hole_size) {
|
||||||
|
@ -1710,8 +1681,7 @@ next:
|
||||||
if (search_end > search_start) {
|
if (search_end > search_start) {
|
||||||
hole_size = search_end - search_start;
|
hole_size = search_end - search_start;
|
||||||
|
|
||||||
if (contains_pending_extent(transaction, device, &search_start,
|
if (contains_pending_extent(device, &search_start, hole_size)) {
|
||||||
hole_size)) {
|
|
||||||
btrfs_release_path(path);
|
btrfs_release_path(path);
|
||||||
goto again;
|
goto again;
|
||||||
}
|
}
|
||||||
|
@ -4756,7 +4726,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
|
||||||
* in-memory chunks are synced to disk so that the loop below sees them
|
* in-memory chunks are synced to disk so that the loop below sees them
|
||||||
* and relocates them accordingly.
|
* and relocates them accordingly.
|
||||||
*/
|
*/
|
||||||
if (contains_pending_extent(trans->transaction, device, &start, diff)) {
|
if (contains_pending_extent(device, &start, diff)) {
|
||||||
mutex_unlock(&fs_info->chunk_mutex);
|
mutex_unlock(&fs_info->chunk_mutex);
|
||||||
ret = btrfs_commit_transaction(trans);
|
ret = btrfs_commit_transaction(trans);
|
||||||
if (ret)
|
if (ret)
|
||||||
|
@ -5189,9 +5159,6 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
|
||||||
free_extent_map(em);
|
free_extent_map(em);
|
||||||
goto error;
|
goto error;
|
||||||
}
|
}
|
||||||
|
|
||||||
list_add_tail(&em->list, &trans->transaction->pending_chunks);
|
|
||||||
refcount_inc(&em->refs);
|
|
||||||
write_unlock(&em_tree->lock);
|
write_unlock(&em_tree->lock);
|
||||||
|
|
||||||
ret = btrfs_make_block_group(trans, 0, type, start, chunk_size);
|
ret = btrfs_make_block_group(trans, 0, type, start, chunk_size);
|
||||||
|
@ -5224,8 +5191,6 @@ error_del_extent:
|
||||||
free_extent_map(em);
|
free_extent_map(em);
|
||||||
/* One for the tree reference */
|
/* One for the tree reference */
|
||||||
free_extent_map(em);
|
free_extent_map(em);
|
||||||
/* One for the pending_chunks list reference */
|
|
||||||
free_extent_map(em);
|
|
||||||
error:
|
error:
|
||||||
kfree(devices_info);
|
kfree(devices_info);
|
||||||
return ret;
|
return ret;
|
||||||
|
|
|
@ -133,6 +133,8 @@ struct btrfs_device {
|
||||||
/* Counter to record the change of device stats */
|
/* Counter to record the change of device stats */
|
||||||
atomic_t dev_stats_ccnt;
|
atomic_t dev_stats_ccnt;
|
||||||
atomic_t dev_stat_values[BTRFS_DEV_STAT_VALUES_MAX];
|
atomic_t dev_stat_values[BTRFS_DEV_STAT_VALUES_MAX];
|
||||||
|
|
||||||
|
struct extent_io_tree alloc_state;
|
||||||
};
|
};
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|
Загрузка…
Ссылка в новой задаче