2009-04-03 17:47:43 +04:00
|
|
|
/*
|
|
|
|
* Copyright (C) 2009 Oracle. All rights reserved.
|
|
|
|
*
|
|
|
|
* This program is free software; you can redistribute it and/or
|
|
|
|
* modify it under the terms of the GNU General Public
|
|
|
|
* License v2 as published by the Free Software Foundation.
|
|
|
|
*
|
|
|
|
* This program is distributed in the hope that it will be useful,
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
* General Public License for more details.
|
|
|
|
*
|
|
|
|
* You should have received a copy of the GNU General Public
|
|
|
|
* License along with this program; if not, write to the
|
|
|
|
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
|
|
|
* Boston, MA 021110-1307, USA.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#ifndef __BTRFS_FREE_SPACE_CACHE
|
|
|
|
#define __BTRFS_FREE_SPACE_CACHE
|
|
|
|
|
Btrfs: use hybrid extents+bitmap rb tree for free space
Currently btrfs has a problem where it can use a ridiculous amount of RAM simply
tracking free space. As free space gets fragmented, we end up with thousands of
entries on an rb-tree per block group, which usually spans 1 gig of area. Since
we currently don't ever flush free space cache back to disk this gets to be a
bit unweildly on large fs's with lots of fragmentation.
This patch solves this problem by using PAGE_SIZE bitmaps for parts of the free
space cache. Initially we calculate a threshold of extent entries we can
handle, which is however many extent entries we can cram into 16k of ram. The
maximum amount of RAM that should ever be used to track 1 gigabyte of diskspace
will be 32k of RAM, which scales much better than we did before.
Once we pass the extent threshold, we start adding bitmaps and using those
instead for tracking the free space. This patch also makes it so that any free
space thats less than 4 * sectorsize we go ahead and put into a bitmap. This is
nice since we try and allocate out of the front of a block group, so if the
front of a block group is heavily fragmented and then has a huge chunk of free
space at the end, we go ahead and add the fragmented areas to bitmaps and use a
normal extent entry to track the big chunk at the back of the block group.
I've also taken the opportunity to revamp how we search for free space.
Previously we indexed free space via an offset indexed rb tree and a bytes
indexed rb tree. I've dropped the bytes indexed rb tree and use only the offset
indexed rb tree. This cuts the number of tree operations we were doing
previously down by half, and gives us a little bit of a better allocation
pattern since we will always start from a specific offset and search forward
from there, instead of searching for the size we need and try and get it as
close as possible to the offset we want.
I've given this a healthy amount of testing pre-new format stuff, as well as
post-new format stuff. I've booted up my fedora box which is installed on btrfs
with this patch and ran with it for a few days without issues. I've not seen
any performance regressions in any of my tests.
Since the last patch Yan Zheng fixed a problem where we could have overlapping
entries, so updating their offset inline would cause problems. Thanks,
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-07-14 05:29:25 +04:00
|
|
|
struct btrfs_free_space {
|
|
|
|
struct rb_node offset_index;
|
|
|
|
u64 offset;
|
|
|
|
u64 bytes;
|
|
|
|
unsigned long *bitmap;
|
|
|
|
struct list_head list;
|
|
|
|
};
|
|
|
|
|
2011-03-29 09:46:06 +04:00
|
|
|
struct btrfs_free_space_ctl {
|
|
|
|
spinlock_t tree_lock;
|
|
|
|
struct rb_root free_space_offset;
|
|
|
|
u64 free_space;
|
|
|
|
int extents_thresh;
|
|
|
|
int free_extents;
|
|
|
|
int total_bitmaps;
|
|
|
|
int unit;
|
|
|
|
u64 start;
|
|
|
|
struct btrfs_free_space_op *op;
|
|
|
|
void *private;
|
|
|
|
};
|
|
|
|
|
|
|
|
struct btrfs_free_space_op {
|
|
|
|
void (*recalc_thresholds)(struct btrfs_free_space_ctl *ctl);
|
|
|
|
bool (*use_bitmap)(struct btrfs_free_space_ctl *ctl,
|
|
|
|
struct btrfs_free_space *info);
|
|
|
|
};
|
|
|
|
|
2010-06-21 22:48:16 +04:00
|
|
|
struct inode *lookup_free_space_inode(struct btrfs_root *root,
|
|
|
|
struct btrfs_block_group_cache
|
|
|
|
*block_group, struct btrfs_path *path);
|
|
|
|
int create_free_space_inode(struct btrfs_root *root,
|
|
|
|
struct btrfs_trans_handle *trans,
|
|
|
|
struct btrfs_block_group_cache *block_group,
|
|
|
|
struct btrfs_path *path);
|
2010-07-02 20:14:14 +04:00
|
|
|
|
2013-05-13 17:55:09 +04:00
|
|
|
int btrfs_check_trunc_cache_free_space(struct btrfs_root *root,
|
|
|
|
struct btrfs_block_rsv *rsv);
|
2010-06-21 22:48:16 +04:00
|
|
|
int btrfs_truncate_free_space_cache(struct btrfs_root *root,
|
|
|
|
struct btrfs_trans_handle *trans,
|
|
|
|
struct btrfs_path *path,
|
|
|
|
struct inode *inode);
|
2010-08-26 00:54:15 +04:00
|
|
|
int load_free_space_cache(struct btrfs_fs_info *fs_info,
|
|
|
|
struct btrfs_block_group_cache *block_group);
|
2010-07-02 20:14:14 +04:00
|
|
|
int btrfs_write_out_cache(struct btrfs_root *root,
|
|
|
|
struct btrfs_trans_handle *trans,
|
|
|
|
struct btrfs_block_group_cache *block_group,
|
|
|
|
struct btrfs_path *path);
|
Btrfs: Cache free inode numbers in memory
Currently btrfs stores the highest objectid of the fs tree, and it always
returns (highest+1) inode number when we create a file, so inode numbers
won't be reclaimed when we delete files, so we'll run out of inode numbers
as we keep create/delete files in 32bits machines.
This fixes it, and it works similarly to how we cache free space in block
cgroups.
We start a kernel thread to read the file tree. By scanning inode items,
we know which chunks of inode numbers are free, and we cache them in
an rb-tree.
Because we are searching the commit root, we have to carefully handle the
cross-transaction case.
The rb-tree is a hybrid extent+bitmap tree, so if we have too many small
chunks of inode numbers, we'll use bitmaps. Initially we allow 16K ram
of extents, and a bitmap will be used if we exceed this threshold. The
extents threshold is adjusted in runtime.
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
2011-04-20 06:06:11 +04:00
|
|
|
|
2011-04-20 06:33:24 +04:00
|
|
|
struct inode *lookup_free_ino_inode(struct btrfs_root *root,
|
|
|
|
struct btrfs_path *path);
|
|
|
|
int create_free_ino_inode(struct btrfs_root *root,
|
|
|
|
struct btrfs_trans_handle *trans,
|
|
|
|
struct btrfs_path *path);
|
|
|
|
int load_free_ino_cache(struct btrfs_fs_info *fs_info,
|
|
|
|
struct btrfs_root *root);
|
|
|
|
int btrfs_write_out_ino_cache(struct btrfs_root *root,
|
|
|
|
struct btrfs_trans_handle *trans,
|
|
|
|
struct btrfs_path *path);
|
|
|
|
|
2011-03-29 09:46:06 +04:00
|
|
|
void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group);
|
Btrfs: Cache free inode numbers in memory
Currently btrfs stores the highest objectid of the fs tree, and it always
returns (highest+1) inode number when we create a file, so inode numbers
won't be reclaimed when we delete files, so we'll run out of inode numbers
as we keep create/delete files in 32bits machines.
This fixes it, and it works similarly to how we cache free space in block
cgroups.
We start a kernel thread to read the file tree. By scanning inode items,
we know which chunks of inode numbers are free, and we cache them in
an rb-tree.
Because we are searching the commit root, we have to carefully handle the
cross-transaction case.
The rb-tree is a hybrid extent+bitmap tree, so if we have too many small
chunks of inode numbers, we'll use bitmaps. Initially we allow 16K ram
of extents, and a bitmap will be used if we exceed this threshold. The
extents threshold is adjusted in runtime.
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
2011-04-20 06:06:11 +04:00
|
|
|
int __btrfs_add_free_space(struct btrfs_free_space_ctl *ctl,
|
|
|
|
u64 bytenr, u64 size);
|
|
|
|
static inline int
|
|
|
|
btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
|
|
|
|
u64 bytenr, u64 size)
|
|
|
|
{
|
|
|
|
return __btrfs_add_free_space(block_group->free_space_ctl,
|
|
|
|
bytenr, size);
|
|
|
|
}
|
2009-04-03 17:47:43 +04:00
|
|
|
int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
|
|
|
|
u64 bytenr, u64 size);
|
Btrfs: Cache free inode numbers in memory
Currently btrfs stores the highest objectid of the fs tree, and it always
returns (highest+1) inode number when we create a file, so inode numbers
won't be reclaimed when we delete files, so we'll run out of inode numbers
as we keep create/delete files in 32bits machines.
This fixes it, and it works similarly to how we cache free space in block
cgroups.
We start a kernel thread to read the file tree. By scanning inode items,
we know which chunks of inode numbers are free, and we cache them in
an rb-tree.
Because we are searching the commit root, we have to carefully handle the
cross-transaction case.
The rb-tree is a hybrid extent+bitmap tree, so if we have too many small
chunks of inode numbers, we'll use bitmaps. Initially we allow 16K ram
of extents, and a bitmap will be used if we exceed this threshold. The
extents threshold is adjusted in runtime.
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
2011-04-20 06:06:11 +04:00
|
|
|
void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl);
|
2009-04-03 17:47:43 +04:00
|
|
|
void btrfs_remove_free_space_cache(struct btrfs_block_group_cache
|
Btrfs: Cache free inode numbers in memory
Currently btrfs stores the highest objectid of the fs tree, and it always
returns (highest+1) inode number when we create a file, so inode numbers
won't be reclaimed when we delete files, so we'll run out of inode numbers
as we keep create/delete files in 32bits machines.
This fixes it, and it works similarly to how we cache free space in block
cgroups.
We start a kernel thread to read the file tree. By scanning inode items,
we know which chunks of inode numbers are free, and we cache them in
an rb-tree.
Because we are searching the commit root, we have to carefully handle the
cross-transaction case.
The rb-tree is a hybrid extent+bitmap tree, so if we have too many small
chunks of inode numbers, we'll use bitmaps. Initially we allow 16K ram
of extents, and a bitmap will be used if we exceed this threshold. The
extents threshold is adjusted in runtime.
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
2011-04-20 06:06:11 +04:00
|
|
|
*block_group);
|
2009-04-03 17:47:43 +04:00
|
|
|
u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
|
2013-09-09 09:19:42 +04:00
|
|
|
u64 offset, u64 bytes, u64 empty_size,
|
|
|
|
u64 *max_extent_size);
|
Btrfs: Cache free inode numbers in memory
Currently btrfs stores the highest objectid of the fs tree, and it always
returns (highest+1) inode number when we create a file, so inode numbers
won't be reclaimed when we delete files, so we'll run out of inode numbers
as we keep create/delete files in 32bits machines.
This fixes it, and it works similarly to how we cache free space in block
cgroups.
We start a kernel thread to read the file tree. By scanning inode items,
we know which chunks of inode numbers are free, and we cache them in
an rb-tree.
Because we are searching the commit root, we have to carefully handle the
cross-transaction case.
The rb-tree is a hybrid extent+bitmap tree, so if we have too many small
chunks of inode numbers, we'll use bitmaps. Initially we allow 16K ram
of extents, and a bitmap will be used if we exceed this threshold. The
extents threshold is adjusted in runtime.
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
2011-04-20 06:06:11 +04:00
|
|
|
u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root);
|
2009-04-03 17:47:43 +04:00
|
|
|
void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
|
|
|
|
u64 bytes);
|
2013-08-14 22:02:47 +04:00
|
|
|
int btrfs_find_space_cluster(struct btrfs_root *root,
|
2009-04-03 17:47:43 +04:00
|
|
|
struct btrfs_block_group_cache *block_group,
|
|
|
|
struct btrfs_free_cluster *cluster,
|
|
|
|
u64 offset, u64 bytes, u64 empty_size);
|
|
|
|
void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster);
|
|
|
|
u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
|
|
|
|
struct btrfs_free_cluster *cluster, u64 bytes,
|
2013-09-09 09:19:42 +04:00
|
|
|
u64 min_start, u64 *max_extent_size);
|
2009-04-03 17:47:43 +04:00
|
|
|
int btrfs_return_cluster_to_free_space(
|
|
|
|
struct btrfs_block_group_cache *block_group,
|
|
|
|
struct btrfs_free_cluster *cluster);
|
2011-03-24 13:24:28 +03:00
|
|
|
int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
|
|
|
|
u64 *trimmed, u64 start, u64 end, u64 minlen);
|
2013-03-15 17:47:08 +04:00
|
|
|
|
2013-08-14 23:05:12 +04:00
|
|
|
/* Support functions for runnint our sanity tests */
|
|
|
|
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
|
|
|
|
int test_add_free_space_entry(struct btrfs_block_group_cache *cache,
|
|
|
|
u64 offset, u64 bytes, bool bitmap);
|
|
|
|
int test_check_exists(struct btrfs_block_group_cache *cache,
|
|
|
|
u64 offset, u64 bytes);
|
|
|
|
#endif
|
2013-03-15 17:47:08 +04:00
|
|
|
|
2009-04-03 17:47:43 +04:00
|
|
|
#endif
|