Btrfs: separate sequence numbers for delayed ref tracking and tree mod log

Sequence numbers for delayed refs have been introduced in the first version
of the qgroup patch set. To solve the problem of find_all_roots on a busy
file system, the tree mod log was introduced. The sequence numbers for that
were simply shared between those two users.

However, at one point in qgroup's quota accounting, there's a statement
accessing the previous sequence number, that's still just doing (seq - 1)
just as it would have to in the very first version.

To satisfy that requirement, this patch makes the sequence number counter 64
bit and splits it into a major part (used for qgroup sequence number
counting) and a minor part (incremented for each tree modification in the
log). This enables us to go exactly one major step backwards, as required
for qgroups, while still incrementing the sequence counter for tree mod log
insertions to keep track of their order. Keeping them in a single variable
means there's no need to change all the code dealing with comparisons of two
sequence numbers.

The sequence number is reset to 0 on commit (not new in this patch), which
ensures we won't overflow the two 32 bit counters.

Without this fix, the qgroup tracking can occasionally go wrong and WARN_ONs
from the tree mod log code may happen.

Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
This commit is contained in:
Jan Schmidt 2013-04-24 16:57:33 +00:00 коммит произвёл Josef Bacik
Родитель 6d49ba1b47
Коммит fc36ed7e0b
7 изменённых файлов: 63 добавлений и 19 удалений

Просмотреть файл

@ -360,6 +360,44 @@ static inline void tree_mod_log_write_unlock(struct btrfs_fs_info *fs_info)
write_unlock(&fs_info->tree_mod_log_lock); write_unlock(&fs_info->tree_mod_log_lock);
} }
/*
* Increment the upper half of tree_mod_seq, set lower half zero.
*
* Must be called with fs_info->tree_mod_seq_lock held.
*/
static inline u64 btrfs_inc_tree_mod_seq_major(struct btrfs_fs_info *fs_info)
{
u64 seq = atomic64_read(&fs_info->tree_mod_seq);
seq &= 0xffffffff00000000ull;
seq += 1ull << 32;
atomic64_set(&fs_info->tree_mod_seq, seq);
return seq;
}
/*
* Increment the lower half of tree_mod_seq.
*
* Must be called with fs_info->tree_mod_seq_lock held. The way major numbers
* are generated should not technically require a spin lock here. (Rationale:
* incrementing the minor while incrementing the major seq number is between its
* atomic64_read and atomic64_set calls doesn't duplicate sequence numbers, it
* just returns a unique sequence number as usual.) We have decided to leave
* that requirement in here and rethink it once we notice it really imposes a
* problem on some workload.
*/
static inline u64 btrfs_inc_tree_mod_seq_minor(struct btrfs_fs_info *fs_info)
{
return atomic64_inc_return(&fs_info->tree_mod_seq);
}
/*
* return the last minor in the previous major tree_mod_seq number
*/
u64 btrfs_tree_mod_seq_prev(u64 seq)
{
return (seq & 0xffffffff00000000ull) - 1ull;
}
/* /*
* This adds a new blocker to the tree mod log's blocker list if the @elem * This adds a new blocker to the tree mod log's blocker list if the @elem
* passed does not already have a sequence number set. So when a caller expects * passed does not already have a sequence number set. So when a caller expects
@ -376,10 +414,10 @@ u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
tree_mod_log_write_lock(fs_info); tree_mod_log_write_lock(fs_info);
spin_lock(&fs_info->tree_mod_seq_lock); spin_lock(&fs_info->tree_mod_seq_lock);
if (!elem->seq) { if (!elem->seq) {
elem->seq = btrfs_inc_tree_mod_seq(fs_info); elem->seq = btrfs_inc_tree_mod_seq_major(fs_info);
list_add_tail(&elem->list, &fs_info->tree_mod_seq_list); list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
} }
seq = btrfs_inc_tree_mod_seq(fs_info); seq = btrfs_inc_tree_mod_seq_minor(fs_info);
spin_unlock(&fs_info->tree_mod_seq_lock); spin_unlock(&fs_info->tree_mod_seq_lock);
tree_mod_log_write_unlock(fs_info); tree_mod_log_write_unlock(fs_info);
@ -524,7 +562,10 @@ static inline int tree_mod_alloc(struct btrfs_fs_info *fs_info, gfp_t flags,
if (!tm) if (!tm)
return -ENOMEM; return -ENOMEM;
tm->seq = btrfs_inc_tree_mod_seq(fs_info); spin_lock(&fs_info->tree_mod_seq_lock);
tm->seq = btrfs_inc_tree_mod_seq_minor(fs_info);
spin_unlock(&fs_info->tree_mod_seq_lock);
return tm->seq; return tm->seq;
} }

Просмотреть файл

@ -1422,7 +1422,7 @@ struct btrfs_fs_info {
/* this protects tree_mod_seq_list */ /* this protects tree_mod_seq_list */
spinlock_t tree_mod_seq_lock; spinlock_t tree_mod_seq_lock;
atomic_t tree_mod_seq; atomic64_t tree_mod_seq;
struct list_head tree_mod_seq_list; struct list_head tree_mod_seq_list;
struct seq_list tree_mod_seq_elem; struct seq_list tree_mod_seq_elem;
@ -3332,10 +3332,7 @@ u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
struct seq_list *elem); struct seq_list *elem);
void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
struct seq_list *elem); struct seq_list *elem);
static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info) u64 btrfs_tree_mod_seq_prev(u64 seq);
{
return atomic_inc_return(&fs_info->tree_mod_seq);
}
int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq); int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq);
/* root-item.c */ /* root-item.c */

Просмотреть файл

@ -361,8 +361,10 @@ int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
elem = list_first_entry(&fs_info->tree_mod_seq_list, elem = list_first_entry(&fs_info->tree_mod_seq_list,
struct seq_list, list); struct seq_list, list);
if (seq >= elem->seq) { if (seq >= elem->seq) {
pr_debug("holding back delayed_ref %llu, lowest is " pr_debug("holding back delayed_ref %#x.%x, lowest is %#x.%x (%p)\n",
"%llu (%p)\n", seq, elem->seq, delayed_refs); (u32)(seq >> 32), (u32)seq,
(u32)(elem->seq >> 32), (u32)elem->seq,
delayed_refs);
ret = 1; ret = 1;
} }
} }

Просмотреть файл

@ -2157,7 +2157,7 @@ int open_ctree(struct super_block *sb,
atomic_set(&fs_info->async_submit_draining, 0); atomic_set(&fs_info->async_submit_draining, 0);
atomic_set(&fs_info->nr_async_bios, 0); atomic_set(&fs_info->nr_async_bios, 0);
atomic_set(&fs_info->defrag_running, 0); atomic_set(&fs_info->defrag_running, 0);
atomic_set(&fs_info->tree_mod_seq, 0); atomic64_set(&fs_info->tree_mod_seq, 0);
fs_info->sb = sb; fs_info->sb = sb;
fs_info->max_inline = 8192 * 1024; fs_info->max_inline = 8192 * 1024;
fs_info->metadata_ratio = 0; fs_info->metadata_ratio = 0;

Просмотреть файл

@ -2541,9 +2541,10 @@ int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
!trans->delayed_ref_elem.seq) { !trans->delayed_ref_elem.seq) {
/* list without seq or seq without list */ /* list without seq or seq without list */
btrfs_err(fs_info, btrfs_err(fs_info,
"qgroup accounting update error, list is%s empty, seq is %llu", "qgroup accounting update error, list is%s empty, seq is %#x.%x",
list_empty(&trans->qgroup_ref_list) ? "" : " not", list_empty(&trans->qgroup_ref_list) ? "" : " not",
trans->delayed_ref_elem.seq); (u32)(trans->delayed_ref_elem.seq >> 32),
(u32)trans->delayed_ref_elem.seq);
BUG(); BUG();
} }

Просмотреть файл

@ -1242,9 +1242,11 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
case BTRFS_ADD_DELAYED_REF: case BTRFS_ADD_DELAYED_REF:
case BTRFS_ADD_DELAYED_EXTENT: case BTRFS_ADD_DELAYED_EXTENT:
sgn = 1; sgn = 1;
seq = btrfs_tree_mod_seq_prev(node->seq);
break; break;
case BTRFS_DROP_DELAYED_REF: case BTRFS_DROP_DELAYED_REF:
sgn = -1; sgn = -1;
seq = node->seq;
break; break;
case BTRFS_UPDATE_DELAYED_HEAD: case BTRFS_UPDATE_DELAYED_HEAD:
return 0; return 0;
@ -1254,14 +1256,14 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
/* /*
* the delayed ref sequence number we pass depends on the direction of * the delayed ref sequence number we pass depends on the direction of
* the operation. for add operations, we pass (node->seq - 1) to skip * the operation. for add operations, we pass
* tree_mod_log_prev_seq(node->seq) to skip
* the delayed ref's current sequence number, because we need the state * the delayed ref's current sequence number, because we need the state
* of the tree before the add operation. for delete operations, we pass * of the tree before the add operation. for delete operations, we pass
* (node->seq) to include the delayed ref's current sequence number, * (node->seq) to include the delayed ref's current sequence number,
* because we need the state of the tree after the delete operation. * because we need the state of the tree after the delete operation.
*/ */
ret = btrfs_find_all_roots(trans, fs_info, node->bytenr, ret = btrfs_find_all_roots(trans, fs_info, node->bytenr, seq, &roots);
sgn > 0 ? node->seq - 1 : node->seq, &roots);
if (ret < 0) if (ret < 0)
return ret; return ret;
@ -1772,8 +1774,9 @@ void assert_qgroups_uptodate(struct btrfs_trans_handle *trans)
{ {
if (list_empty(&trans->qgroup_ref_list) && !trans->delayed_ref_elem.seq) if (list_empty(&trans->qgroup_ref_list) && !trans->delayed_ref_elem.seq)
return; return;
printk(KERN_ERR "btrfs: qgroups not uptodate in trans handle %p: list is%s empty, seq is %llu\n", pr_err("btrfs: qgroups not uptodate in trans handle %p: list is%s empty, seq is %#x.%x\n",
trans, list_empty(&trans->qgroup_ref_list) ? "" : " not", trans, list_empty(&trans->qgroup_ref_list) ? "" : " not",
trans->delayed_ref_elem.seq); (u32)(trans->delayed_ref_elem.seq >> 32),
(u32)trans->delayed_ref_elem.seq);
BUG(); BUG();
} }

Просмотреть файл

@ -162,7 +162,7 @@ loop:
if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log)) if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log))
WARN(1, KERN_ERR "btrfs: tree_mod_log rb tree not empty when " WARN(1, KERN_ERR "btrfs: tree_mod_log rb tree not empty when "
"creating a fresh transaction\n"); "creating a fresh transaction\n");
atomic_set(&fs_info->tree_mod_seq, 0); atomic64_set(&fs_info->tree_mod_seq, 0);
spin_lock_init(&cur_trans->commit_lock); spin_lock_init(&cur_trans->commit_lock);
spin_lock_init(&cur_trans->delayed_refs.lock); spin_lock_init(&cur_trans->delayed_refs.lock);