mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2025-01-22 16:06:04 -05:00
bcachefs: fix O(n^2) issue with whiteouts in journal keys
The journal_keys array can't be substantially modified after we go RW, because lookups need to be able to check it locklessly - thus we're limited on what we can do when a key in the journal has been overwritten. This is a problem when there's many overwrites to skip over for peek() operations. To fix this, add tracking of ranges of overwrites: we create a range entry when there's more than one contiguous whiteout. Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
This commit is contained in:
parent
854724d116
commit
eae6c4a625
5 changed files with 179 additions and 41 deletions
|
@ -205,6 +205,7 @@
|
||||||
#include <linux/zstd.h>
|
#include <linux/zstd.h>
|
||||||
|
|
||||||
#include "bcachefs_format.h"
|
#include "bcachefs_format.h"
|
||||||
|
#include "btree_journal_iter_types.h"
|
||||||
#include "disk_accounting_types.h"
|
#include "disk_accounting_types.h"
|
||||||
#include "errcode.h"
|
#include "errcode.h"
|
||||||
#include "fifo.h"
|
#include "fifo.h"
|
||||||
|
@ -658,28 +659,6 @@ struct journal_seq_blacklist_table {
|
||||||
} entries[];
|
} entries[];
|
||||||
};
|
};
|
||||||
|
|
||||||
struct journal_keys {
|
|
||||||
/* must match layout in darray_types.h */
|
|
||||||
size_t nr, size;
|
|
||||||
struct journal_key {
|
|
||||||
u64 journal_seq;
|
|
||||||
u32 journal_offset;
|
|
||||||
enum btree_id btree_id:8;
|
|
||||||
unsigned level:8;
|
|
||||||
bool allocated;
|
|
||||||
bool overwritten;
|
|
||||||
struct bkey_i *k;
|
|
||||||
} *data;
|
|
||||||
/*
|
|
||||||
* Gap buffer: instead of all the empty space in the array being at the
|
|
||||||
* end of the buffer - from @nr to @size - the empty space is at @gap.
|
|
||||||
* This means that sequential insertions are O(n) instead of O(n^2).
|
|
||||||
*/
|
|
||||||
size_t gap;
|
|
||||||
atomic_t ref;
|
|
||||||
bool initial_ref_held;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct btree_trans_buf {
|
struct btree_trans_buf {
|
||||||
struct btree_trans *trans;
|
struct btree_trans *trans;
|
||||||
};
|
};
|
||||||
|
|
|
@ -16,6 +16,17 @@
|
||||||
* operations for the regular btree iter code to use:
|
* operations for the regular btree iter code to use:
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
static inline size_t pos_to_idx(struct journal_keys *keys, size_t pos)
|
||||||
|
{
|
||||||
|
size_t gap_size = keys->size - keys->nr;
|
||||||
|
|
||||||
|
BUG_ON(pos >= keys->gap && pos < keys->gap + gap_size);
|
||||||
|
|
||||||
|
if (pos >= keys->gap)
|
||||||
|
pos -= gap_size;
|
||||||
|
return pos;
|
||||||
|
}
|
||||||
|
|
||||||
static inline size_t idx_to_pos(struct journal_keys *keys, size_t idx)
|
static inline size_t idx_to_pos(struct journal_keys *keys, size_t idx)
|
||||||
{
|
{
|
||||||
size_t gap_size = keys->size - keys->nr;
|
size_t gap_size = keys->size - keys->nr;
|
||||||
|
@ -84,27 +95,37 @@ struct bkey_i *bch2_journal_keys_peek_max(struct bch_fs *c, enum btree_id btree_
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct bkey_i *ret = NULL;
|
||||||
|
rcu_read_lock(); /* for overwritten_ranges */
|
||||||
|
|
||||||
while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) {
|
while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) {
|
||||||
if (__journal_key_cmp(btree_id, level, end_pos, k) < 0)
|
if (__journal_key_cmp(btree_id, level, end_pos, k) < 0)
|
||||||
return NULL;
|
break;
|
||||||
|
|
||||||
if (k->overwritten) {
|
if (k->overwritten) {
|
||||||
(*idx)++;
|
if (k->overwritten_range)
|
||||||
|
*idx = rcu_dereference(k->overwritten_range)->end;
|
||||||
|
else
|
||||||
|
*idx += 1;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (__journal_key_cmp(btree_id, level, pos, k) <= 0)
|
if (__journal_key_cmp(btree_id, level, pos, k) <= 0) {
|
||||||
return k->k;
|
ret = k->k;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
(*idx)++;
|
(*idx)++;
|
||||||
iters++;
|
iters++;
|
||||||
if (iters == 10) {
|
if (iters == 10) {
|
||||||
*idx = 0;
|
*idx = 0;
|
||||||
|
rcu_read_unlock();
|
||||||
goto search;
|
goto search;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return NULL;
|
rcu_read_unlock();
|
||||||
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct bkey_i *bch2_journal_keys_peek_prev_min(struct bch_fs *c, enum btree_id btree_id,
|
struct bkey_i *bch2_journal_keys_peek_prev_min(struct bch_fs *c, enum btree_id btree_id,
|
||||||
|
@ -130,17 +151,25 @@ struct bkey_i *bch2_journal_keys_peek_prev_min(struct bch_fs *c, enum btree_id b
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct bkey_i *ret = NULL;
|
||||||
|
rcu_read_lock(); /* for overwritten_ranges */
|
||||||
|
|
||||||
while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) {
|
while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) {
|
||||||
if (__journal_key_cmp(btree_id, level, end_pos, k) > 0)
|
if (__journal_key_cmp(btree_id, level, end_pos, k) > 0)
|
||||||
return NULL;
|
break;
|
||||||
|
|
||||||
if (k->overwritten) {
|
if (k->overwritten) {
|
||||||
--(*idx);
|
if (k->overwritten_range)
|
||||||
|
*idx = rcu_dereference(k->overwritten_range)->start - 1;
|
||||||
|
else
|
||||||
|
*idx -= 1;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (__journal_key_cmp(btree_id, level, pos, k) >= 0)
|
if (__journal_key_cmp(btree_id, level, pos, k) >= 0) {
|
||||||
return k->k;
|
ret = k->k;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
--(*idx);
|
--(*idx);
|
||||||
iters++;
|
iters++;
|
||||||
|
@ -150,7 +179,8 @@ struct bkey_i *bch2_journal_keys_peek_prev_min(struct bch_fs *c, enum btree_id b
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return NULL;
|
rcu_read_unlock();
|
||||||
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree_id,
|
struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree_id,
|
||||||
|
@ -163,6 +193,7 @@ struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree
|
||||||
|
|
||||||
static void journal_iter_verify(struct journal_iter *iter)
|
static void journal_iter_verify(struct journal_iter *iter)
|
||||||
{
|
{
|
||||||
|
#ifdef CONFIG_BCACHEFS_DEBUG
|
||||||
struct journal_keys *keys = iter->keys;
|
struct journal_keys *keys = iter->keys;
|
||||||
size_t gap_size = keys->size - keys->nr;
|
size_t gap_size = keys->size - keys->nr;
|
||||||
|
|
||||||
|
@ -175,6 +206,7 @@ static void journal_iter_verify(struct journal_iter *iter)
|
||||||
int cmp = __journal_key_btree_cmp(iter->btree_id, iter->level, k);
|
int cmp = __journal_key_btree_cmp(iter->btree_id, iter->level, k);
|
||||||
BUG_ON(cmp > 0);
|
BUG_ON(cmp > 0);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
static void journal_iters_fix(struct bch_fs *c)
|
static void journal_iters_fix(struct bch_fs *c)
|
||||||
|
@ -335,6 +367,68 @@ bool bch2_key_deleted_in_journal(struct btree_trans *trans, enum btree_id btree,
|
||||||
bkey_deleted(&keys->data[idx].k->k));
|
bkey_deleted(&keys->data[idx].k->k));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void __bch2_journal_key_overwritten(struct journal_keys *keys, size_t pos)
|
||||||
|
{
|
||||||
|
struct journal_key *k = keys->data + pos;
|
||||||
|
size_t idx = pos_to_idx(keys, pos);
|
||||||
|
|
||||||
|
k->overwritten = true;
|
||||||
|
|
||||||
|
struct journal_key *prev = idx > 0 ? keys->data + idx_to_pos(keys, idx - 1) : NULL;
|
||||||
|
struct journal_key *next = idx + 1 < keys->nr ? keys->data + idx_to_pos(keys, idx + 1) : NULL;
|
||||||
|
|
||||||
|
bool prev_overwritten = prev && prev->overwritten;
|
||||||
|
bool next_overwritten = next && next->overwritten;
|
||||||
|
|
||||||
|
struct journal_key_range_overwritten *prev_range =
|
||||||
|
prev_overwritten ? prev->overwritten_range : NULL;
|
||||||
|
struct journal_key_range_overwritten *next_range =
|
||||||
|
next_overwritten ? next->overwritten_range : NULL;
|
||||||
|
|
||||||
|
BUG_ON(prev_range && prev_range->end != idx);
|
||||||
|
BUG_ON(next_range && next_range->start != idx + 1);
|
||||||
|
|
||||||
|
if (prev_range && next_range) {
|
||||||
|
prev_range->end = next_range->end;
|
||||||
|
|
||||||
|
keys->data[pos].overwritten_range = prev_range;
|
||||||
|
for (size_t i = next_range->start; i < next_range->end; i++) {
|
||||||
|
struct journal_key *ip = keys->data + idx_to_pos(keys, i);
|
||||||
|
BUG_ON(ip->overwritten_range != next_range);
|
||||||
|
ip->overwritten_range = prev_range;
|
||||||
|
}
|
||||||
|
|
||||||
|
kfree_rcu_mightsleep(next_range);
|
||||||
|
} else if (prev_range) {
|
||||||
|
prev_range->end++;
|
||||||
|
k->overwritten_range = prev_range;
|
||||||
|
if (next_overwritten) {
|
||||||
|
prev_range->end++;
|
||||||
|
next->overwritten_range = prev_range;
|
||||||
|
}
|
||||||
|
} else if (next_range) {
|
||||||
|
next_range->start--;
|
||||||
|
k->overwritten_range = next_range;
|
||||||
|
if (prev_overwritten) {
|
||||||
|
next_range->start--;
|
||||||
|
prev->overwritten_range = next_range;
|
||||||
|
}
|
||||||
|
} else if (prev_overwritten || next_overwritten) {
|
||||||
|
struct journal_key_range_overwritten *r = kmalloc(sizeof(*r), GFP_KERNEL);
|
||||||
|
if (!r)
|
||||||
|
return;
|
||||||
|
|
||||||
|
r->start = idx - (size_t) prev_overwritten;
|
||||||
|
r->end = idx + 1 + (size_t) next_overwritten;
|
||||||
|
|
||||||
|
rcu_assign_pointer(k->overwritten_range, r);
|
||||||
|
if (prev_overwritten)
|
||||||
|
prev->overwritten_range = r;
|
||||||
|
if (next_overwritten)
|
||||||
|
next->overwritten_range = r;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree,
|
void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree,
|
||||||
unsigned level, struct bpos pos)
|
unsigned level, struct bpos pos)
|
||||||
{
|
{
|
||||||
|
@ -344,8 +438,12 @@ void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree,
|
||||||
if (idx < keys->size &&
|
if (idx < keys->size &&
|
||||||
keys->data[idx].btree_id == btree &&
|
keys->data[idx].btree_id == btree &&
|
||||||
keys->data[idx].level == level &&
|
keys->data[idx].level == level &&
|
||||||
bpos_eq(keys->data[idx].k->k.p, pos))
|
bpos_eq(keys->data[idx].k->k.p, pos) &&
|
||||||
keys->data[idx].overwritten = true;
|
!keys->data[idx].overwritten) {
|
||||||
|
mutex_lock(&keys->overwrite_lock);
|
||||||
|
__bch2_journal_key_overwritten(keys, idx);
|
||||||
|
mutex_unlock(&keys->overwrite_lock);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void bch2_journal_iter_advance(struct journal_iter *iter)
|
static void bch2_journal_iter_advance(struct journal_iter *iter)
|
||||||
|
@ -359,8 +457,11 @@ static void bch2_journal_iter_advance(struct journal_iter *iter)
|
||||||
|
|
||||||
static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter)
|
static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter)
|
||||||
{
|
{
|
||||||
|
struct bkey_s_c ret = bkey_s_c_null;
|
||||||
|
|
||||||
journal_iter_verify(iter);
|
journal_iter_verify(iter);
|
||||||
|
|
||||||
|
rcu_read_lock();
|
||||||
while (iter->idx < iter->keys->size) {
|
while (iter->idx < iter->keys->size) {
|
||||||
struct journal_key *k = iter->keys->data + iter->idx;
|
struct journal_key *k = iter->keys->data + iter->idx;
|
||||||
|
|
||||||
|
@ -369,13 +470,19 @@ static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter)
|
||||||
break;
|
break;
|
||||||
BUG_ON(cmp);
|
BUG_ON(cmp);
|
||||||
|
|
||||||
if (!k->overwritten)
|
if (!k->overwritten) {
|
||||||
return bkey_i_to_s_c(k->k);
|
ret = bkey_i_to_s_c(k->k);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
bch2_journal_iter_advance(iter);
|
if (k->overwritten_range)
|
||||||
|
iter->idx = idx_to_pos(iter->keys, rcu_dereference(k->overwritten_range)->end);
|
||||||
|
else
|
||||||
|
bch2_journal_iter_advance(iter);
|
||||||
}
|
}
|
||||||
|
rcu_read_unlock();
|
||||||
|
|
||||||
return bkey_s_c_null;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void bch2_journal_iter_exit(struct journal_iter *iter)
|
static void bch2_journal_iter_exit(struct journal_iter *iter)
|
||||||
|
@ -556,9 +663,15 @@ void bch2_journal_keys_put(struct bch_fs *c)
|
||||||
|
|
||||||
move_gap(keys, keys->nr);
|
move_gap(keys, keys->nr);
|
||||||
|
|
||||||
darray_for_each(*keys, i)
|
darray_for_each(*keys, i) {
|
||||||
|
if (i->overwritten_range &&
|
||||||
|
(i == &darray_last(*keys) ||
|
||||||
|
i->overwritten_range != i[1].overwritten_range))
|
||||||
|
kfree(i->overwritten_range);
|
||||||
|
|
||||||
if (i->allocated)
|
if (i->allocated)
|
||||||
kfree(i->k);
|
kfree(i->k);
|
||||||
|
}
|
||||||
|
|
||||||
kvfree(keys->data);
|
kvfree(keys->data);
|
||||||
keys->data = NULL;
|
keys->data = NULL;
|
||||||
|
@ -682,3 +795,12 @@ void bch2_journal_keys_dump(struct bch_fs *c)
|
||||||
}
|
}
|
||||||
printbuf_exit(&buf);
|
printbuf_exit(&buf);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void bch2_fs_journal_keys_init(struct bch_fs *c)
|
||||||
|
{
|
||||||
|
struct journal_keys *keys = &c->journal_keys;
|
||||||
|
|
||||||
|
atomic_set(&keys->ref, 1);
|
||||||
|
keys->initial_ref_held = true;
|
||||||
|
mutex_init(&keys->overwrite_lock);
|
||||||
|
}
|
||||||
|
|
|
@ -97,4 +97,6 @@ void bch2_shoot_down_journal_keys(struct bch_fs *, enum btree_id,
|
||||||
|
|
||||||
void bch2_journal_keys_dump(struct bch_fs *);
|
void bch2_journal_keys_dump(struct bch_fs *);
|
||||||
|
|
||||||
|
void bch2_fs_journal_keys_init(struct bch_fs *);
|
||||||
|
|
||||||
#endif /* _BCACHEFS_BTREE_JOURNAL_ITER_H */
|
#endif /* _BCACHEFS_BTREE_JOURNAL_ITER_H */
|
||||||
|
|
36
fs/bcachefs/btree_journal_iter_types.h
Normal file
36
fs/bcachefs/btree_journal_iter_types.h
Normal file
|
@ -0,0 +1,36 @@
|
||||||
|
/* SPDX-License-Identifier: GPL-2.0 */
|
||||||
|
#ifndef _BCACHEFS_BTREE_JOURNAL_ITER_TYPES_H
|
||||||
|
#define _BCACHEFS_BTREE_JOURNAL_ITER_TYPES_H
|
||||||
|
|
||||||
|
struct journal_key_range_overwritten {
|
||||||
|
size_t start, end;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct journal_key {
|
||||||
|
u64 journal_seq;
|
||||||
|
u32 journal_offset;
|
||||||
|
enum btree_id btree_id:8;
|
||||||
|
unsigned level:8;
|
||||||
|
bool allocated;
|
||||||
|
bool overwritten;
|
||||||
|
struct journal_key_range_overwritten __rcu *
|
||||||
|
overwritten_range;
|
||||||
|
struct bkey_i *k;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct journal_keys {
|
||||||
|
/* must match layout in darray_types.h */
|
||||||
|
size_t nr, size;
|
||||||
|
struct journal_key *data;
|
||||||
|
/*
|
||||||
|
* Gap buffer: instead of all the empty space in the array being at the
|
||||||
|
* end of the buffer - from @nr to @size - the empty space is at @gap.
|
||||||
|
* This means that sequential insertions are O(n) instead of O(n^2).
|
||||||
|
*/
|
||||||
|
size_t gap;
|
||||||
|
atomic_t ref;
|
||||||
|
bool initial_ref_held;
|
||||||
|
struct mutex overwrite_lock;
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif /* _BCACHEFS_BTREE_JOURNAL_ITER_TYPES_H */
|
|
@ -773,8 +773,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
|
||||||
|
|
||||||
init_rwsem(&c->gc_lock);
|
init_rwsem(&c->gc_lock);
|
||||||
mutex_init(&c->gc_gens_lock);
|
mutex_init(&c->gc_gens_lock);
|
||||||
atomic_set(&c->journal_keys.ref, 1);
|
|
||||||
c->journal_keys.initial_ref_held = true;
|
|
||||||
|
|
||||||
for (i = 0; i < BCH_TIME_STAT_NR; i++)
|
for (i = 0; i < BCH_TIME_STAT_NR; i++)
|
||||||
bch2_time_stats_init(&c->times[i]);
|
bch2_time_stats_init(&c->times[i]);
|
||||||
|
@ -784,6 +782,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
|
||||||
bch2_fs_btree_key_cache_init_early(&c->btree_key_cache);
|
bch2_fs_btree_key_cache_init_early(&c->btree_key_cache);
|
||||||
bch2_fs_btree_iter_init_early(c);
|
bch2_fs_btree_iter_init_early(c);
|
||||||
bch2_fs_btree_interior_update_init_early(c);
|
bch2_fs_btree_interior_update_init_early(c);
|
||||||
|
bch2_fs_journal_keys_init(c);
|
||||||
bch2_fs_allocator_background_init(c);
|
bch2_fs_allocator_background_init(c);
|
||||||
bch2_fs_allocator_foreground_init(c);
|
bch2_fs_allocator_foreground_init(c);
|
||||||
bch2_fs_rebalance_init(c);
|
bch2_fs_rebalance_init(c);
|
||||||
|
|
Loading…
Reference in a new issue