1
0
Fork 0
mirror of https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git synced 2025-01-22 07:53:11 -05:00

bcachefs: fix O(n^2) issue with whiteouts in journal keys

The journal_keys array can't be substantially modified after we go RW,
because lookups need to be able to check it locklessly - thus we're
limited on what we can do when a key in the journal has been
overwritten.

This is a problem when there's many overwrites to skip over for peek()
operations. To fix this, add tracking of ranges of overwrites: we create
a range entry when there's more than one contiguous whiteout.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
This commit is contained in:
Kent Overstreet 2024-11-17 02:23:24 -05:00
parent 854724d116
commit eae6c4a625
5 changed files with 179 additions and 41 deletions

View file

@ -205,6 +205,7 @@
#include <linux/zstd.h>
#include "bcachefs_format.h"
#include "btree_journal_iter_types.h"
#include "disk_accounting_types.h"
#include "errcode.h"
#include "fifo.h"
@ -658,28 +659,6 @@ struct journal_seq_blacklist_table {
} entries[];
};
struct journal_keys {
/* must match layout in darray_types.h */
size_t nr, size;
struct journal_key {
u64 journal_seq;
u32 journal_offset;
enum btree_id btree_id:8;
unsigned level:8;
bool allocated;
bool overwritten;
struct bkey_i *k;
} *data;
/*
* Gap buffer: instead of all the empty space in the array being at the
* end of the buffer - from @nr to @size - the empty space is at @gap.
* This means that sequential insertions are O(n) instead of O(n^2).
*/
size_t gap;
atomic_t ref;
bool initial_ref_held;
};
struct btree_trans_buf {
struct btree_trans *trans;
};

View file

@ -16,6 +16,17 @@
* operations for the regular btree iter code to use:
*/
static inline size_t pos_to_idx(struct journal_keys *keys, size_t pos)
{
size_t gap_size = keys->size - keys->nr;
BUG_ON(pos >= keys->gap && pos < keys->gap + gap_size);
if (pos >= keys->gap)
pos -= gap_size;
return pos;
}
static inline size_t idx_to_pos(struct journal_keys *keys, size_t idx)
{
size_t gap_size = keys->size - keys->nr;
@ -84,27 +95,37 @@ struct bkey_i *bch2_journal_keys_peek_max(struct bch_fs *c, enum btree_id btree_
}
}
struct bkey_i *ret = NULL;
rcu_read_lock(); /* for overwritten_ranges */
while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) {
if (__journal_key_cmp(btree_id, level, end_pos, k) < 0)
return NULL;
break;
if (k->overwritten) {
(*idx)++;
if (k->overwritten_range)
*idx = rcu_dereference(k->overwritten_range)->end;
else
*idx += 1;
continue;
}
if (__journal_key_cmp(btree_id, level, pos, k) <= 0)
return k->k;
if (__journal_key_cmp(btree_id, level, pos, k) <= 0) {
ret = k->k;
break;
}
(*idx)++;
iters++;
if (iters == 10) {
*idx = 0;
rcu_read_unlock();
goto search;
}
}
return NULL;
rcu_read_unlock();
return ret;
}
struct bkey_i *bch2_journal_keys_peek_prev_min(struct bch_fs *c, enum btree_id btree_id,
@ -130,17 +151,25 @@ struct bkey_i *bch2_journal_keys_peek_prev_min(struct bch_fs *c, enum btree_id b
}
}
struct bkey_i *ret = NULL;
rcu_read_lock(); /* for overwritten_ranges */
while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) {
if (__journal_key_cmp(btree_id, level, end_pos, k) > 0)
return NULL;
break;
if (k->overwritten) {
--(*idx);
if (k->overwritten_range)
*idx = rcu_dereference(k->overwritten_range)->start - 1;
else
*idx -= 1;
continue;
}
if (__journal_key_cmp(btree_id, level, pos, k) >= 0)
return k->k;
if (__journal_key_cmp(btree_id, level, pos, k) >= 0) {
ret = k->k;
break;
}
--(*idx);
iters++;
@ -150,7 +179,8 @@ struct bkey_i *bch2_journal_keys_peek_prev_min(struct bch_fs *c, enum btree_id b
}
}
return NULL;
rcu_read_unlock();
return ret;
}
struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree_id,
@ -163,6 +193,7 @@ struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree
static void journal_iter_verify(struct journal_iter *iter)
{
#ifdef CONFIG_BCACHEFS_DEBUG
struct journal_keys *keys = iter->keys;
size_t gap_size = keys->size - keys->nr;
@ -175,6 +206,7 @@ static void journal_iter_verify(struct journal_iter *iter)
int cmp = __journal_key_btree_cmp(iter->btree_id, iter->level, k);
BUG_ON(cmp > 0);
}
#endif
}
static void journal_iters_fix(struct bch_fs *c)
@ -335,6 +367,68 @@ bool bch2_key_deleted_in_journal(struct btree_trans *trans, enum btree_id btree,
bkey_deleted(&keys->data[idx].k->k));
}
static void __bch2_journal_key_overwritten(struct journal_keys *keys, size_t pos)
{
struct journal_key *k = keys->data + pos;
size_t idx = pos_to_idx(keys, pos);
k->overwritten = true;
struct journal_key *prev = idx > 0 ? keys->data + idx_to_pos(keys, idx - 1) : NULL;
struct journal_key *next = idx + 1 < keys->nr ? keys->data + idx_to_pos(keys, idx + 1) : NULL;
bool prev_overwritten = prev && prev->overwritten;
bool next_overwritten = next && next->overwritten;
struct journal_key_range_overwritten *prev_range =
prev_overwritten ? prev->overwritten_range : NULL;
struct journal_key_range_overwritten *next_range =
next_overwritten ? next->overwritten_range : NULL;
BUG_ON(prev_range && prev_range->end != idx);
BUG_ON(next_range && next_range->start != idx + 1);
if (prev_range && next_range) {
prev_range->end = next_range->end;
keys->data[pos].overwritten_range = prev_range;
for (size_t i = next_range->start; i < next_range->end; i++) {
struct journal_key *ip = keys->data + idx_to_pos(keys, i);
BUG_ON(ip->overwritten_range != next_range);
ip->overwritten_range = prev_range;
}
kfree_rcu_mightsleep(next_range);
} else if (prev_range) {
prev_range->end++;
k->overwritten_range = prev_range;
if (next_overwritten) {
prev_range->end++;
next->overwritten_range = prev_range;
}
} else if (next_range) {
next_range->start--;
k->overwritten_range = next_range;
if (prev_overwritten) {
next_range->start--;
prev->overwritten_range = next_range;
}
} else if (prev_overwritten || next_overwritten) {
struct journal_key_range_overwritten *r = kmalloc(sizeof(*r), GFP_KERNEL);
if (!r)
return;
r->start = idx - (size_t) prev_overwritten;
r->end = idx + 1 + (size_t) next_overwritten;
rcu_assign_pointer(k->overwritten_range, r);
if (prev_overwritten)
prev->overwritten_range = r;
if (next_overwritten)
next->overwritten_range = r;
}
}
void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree,
unsigned level, struct bpos pos)
{
@ -344,8 +438,12 @@ void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree,
if (idx < keys->size &&
keys->data[idx].btree_id == btree &&
keys->data[idx].level == level &&
bpos_eq(keys->data[idx].k->k.p, pos))
keys->data[idx].overwritten = true;
bpos_eq(keys->data[idx].k->k.p, pos) &&
!keys->data[idx].overwritten) {
mutex_lock(&keys->overwrite_lock);
__bch2_journal_key_overwritten(keys, idx);
mutex_unlock(&keys->overwrite_lock);
}
}
static void bch2_journal_iter_advance(struct journal_iter *iter)
@ -359,8 +457,11 @@ static void bch2_journal_iter_advance(struct journal_iter *iter)
static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter)
{
struct bkey_s_c ret = bkey_s_c_null;
journal_iter_verify(iter);
rcu_read_lock();
while (iter->idx < iter->keys->size) {
struct journal_key *k = iter->keys->data + iter->idx;
@ -369,13 +470,19 @@ static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter)
break;
BUG_ON(cmp);
if (!k->overwritten)
return bkey_i_to_s_c(k->k);
if (!k->overwritten) {
ret = bkey_i_to_s_c(k->k);
break;
}
bch2_journal_iter_advance(iter);
if (k->overwritten_range)
iter->idx = idx_to_pos(iter->keys, rcu_dereference(k->overwritten_range)->end);
else
bch2_journal_iter_advance(iter);
}
rcu_read_unlock();
return bkey_s_c_null;
return ret;
}
static void bch2_journal_iter_exit(struct journal_iter *iter)
@ -556,9 +663,15 @@ void bch2_journal_keys_put(struct bch_fs *c)
move_gap(keys, keys->nr);
darray_for_each(*keys, i)
darray_for_each(*keys, i) {
if (i->overwritten_range &&
(i == &darray_last(*keys) ||
i->overwritten_range != i[1].overwritten_range))
kfree(i->overwritten_range);
if (i->allocated)
kfree(i->k);
}
kvfree(keys->data);
keys->data = NULL;
@ -682,3 +795,12 @@ void bch2_journal_keys_dump(struct bch_fs *c)
}
printbuf_exit(&buf);
}
void bch2_fs_journal_keys_init(struct bch_fs *c)
{
struct journal_keys *keys = &c->journal_keys;
atomic_set(&keys->ref, 1);
keys->initial_ref_held = true;
mutex_init(&keys->overwrite_lock);
}

View file

@ -97,4 +97,6 @@ void bch2_shoot_down_journal_keys(struct bch_fs *, enum btree_id,
void bch2_journal_keys_dump(struct bch_fs *);
void bch2_fs_journal_keys_init(struct bch_fs *);
#endif /* _BCACHEFS_BTREE_JOURNAL_ITER_H */

View file

@ -0,0 +1,36 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_BTREE_JOURNAL_ITER_TYPES_H
#define _BCACHEFS_BTREE_JOURNAL_ITER_TYPES_H
struct journal_key_range_overwritten {
size_t start, end;
};
struct journal_key {
u64 journal_seq;
u32 journal_offset;
enum btree_id btree_id:8;
unsigned level:8;
bool allocated;
bool overwritten;
struct journal_key_range_overwritten __rcu *
overwritten_range;
struct bkey_i *k;
};
struct journal_keys {
/* must match layout in darray_types.h */
size_t nr, size;
struct journal_key *data;
/*
* Gap buffer: instead of all the empty space in the array being at the
* end of the buffer - from @nr to @size - the empty space is at @gap.
* This means that sequential insertions are O(n) instead of O(n^2).
*/
size_t gap;
atomic_t ref;
bool initial_ref_held;
struct mutex overwrite_lock;
};
#endif /* _BCACHEFS_BTREE_JOURNAL_ITER_TYPES_H */

View file

@ -773,8 +773,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
init_rwsem(&c->gc_lock);
mutex_init(&c->gc_gens_lock);
atomic_set(&c->journal_keys.ref, 1);
c->journal_keys.initial_ref_held = true;
for (i = 0; i < BCH_TIME_STAT_NR; i++)
bch2_time_stats_init(&c->times[i]);
@ -784,6 +782,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
bch2_fs_btree_key_cache_init_early(&c->btree_key_cache);
bch2_fs_btree_iter_init_early(c);
bch2_fs_btree_interior_update_init_early(c);
bch2_fs_journal_keys_init(c);
bch2_fs_allocator_background_init(c);
bch2_fs_allocator_foreground_init(c);
bch2_fs_rebalance_init(c);