1
0
Fork 0
mirror of https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git synced 2025-01-24 01:09:38 -05:00

bcachefs: Don't require flush/fua on every journal write

This patch adds a flag to journal entries which, if set, indicates that
they weren't done as flush/fua writes.

 - non flush/fua journal writes don't update last_seq (i.e. they don't
   free up space in the journal), thus the journal free space
   calculations now check whether nonflush journal writes are currently
   allowed (i.e. are we low on free space, or would doing a flush write
   free up a lot of space in the journal)

 - write_delay_ms, the user configurable option for when open journal
   entries are automatically written, is now interpreted as the max
   delay between flush journal writes (default 1 second).

 - bch2_journal_flush_seq_async is changed to ensure a flush write >=
   the requested sequence number has happened

 - journal read/replay must now ignore, and blacklist, any journal
   entries newer than the most recent flush entry in the journal. Also,
   the way the read_entire_journal option is handled has been improved;
   struct journal_replay now has an entry, 'ignore', for entries that
   were read but should not be used.

 - assorted refactoring and improvements related to journal read in
   journal_io.c and recovery.c

Previously, we'd have to issue a flush/fua write every time we
accumulated a full journal entry - typically the bucket size. Now we
need to issue them much less frequently: when an fsync is requested, or
it's been more than write_delay_ms since the last flush, or when we need
to free up space in the journal. This is a significant performance
improvement on many write heavy workloads.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
This commit is contained in:
Kent Overstreet 2020-11-14 09:59:58 -05:00 committed by Kent Overstreet
parent b6df4325cd
commit adbcada43f
9 changed files with 312 additions and 151 deletions

View file

@ -1336,14 +1336,16 @@ LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3], 0, 16);
x(extents_above_btree_updates, 12) \
x(btree_updates_journalled, 13) \
x(reflink_inline_data, 14) \
x(new_varint, 15)
x(new_varint, 15) \
x(journal_no_flush, 16)
#define BCH_SB_FEATURES_ALL \
((1ULL << BCH_FEATURE_new_siphash)| \
(1ULL << BCH_FEATURE_new_extent_overwrite)| \
(1ULL << BCH_FEATURE_btree_ptr_v2)| \
(1ULL << BCH_FEATURE_extents_above_btree_updates)|\
(1ULL << BCH_FEATURE_new_varint))\
(1ULL << BCH_FEATURE_new_varint)| \
(1ULL << BCH_FEATURE_journal_no_flush))
enum bch_sb_feature {
#define x(f, n) BCH_FEATURE_##f,
@ -1582,6 +1584,7 @@ struct jset {
LE32_BITMASK(JSET_CSUM_TYPE, struct jset, flags, 0, 4);
LE32_BITMASK(JSET_BIG_ENDIAN, struct jset, flags, 4, 5);
LE32_BITMASK(JSET_NO_FLUSH, struct jset, flags, 5, 6);
#define BCH_JOURNAL_BUCKETS_MIN 8

View file

@ -79,6 +79,8 @@ static void bch2_journal_buf_init(struct journal *j)
struct journal_buf *buf = journal_cur_buf(j);
bkey_extent_init(&buf->key);
buf->noflush = false;
buf->must_flush = false;
memset(buf->has_inode, 0, sizeof(buf->has_inode));
@ -574,7 +576,7 @@ int bch2_journal_flush_seq_async(struct journal *j, u64 seq,
struct journal_buf *buf;
int ret = 0;
if (seq <= j->seq_ondisk)
if (seq <= j->flushed_seq_ondisk)
return 1;
spin_lock(&j->lock);
@ -585,16 +587,53 @@ int bch2_journal_flush_seq_async(struct journal *j, u64 seq,
goto out;
}
if (seq <= j->seq_ondisk) {
if (seq <= j->flushed_seq_ondisk) {
ret = 1;
goto out;
}
if (parent &&
(buf = journal_seq_to_buf(j, seq)))
if (!closure_wait(&buf->wait, parent))
/* if seq was written, but not flushed - flush a newer one instead */
seq = max(seq, last_unwritten_seq(j));
recheck_need_open:
if (seq == journal_cur_seq(j) && !journal_entry_is_open(j)) {
struct journal_res res = { 0 };
spin_unlock(&j->lock);
ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0);
if (ret)
return ret;
seq = res.seq;
buf = j->buf + (seq & JOURNAL_BUF_MASK);
buf->must_flush = true;
set_bit(JOURNAL_NEED_WRITE, &j->flags);
if (parent && !closure_wait(&buf->wait, parent))
BUG();
bch2_journal_res_put(j, &res);
spin_lock(&j->lock);
goto want_write;
}
/*
* if write was kicked off without a flush, flush the next sequence
* number instead
*/
buf = journal_seq_to_buf(j, seq);
if (buf->noflush) {
seq++;
goto recheck_need_open;
}
buf->must_flush = true;
if (parent && !closure_wait(&buf->wait, parent))
BUG();
want_write:
if (seq == journal_cur_seq(j))
journal_entry_want_write(j);
out:
@ -979,6 +1018,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
spin_lock(&j->lock);
set_bit(JOURNAL_STARTED, &j->flags);
j->last_flush_write = jiffies;
journal_pin_new_entry(j, 1);
@ -1116,6 +1156,8 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
"last_seq:\t\t%llu\n"
"last_seq_ondisk:\t%llu\n"
"prereserved:\t\t%u/%u\n"
"nr flush writes:\t%llu\n"
"nr noflush writes:\t%llu\n"
"nr direct reclaim:\t%llu\n"
"nr background reclaim:\t%llu\n"
"current entry sectors:\t%u\n"
@ -1127,6 +1169,8 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
j->last_seq_ondisk,
j->prereserved.reserved,
j->prereserved.remaining,
j->nr_flush_writes,
j->nr_noflush_writes,
j->nr_direct_reclaim,
j->nr_background_reclaim,
j->cur_entry_sectors,

View file

@ -136,7 +136,7 @@ static inline u64 journal_last_seq(struct journal *j)
static inline u64 journal_cur_seq(struct journal *j)
{
BUG_ON(j->pin.back - 1 != atomic64_read(&j->seq));
EBUG_ON(j->pin.back - 1 != atomic64_read(&j->seq));
return j->pin.back - 1;
}

View file

@ -10,9 +10,26 @@
#include "journal.h"
#include "journal_io.h"
#include "journal_reclaim.h"
#include "journal_seq_blacklist.h"
#include "replicas.h"
#include "trace.h"
static void __journal_replay_free(struct journal_replay *i)
{
list_del(&i->list);
kvpfree(i, offsetof(struct journal_replay, j) +
vstruct_bytes(&i->j));
}
static void journal_replay_free(struct bch_fs *c, struct journal_replay *i)
{
i->ignore = true;
if (!c->opts.read_entire_journal)
__journal_replay_free(i);
}
struct journal_list {
struct closure cl;
struct mutex lock;
@ -35,28 +52,29 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
struct bch_devs_list devs = { .nr = 0 };
struct list_head *where;
size_t bytes = vstruct_bytes(j);
__le64 last_seq;
u64 last_seq = 0;
int ret;
last_seq = !list_empty(jlist->head)
? list_last_entry(jlist->head, struct journal_replay,
list)->j.last_seq
: 0;
if (!c->opts.read_entire_journal) {
/* Is this entry older than the range we need? */
if (le64_to_cpu(j->seq) < le64_to_cpu(last_seq)) {
ret = JOURNAL_ENTRY_ADD_OUT_OF_RANGE;
goto out;
list_for_each_entry_reverse(i, jlist->head, list) {
if (!JSET_NO_FLUSH(&i->j)) {
last_seq = le64_to_cpu(i->j.last_seq);
break;
}
}
/* Drop entries we don't need anymore */
/* Is this entry older than the range we need? */
if (!c->opts.read_entire_journal &&
le64_to_cpu(j->seq) < last_seq) {
ret = JOURNAL_ENTRY_ADD_OUT_OF_RANGE;
goto out;
}
/* Drop entries we don't need anymore */
if (!JSET_NO_FLUSH(j)) {
list_for_each_entry_safe(i, pos, jlist->head, list) {
if (le64_to_cpu(i->j.seq) >= le64_to_cpu(j->last_seq))
break;
list_del(&i->list);
kvpfree(i, offsetof(struct journal_replay, j) +
vstruct_bytes(&i->j));
journal_replay_free(c, i);
}
}
@ -80,9 +98,7 @@ add:
if (i && le64_to_cpu(j->seq) == le64_to_cpu(i->j.seq)) {
if (i->bad) {
devs = i->devs;
list_del(&i->list);
kvpfree(i, offsetof(struct journal_replay, j) +
vstruct_bytes(&i->j));
__journal_replay_free(i);
} else if (bad) {
goto found;
} else {
@ -104,6 +120,7 @@ add:
list_add(&i->list, where);
i->devs = devs;
i->bad = bad;
i->ignore = false;
unsafe_memcpy(&i->j, j, bytes, "embedded variable length struct");
found:
if (!bch2_dev_list_has_dev(i->devs, ca->dev_idx))
@ -698,14 +715,16 @@ err:
goto out;
}
int bch2_journal_read(struct bch_fs *c, struct list_head *list)
int bch2_journal_read(struct bch_fs *c, struct list_head *list,
u64 *blacklist_seq, u64 *start_seq)
{
struct journal_list jlist;
struct journal_replay *i;
struct journal_replay *i, *t;
struct bch_dev *ca;
unsigned iter;
size_t keys = 0, entries = 0;
bool degraded = false;
u64 seq, last_seq = 0;
int ret = 0;
closure_init_stack(&jlist.cl);
@ -734,12 +753,97 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
if (jlist.ret)
return jlist.ret;
if (list_empty(list)) {
bch_info(c, "journal read done, but no entries found");
return 0;
}
i = list_last_entry(list, struct journal_replay, list);
*start_seq = le64_to_cpu(i->j.seq) + 1;
/*
* Find most recent flush entry, and ignore newer non flush entries -
* those entries will be blacklisted:
*/
list_for_each_entry_safe_reverse(i, t, list, list) {
if (i->ignore)
continue;
if (!JSET_NO_FLUSH(&i->j)) {
last_seq = le64_to_cpu(i->j.last_seq);
*blacklist_seq = le64_to_cpu(i->j.seq) + 1;
break;
}
journal_replay_free(c, i);
}
if (!last_seq) {
fsck_err(c, "journal read done, but no entries found after dropping non-flushes");
return -1;
}
/* Drop blacklisted entries and entries older than last_seq: */
list_for_each_entry_safe(i, t, list, list) {
if (i->ignore)
continue;
seq = le64_to_cpu(i->j.seq);
if (seq < last_seq) {
journal_replay_free(c, i);
continue;
}
if (bch2_journal_seq_is_blacklisted(c, seq, true)) {
fsck_err_on(!JSET_NO_FLUSH(&i->j), c,
"found blacklisted journal entry %llu", seq);
journal_replay_free(c, i);
}
}
/* Check for missing entries: */
seq = last_seq;
list_for_each_entry(i, list, list) {
if (i->ignore)
continue;
BUG_ON(seq > le64_to_cpu(i->j.seq));
while (seq < le64_to_cpu(i->j.seq)) {
u64 missing_start, missing_end;
while (seq < le64_to_cpu(i->j.seq) &&
bch2_journal_seq_is_blacklisted(c, seq, false))
seq++;
if (seq == le64_to_cpu(i->j.seq))
break;
missing_start = seq;
while (seq < le64_to_cpu(i->j.seq) &&
!bch2_journal_seq_is_blacklisted(c, seq, false))
seq++;
missing_end = seq - 1;
fsck_err(c, "journal entries %llu-%llu missing! (replaying %llu-%llu)",
missing_start, missing_end,
last_seq, *blacklist_seq - 1);
}
seq++;
}
list_for_each_entry(i, list, list) {
struct jset_entry *entry;
struct bkey_i *k, *_n;
struct bch_replicas_padded replicas;
char buf[80];
if (i->ignore)
continue;
ret = jset_validate_entries(c, &i->j, READ);
if (ret)
goto fsck_err;
@ -767,12 +871,12 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
entries++;
}
if (!list_empty(list)) {
i = list_last_entry(list, struct journal_replay, list);
bch_info(c, "journal read done, %zu keys in %zu entries, seq %llu",
keys, entries, *start_seq);
bch_info(c, "journal read done, %zu keys in %zu entries, seq %llu",
keys, entries, le64_to_cpu(i->j.seq));
}
if (*start_seq != *blacklist_seq)
bch_info(c, "dropped unflushed entries %llu-%llu",
*blacklist_seq, *start_seq - 1);
fsck_err:
return ret;
}
@ -990,8 +1094,12 @@ static void journal_write_done(struct closure *cl)
j->seq_ondisk = seq;
if (err && (!j->err_seq || seq < j->err_seq))
j->err_seq = seq;
j->last_seq_ondisk = last_seq;
bch2_journal_space_available(j);
if (!w->noflush) {
j->flushed_seq_ondisk = seq;
j->last_seq_ondisk = last_seq;
bch2_journal_space_available(j);
}
/*
* Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard
@ -1067,6 +1175,22 @@ void bch2_journal_write(struct closure *cl)
j->write_start_time = local_clock();
spin_lock(&j->lock);
if (c->sb.features & (1ULL << BCH_FEATURE_journal_no_flush) &&
!w->must_flush &&
(jiffies - j->last_flush_write) < msecs_to_jiffies(j->write_delay_ms) &&
test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags)) {
w->noflush = true;
SET_JSET_NO_FLUSH(jset, true);
jset->last_seq = cpu_to_le64(j->last_seq_ondisk);
j->nr_noflush_writes++;
} else {
j->last_flush_write = jiffies;
j->nr_flush_writes++;
}
spin_unlock(&j->lock);
/*
* New btree roots are set by journalling them; when the journal entry
* gets written we have to propagate them to c->btree_roots
@ -1183,11 +1307,12 @@ retry_alloc:
sectors);
bio = ca->journal.bio;
bio_reset(bio, ca->disk_sb.bdev,
REQ_OP_WRITE|REQ_SYNC|REQ_META|REQ_PREFLUSH|REQ_FUA);
bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META);
bio->bi_iter.bi_sector = ptr->offset;
bio->bi_end_io = journal_write_endio;
bio->bi_private = ca;
if (!JSET_NO_FLUSH(jset))
bio->bi_opf |= REQ_PREFLUSH|REQ_FUA;
bch2_bio_map(bio, jset, sectors << 9);
trace_journal_write(bio);
@ -1196,18 +1321,19 @@ retry_alloc:
ca->journal.bucket_seq[ca->journal.cur_idx] = le64_to_cpu(jset->seq);
}
for_each_rw_member(ca, c, i)
if (journal_flushes_device(ca) &&
!bch2_bkey_has_device(bkey_i_to_s_c(&w->key), i)) {
percpu_ref_get(&ca->io_ref);
bio = ca->journal.bio;
bio_reset(bio, ca->disk_sb.bdev, REQ_OP_FLUSH);
bio->bi_end_io = journal_write_endio;
bio->bi_private = ca;
closure_bio_submit(bio, cl);
}
if (!JSET_NO_FLUSH(jset)) {
for_each_rw_member(ca, c, i)
if (journal_flushes_device(ca) &&
!bch2_bkey_has_device(bkey_i_to_s_c(&w->key), i)) {
percpu_ref_get(&ca->io_ref);
bio = ca->journal.bio;
bio_reset(bio, ca->disk_sb.bdev, REQ_OP_FLUSH);
bio->bi_end_io = journal_write_endio;
bio->bi_private = ca;
closure_bio_submit(bio, cl);
}
}
no_io:
bch2_bucket_seq_cleanup(c);

View file

@ -11,6 +11,7 @@ struct journal_replay {
struct bch_devs_list devs;
/* checksum error, but we may want to try using it anyways: */
bool bad;
bool ignore;
/* must be last: */
struct jset j;
};
@ -37,7 +38,7 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
for_each_jset_entry_type(entry, jset, BCH_JSET_ENTRY_btree_keys) \
vstruct_for_each_safe(entry, k, _n)
int bch2_journal_read(struct bch_fs *, struct list_head *);
int bch2_journal_read(struct bch_fs *, struct list_head *, u64 *, u64 *);
void bch2_journal_write(struct closure *);

View file

@ -158,7 +158,7 @@ void bch2_journal_space_available(struct journal *j)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct bch_dev *ca;
unsigned clean;
unsigned clean, clean_ondisk, total;
unsigned overhead, u64s_remaining = 0;
unsigned max_entry_size = min(j->buf[0].buf_size >> 9,
j->buf[1].buf_size >> 9);
@ -204,13 +204,21 @@ void bch2_journal_space_available(struct journal *j)
for (i = 0; i < journal_space_nr; i++)
j->space[i] = __journal_space_available(j, nr_devs_want, i);
clean_ondisk = j->space[journal_space_clean_ondisk].total;
clean = j->space[journal_space_clean].total;
total = j->space[journal_space_total].total;
if (!j->space[journal_space_discarded].next_entry)
ret = cur_entry_journal_full;
else if (!fifo_free(&j->pin))
ret = cur_entry_journal_pin_full;
if ((clean - clean_ondisk <= total / 8) &&
(clean_ondisk * 2 > clean ))
set_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags);
else
clear_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags);
overhead = DIV_ROUND_UP(clean, max_entry_size) *
journal_entry_overhead(j);
u64s_remaining = clean << 6;

View file

@ -118,7 +118,7 @@ out_write_sb:
out:
mutex_unlock(&c->sb_lock);
return ret;
return ret ?: bch2_blacklist_table_initialize(c);
}
static int journal_seq_blacklist_table_cmp(const void *_l,
@ -164,8 +164,6 @@ int bch2_blacklist_table_initialize(struct bch_fs *c)
struct journal_seq_blacklist_table *t;
unsigned i, nr = blacklist_nr_entries(bl);
BUG_ON(c->journal_seq_blacklist_table);
if (!bl)
return 0;
@ -187,6 +185,7 @@ int bch2_blacklist_table_initialize(struct bch_fs *c)
journal_seq_blacklist_table_cmp,
NULL);
kfree(c->journal_seq_blacklist_table);
c->journal_seq_blacklist_table = t;
return 0;
}

View file

@ -29,6 +29,8 @@ struct journal_buf {
unsigned disk_sectors; /* maximum size entry could have been, if
buf_size was bigger */
unsigned u64s_reserved;
bool noflush; /* write has already been kicked off, and was noflush */
bool must_flush; /* something wants a flush */
/* bloom filter: */
unsigned long has_inode[1024 / sizeof(unsigned long)];
};
@ -146,6 +148,7 @@ enum {
JOURNAL_RECLAIM_STARTED,
JOURNAL_NEED_WRITE,
JOURNAL_MAY_GET_UNRESERVED,
JOURNAL_MAY_SKIP_FLUSH,
};
/* Embedded in struct bch_fs */
@ -203,6 +206,7 @@ struct journal {
/* seq, last_seq from the most recent journal entry successfully written */
u64 seq_ondisk;
u64 flushed_seq_ondisk;
u64 last_seq_ondisk;
u64 err_seq;
u64 last_empty_seq;
@ -252,11 +256,15 @@ struct journal {
unsigned write_delay_ms;
unsigned reclaim_delay_ms;
unsigned long last_flush_write;
u64 res_get_blocked_start;
u64 need_write_time;
u64 write_start_time;
u64 nr_flush_writes;
u64 nr_noflush_writes;
struct bch2_time_stats *write_time;
struct bch2_time_stats *delay_time;
struct bch2_time_stats *blocked_time;

View file

@ -313,7 +313,7 @@ void bch2_journal_keys_free(struct journal_keys *keys)
static struct journal_keys journal_keys_sort(struct list_head *journal_entries)
{
struct journal_replay *p;
struct journal_replay *i;
struct jset_entry *entry;
struct bkey_i *k, *_n;
struct journal_keys keys = { NULL };
@ -323,35 +323,35 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries)
if (list_empty(journal_entries))
return keys;
keys.journal_seq_base =
le64_to_cpu(list_last_entry(journal_entries,
struct journal_replay, list)->j.last_seq);
list_for_each_entry(p, journal_entries, list) {
if (le64_to_cpu(p->j.seq) < keys.journal_seq_base)
list_for_each_entry(i, journal_entries, list) {
if (i->ignore)
continue;
for_each_jset_key(k, _n, entry, &p->j)
if (!keys.journal_seq_base)
keys.journal_seq_base = le64_to_cpu(i->j.seq);
for_each_jset_key(k, _n, entry, &i->j)
nr_keys++;
}
keys.d = kvmalloc(sizeof(keys.d[0]) * nr_keys, GFP_KERNEL);
if (!keys.d)
goto err;
list_for_each_entry(p, journal_entries, list) {
if (le64_to_cpu(p->j.seq) < keys.journal_seq_base)
list_for_each_entry(i, journal_entries, list) {
if (i->ignore)
continue;
for_each_jset_key(k, _n, entry, &p->j)
BUG_ON(le64_to_cpu(i->j.seq) - keys.journal_seq_base > U32_MAX);
for_each_jset_key(k, _n, entry, &i->j)
keys.d[keys.nr++] = (struct journal_key) {
.btree_id = entry->btree_id,
.level = entry->level,
.k = k,
.journal_seq = le64_to_cpu(p->j.seq) -
.journal_seq = le64_to_cpu(i->j.seq) -
keys.journal_seq_base,
.journal_offset = k->_data - p->j._data,
.journal_offset = k->_data - i->j._data,
};
}
@ -643,46 +643,6 @@ err:
return ret;
}
static bool journal_empty(struct list_head *journal)
{
return list_empty(journal) ||
journal_entry_empty(&list_last_entry(journal,
struct journal_replay, list)->j);
}
static int
verify_journal_entries_not_blacklisted_or_missing(struct bch_fs *c,
struct list_head *journal)
{
struct journal_replay *i =
list_last_entry(journal, struct journal_replay, list);
u64 start_seq = le64_to_cpu(i->j.last_seq);
u64 end_seq = le64_to_cpu(i->j.seq);
u64 seq = start_seq;
int ret = 0;
list_for_each_entry(i, journal, list) {
if (le64_to_cpu(i->j.seq) < start_seq)
continue;
fsck_err_on(seq != le64_to_cpu(i->j.seq), c,
"journal entries %llu-%llu missing! (replaying %llu-%llu)",
seq, le64_to_cpu(i->j.seq) - 1,
start_seq, end_seq);
seq = le64_to_cpu(i->j.seq);
fsck_err_on(bch2_journal_seq_is_blacklisted(c, seq, false), c,
"found blacklisted journal entry %llu", seq);
do {
seq++;
} while (bch2_journal_seq_is_blacklisted(c, seq, false));
}
fsck_err:
return ret;
}
/* journal replay early: */
static int journal_replay_entry_early(struct bch_fs *c,
@ -767,6 +727,7 @@ static int journal_replay_early(struct bch_fs *c,
struct bch_sb_field_clean *clean,
struct list_head *journal)
{
struct journal_replay *i;
struct jset_entry *entry;
int ret;
@ -782,18 +743,19 @@ static int journal_replay_early(struct bch_fs *c,
return ret;
}
} else {
struct journal_replay *i =
list_last_entry(journal, struct journal_replay, list);
list_for_each_entry(i, journal, list) {
if (i->ignore)
continue;
c->bucket_clock[READ].hand = le16_to_cpu(i->j.read_clock);
c->bucket_clock[WRITE].hand = le16_to_cpu(i->j.write_clock);
c->bucket_clock[READ].hand = le16_to_cpu(i->j.read_clock);
c->bucket_clock[WRITE].hand = le16_to_cpu(i->j.write_clock);
list_for_each_entry(i, journal, list)
vstruct_for_each(&i->j, entry) {
ret = journal_replay_entry_early(c, entry);
if (ret)
return ret;
}
}
}
bch2_fs_usage_initialize(c);
@ -842,9 +804,6 @@ static int verify_superblock_clean(struct bch_fs *c,
struct bch_sb_field_clean *clean = *cleanp;
int ret = 0;
if (!c->sb.clean || !j)
return 0;
if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c,
"superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown",
le64_to_cpu(clean->journal_seq),
@ -971,7 +930,8 @@ int bch2_fs_recovery(struct bch_fs *c)
{
const char *err = "cannot allocate memory";
struct bch_sb_field_clean *clean = NULL;
u64 journal_seq;
struct jset *last_journal_entry = NULL;
u64 blacklist_seq, journal_seq;
bool write_sb = false, need_write_alloc = false;
int ret;
@ -991,24 +951,38 @@ int bch2_fs_recovery(struct bch_fs *c)
set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags);
}
if (!c->sb.clean || c->opts.fsck || c->opts.keep_journal) {
struct jset *j;
ret = bch2_blacklist_table_initialize(c);
if (ret) {
bch_err(c, "error initializing blacklist table");
goto err;
}
ret = bch2_journal_read(c, &c->journal_entries);
if (!c->sb.clean || c->opts.fsck || c->opts.keep_journal) {
struct journal_replay *i;
ret = bch2_journal_read(c, &c->journal_entries,
&blacklist_seq, &journal_seq);
if (ret)
goto err;
if (mustfix_fsck_err_on(c->sb.clean && !journal_empty(&c->journal_entries), c,
list_for_each_entry_reverse(i, &c->journal_entries, list)
if (!i->ignore) {
last_journal_entry = &i->j;
break;
}
if (mustfix_fsck_err_on(c->sb.clean &&
last_journal_entry &&
!journal_entry_empty(last_journal_entry), c,
"filesystem marked clean but journal not empty")) {
c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO);
SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
c->sb.clean = false;
}
if (!c->sb.clean && list_empty(&c->journal_entries)) {
bch_err(c, "no journal entries found");
ret = BCH_FSCK_REPAIR_IMPOSSIBLE;
goto err;
if (!last_journal_entry) {
fsck_err_on(!c->sb.clean, c, "no journal entries found");
goto use_clean;
}
c->journal_keys = journal_keys_sort(&c->journal_entries);
@ -1017,16 +991,21 @@ int bch2_fs_recovery(struct bch_fs *c)
goto err;
}
j = &list_last_entry(&c->journal_entries,
struct journal_replay, list)->j;
ret = verify_superblock_clean(c, &clean, j);
if (ret)
if (c->sb.clean && last_journal_entry) {
ret = verify_superblock_clean(c, &clean,
last_journal_entry);
if (ret)
goto err;
}
} else {
use_clean:
if (!clean) {
bch_err(c, "no superblock clean section found");
ret = BCH_FSCK_REPAIR_IMPOSSIBLE;
goto err;
journal_seq = le64_to_cpu(j->seq) + 1;
} else {
journal_seq = le64_to_cpu(clean->journal_seq) + 1;
}
blacklist_seq = journal_seq = le64_to_cpu(clean->journal_seq) + 1;
}
if (!c->sb.clean &&
@ -1045,30 +1024,23 @@ int bch2_fs_recovery(struct bch_fs *c)
if (ret)
goto err;
if (!c->sb.clean) {
/*
* After an unclean shutdown, skip then next few journal sequence
* numbers as they may have been referenced by btree writes that
* happened before their corresponding journal writes - those btree
* writes need to be ignored, by skipping and blacklisting the next few
* journal sequence numbers:
*/
if (!c->sb.clean)
journal_seq += 8;
if (blacklist_seq != journal_seq) {
ret = bch2_journal_seq_blacklist_add(c,
journal_seq,
journal_seq + 8);
blacklist_seq, journal_seq);
if (ret) {
bch_err(c, "error creating new journal seq blacklist entry");
goto err;
}
journal_seq += 8;
/*
* The superblock needs to be written before we do any btree
* node writes: it will be in the read_write() path
*/
}
ret = bch2_blacklist_table_initialize(c);
if (!list_empty(&c->journal_entries)) {
ret = verify_journal_entries_not_blacklisted_or_missing(c,
&c->journal_entries);
if (ret)
goto err;
}
ret = bch2_fs_journal_start(&c->journal, journal_seq,