bcachefs: Checksum errors get additional retries

It's possible for checksum errors to be transient - e.g. flakey
controller or cable, thus we need additional retries (besides retrying
from different replicas) before we can definitely return an error.

This is particularly important for the next patch, which will allow the
data move path to move extents with checksum errors - we don't want to
accidentally introduce bitrot due to a transient error!

- bch2_bkey_pick_read_device() is substantially reworked, and
  bch2_dev_io_failures is expanded to record more information about the
  type of failure (i.e. number of checksum errors).

  It now returns an error code that describes more precisely the reason
  for the failure - checksum error, io error, or offline device, instead
  of the previous generic "insufficient devices". This is important for
  the next patches that add poisoning, as we only want to poison extents
  when we've got real checksum errors (or perhaps IO errors?) - not
  because a device was offline.

- Add a new option and superblock field for the number of checksum
  retries.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
This commit is contained in:
Kent Overstreet
2025-03-08 12:56:43 -05:00
parent ccba9029b0
commit be31e412ac
9 changed files with 128 additions and 82 deletions
+2
View File
@@ -842,6 +842,7 @@ LE64_BITMASK(BCH_SB_SHARD_INUMS, struct bch_sb, flags[3], 28, 29);
LE64_BITMASK(BCH_SB_INODES_USE_KEY_CACHE,struct bch_sb, flags[3], 29, 30);
LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DELAY,struct bch_sb, flags[3], 30, 62);
LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DISABLED,struct bch_sb, flags[3], 62, 63);
/* one free bit */
LE64_BITMASK(BCH_SB_JOURNAL_RECLAIM_DELAY,struct bch_sb, flags[4], 0, 32);
LE64_BITMASK(BCH_SB_JOURNAL_TRANSACTION_NAMES,struct bch_sb, flags[4], 32, 33);
LE64_BITMASK(BCH_SB_NOCOW, struct bch_sb, flags[4], 33, 34);
@@ -861,6 +862,7 @@ LE64_BITMASK(BCH_SB_VERSION_INCOMPAT_ALLOWED,
struct bch_sb, flags[5], 48, 64);
LE64_BITMASK(BCH_SB_SHARD_INUMS_NBITS, struct bch_sb, flags[6], 0, 4);
LE64_BITMASK(BCH_SB_WRITE_ERROR_TIMEOUT,struct bch_sb, flags[6], 4, 14);
LE64_BITMASK(BCH_SB_CSUM_ERR_RETRY_NR, struct bch_sb, flags[6], 14, 20);
static inline __u64 BCH_SB_COMPRESSION_TYPE(const struct bch_sb *sb)
{
+1 -1
View File
@@ -1355,7 +1355,7 @@ start:
percpu_ref_put(&ca->io_ref);
rb->have_ioref = false;
bch2_mark_io_failure(&failed, &rb->pick);
bch2_mark_io_failure(&failed, &rb->pick, false);
can_retry = bch2_bkey_pick_read_device(c,
bkey_i_to_s_c(&b->key),
+3 -1
View File
@@ -273,7 +273,6 @@
x(EIO, stripe_reconstruct) \
x(EIO, key_type_error) \
x(EIO, extent_poisened) \
x(EIO, no_device_to_read_from) \
x(EIO, missing_indirect_extent) \
x(EIO, invalidate_stripe_to_dev) \
x(EIO, no_encryption_key) \
@@ -283,6 +282,9 @@
x(EIO, ec_block_read) \
x(EIO, ec_block_write) \
x(EIO, data_read) \
x(BCH_ERR_data_read, no_device_to_read_from) \
x(BCH_ERR_data_read, data_read_io_err) \
x(BCH_ERR_data_read, data_read_csum_err) \
x(BCH_ERR_data_read, data_read_retry) \
x(BCH_ERR_data_read_retry, data_read_retry_avoid) \
x(BCH_ERR_data_read_retry_avoid,data_read_retry_device_offline) \
+97 -68
View File
@@ -58,7 +58,8 @@ struct bch_dev_io_failures *bch2_dev_io_failures(struct bch_io_failures *f,
}
void bch2_mark_io_failure(struct bch_io_failures *failed,
struct extent_ptr_decoded *p)
struct extent_ptr_decoded *p,
bool csum_error)
{
struct bch_dev_io_failures *f = bch2_dev_io_failures(failed, p->ptr.dev);
@@ -66,17 +67,16 @@ void bch2_mark_io_failure(struct bch_io_failures *failed,
BUG_ON(failed->nr >= ARRAY_SIZE(failed->devs));
f = &failed->devs[failed->nr++];
f->dev = p->ptr.dev;
f->idx = p->idx;
f->nr_failed = 1;
f->nr_retries = 0;
} else if (p->idx != f->idx) {
f->idx = p->idx;
f->nr_failed = 1;
f->nr_retries = 0;
} else {
f->nr_failed++;
memset(f, 0, sizeof(*f));
f->dev = p->ptr.dev;
}
if (p->do_ec_reconstruct)
f->failed_ec = true;
else if (!csum_error)
f->failed_io = true;
else
f->failed_csum_nr++;
}
static inline u64 dev_latency(struct bch_dev *ca)
@@ -94,37 +94,30 @@ static inline int dev_failed(struct bch_dev *ca)
*/
static inline bool ptr_better(struct bch_fs *c,
const struct extent_ptr_decoded p1,
const struct extent_ptr_decoded p2)
u64 p1_latency,
struct bch_dev *ca1,
const struct extent_ptr_decoded p2,
u64 p2_latency)
{
if (likely(!p1.idx && !p2.idx)) {
struct bch_dev *ca1 = bch2_dev_rcu(c, p1.ptr.dev);
struct bch_dev *ca2 = bch2_dev_rcu(c, p2.ptr.dev);
struct bch_dev *ca2 = bch2_dev_rcu(c, p2.ptr.dev);
int failed_delta = dev_failed(ca1) - dev_failed(ca2);
int failed_delta = dev_failed(ca1) - dev_failed(ca2);
if (unlikely(failed_delta))
return failed_delta < 0;
if (failed_delta)
return failed_delta < 0;
if (unlikely(bch2_force_reconstruct_read))
return p1.do_ec_reconstruct > p2.do_ec_reconstruct;
u64 l1 = dev_latency(ca1);
u64 l2 = dev_latency(ca2);
if (unlikely(p1.do_ec_reconstruct || p2.do_ec_reconstruct))
return p1.do_ec_reconstruct < p2.do_ec_reconstruct;
/*
* Square the latencies, to bias more in favor of the faster
* device - we never want to stop issuing reads to the slower
* device altogether, so that we can update our latency numbers:
*/
l1 *= l1;
l2 *= l2;
int crc_retry_delta = (int) p1.crc_retry_nr - (int) p2.crc_retry_nr;
if (unlikely(crc_retry_delta))
return crc_retry_delta < 0;
/* Pick at random, biased in favor of the faster device: */
/* Pick at random, biased in favor of the faster device: */
return bch2_get_random_u64_below(l1 + l2) > l1;
}
if (bch2_force_reconstruct_read)
return p1.idx > p2.idx;
return p1.idx < p2.idx;
return bch2_get_random_u64_below(p1_latency + p2_latency) > p1_latency;
}
/*
@@ -133,73 +126,109 @@ static inline bool ptr_better(struct bch_fs *c,
* other devices, it will still pick a pointer from avoid.
*/
int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
struct bch_io_failures *failed,
struct extent_ptr_decoded *pick,
int dev)
struct bch_io_failures *failed,
struct extent_ptr_decoded *pick,
int dev)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
struct bch_dev_io_failures *f;
int ret = 0;
bool have_csum_errors = false, have_io_errors = false, have_missing_devs = false;
bool have_dirty_ptrs = false, have_pick = false;
if (k.k->type == KEY_TYPE_error)
return -BCH_ERR_key_type_error;
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned))
return -BCH_ERR_extent_poisened;
rcu_read_lock();
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
u64 pick_latency;
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
have_dirty_ptrs |= !p.ptr.cached;
/*
* Unwritten extent: no need to actually read, treat it as a
* hole and return 0s:
*/
if (p.ptr.unwritten) {
ret = 0;
break;
rcu_read_unlock();
return 0;
}
/* Are we being asked to read from a specific device? */
if (dev >= 0 && p.ptr.dev != dev)
continue;
/*
* If there are any dirty pointers it's an error if we can't
* read:
*/
if (!ret && !p.ptr.cached)
ret = -BCH_ERR_no_device_to_read_from;
struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev);
if (p.ptr.cached && (!ca || dev_ptr_stale_rcu(ca, &p.ptr)))
continue;
f = failed ? bch2_dev_io_failures(failed, p.ptr.dev) : NULL;
if (f)
p.idx = f->nr_failed < f->nr_retries
? f->idx
: f->idx + 1;
struct bch_dev_io_failures *f =
unlikely(failed) ? bch2_dev_io_failures(failed, p.ptr.dev) : NULL;
if (unlikely(f)) {
p.crc_retry_nr = f->failed_csum_nr;
p.has_ec &= ~f->failed_ec;
if (!p.idx && (!ca || !bch2_dev_is_online(ca)))
p.idx++;
if (ca && ca->mi.state != BCH_MEMBER_STATE_failed) {
have_io_errors |= f->failed_io;
have_io_errors |= f->failed_ec;
}
have_csum_errors |= !!f->failed_csum_nr;
if (!p.idx && p.has_ec && bch2_force_reconstruct_read)
p.idx++;
if (p.has_ec && (f->failed_io || f->failed_csum_nr))
p.do_ec_reconstruct = true;
else if (f->failed_io ||
f->failed_csum_nr > c->opts.checksum_err_retry_nr)
continue;
}
if (p.idx > (unsigned) p.has_ec)
continue;
have_missing_devs |= ca && !bch2_dev_is_online(ca);
if (ret > 0 && !ptr_better(c, p, *pick))
continue;
if (!ca || !bch2_dev_is_online(ca)) {
if (!p.has_ec)
continue;
p.do_ec_reconstruct = true;
}
*pick = p;
ret = 1;
if (bch2_force_reconstruct_read && p.has_ec)
p.do_ec_reconstruct = true;
u64 p_latency = dev_latency(ca);
/*
* Square the latencies, to bias more in favor of the faster
* device - we never want to stop issuing reads to the slower
* device altogether, so that we can update our latency numbers:
*/
p_latency *= p_latency;
if (!have_pick ||
ptr_better(c,
p, p_latency, ca,
*pick, pick_latency)) {
*pick = p;
pick_latency = p_latency;
have_pick = true;
}
}
rcu_read_unlock();
return ret;
if (have_pick)
return 1;
if (!have_dirty_ptrs)
return 0;
if (have_missing_devs)
return -BCH_ERR_no_device_to_read_from;
if (have_csum_errors)
return -BCH_ERR_data_read_csum_err;
if (have_io_errors)
return -BCH_ERR_data_read_io_err;
WARN_ONCE(1, "unhandled error case in %s\n", __func__);
return -EINVAL;
}
/* KEY_TYPE_btree_ptr: */
+4 -3
View File
@@ -320,8 +320,9 @@ static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k)
({ \
__label__ out; \
\
(_ptr).idx = 0; \
(_ptr).has_ec = false; \
(_ptr).has_ec = false; \
(_ptr).do_ec_reconstruct = false; \
(_ptr).crc_retry_nr = 0; \
\
__bkey_extent_entry_for_each_from(_entry, _end, _entry) \
switch (__extent_entry_type(_entry)) { \
@@ -401,7 +402,7 @@ out: \
struct bch_dev_io_failures *bch2_dev_io_failures(struct bch_io_failures *,
unsigned);
void bch2_mark_io_failure(struct bch_io_failures *,
struct extent_ptr_decoded *);
struct extent_ptr_decoded *, bool);
int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c,
struct bch_io_failures *,
struct extent_ptr_decoded *, int);
+6 -5
View File
@@ -20,8 +20,9 @@ struct bch_extent_crc_unpacked {
};
struct extent_ptr_decoded {
unsigned idx;
bool has_ec;
bool do_ec_reconstruct;
u8 crc_retry_nr;
struct bch_extent_crc_unpacked crc;
struct bch_extent_ptr ptr;
struct bch_extent_stripe_ptr ec;
@@ -31,10 +32,10 @@ struct bch_io_failures {
u8 nr;
struct bch_dev_io_failures {
u8 dev;
u8 idx;
u8 nr_failed;
u8 nr_retries;
} devs[BCH_REPLICAS_MAX];
unsigned failed_csum_nr:6,
failed_io:1,
failed_ec:1;
} devs[BCH_REPLICAS_MAX + 1];
};
#endif /* _BCACHEFS_EXTENTS_TYPES_H */
+6 -4
View File
@@ -487,7 +487,8 @@ static void bch2_rbio_retry(struct work_struct *work)
bvec_iter_sectors(rbio->bvec_iter));
if (bch2_err_matches(rbio->ret, BCH_ERR_data_read_retry_avoid))
bch2_mark_io_failure(&failed, &rbio->pick);
bch2_mark_io_failure(&failed, &rbio->pick,
rbio->ret == -BCH_ERR_data_read_retry_csum_err);
if (!rbio->split) {
rbio->bio.bi_status = 0;
@@ -991,7 +992,7 @@ retry_pick:
ca &&
unlikely(dev_ptr_stale(ca, &pick.ptr))) {
read_from_stale_dirty_pointer(trans, ca, k, pick.ptr);
bch2_mark_io_failure(failed, &pick);
bch2_mark_io_failure(failed, &pick, false);
percpu_ref_put(&ca->io_ref);
goto retry_pick;
}
@@ -1154,7 +1155,7 @@ retry_pick:
else
bch2_trans_unlock_long(trans);
if (!rbio->pick.idx) {
if (likely(!rbio->pick.do_ec_reconstruct)) {
if (unlikely(!rbio->have_ioref)) {
struct printbuf buf = PRINTBUF;
bch2_read_err_msg_trans(trans, &buf, rbio, read_pos);
@@ -1215,7 +1216,8 @@ out:
rbio = bch2_rbio_free(rbio);
if (bch2_err_matches(ret, BCH_ERR_data_read_retry_avoid))
bch2_mark_io_failure(failed, &pick);
bch2_mark_io_failure(failed, &pick,
ret == -BCH_ERR_data_read_retry_csum_err);
return ret;
}
+5
View File
@@ -186,6 +186,11 @@ enum fsck_err_opts {
OPT_STR(__bch2_csum_opts), \
BCH_SB_DATA_CSUM_TYPE, BCH_CSUM_OPT_crc32c, \
NULL, NULL) \
x(checksum_err_retry_nr, u8, \
OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
OPT_UINT(0, 32), \
BCH_SB_CSUM_ERR_RETRY_NR, 3, \
NULL, NULL) \
x(compression, u8, \
OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
OPT_FN(bch2_opt_compression), \
+4
View File
@@ -457,6 +457,10 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb,
if (!BCH_SB_WRITE_ERROR_TIMEOUT(sb))
SET_BCH_SB_WRITE_ERROR_TIMEOUT(sb, 30);
if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_extent_flags &&
!BCH_SB_CSUM_ERR_RETRY_NR(sb))
SET_BCH_SB_CSUM_ERR_RETRY_NR(sb, 3);
}
#ifdef __KERNEL__