bcachefs: Internal reads can now correct errors

Rework the read path so that BCH_READ_NODECODE reads now also self-heal
after a read error and a successful retry - prerequisite for scrub.

- __bch2_read_endio() now handles a read that's both BCH_READ_NODECODE
  and a bounce.

  Normally, we don't want a BCH_READ_NODECODE read to ever allocate a
  split bch_read_bio: we want to maintain the relationship between the
  bch_read_bio and the data_update it's embedded in.

  But correcting read errors requires allocating a split/bounce rbio
  that's embedded in a promote_op. We do still have a 1-1 relationship,
  i.e. we only allocate a single split/bounce if it's a
  BCH_READ_NODECODE, so things hopefully don't get too crazy.

- __bch2_read_extent() now is allowed to allocate the promote_op for
  rewriting after a failed read, even if it's BCH_READ_NODECODE.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
This commit is contained in:
Kent Overstreet
2024-12-30 16:32:57 -05:00
parent 7b1d655106
commit dff6de9518
+56 -52
View File
@@ -696,32 +696,40 @@ static void __bch2_read_endio(struct work_struct *work)
if (unlikely(rbio->narrow_crcs))
bch2_rbio_narrow_crcs(rbio);
if (rbio->flags & BCH_READ_data_update)
goto nodecode;
if (likely(!(rbio->flags & BCH_READ_data_update))) {
/* Adjust crc to point to subset of data we want: */
crc.offset += rbio->offset_into_extent;
crc.live_size = bvec_iter_sectors(rbio->bvec_iter);
/* Adjust crc to point to subset of data we want: */
crc.offset += rbio->offset_into_extent;
crc.live_size = bvec_iter_sectors(rbio->bvec_iter);
if (crc_is_compressed(crc)) {
ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
if (ret)
goto decrypt_err;
if (crc_is_compressed(crc)) {
ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
if (ret)
goto decrypt_err;
if (bch2_bio_uncompress(c, src, dst, dst_iter, crc) &&
!c->opts.no_data_io)
goto decompression_err;
} else {
/* don't need to decrypt the entire bio: */
nonce = nonce_add(nonce, crc.offset << 9);
bio_advance(src, crc.offset << 9);
if (bch2_bio_uncompress(c, src, dst, dst_iter, crc) &&
!c->opts.no_data_io)
goto decompression_err;
BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size);
src->bi_iter.bi_size = dst_iter.bi_size;
ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
if (ret)
goto decrypt_err;
if (rbio->bounce) {
struct bvec_iter src_iter = src->bi_iter;
bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
}
}
} else {
/* don't need to decrypt the entire bio: */
nonce = nonce_add(nonce, crc.offset << 9);
bio_advance(src, crc.offset << 9);
BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size);
src->bi_iter.bi_size = dst_iter.bi_size;
ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
if (ret)
goto decrypt_err;
if (rbio->split)
rbio->parent->pick = rbio->pick;
if (rbio->bounce) {
struct bvec_iter src_iter = src->bi_iter;
@@ -739,7 +747,7 @@ static void __bch2_read_endio(struct work_struct *work)
if (ret)
goto decrypt_err;
}
nodecode:
if (likely(!(rbio->flags & BCH_READ_in_retry))) {
rbio = bch2_rbio_free(rbio);
bch2_rbio_done(rbio);
@@ -931,13 +939,35 @@ retry_pick:
goto retry_pick;
}
if (flags & BCH_READ_data_update) {
struct data_update *u = container_of(orig, struct data_update, rbio);
if (!(flags & BCH_READ_data_update)) {
if (!(flags & BCH_READ_last_fragment) ||
bio_flagged(&orig->bio, BIO_CHAIN))
flags |= BCH_READ_must_clone;
narrow_crcs = !(flags & BCH_READ_in_retry) &&
bch2_can_narrow_extent_crcs(k, pick.crc);
if (narrow_crcs && (flags & BCH_READ_user_mapped))
flags |= BCH_READ_must_bounce;
EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size);
if (crc_is_compressed(pick.crc) ||
(pick.crc.csum_type != BCH_CSUM_none &&
(bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
(bch2_csum_type_is_encryption(pick.crc.csum_type) &&
(flags & BCH_READ_user_mapped)) ||
(flags & BCH_READ_must_bounce)))) {
read_full = true;
bounce = true;
}
} else {
read_full = true;
/*
* can happen if we retry, and the extent we were going to read
* has been merged in the meantime:
*/
struct data_update *u = container_of(orig, struct data_update, rbio);
if (pick.crc.compressed_size > u->op.wbio.bio.bi_iter.bi_size) {
if (ca)
percpu_ref_put(&ca->io_ref);
@@ -945,29 +975,6 @@ retry_pick:
}
iter.bi_size = pick.crc.compressed_size << 9;
goto get_bio;
}
if (!(flags & BCH_READ_last_fragment) ||
bio_flagged(&orig->bio, BIO_CHAIN))
flags |= BCH_READ_must_clone;
narrow_crcs = !(flags & BCH_READ_in_retry) &&
bch2_can_narrow_extent_crcs(k, pick.crc);
if (narrow_crcs && (flags & BCH_READ_user_mapped))
flags |= BCH_READ_must_bounce;
EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size);
if (crc_is_compressed(pick.crc) ||
(pick.crc.csum_type != BCH_CSUM_none &&
(bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
(bch2_csum_type_is_encryption(pick.crc.csum_type) &&
(flags & BCH_READ_user_mapped)) ||
(flags & BCH_READ_must_bounce)))) {
read_full = true;
bounce = true;
}
if (orig->opts.promote_target || have_io_error(failed))
@@ -991,7 +998,7 @@ retry_pick:
pick.crc.offset = 0;
pick.crc.live_size = bvec_iter_sectors(iter);
}
get_bio:
if (rbio) {
/*
* promote already allocated bounce rbio:
@@ -1055,9 +1062,6 @@ get_bio:
rbio->version = k.k->bversion;
INIT_WORK(&rbio->work, NULL);
if (flags & BCH_READ_data_update)
orig->pick = pick;
rbio->bio.bi_opf = orig->bio.bi_opf;
rbio->bio.bi_iter.bi_sector = pick.ptr.offset;
rbio->bio.bi_end_io = bch2_read_endio;