bcachefs: Checksum errors get additional retries

It's possible for checksum errors to be transient - e.g. flakey controller or cable, thus we need additional retries (besides retrying from different replicas) before we can definitely return an error. This is particularly important for the next patch, which will allow the data move path to move extents with checksum errors - we don't want to accidentally introduce bitrot due to a transient error! - bch2_bkey_pick_read_device() is substantially reworked, and bch2_dev_io_failures is expanded to record more information about the type of failure (i.e. number of checksum errors). It now returns an error code that describes more precisely the reason for the failure - checksum error, io error, or offline device, instead of the previous generic "insufficient devices". This is important for the next patches that add poisoning, as we only want to poison extents when we've got real checksum errors (or perhaps IO errors?) - not because a device was offline. - Add a new option and superblock field for the number of checksum retries. Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2025-03-08 12:56:43 -05:00
parent ccba9029b0
commit be31e412ac
9 changed files with 128 additions and 82 deletions
@@ -842,6 +842,7 @@ LE64_BITMASK(BCH_SB_SHARD_INUMS,	struct bch_sb, flags[3], 28, 29);
 LE64_BITMASK(BCH_SB_INODES_USE_KEY_CACHE,struct bch_sb, flags[3], 29, 30);
 LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DELAY,struct bch_sb, flags[3], 30, 62);
 LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DISABLED,struct bch_sb, flags[3], 62, 63);
+/* one free bit */
 LE64_BITMASK(BCH_SB_JOURNAL_RECLAIM_DELAY,struct bch_sb, flags[4], 0, 32);
 LE64_BITMASK(BCH_SB_JOURNAL_TRANSACTION_NAMES,struct bch_sb, flags[4], 32, 33);
 LE64_BITMASK(BCH_SB_NOCOW,		struct bch_sb, flags[4], 33, 34);
@@ -861,6 +862,7 @@ LE64_BITMASK(BCH_SB_VERSION_INCOMPAT_ALLOWED,
 					struct bch_sb, flags[5], 48, 64);
 LE64_BITMASK(BCH_SB_SHARD_INUMS_NBITS,	struct bch_sb, flags[6],  0,  4);
 LE64_BITMASK(BCH_SB_WRITE_ERROR_TIMEOUT,struct bch_sb, flags[6],  4, 14);
+LE64_BITMASK(BCH_SB_CSUM_ERR_RETRY_NR,	struct bch_sb, flags[6], 14, 20);

 static inline __u64 BCH_SB_COMPRESSION_TYPE(const struct bch_sb *sb)
 {
@@ -1355,7 +1355,7 @@ start:
 			percpu_ref_put(&ca->io_ref);
 		rb->have_ioref = false;

-		bch2_mark_io_failure(&failed, &rb->pick);
+		bch2_mark_io_failure(&failed, &rb->pick, false);

 		can_retry = bch2_bkey_pick_read_device(c,
 				bkey_i_to_s_c(&b->key),
@@ -273,7 +273,6 @@
 	x(EIO,				stripe_reconstruct)			\
 	x(EIO,				key_type_error)				\
 	x(EIO,				extent_poisened)			\
-	x(EIO,				no_device_to_read_from)			\
 	x(EIO,				missing_indirect_extent)		\
 	x(EIO,				invalidate_stripe_to_dev)		\
 	x(EIO,				no_encryption_key)			\
@@ -283,6 +282,9 @@
 	x(EIO,				ec_block_read)				\
 	x(EIO,				ec_block_write)				\
 	x(EIO,				data_read)				\
+	x(BCH_ERR_data_read,		no_device_to_read_from)			\
+	x(BCH_ERR_data_read,		data_read_io_err)			\
+	x(BCH_ERR_data_read,		data_read_csum_err)			\
 	x(BCH_ERR_data_read,		data_read_retry)			\
 	x(BCH_ERR_data_read_retry,	data_read_retry_avoid)			\
 	x(BCH_ERR_data_read_retry_avoid,data_read_retry_device_offline)		\
@@ -58,7 +58,8 @@ struct bch_dev_io_failures *bch2_dev_io_failures(struct bch_io_failures *f,
 }

 void bch2_mark_io_failure(struct bch_io_failures *failed,
-			  struct extent_ptr_decoded *p)
+			  struct extent_ptr_decoded *p,
+			  bool csum_error)
 {
 	struct bch_dev_io_failures *f = bch2_dev_io_failures(failed, p->ptr.dev);

@@ -66,17 +67,16 @@ void bch2_mark_io_failure(struct bch_io_failures *failed,
 		BUG_ON(failed->nr >= ARRAY_SIZE(failed->devs));

 		f = &failed->devs[failed->nr++];
-		f->dev		= p->ptr.dev;
-		f->idx		= p->idx;
-		f->nr_failed	= 1;
-		f->nr_retries	= 0;
-	} else if (p->idx != f->idx) {
-		f->idx		= p->idx;
-		f->nr_failed	= 1;
-		f->nr_retries	= 0;
-	} else {
-		f->nr_failed++;
+		memset(f, 0, sizeof(*f));
+		f->dev = p->ptr.dev;
 	}
+
+	if (p->do_ec_reconstruct)
+		f->failed_ec = true;
+	else if (!csum_error)
+		f->failed_io = true;
+	else
+		f->failed_csum_nr++;
 }

 static inline u64 dev_latency(struct bch_dev *ca)
@@ -94,37 +94,30 @@ static inline int dev_failed(struct bch_dev *ca)
 */
 static inline bool ptr_better(struct bch_fs *c,
 			      const struct extent_ptr_decoded p1,
-			      const struct extent_ptr_decoded p2)
+			      u64 p1_latency,
+			      struct bch_dev *ca1,
+			      const struct extent_ptr_decoded p2,
+			      u64 p2_latency)
 {
-	if (likely(!p1.idx && !p2.idx)) {
-		struct bch_dev *ca1 = bch2_dev_rcu(c, p1.ptr.dev);
-		struct bch_dev *ca2 = bch2_dev_rcu(c, p2.ptr.dev);
+	struct bch_dev *ca2 = bch2_dev_rcu(c, p2.ptr.dev);

-		int failed_delta = dev_failed(ca1) - dev_failed(ca2);
+	int failed_delta = dev_failed(ca1) - dev_failed(ca2);
+	if (unlikely(failed_delta))
+		return failed_delta < 0;

-		if (failed_delta)
-			return failed_delta < 0;
+	if (unlikely(bch2_force_reconstruct_read))
+		return p1.do_ec_reconstruct > p2.do_ec_reconstruct;

-		u64 l1 = dev_latency(ca1);
-		u64 l2 = dev_latency(ca2);
+	if (unlikely(p1.do_ec_reconstruct || p2.do_ec_reconstruct))
+		return p1.do_ec_reconstruct < p2.do_ec_reconstruct;

-		/*
-		 * Square the latencies, to bias more in favor of the faster
-		 * device - we never want to stop issuing reads to the slower
-		 * device altogether, so that we can update our latency numbers:
-		 */
-		l1 *= l1;
-		l2 *= l2;
+	int crc_retry_delta = (int) p1.crc_retry_nr - (int) p2.crc_retry_nr;
+	if (unlikely(crc_retry_delta))
+		return crc_retry_delta < 0;

-		/* Pick at random, biased in favor of the faster device: */
+	/* Pick at random, biased in favor of the faster device: */

-		return bch2_get_random_u64_below(l1 + l2) > l1;
-	}
-
-	if (bch2_force_reconstruct_read)
-		return p1.idx > p2.idx;
-
-	return p1.idx < p2.idx;
+	return bch2_get_random_u64_below(p1_latency + p2_latency) > p1_latency;
 }

 /*
@@ -133,73 +126,109 @@ static inline bool ptr_better(struct bch_fs *c,
 * other devices, it will still pick a pointer from avoid.
 */
 int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
-			      struct bch_io_failures *failed,
-			      struct extent_ptr_decoded *pick,
-			      int dev)
+			       struct bch_io_failures *failed,
+			       struct extent_ptr_decoded *pick,
+			       int dev)
 {
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-	const union bch_extent_entry *entry;
-	struct extent_ptr_decoded p;
-	struct bch_dev_io_failures *f;
-	int ret = 0;
+	bool have_csum_errors = false, have_io_errors = false, have_missing_devs = false;
+	bool have_dirty_ptrs = false, have_pick = false;

 	if (k.k->type == KEY_TYPE_error)
 		return -BCH_ERR_key_type_error;

+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+
 	if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned))
 		return -BCH_ERR_extent_poisened;

 	rcu_read_lock();
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
+	u64 pick_latency;
+
 	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+		have_dirty_ptrs |= !p.ptr.cached;
+
 		/*
 		 * Unwritten extent: no need to actually read, treat it as a
 		 * hole and return 0s:
 		 */
 		if (p.ptr.unwritten) {
-			ret = 0;
-			break;
+			rcu_read_unlock();
+			return 0;
 		}

 		/* Are we being asked to read from a specific device? */
 		if (dev >= 0 && p.ptr.dev != dev)
 			continue;

-		/*
-		 * If there are any dirty pointers it's an error if we can't
-		 * read:
-		 */
-		if (!ret && !p.ptr.cached)
-			ret = -BCH_ERR_no_device_to_read_from;
-
 		struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev);

 		if (p.ptr.cached && (!ca || dev_ptr_stale_rcu(ca, &p.ptr)))
 			continue;

-		f = failed ? bch2_dev_io_failures(failed, p.ptr.dev) : NULL;
-		if (f)
-			p.idx = f->nr_failed < f->nr_retries
-				? f->idx
-				: f->idx + 1;
+		struct bch_dev_io_failures *f =
+			unlikely(failed) ? bch2_dev_io_failures(failed, p.ptr.dev) : NULL;
+		if (unlikely(f)) {
+			p.crc_retry_nr	   = f->failed_csum_nr;
+			p.has_ec	  &= ~f->failed_ec;

-		if (!p.idx && (!ca || !bch2_dev_is_online(ca)))
-			p.idx++;
+			if (ca && ca->mi.state != BCH_MEMBER_STATE_failed) {
+				have_io_errors	|= f->failed_io;
+				have_io_errors	|= f->failed_ec;
+			}
+			have_csum_errors	|= !!f->failed_csum_nr;

-		if (!p.idx && p.has_ec && bch2_force_reconstruct_read)
-			p.idx++;
+			if (p.has_ec && (f->failed_io || f->failed_csum_nr))
+				p.do_ec_reconstruct = true;
+			else if (f->failed_io ||
+				 f->failed_csum_nr > c->opts.checksum_err_retry_nr)
+				continue;
+		}

-		if (p.idx > (unsigned) p.has_ec)
-			continue;
+		have_missing_devs |= ca && !bch2_dev_is_online(ca);

-		if (ret > 0 && !ptr_better(c, p, *pick))
-			continue;
+		if (!ca || !bch2_dev_is_online(ca)) {
+			if (!p.has_ec)
+				continue;
+			p.do_ec_reconstruct = true;
+		}

-		*pick = p;
-		ret = 1;
+		if (bch2_force_reconstruct_read && p.has_ec)
+			p.do_ec_reconstruct = true;
+
+		u64 p_latency = dev_latency(ca);
+		/*
+		 * Square the latencies, to bias more in favor of the faster
+		 * device - we never want to stop issuing reads to the slower
+		 * device altogether, so that we can update our latency numbers:
+		 */
+		p_latency *= p_latency;
+
+		if (!have_pick ||
+		    ptr_better(c,
+			       p, p_latency, ca,
+			       *pick, pick_latency)) {
+			*pick = p;
+			pick_latency = p_latency;
+			have_pick = true;
+		}
 	}
 	rcu_read_unlock();

-	return ret;
+	if (have_pick)
+		return 1;
+	if (!have_dirty_ptrs)
+		return 0;
+	if (have_missing_devs)
+		return -BCH_ERR_no_device_to_read_from;
+	if (have_csum_errors)
+		return -BCH_ERR_data_read_csum_err;
+	if (have_io_errors)
+		return -BCH_ERR_data_read_io_err;
+
+	WARN_ONCE(1, "unhandled error case in %s\n", __func__);
+	return -EINVAL;
 }

 /* KEY_TYPE_btree_ptr: */
@@ -320,8 +320,9 @@ static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k)
 ({									\
 	__label__ out;							\
 									\
-	(_ptr).idx	= 0;						\
-	(_ptr).has_ec	= false;					\
+	(_ptr).has_ec			= false;			\
+	(_ptr).do_ec_reconstruct	= false;			\
+	(_ptr).crc_retry_nr		= 0;				\
 									\
 	__bkey_extent_entry_for_each_from(_entry, _end, _entry)		\
 		switch (__extent_entry_type(_entry)) {			\
@@ -401,7 +402,7 @@ out:									\
 struct bch_dev_io_failures *bch2_dev_io_failures(struct bch_io_failures *,
 						 unsigned);
 void bch2_mark_io_failure(struct bch_io_failures *,
-			  struct extent_ptr_decoded *);
+			  struct extent_ptr_decoded *, bool);
 int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c,
 			       struct bch_io_failures *,
 			       struct extent_ptr_decoded *, int);
@@ -20,8 +20,9 @@ struct bch_extent_crc_unpacked {
 };

 struct extent_ptr_decoded {
-	unsigned			idx;
 	bool				has_ec;
+	bool				do_ec_reconstruct;
+	u8				crc_retry_nr;
 	struct bch_extent_crc_unpacked	crc;
 	struct bch_extent_ptr		ptr;
 	struct bch_extent_stripe_ptr	ec;
@@ -31,10 +32,10 @@ struct bch_io_failures {
 	u8			nr;
 	struct bch_dev_io_failures {
 		u8		dev;
-		u8		idx;
-		u8		nr_failed;
-		u8		nr_retries;
-	}			devs[BCH_REPLICAS_MAX];
+		unsigned	failed_csum_nr:6,
+				failed_io:1,
+				failed_ec:1;
+	}			devs[BCH_REPLICAS_MAX + 1];
 };

 #endif /* _BCACHEFS_EXTENTS_TYPES_H */
@@ -487,7 +487,8 @@ static void bch2_rbio_retry(struct work_struct *work)
 		     bvec_iter_sectors(rbio->bvec_iter));

 	if (bch2_err_matches(rbio->ret, BCH_ERR_data_read_retry_avoid))
-		bch2_mark_io_failure(&failed, &rbio->pick);
+		bch2_mark_io_failure(&failed, &rbio->pick,
+				     rbio->ret == -BCH_ERR_data_read_retry_csum_err);

 	if (!rbio->split) {
 		rbio->bio.bi_status	= 0;
@@ -991,7 +992,7 @@ retry_pick:
 	    ca &&
 	    unlikely(dev_ptr_stale(ca, &pick.ptr))) {
 		read_from_stale_dirty_pointer(trans, ca, k, pick.ptr);
-		bch2_mark_io_failure(failed, &pick);
+		bch2_mark_io_failure(failed, &pick, false);
 		percpu_ref_put(&ca->io_ref);
 		goto retry_pick;
 	}
@@ -1154,7 +1155,7 @@ retry_pick:
 	else
 		bch2_trans_unlock_long(trans);

-	if (!rbio->pick.idx) {
+	if (likely(!rbio->pick.do_ec_reconstruct)) {
 		if (unlikely(!rbio->have_ioref)) {
 			struct printbuf buf = PRINTBUF;
 			bch2_read_err_msg_trans(trans, &buf, rbio, read_pos);
@@ -1215,7 +1216,8 @@ out:
 		rbio = bch2_rbio_free(rbio);

 		if (bch2_err_matches(ret, BCH_ERR_data_read_retry_avoid))
-			bch2_mark_io_failure(failed, &pick);
+			bch2_mark_io_failure(failed, &pick,
+					ret == -BCH_ERR_data_read_retry_csum_err);

 		return ret;
 	}
@@ -186,6 +186,11 @@ enum fsck_err_opts {
 	  OPT_STR(__bch2_csum_opts),					\
 	  BCH_SB_DATA_CSUM_TYPE,	BCH_CSUM_OPT_crc32c,		\
 	  NULL,		NULL)						\
+	x(checksum_err_retry_nr,	u8,				\
+	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,			\
+	  OPT_UINT(0, 32),						\
+	  BCH_SB_CSUM_ERR_RETRY_NR,	3,				\
+	  NULL,		NULL)						\
 	x(compression,			u8,				\
 	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,		\
 	  OPT_FN(bch2_opt_compression),					\
@@ -457,6 +457,10 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb,

 		if (!BCH_SB_WRITE_ERROR_TIMEOUT(sb))
 			SET_BCH_SB_WRITE_ERROR_TIMEOUT(sb, 30);
+
+		if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_extent_flags &&
+		    !BCH_SB_CSUM_ERR_RETRY_NR(sb))
+			SET_BCH_SB_CSUM_ERR_RETRY_NR(sb, 3);
 	}

 #ifdef __KERNEL__