From 5db755fbb1a0de4a4cfd5d5edfaa19853b9c56e6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 31 May 2024 09:47:56 +0200 Subject: [PATCH 01/26] ubd: refactor the interrupt handler Instead of a separate handler function that leaves no work in the interrupt hanler itself, split out a per-request end I/O helper and clean up the coding style and variable naming while we're at it. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Acked-By: Anton Ivanov Link: https://lore.kernel.org/r/20240531074837.1648501-2-hch@lst.de Signed-off-by: Jens Axboe --- arch/um/drivers/ubd_kern.c | 49 ++++++++++++++------------------------ 1 file changed, 18 insertions(+), 31 deletions(-) diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c index ef805eaa9e01..0c9542d58c01 100644 --- a/arch/um/drivers/ubd_kern.c +++ b/arch/um/drivers/ubd_kern.c @@ -447,43 +447,30 @@ static int bulk_req_safe_read( return n; } -/* Called without dev->lock held, and only in interrupt context. */ -static void ubd_handler(void) +static void ubd_end_request(struct io_thread_req *io_req) { - int n; - int count; - - while(1){ - n = bulk_req_safe_read( - thread_fd, - irq_req_buffer, - &irq_remainder, - &irq_remainder_size, - UBD_REQ_BUFFER_SIZE - ); - if (n < 0) { - if(n == -EAGAIN) - break; - printk(KERN_ERR "spurious interrupt in ubd_handler, " - "err = %d\n", -n); - return; - } - for (count = 0; count < n/sizeof(struct io_thread_req *); count++) { - struct io_thread_req *io_req = (*irq_req_buffer)[count]; - - if ((io_req->error == BLK_STS_NOTSUPP) && (req_op(io_req->req) == REQ_OP_DISCARD)) { - blk_queue_max_discard_sectors(io_req->req->q, 0); - blk_queue_max_write_zeroes_sectors(io_req->req->q, 0); - } - blk_mq_end_request(io_req->req, io_req->error); - kfree(io_req); - } + if (io_req->error == BLK_STS_NOTSUPP && + req_op(io_req->req) == REQ_OP_DISCARD) { + blk_queue_max_discard_sectors(io_req->req->q, 0); + blk_queue_max_write_zeroes_sectors(io_req->req->q, 0); } + blk_mq_end_request(io_req->req, io_req->error); + kfree(io_req); } static irqreturn_t ubd_intr(int irq, void *dev) { - ubd_handler(); + int len, i; + + while ((len = bulk_req_safe_read(thread_fd, irq_req_buffer, + &irq_remainder, &irq_remainder_size, + UBD_REQ_BUFFER_SIZE)) >= 0) { + for (i = 0; i < len / sizeof(struct io_thread_req *); i++) + ubd_end_request((*irq_req_buffer)[i]); + } + + if (len < 0 && len != -EAGAIN) + pr_err("spurious interrupt in %s, err = %d\n", __func__, len); return IRQ_HANDLED; } From 31ade7d4fdcf382beb8cb229a1f5d77e0f239672 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 31 May 2024 09:47:57 +0200 Subject: [PATCH 02/26] ubd: untagle discard vs write zeroes not support handling Discard and Write Zeroes are different operation and implemented by different fallocate opcodes for ubd. If one fails the other one can work and vice versa. Split the code to disable the operations in ubd_handler to only disable the operation that actually failed. Fixes: 50109b5a03b4 ("um: Add support for DISCARD in the UBD Driver") Signed-off-by: Christoph Hellwig Reviewed-by: Bart Van Assche Reviewed-by: Damien Le Moal Reviewed-by: Martin K. Petersen Acked-By: Anton Ivanov Link: https://lore.kernel.org/r/20240531074837.1648501-3-hch@lst.de Signed-off-by: Jens Axboe --- arch/um/drivers/ubd_kern.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c index 0c9542d58c01..093c87879d08 100644 --- a/arch/um/drivers/ubd_kern.c +++ b/arch/um/drivers/ubd_kern.c @@ -449,10 +449,11 @@ static int bulk_req_safe_read( static void ubd_end_request(struct io_thread_req *io_req) { - if (io_req->error == BLK_STS_NOTSUPP && - req_op(io_req->req) == REQ_OP_DISCARD) { - blk_queue_max_discard_sectors(io_req->req->q, 0); - blk_queue_max_write_zeroes_sectors(io_req->req->q, 0); + if (io_req->error == BLK_STS_NOTSUPP) { + if (req_op(io_req->req) == REQ_OP_DISCARD) + blk_queue_max_discard_sectors(io_req->req->q, 0); + else if (req_op(io_req->req) == REQ_OP_WRITE_ZEROES) + blk_queue_max_write_zeroes_sectors(io_req->req->q, 0); } blk_mq_end_request(io_req->req, io_req->error); kfree(io_req); From a00d4bfce7c6d7fa4712b8133ec195c9bd142ae6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 31 May 2024 09:47:58 +0200 Subject: [PATCH 03/26] rbd: increase io_opt again Commit 16d80c54ad42 ("rbd: set io_min, io_opt and discard_granularity to alloc_size") lowered the io_opt size for rbd from objset_bytes which is 4MB for typical setup to alloc_size which is typically 64KB. The commit mostly talks about discard behavior and does mention io_min in passing. Reducing io_opt means reducing the readahead size, which seems counter-intuitive given that rbd currently abuses the user max_sectors setting to actually increase the I/O size. Switch back to the old setting to allow larger reads (the readahead size despite it's name actually limits the size of any buffered read) and to prepare for using io_opt in the max_sectors calculation and getting drivers out of the business of overriding the max_user_sectors value. Signed-off-by: Christoph Hellwig Acked-by: Ilya Dryomov Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240531074837.1648501-4-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/rbd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 26ff5cd2bf0a..46dc487ccc17 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -4955,8 +4955,8 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) struct queue_limits lim = { .max_hw_sectors = objset_bytes >> SECTOR_SHIFT, .max_user_sectors = objset_bytes >> SECTOR_SHIFT, + .io_opt = objset_bytes, .io_min = rbd_dev->opts->alloc_size, - .io_opt = rbd_dev->opts->alloc_size, .max_segments = USHRT_MAX, .max_segment_size = UINT_MAX, }; From a23634644afc2f7c1bac98776440a1f3b161819e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 31 May 2024 09:47:59 +0200 Subject: [PATCH 04/26] block: take io_opt and io_min into account for max_sectors The soft max_sectors limit is normally capped by the hardware limits and an arbitrary upper limit enforced by the kernel, but can be modified by the user. A few drivers want to increase this limit (nbd, rbd) or adjust it up or down based on hardware capabilities (sd). Change blk_validate_limits to default max_sectors to the optimal I/O size, or upgrade it to the preferred minimal I/O size if that is larger than the kernel default if no optimal I/O size is provided based on the logic in the SD driver. This keeps the existing kernel default for drivers that do not provide an io_opt or very big io_min value, but picks a much more useful default for those who provide these hints, and allows to remove the hacks to set the user max_sectors limit in nbd, rbd and sd. Signed-off-by: Christoph Hellwig Reviewed-by: Bart Van Assche Reviewed-by: Damien Le Moal Acked-by: Ilya Dryomov Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240531074837.1648501-5-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-settings.c | 7 +++++++ drivers/block/nbd.c | 2 +- drivers/block/rbd.c | 1 - drivers/scsi/sd.c | 29 +++++------------------------ 4 files changed, 13 insertions(+), 26 deletions(-) diff --git a/block/blk-settings.c b/block/blk-settings.c index effeb9a639bb..a49abdb35548 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -153,6 +153,13 @@ static int blk_validate_limits(struct queue_limits *lim) if (lim->max_user_sectors < PAGE_SIZE / SECTOR_SIZE) return -EINVAL; lim->max_sectors = min(max_hw_sectors, lim->max_user_sectors); + } else if (lim->io_opt) { + lim->max_sectors = + min(max_hw_sectors, lim->io_opt >> SECTOR_SHIFT); + } else if (lim->io_min && + lim->io_min > (BLK_DEF_MAX_SECTORS_CAP << SECTOR_SHIFT)) { + lim->max_sectors = + min(max_hw_sectors, lim->io_min >> SECTOR_SHIFT); } else { lim->max_sectors = min(max_hw_sectors, BLK_DEF_MAX_SECTORS_CAP); } diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 22a79a62cc4e..ad887d614d5b 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -1808,7 +1808,7 @@ static struct nbd_device *nbd_dev_add(int index, unsigned int refs) { struct queue_limits lim = { .max_hw_sectors = 65536, - .max_user_sectors = 256, + .io_opt = 256 << SECTOR_SHIFT, .max_segments = USHRT_MAX, .max_segment_size = UINT_MAX, }; diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 46dc487ccc17..22ad704f81d8 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -4954,7 +4954,6 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) rbd_dev->layout.object_size * rbd_dev->layout.stripe_count; struct queue_limits lim = { .max_hw_sectors = objset_bytes >> SECTOR_SHIFT, - .max_user_sectors = objset_bytes >> SECTOR_SHIFT, .io_opt = objset_bytes, .io_min = rbd_dev->opts->alloc_size, .max_segments = USHRT_MAX, diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index f6c822c9cbd2..3dff9150ce11 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -3593,7 +3593,7 @@ static int sd_revalidate_disk(struct gendisk *disk) struct request_queue *q = sdkp->disk->queue; sector_t old_capacity = sdkp->capacity; unsigned char *buffer; - unsigned int dev_max, rw_max; + unsigned int dev_max; SCSI_LOG_HLQUEUE(3, sd_printk(KERN_INFO, sdkp, "sd_revalidate_disk\n")); @@ -3675,34 +3675,15 @@ static int sd_revalidate_disk(struct gendisk *disk) else blk_queue_io_min(sdkp->disk->queue, 0); - if (sd_validate_opt_xfer_size(sdkp, dev_max)) { - q->limits.io_opt = logical_to_bytes(sdp, sdkp->opt_xfer_blocks); - rw_max = logical_to_sectors(sdp, sdkp->opt_xfer_blocks); - } else { - q->limits.io_opt = 0; - rw_max = min_not_zero(logical_to_sectors(sdp, dev_max), - (sector_t)BLK_DEF_MAX_SECTORS_CAP); - } - /* * Limit default to SCSI host optimal sector limit if set. There may be * an impact on performance for when the size of a request exceeds this * host limit. */ - rw_max = min_not_zero(rw_max, sdp->host->opt_sectors); - - /* Do not exceed controller limit */ - rw_max = min(rw_max, queue_max_hw_sectors(q)); - - /* - * Only update max_sectors if previously unset or if the current value - * exceeds the capabilities of the hardware. - */ - if (sdkp->first_scan || - q->limits.max_sectors > q->limits.max_dev_sectors || - q->limits.max_sectors > q->limits.max_hw_sectors) { - q->limits.max_sectors = rw_max; - q->limits.max_user_sectors = rw_max; + q->limits.io_opt = sdp->host->opt_sectors << SECTOR_SHIFT; + if (sd_validate_opt_xfer_size(sdkp, dev_max)) { + q->limits.io_opt = min_not_zero(q->limits.io_opt, + logical_to_bytes(sdp, sdkp->opt_xfer_blocks)); } sdkp->first_scan = 0; From b3491b0db165c0cbe25874da66d97652c03db654 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 31 May 2024 09:48:00 +0200 Subject: [PATCH 05/26] sd: simplify the ZBC case in provisioning_mode_store Don't reset the discard settings to no-op over and over when a user writes to the provisioning attribute as that is already the default mode for ZBC devices. In hindsight we should have made writing to the attribute fail for ZBC devices, but the code has probably been around for far too long to change this now. Signed-off-by: Christoph Hellwig Reviewed-by: Bart Van Assche Reviewed-by: Damien Le Moal Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240531074837.1648501-6-hch@lst.de Signed-off-by: Jens Axboe --- drivers/scsi/sd.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 3dff9150ce11..83aa17fea39d 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -461,14 +461,13 @@ provisioning_mode_store(struct device *dev, struct device_attribute *attr, if (!capable(CAP_SYS_ADMIN)) return -EACCES; - if (sd_is_zoned(sdkp)) { - sd_config_discard(sdkp, SD_LBP_DISABLE); - return count; - } - if (sdp->type != TYPE_DISK) return -EINVAL; + /* ignore the provisioning mode for ZBC devices */ + if (sd_is_zoned(sdkp)) + return count; + mode = sysfs_match_string(lbp_mode, buf); if (mode < 0) return -EINVAL; From b0dadb86a90bd5a7b723f9d3a6cf69da9b596496 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 31 May 2024 09:48:01 +0200 Subject: [PATCH 06/26] sd: add a sd_disable_discard helper Add helper to disable discard when it is not supported and use it instead of sd_config_discard in the I/O completion handler. This avoids touching more fields than required in the I/O completion handler and prepares for converting sd to use the atomic queue limits API. Signed-off-by: Christoph Hellwig Reviewed-by: Bart Van Assche Reviewed-by: Damien Le Moal Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240531074837.1648501-7-hch@lst.de Signed-off-by: Jens Axboe --- drivers/scsi/sd.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 83aa17fea39d..f07d90474e68 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -821,6 +821,12 @@ static unsigned char sd_setup_protect_cmnd(struct scsi_cmnd *scmd, return protect; } +static void sd_disable_discard(struct scsi_disk *sdkp) +{ + sdkp->provisioning_mode = SD_LBP_DISABLE; + blk_queue_max_discard_sectors(sdkp->disk->queue, 0); +} + static void sd_config_discard(struct scsi_disk *sdkp, unsigned int mode) { struct request_queue *q = sdkp->disk->queue; @@ -2245,12 +2251,12 @@ static int sd_done(struct scsi_cmnd *SCpnt) case 0x24: /* INVALID FIELD IN CDB */ switch (SCpnt->cmnd[0]) { case UNMAP: - sd_config_discard(sdkp, SD_LBP_DISABLE); + sd_disable_discard(sdkp); break; case WRITE_SAME_16: case WRITE_SAME: if (SCpnt->cmnd[1] & 8) { /* UNMAP */ - sd_config_discard(sdkp, SD_LBP_DISABLE); + sd_disable_discard(sdkp); } else { sdkp->device->no_write_same = 1; sd_config_write_same(sdkp); From 9972b8ce0d4ba373901bdd1e15e4de58fcd7f662 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 31 May 2024 09:48:02 +0200 Subject: [PATCH 07/26] sd: add a sd_disable_write_same helper Add helper to disable WRITE SAME when it is not supported and use it instead of sd_config_write_same in the I/O completion handler. This avoids touching more fields than required in the I/O completion handler and prepares for converting sd to use the atomic queue limits API. Signed-off-by: Christoph Hellwig Reviewed-by: Bart Van Assche Reviewed-by: Damien Le Moal Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240531074837.1648501-8-hch@lst.de Signed-off-by: Jens Axboe --- drivers/scsi/sd.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index f07d90474e68..70211d0b1876 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -1004,6 +1004,13 @@ static blk_status_t sd_setup_write_zeroes_cmnd(struct scsi_cmnd *cmd) return sd_setup_write_same10_cmnd(cmd, false); } +static void sd_disable_write_same(struct scsi_disk *sdkp) +{ + sdkp->device->no_write_same = 1; + sdkp->max_ws_blocks = 0; + blk_queue_max_write_zeroes_sectors(sdkp->disk->queue, 0); +} + static void sd_config_write_same(struct scsi_disk *sdkp) { struct request_queue *q = sdkp->disk->queue; @@ -2258,8 +2265,7 @@ static int sd_done(struct scsi_cmnd *SCpnt) if (SCpnt->cmnd[1] & 8) { /* UNMAP */ sd_disable_discard(sdkp); } else { - sdkp->device->no_write_same = 1; - sd_config_write_same(sdkp); + sd_disable_write_same(sdkp); req->rq_flags |= RQF_QUIET; } break; From d15b9bd42cd3b2077812d4bf32f532a9bd5c4914 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 31 May 2024 09:48:03 +0200 Subject: [PATCH 08/26] sd: simplify the disable case in sd_config_discard Fall through to the main call to blk_queue_max_discard_sectors given that max_blocks has been initialized to zero above instead of duplicating the call. Signed-off-by: Christoph Hellwig Reviewed-by: Bart Van Assche Reviewed-by: Damien Le Moal Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240531074837.1648501-9-hch@lst.de Signed-off-by: Jens Axboe --- drivers/scsi/sd.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 70211d0b1876..0dbc6eb7a7ca 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -844,8 +844,7 @@ static void sd_config_discard(struct scsi_disk *sdkp, unsigned int mode) case SD_LBP_FULL: case SD_LBP_DISABLE: - blk_queue_max_discard_sectors(q, 0); - return; + break; case SD_LBP_UNMAP: max_blocks = min_not_zero(sdkp->max_unmap_blocks, From f1e8185fc12c699c3abf4f39b1ff5d7793da3a95 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 31 May 2024 09:48:04 +0200 Subject: [PATCH 09/26] sd: factor out a sd_discard_mode helper Split the logic to pick the right discard mode into a little helper to prepare for further changes. Signed-off-by: Christoph Hellwig Reviewed-by: Bart Van Assche Reviewed-by: Damien Le Moal Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240531074837.1648501-10-hch@lst.de Signed-off-by: Jens Axboe --- drivers/scsi/sd.c | 37 ++++++++++++++++++++----------------- 1 file changed, 20 insertions(+), 17 deletions(-) diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 0dbc6eb7a7ca..39eddfac09ef 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -3201,6 +3201,25 @@ static void sd_read_app_tag_own(struct scsi_disk *sdkp, unsigned char *buffer) return; } +static unsigned int sd_discard_mode(struct scsi_disk *sdkp) +{ + if (!sdkp->lbpvpd) { + /* LBP VPD page not provided */ + if (sdkp->max_unmap_blocks) + return SD_LBP_UNMAP; + return SD_LBP_WS16; + } + + /* LBP VPD page tells us what to use */ + if (sdkp->lbpu && sdkp->max_unmap_blocks) + return SD_LBP_UNMAP; + if (sdkp->lbpws) + return SD_LBP_WS16; + if (sdkp->lbpws10) + return SD_LBP_WS10; + return SD_LBP_DISABLE; +} + /** * sd_read_block_limits - Query disk device for preferred I/O sizes. * @sdkp: disk to query @@ -3239,23 +3258,7 @@ static void sd_read_block_limits(struct scsi_disk *sdkp) sdkp->unmap_alignment = get_unaligned_be32(&vpd->data[32]) & ~(1 << 31); - if (!sdkp->lbpvpd) { /* LBP VPD page not provided */ - - if (sdkp->max_unmap_blocks) - sd_config_discard(sdkp, SD_LBP_UNMAP); - else - sd_config_discard(sdkp, SD_LBP_WS16); - - } else { /* LBP VPD page tells us what to use */ - if (sdkp->lbpu && sdkp->max_unmap_blocks) - sd_config_discard(sdkp, SD_LBP_UNMAP); - else if (sdkp->lbpws) - sd_config_discard(sdkp, SD_LBP_WS16); - else if (sdkp->lbpws10) - sd_config_discard(sdkp, SD_LBP_WS10); - else - sd_config_discard(sdkp, SD_LBP_DISABLE); - } + sd_config_discard(sdkp, sd_discard_mode(sdkp)); } out: From 9c1d339a1bf45f4d3a2e77bbf24b0ec51f02551c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 31 May 2024 09:48:05 +0200 Subject: [PATCH 10/26] sd: cleanup zoned queue limits initialization Consolidate setting zone-related queue limits in sd_zbc_read_zones instead of splitting them between sd_zbc_revalidate_zones and sd_zbc_read_zones, and move the early_zone_information initialization in sd_zbc_read_zones above setting up the queue limits. Signed-off-by: Christoph Hellwig Reviewed-by: Bart Van Assche Reviewed-by: Damien Le Moal Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240531074837.1648501-11-hch@lst.de Signed-off-by: Jens Axboe --- drivers/scsi/sd_zbc.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c index 806036e48abe..1c24c844e8d1 100644 --- a/drivers/scsi/sd_zbc.c +++ b/drivers/scsi/sd_zbc.c @@ -565,12 +565,6 @@ int sd_zbc_revalidate_zones(struct scsi_disk *sdkp) sdkp->zone_info.zone_blocks = zone_blocks; sdkp->zone_info.nr_zones = nr_zones; - blk_queue_chunk_sectors(q, - logical_to_sectors(sdkp->device, zone_blocks)); - - /* Enable block layer zone append emulation */ - blk_queue_max_zone_append_sectors(q, 0); - flags = memalloc_noio_save(); ret = blk_revalidate_disk_zones(disk); memalloc_noio_restore(flags); @@ -625,6 +619,10 @@ int sd_zbc_read_zones(struct scsi_disk *sdkp, u8 buf[SD_BUF_SIZE]) if (ret != 0) goto err; + nr_zones = round_up(sdkp->capacity, zone_blocks) >> ilog2(zone_blocks); + sdkp->early_zone_info.nr_zones = nr_zones; + sdkp->early_zone_info.zone_blocks = zone_blocks; + /* The drive satisfies the kernel restrictions: set it up */ blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q); if (sdkp->zones_max_open == U32_MAX) @@ -632,10 +630,10 @@ int sd_zbc_read_zones(struct scsi_disk *sdkp, u8 buf[SD_BUF_SIZE]) else disk_set_max_open_zones(disk, sdkp->zones_max_open); disk_set_max_active_zones(disk, 0); - nr_zones = round_up(sdkp->capacity, zone_blocks) >> ilog2(zone_blocks); - - sdkp->early_zone_info.nr_zones = nr_zones; - sdkp->early_zone_info.zone_blocks = zone_blocks; + blk_queue_chunk_sectors(q, + logical_to_sectors(sdkp->device, zone_blocks)); + /* Enable block layer zone append emulation */ + blk_queue_max_zone_append_sectors(q, 0); return 0; From 804e498e0496d889090f32f812b5ce1a4f2aa63e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 31 May 2024 09:48:06 +0200 Subject: [PATCH 11/26] sd: convert to the atomic queue limits API Assign all queue limits through a local queue_limits variable and queue_limits_commit_update so that we can't race updating them from multiple places, and freeze the queue when updating them so that in-progress I/O submissions don't see half-updated limits. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: John Garry Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240531074837.1648501-12-hch@lst.de Signed-off-by: Jens Axboe --- drivers/scsi/sd.c | 130 ++++++++++++++++++++++++------------------ drivers/scsi/sd.h | 6 +- drivers/scsi/sd_zbc.c | 15 ++--- 3 files changed, 85 insertions(+), 66 deletions(-) diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 39eddfac09ef..049071b56819 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -101,12 +101,13 @@ MODULE_ALIAS_SCSI_DEVICE(TYPE_ZBC); #define SD_MINORS 16 -static void sd_config_discard(struct scsi_disk *, unsigned int); -static void sd_config_write_same(struct scsi_disk *); +static void sd_config_discard(struct scsi_disk *sdkp, struct queue_limits *lim, + unsigned int mode); +static void sd_config_write_same(struct scsi_disk *sdkp, + struct queue_limits *lim); static int sd_revalidate_disk(struct gendisk *); static void sd_unlock_native_capacity(struct gendisk *disk); static void sd_shutdown(struct device *); -static void sd_read_capacity(struct scsi_disk *sdkp, unsigned char *buffer); static void scsi_disk_release(struct device *cdev); static DEFINE_IDA(sd_index_ida); @@ -456,7 +457,8 @@ provisioning_mode_store(struct device *dev, struct device_attribute *attr, { struct scsi_disk *sdkp = to_scsi_disk(dev); struct scsi_device *sdp = sdkp->device; - int mode; + struct queue_limits lim; + int mode, err; if (!capable(CAP_SYS_ADMIN)) return -EACCES; @@ -472,8 +474,13 @@ provisioning_mode_store(struct device *dev, struct device_attribute *attr, if (mode < 0) return -EINVAL; - sd_config_discard(sdkp, mode); - + lim = queue_limits_start_update(sdkp->disk->queue); + sd_config_discard(sdkp, &lim, mode); + blk_mq_freeze_queue(sdkp->disk->queue); + err = queue_limits_commit_update(sdkp->disk->queue, &lim); + blk_mq_unfreeze_queue(sdkp->disk->queue); + if (err) + return err; return count; } static DEVICE_ATTR_RW(provisioning_mode); @@ -556,6 +563,7 @@ max_write_same_blocks_store(struct device *dev, struct device_attribute *attr, { struct scsi_disk *sdkp = to_scsi_disk(dev); struct scsi_device *sdp = sdkp->device; + struct queue_limits lim; unsigned long max; int err; @@ -577,8 +585,13 @@ max_write_same_blocks_store(struct device *dev, struct device_attribute *attr, sdkp->max_ws_blocks = max; } - sd_config_write_same(sdkp); - + lim = queue_limits_start_update(sdkp->disk->queue); + sd_config_write_same(sdkp, &lim); + blk_mq_freeze_queue(sdkp->disk->queue); + err = queue_limits_commit_update(sdkp->disk->queue, &lim); + blk_mq_unfreeze_queue(sdkp->disk->queue); + if (err) + return err; return count; } static DEVICE_ATTR_RW(max_write_same_blocks); @@ -827,17 +840,15 @@ static void sd_disable_discard(struct scsi_disk *sdkp) blk_queue_max_discard_sectors(sdkp->disk->queue, 0); } -static void sd_config_discard(struct scsi_disk *sdkp, unsigned int mode) +static void sd_config_discard(struct scsi_disk *sdkp, struct queue_limits *lim, + unsigned int mode) { - struct request_queue *q = sdkp->disk->queue; unsigned int logical_block_size = sdkp->device->sector_size; unsigned int max_blocks = 0; - q->limits.discard_alignment = - sdkp->unmap_alignment * logical_block_size; - q->limits.discard_granularity = - max(sdkp->physical_block_size, - sdkp->unmap_granularity * logical_block_size); + lim->discard_alignment = sdkp->unmap_alignment * logical_block_size; + lim->discard_granularity = max(sdkp->physical_block_size, + sdkp->unmap_granularity * logical_block_size); sdkp->provisioning_mode = mode; switch (mode) { @@ -875,7 +886,8 @@ static void sd_config_discard(struct scsi_disk *sdkp, unsigned int mode) break; } - blk_queue_max_discard_sectors(q, max_blocks * (logical_block_size >> 9)); + lim->max_hw_discard_sectors = max_blocks * + (logical_block_size >> SECTOR_SHIFT); } static void *sd_set_special_bvec(struct request *rq, unsigned int data_len) @@ -1010,9 +1022,9 @@ static void sd_disable_write_same(struct scsi_disk *sdkp) blk_queue_max_write_zeroes_sectors(sdkp->disk->queue, 0); } -static void sd_config_write_same(struct scsi_disk *sdkp) +static void sd_config_write_same(struct scsi_disk *sdkp, + struct queue_limits *lim) { - struct request_queue *q = sdkp->disk->queue; unsigned int logical_block_size = sdkp->device->sector_size; if (sdkp->device->no_write_same) { @@ -1066,8 +1078,8 @@ static void sd_config_write_same(struct scsi_disk *sdkp) } out: - blk_queue_max_write_zeroes_sectors(q, sdkp->max_ws_blocks * - (logical_block_size >> 9)); + lim->max_write_zeroes_sectors = + sdkp->max_ws_blocks * (logical_block_size >> SECTOR_SHIFT); } static blk_status_t sd_setup_flush_cmnd(struct scsi_cmnd *cmd) @@ -2523,7 +2535,7 @@ static void read_capacity_error(struct scsi_disk *sdkp, struct scsi_device *sdp, #define READ_CAPACITY_RETRIES_ON_RESET 10 static int read_capacity_16(struct scsi_disk *sdkp, struct scsi_device *sdp, - unsigned char *buffer) + struct queue_limits *lim, unsigned char *buffer) { unsigned char cmd[16]; struct scsi_sense_hdr sshdr; @@ -2597,7 +2609,7 @@ static int read_capacity_16(struct scsi_disk *sdkp, struct scsi_device *sdp, /* Lowest aligned logical block */ alignment = ((buffer[14] & 0x3f) << 8 | buffer[15]) * sector_size; - blk_queue_alignment_offset(sdp->request_queue, alignment); + lim->alignment_offset = alignment; if (alignment && sdkp->first_scan) sd_printk(KERN_NOTICE, sdkp, "physical block alignment offset: %u\n", alignment); @@ -2608,7 +2620,7 @@ static int read_capacity_16(struct scsi_disk *sdkp, struct scsi_device *sdp, if (buffer[14] & 0x40) /* LBPRZ */ sdkp->lbprz = 1; - sd_config_discard(sdkp, SD_LBP_WS16); + sd_config_discard(sdkp, lim, SD_LBP_WS16); } sdkp->capacity = lba + 1; @@ -2711,13 +2723,14 @@ static int sd_try_rc16_first(struct scsi_device *sdp) * read disk capacity */ static void -sd_read_capacity(struct scsi_disk *sdkp, unsigned char *buffer) +sd_read_capacity(struct scsi_disk *sdkp, struct queue_limits *lim, + unsigned char *buffer) { int sector_size; struct scsi_device *sdp = sdkp->device; if (sd_try_rc16_first(sdp)) { - sector_size = read_capacity_16(sdkp, sdp, buffer); + sector_size = read_capacity_16(sdkp, sdp, lim, buffer); if (sector_size == -EOVERFLOW) goto got_data; if (sector_size == -ENODEV) @@ -2737,7 +2750,7 @@ sd_read_capacity(struct scsi_disk *sdkp, unsigned char *buffer) int old_sector_size = sector_size; sd_printk(KERN_NOTICE, sdkp, "Very big device. " "Trying to use READ CAPACITY(16).\n"); - sector_size = read_capacity_16(sdkp, sdp, buffer); + sector_size = read_capacity_16(sdkp, sdp, lim, buffer); if (sector_size < 0) { sd_printk(KERN_NOTICE, sdkp, "Using 0xffffffff as device size\n"); @@ -2796,9 +2809,8 @@ got_data: */ sector_size = 512; } - blk_queue_logical_block_size(sdp->request_queue, sector_size); - blk_queue_physical_block_size(sdp->request_queue, - sdkp->physical_block_size); + lim->logical_block_size = sector_size; + lim->physical_block_size = sdkp->physical_block_size; sdkp->device->sector_size = sector_size; if (sdkp->capacity > 0xffffffff) @@ -3220,11 +3232,11 @@ static unsigned int sd_discard_mode(struct scsi_disk *sdkp) return SD_LBP_DISABLE; } -/** - * sd_read_block_limits - Query disk device for preferred I/O sizes. - * @sdkp: disk to query +/* + * Query disk device for preferred I/O sizes. */ -static void sd_read_block_limits(struct scsi_disk *sdkp) +static void sd_read_block_limits(struct scsi_disk *sdkp, + struct queue_limits *lim) { struct scsi_vpd *vpd; @@ -3258,7 +3270,7 @@ static void sd_read_block_limits(struct scsi_disk *sdkp) sdkp->unmap_alignment = get_unaligned_be32(&vpd->data[32]) & ~(1 << 31); - sd_config_discard(sdkp, sd_discard_mode(sdkp)); + sd_config_discard(sdkp, lim, sd_discard_mode(sdkp)); } out: @@ -3277,11 +3289,9 @@ static void sd_read_block_limits_ext(struct scsi_disk *sdkp) rcu_read_unlock(); } -/** - * sd_read_block_characteristics - Query block dev. characteristics - * @sdkp: disk to query - */ -static void sd_read_block_characteristics(struct scsi_disk *sdkp) +/* Query block device characteristics */ +static void sd_read_block_characteristics(struct scsi_disk *sdkp, + struct queue_limits *lim) { struct request_queue *q = sdkp->disk->queue; struct scsi_vpd *vpd; @@ -3307,29 +3317,26 @@ static void sd_read_block_characteristics(struct scsi_disk *sdkp) #ifdef CONFIG_BLK_DEV_ZONED /* sd_probe rejects ZBD devices early otherwise */ if (sdkp->device->type == TYPE_ZBC) { - /* - * Host-managed. - */ - disk_set_zoned(sdkp->disk); + lim->zoned = true; /* * Per ZBC and ZAC specifications, writes in sequential write * required zones of host-managed devices must be aligned to * the device physical block size. */ - blk_queue_zone_write_granularity(q, sdkp->physical_block_size); + lim->zone_write_granularity = sdkp->physical_block_size; } else { /* * Host-aware devices are treated as conventional. */ - WARN_ON_ONCE(blk_queue_is_zoned(q)); + lim->zoned = false; } #endif /* CONFIG_BLK_DEV_ZONED */ if (!sdkp->first_scan) return; - if (blk_queue_is_zoned(q)) + if (lim->zoned) sd_printk(KERN_NOTICE, sdkp, "Host-managed zoned block device\n"); else if (sdkp->zoned == 1) sd_printk(KERN_NOTICE, sdkp, "Host-aware SMR disk used as regular disk\n"); @@ -3605,8 +3612,10 @@ static int sd_revalidate_disk(struct gendisk *disk) struct scsi_device *sdp = sdkp->device; struct request_queue *q = sdkp->disk->queue; sector_t old_capacity = sdkp->capacity; + struct queue_limits lim; unsigned char *buffer; unsigned int dev_max; + int err; SCSI_LOG_HLQUEUE(3, sd_printk(KERN_INFO, sdkp, "sd_revalidate_disk\n")); @@ -3627,12 +3636,14 @@ static int sd_revalidate_disk(struct gendisk *disk) sd_spinup_disk(sdkp); + lim = queue_limits_start_update(sdkp->disk->queue); + /* * Without media there is no reason to ask; moreover, some devices * react badly if we do. */ if (sdkp->media_present) { - sd_read_capacity(sdkp, buffer); + sd_read_capacity(sdkp, &lim, buffer); /* * Some USB/UAS devices return generic values for mode pages * until the media has been accessed. Trigger a READ operation @@ -3651,10 +3662,10 @@ static int sd_revalidate_disk(struct gendisk *disk) if (scsi_device_supports_vpd(sdp)) { sd_read_block_provisioning(sdkp); - sd_read_block_limits(sdkp); + sd_read_block_limits(sdkp, &lim); sd_read_block_limits_ext(sdkp); - sd_read_block_characteristics(sdkp); - sd_zbc_read_zones(sdkp, buffer); + sd_read_block_characteristics(sdkp, &lim); + sd_zbc_read_zones(sdkp, &lim, buffer); sd_read_cpr(sdkp); } @@ -3680,31 +3691,36 @@ static int sd_revalidate_disk(struct gendisk *disk) /* Some devices report a maximum block count for READ/WRITE requests. */ dev_max = min_not_zero(dev_max, sdkp->max_xfer_blocks); - q->limits.max_dev_sectors = logical_to_sectors(sdp, dev_max); + lim.max_dev_sectors = logical_to_sectors(sdp, dev_max); if (sd_validate_min_xfer_size(sdkp)) - blk_queue_io_min(sdkp->disk->queue, - logical_to_bytes(sdp, sdkp->min_xfer_blocks)); + lim.io_min = logical_to_bytes(sdp, sdkp->min_xfer_blocks); else - blk_queue_io_min(sdkp->disk->queue, 0); + lim.io_min = 0; /* * Limit default to SCSI host optimal sector limit if set. There may be * an impact on performance for when the size of a request exceeds this * host limit. */ - q->limits.io_opt = sdp->host->opt_sectors << SECTOR_SHIFT; + lim.io_opt = sdp->host->opt_sectors << SECTOR_SHIFT; if (sd_validate_opt_xfer_size(sdkp, dev_max)) { - q->limits.io_opt = min_not_zero(q->limits.io_opt, + lim.io_opt = min_not_zero(lim.io_opt, logical_to_bytes(sdp, sdkp->opt_xfer_blocks)); } sdkp->first_scan = 0; set_capacity_and_notify(disk, logical_to_sectors(sdp, sdkp->capacity)); - sd_config_write_same(sdkp); + sd_config_write_same(sdkp, &lim); kfree(buffer); + blk_mq_freeze_queue(sdkp->disk->queue); + err = queue_limits_commit_update(sdkp->disk->queue, &lim); + blk_mq_unfreeze_queue(sdkp->disk->queue); + if (err) + return err; + /* * For a zoned drive, revalidating the zones can be done only once * the gendisk capacity is set. So if this fails, set back the gendisk diff --git a/drivers/scsi/sd.h b/drivers/scsi/sd.h index 49dd600bfa48..b4170b17bad4 100644 --- a/drivers/scsi/sd.h +++ b/drivers/scsi/sd.h @@ -239,7 +239,8 @@ static inline int sd_is_zoned(struct scsi_disk *sdkp) #ifdef CONFIG_BLK_DEV_ZONED -int sd_zbc_read_zones(struct scsi_disk *sdkp, u8 buf[SD_BUF_SIZE]); +int sd_zbc_read_zones(struct scsi_disk *sdkp, struct queue_limits *lim, + u8 buf[SD_BUF_SIZE]); int sd_zbc_revalidate_zones(struct scsi_disk *sdkp); blk_status_t sd_zbc_setup_zone_mgmt_cmnd(struct scsi_cmnd *cmd, unsigned char op, bool all); @@ -250,7 +251,8 @@ int sd_zbc_report_zones(struct gendisk *disk, sector_t sector, #else /* CONFIG_BLK_DEV_ZONED */ -static inline int sd_zbc_read_zones(struct scsi_disk *sdkp, u8 buf[SD_BUF_SIZE]) +static inline int sd_zbc_read_zones(struct scsi_disk *sdkp, + struct queue_limits *lim, u8 buf[SD_BUF_SIZE]) { return 0; } diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c index 1c24c844e8d1..f685838d9ed2 100644 --- a/drivers/scsi/sd_zbc.c +++ b/drivers/scsi/sd_zbc.c @@ -582,13 +582,15 @@ int sd_zbc_revalidate_zones(struct scsi_disk *sdkp) /** * sd_zbc_read_zones - Read zone information and update the request queue * @sdkp: SCSI disk pointer. + * @lim: queue limits to read into * @buf: 512 byte buffer used for storing SCSI command output. * * Read zone information and update the request queue zone characteristics and * also the zoned device information in *sdkp. Called by sd_revalidate_disk() * before the gendisk capacity has been set. */ -int sd_zbc_read_zones(struct scsi_disk *sdkp, u8 buf[SD_BUF_SIZE]) +int sd_zbc_read_zones(struct scsi_disk *sdkp, struct queue_limits *lim, + u8 buf[SD_BUF_SIZE]) { struct gendisk *disk = sdkp->disk; struct request_queue *q = disk->queue; @@ -626,14 +628,13 @@ int sd_zbc_read_zones(struct scsi_disk *sdkp, u8 buf[SD_BUF_SIZE]) /* The drive satisfies the kernel restrictions: set it up */ blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q); if (sdkp->zones_max_open == U32_MAX) - disk_set_max_open_zones(disk, 0); + lim->max_open_zones = 0; else - disk_set_max_open_zones(disk, sdkp->zones_max_open); - disk_set_max_active_zones(disk, 0); - blk_queue_chunk_sectors(q, - logical_to_sectors(sdkp->device, zone_blocks)); + lim->max_open_zones = sdkp->zones_max_open; + lim->max_active_zones = 0; + lim->chunk_sectors = logical_to_sectors(sdkp->device, zone_blocks); /* Enable block layer zone append emulation */ - blk_queue_max_zone_append_sectors(q, 0); + lim->max_zone_append_sectors = 0; return 0; From 969f17e10f5b732c05186ee0126c8a08166d0cda Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 31 May 2024 09:48:07 +0200 Subject: [PATCH 12/26] sr: convert to the atomic queue limits API Assign all queue limits through a local queue_limits variable and queue_limits_commit_update so that we can't race updating them from multiple places, and free the queue when updating them so that in-progress I/O submissions don't see half-updated limits. Also use the chance to clean up variable names to standard ones. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240531074837.1648501-13-hch@lst.de Signed-off-by: Jens Axboe --- drivers/scsi/sr.c | 42 +++++++++++++++++++++++++----------------- 1 file changed, 25 insertions(+), 17 deletions(-) diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c index 7ab000942b97..3f491019103e 100644 --- a/drivers/scsi/sr.c +++ b/drivers/scsi/sr.c @@ -111,7 +111,7 @@ static struct lock_class_key sr_bio_compl_lkclass; static int sr_open(struct cdrom_device_info *, int); static void sr_release(struct cdrom_device_info *); -static void get_sectorsize(struct scsi_cd *); +static int get_sectorsize(struct scsi_cd *); static int get_capabilities(struct scsi_cd *); static unsigned int sr_check_events(struct cdrom_device_info *cdi, @@ -473,15 +473,15 @@ static blk_status_t sr_init_command(struct scsi_cmnd *SCpnt) return BLK_STS_IOERR; } -static void sr_revalidate_disk(struct scsi_cd *cd) +static int sr_revalidate_disk(struct scsi_cd *cd) { struct scsi_sense_hdr sshdr; /* if the unit is not ready, nothing more to do */ if (scsi_test_unit_ready(cd->device, SR_TIMEOUT, MAX_RETRIES, &sshdr)) - return; + return 0; sr_cd_check(&cd->cdi); - get_sectorsize(cd); + return get_sectorsize(cd); } static int sr_block_open(struct gendisk *disk, blk_mode_t mode) @@ -494,13 +494,16 @@ static int sr_block_open(struct gendisk *disk, blk_mode_t mode) return -ENXIO; scsi_autopm_get_device(sdev); - if (disk_check_media_change(disk)) - sr_revalidate_disk(cd); + if (disk_check_media_change(disk)) { + ret = sr_revalidate_disk(cd); + if (ret) + goto out; + } mutex_lock(&cd->lock); ret = cdrom_open(&cd->cdi, mode); mutex_unlock(&cd->lock); - +out: scsi_autopm_put_device(sdev); if (ret) scsi_device_put(cd->device); @@ -685,7 +688,9 @@ static int sr_probe(struct device *dev) blk_pm_runtime_init(sdev->request_queue, dev); dev_set_drvdata(dev, cd); - sr_revalidate_disk(cd); + error = sr_revalidate_disk(cd); + if (error) + goto unregister_cdrom; error = device_add_disk(&sdev->sdev_gendev, disk, NULL); if (error) @@ -714,13 +719,14 @@ fail: } -static void get_sectorsize(struct scsi_cd *cd) +static int get_sectorsize(struct scsi_cd *cd) { + struct request_queue *q = cd->device->request_queue; static const u8 cmd[10] = { READ_CAPACITY }; unsigned char buffer[8] = { }; - int the_result; + struct queue_limits lim; + int err; int sector_size; - struct request_queue *queue; struct scsi_failure failure_defs[] = { { .result = SCMD_FAILURE_RESULT_ANY, @@ -736,10 +742,10 @@ static void get_sectorsize(struct scsi_cd *cd) }; /* Do the command and wait.. */ - the_result = scsi_execute_cmd(cd->device, cmd, REQ_OP_DRV_IN, buffer, + err = scsi_execute_cmd(cd->device, cmd, REQ_OP_DRV_IN, buffer, sizeof(buffer), SR_TIMEOUT, MAX_RETRIES, &exec_args); - if (the_result) { + if (err) { cd->capacity = 0x1fffff; sector_size = 2048; /* A guess, just in case */ } else { @@ -789,10 +795,12 @@ static void get_sectorsize(struct scsi_cd *cd) set_capacity(cd->disk, cd->capacity); } - queue = cd->device->request_queue; - blk_queue_logical_block_size(queue, sector_size); - - return; + lim = queue_limits_start_update(q); + lim.logical_block_size = sector_size; + blk_mq_freeze_queue(q); + err = queue_limits_commit_update(q, &lim); + blk_mq_unfreeze_queue(q); + return err; } static int get_capabilities(struct scsi_cd *cd) From 1652b0bafeaa8281ca9a805d81e13d7647bd2f44 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 31 May 2024 09:48:08 +0200 Subject: [PATCH 13/26] block: remove unused queue limits API Remove all APIs that are unused now that sd and sr have been converted to the atomic queue limits API. Signed-off-by: Christoph Hellwig Reviewed-by: Bart Van Assche Reviewed-by: John Garry Reviewed-by: Nitesh Shetty Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240531074837.1648501-14-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-settings.c | 190 ----------------------------------------- include/linux/blkdev.h | 24 ------ 2 files changed, 214 deletions(-) diff --git a/block/blk-settings.c b/block/blk-settings.c index a49abdb35548..0b038729608f 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -293,24 +293,6 @@ int queue_limits_set(struct request_queue *q, struct queue_limits *lim) } EXPORT_SYMBOL_GPL(queue_limits_set); -/** - * blk_queue_chunk_sectors - set size of the chunk for this queue - * @q: the request queue for the device - * @chunk_sectors: chunk sectors in the usual 512b unit - * - * Description: - * If a driver doesn't want IOs to cross a given chunk size, it can set - * this limit and prevent merging across chunks. Note that the block layer - * must accept a page worth of data at any offset. So if the crossing of - * chunks is a hard limitation in the driver, it must still be prepared - * to split single page bios. - **/ -void blk_queue_chunk_sectors(struct request_queue *q, unsigned int chunk_sectors) -{ - q->limits.chunk_sectors = chunk_sectors; -} -EXPORT_SYMBOL(blk_queue_chunk_sectors); - /** * blk_queue_max_discard_sectors - set max sectors for a single discard * @q: the request queue for the device @@ -352,139 +334,6 @@ void blk_queue_max_write_zeroes_sectors(struct request_queue *q, } EXPORT_SYMBOL(blk_queue_max_write_zeroes_sectors); -/** - * blk_queue_max_zone_append_sectors - set max sectors for a single zone append - * @q: the request queue for the device - * @max_zone_append_sectors: maximum number of sectors to write per command - * - * Sets the maximum number of sectors allowed for zone append commands. If - * Specifying 0 for @max_zone_append_sectors indicates that the queue does - * not natively support zone append operations and that the block layer must - * emulate these operations using regular writes. - **/ -void blk_queue_max_zone_append_sectors(struct request_queue *q, - unsigned int max_zone_append_sectors) -{ - unsigned int max_sectors = 0; - - if (WARN_ON(!blk_queue_is_zoned(q))) - return; - - if (max_zone_append_sectors) { - max_sectors = min(q->limits.max_hw_sectors, - max_zone_append_sectors); - max_sectors = min(q->limits.chunk_sectors, max_sectors); - - /* - * Signal eventual driver bugs resulting in the max_zone_append - * sectors limit being 0 due to the chunk_sectors limit (zone - * size) not set or the max_hw_sectors limit not set. - */ - WARN_ON_ONCE(!max_sectors); - } - - q->limits.max_zone_append_sectors = max_sectors; -} -EXPORT_SYMBOL_GPL(blk_queue_max_zone_append_sectors); - -/** - * blk_queue_logical_block_size - set logical block size for the queue - * @q: the request queue for the device - * @size: the logical block size, in bytes - * - * Description: - * This should be set to the lowest possible block size that the - * storage device can address. The default of 512 covers most - * hardware. - **/ -void blk_queue_logical_block_size(struct request_queue *q, unsigned int size) -{ - struct queue_limits *limits = &q->limits; - - limits->logical_block_size = size; - - if (limits->discard_granularity < limits->logical_block_size) - limits->discard_granularity = limits->logical_block_size; - - if (limits->physical_block_size < size) - limits->physical_block_size = size; - - if (limits->io_min < limits->physical_block_size) - limits->io_min = limits->physical_block_size; - - limits->max_hw_sectors = - round_down(limits->max_hw_sectors, size >> SECTOR_SHIFT); - limits->max_sectors = - round_down(limits->max_sectors, size >> SECTOR_SHIFT); -} -EXPORT_SYMBOL(blk_queue_logical_block_size); - -/** - * blk_queue_physical_block_size - set physical block size for the queue - * @q: the request queue for the device - * @size: the physical block size, in bytes - * - * Description: - * This should be set to the lowest possible sector size that the - * hardware can operate on without reverting to read-modify-write - * operations. - */ -void blk_queue_physical_block_size(struct request_queue *q, unsigned int size) -{ - q->limits.physical_block_size = size; - - if (q->limits.physical_block_size < q->limits.logical_block_size) - q->limits.physical_block_size = q->limits.logical_block_size; - - if (q->limits.discard_granularity < q->limits.physical_block_size) - q->limits.discard_granularity = q->limits.physical_block_size; - - if (q->limits.io_min < q->limits.physical_block_size) - q->limits.io_min = q->limits.physical_block_size; -} -EXPORT_SYMBOL(blk_queue_physical_block_size); - -/** - * blk_queue_zone_write_granularity - set zone write granularity for the queue - * @q: the request queue for the zoned device - * @size: the zone write granularity size, in bytes - * - * Description: - * This should be set to the lowest possible size allowing to write in - * sequential zones of a zoned block device. - */ -void blk_queue_zone_write_granularity(struct request_queue *q, - unsigned int size) -{ - if (WARN_ON_ONCE(!blk_queue_is_zoned(q))) - return; - - q->limits.zone_write_granularity = size; - - if (q->limits.zone_write_granularity < q->limits.logical_block_size) - q->limits.zone_write_granularity = q->limits.logical_block_size; -} -EXPORT_SYMBOL_GPL(blk_queue_zone_write_granularity); - -/** - * blk_queue_alignment_offset - set physical block alignment offset - * @q: the request queue for the device - * @offset: alignment offset in bytes - * - * Description: - * Some devices are naturally misaligned to compensate for things like - * the legacy DOS partition table 63-sector offset. Low-level drivers - * should call this function for devices whose first sector is not - * naturally aligned. - */ -void blk_queue_alignment_offset(struct request_queue *q, unsigned int offset) -{ - q->limits.alignment_offset = - offset & (q->limits.physical_block_size - 1); - q->limits.misaligned = 0; -} -EXPORT_SYMBOL(blk_queue_alignment_offset); - void disk_update_readahead(struct gendisk *disk) { blk_apply_bdi_limits(disk->bdi, &disk->queue->limits); @@ -514,26 +363,6 @@ void blk_limits_io_min(struct queue_limits *limits, unsigned int min) } EXPORT_SYMBOL(blk_limits_io_min); -/** - * blk_queue_io_min - set minimum request size for the queue - * @q: the request queue for the device - * @min: smallest I/O size in bytes - * - * Description: - * Storage devices may report a granularity or preferred minimum I/O - * size which is the smallest request the device can perform without - * incurring a performance penalty. For disk drives this is often the - * physical block size. For RAID arrays it is often the stripe chunk - * size. A properly aligned multiple of minimum_io_size is the - * preferred request size for workloads where a high number of I/O - * operations is desired. - */ -void blk_queue_io_min(struct request_queue *q, unsigned int min) -{ - blk_limits_io_min(&q->limits, min); -} -EXPORT_SYMBOL(blk_queue_io_min); - /** * blk_limits_io_opt - set optimal request size for a device * @limits: the queue limits @@ -841,25 +670,6 @@ void blk_queue_write_cache(struct request_queue *q, bool wc, bool fua) } EXPORT_SYMBOL_GPL(blk_queue_write_cache); -/** - * disk_set_zoned - inidicate a zoned device - * @disk: gendisk to configure - */ -void disk_set_zoned(struct gendisk *disk) -{ - struct request_queue *q = disk->queue; - - WARN_ON_ONCE(!IS_ENABLED(CONFIG_BLK_DEV_ZONED)); - - /* - * Set the zone write granularity to the device logical block - * size by default. The driver can change this value if needed. - */ - q->limits.zoned = true; - blk_queue_zone_write_granularity(q, queue_logical_block_size(q)); -} -EXPORT_SYMBOL_GPL(disk_set_zoned); - int bdev_alignment_offset(struct block_device *bdev) { struct request_queue *q = bdev_get_queue(bdev); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 24c36929920b..ad995e7a7698 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -332,8 +332,6 @@ struct queue_limits { typedef int (*report_zones_cb)(struct blk_zone *zone, unsigned int idx, void *data); -void disk_set_zoned(struct gendisk *disk); - #define BLK_ALL_ZONES ((unsigned int)-1) int blkdev_report_zones(struct block_device *bdev, sector_t sector, unsigned int nr_zones, report_zones_cb cb, void *data); @@ -638,18 +636,6 @@ static inline unsigned int disk_zone_no(struct gendisk *disk, sector_t sector) return sector >> ilog2(disk->queue->limits.chunk_sectors); } -static inline void disk_set_max_open_zones(struct gendisk *disk, - unsigned int max_open_zones) -{ - disk->queue->limits.max_open_zones = max_open_zones; -} - -static inline void disk_set_max_active_zones(struct gendisk *disk, - unsigned int max_active_zones) -{ - disk->queue->limits.max_active_zones = max_active_zones; -} - static inline unsigned int bdev_max_open_zones(struct block_device *bdev) { return bdev->bd_disk->queue->limits.max_open_zones; @@ -929,24 +915,14 @@ static inline void queue_limits_cancel_update(struct request_queue *q) /* * Access functions for manipulating queue properties */ -extern void blk_queue_chunk_sectors(struct request_queue *, unsigned int); void blk_queue_max_secure_erase_sectors(struct request_queue *q, unsigned int max_sectors); extern void blk_queue_max_discard_sectors(struct request_queue *q, unsigned int max_discard_sectors); extern void blk_queue_max_write_zeroes_sectors(struct request_queue *q, unsigned int max_write_same_sectors); -extern void blk_queue_logical_block_size(struct request_queue *, unsigned int); -extern void blk_queue_max_zone_append_sectors(struct request_queue *q, - unsigned int max_zone_append_sectors); -extern void blk_queue_physical_block_size(struct request_queue *, unsigned int); -void blk_queue_zone_write_granularity(struct request_queue *q, - unsigned int size); -extern void blk_queue_alignment_offset(struct request_queue *q, - unsigned int alignment); void disk_update_readahead(struct gendisk *disk); extern void blk_limits_io_min(struct queue_limits *limits, unsigned int min); -extern void blk_queue_io_min(struct request_queue *q, unsigned int min); extern void blk_limits_io_opt(struct queue_limits *limits, unsigned int opt); extern void blk_set_queue_depth(struct request_queue *q, unsigned int depth); extern void blk_set_stacking_limits(struct queue_limits *lim); From 73e3715ed14844067c5c598e72777641004a7f60 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 31 May 2024 09:48:09 +0200 Subject: [PATCH 14/26] block: add special APIs for run-time disabling of discard and friends A few drivers optimistically try to support discard, write zeroes and secure erase and disable the features from the I/O completion handler if the hardware can't support them. This disable can't be done using the atomic queue limits API because the I/O completion handlers can't take sleeping locks or freeze the queue. Keep the existing clearing of the relevant field to zero, but replace the old blk_queue_max_* APIs with new disable APIs that force the value to 0. Signed-off-by: Christoph Hellwig Reviewed-by: Bart Van Assche Reviewed-by: Damien Le Moal Reviewed-by: John Garry Reviewed-by: Nitesh Shetty Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240531074837.1648501-15-hch@lst.de Signed-off-by: Jens Axboe --- arch/um/drivers/ubd_kern.c | 4 ++-- block/blk-settings.c | 41 ------------------------------------ drivers/block/xen-blkfront.c | 4 ++-- drivers/scsi/sd.c | 4 ++-- include/linux/blkdev.h | 28 ++++++++++++++++++------ 5 files changed, 28 insertions(+), 53 deletions(-) diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c index 093c87879d08..cdcb75a68989 100644 --- a/arch/um/drivers/ubd_kern.c +++ b/arch/um/drivers/ubd_kern.c @@ -451,9 +451,9 @@ static void ubd_end_request(struct io_thread_req *io_req) { if (io_req->error == BLK_STS_NOTSUPP) { if (req_op(io_req->req) == REQ_OP_DISCARD) - blk_queue_max_discard_sectors(io_req->req->q, 0); + blk_queue_disable_discard(io_req->req->q); else if (req_op(io_req->req) == REQ_OP_WRITE_ZEROES) - blk_queue_max_write_zeroes_sectors(io_req->req->q, 0); + blk_queue_disable_write_zeroes(io_req->req->q); } blk_mq_end_request(io_req->req, io_req->error); kfree(io_req); diff --git a/block/blk-settings.c b/block/blk-settings.c index 0b038729608f..996f247fc98e 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -293,47 +293,6 @@ int queue_limits_set(struct request_queue *q, struct queue_limits *lim) } EXPORT_SYMBOL_GPL(queue_limits_set); -/** - * blk_queue_max_discard_sectors - set max sectors for a single discard - * @q: the request queue for the device - * @max_discard_sectors: maximum number of sectors to discard - **/ -void blk_queue_max_discard_sectors(struct request_queue *q, - unsigned int max_discard_sectors) -{ - struct queue_limits *lim = &q->limits; - - lim->max_hw_discard_sectors = max_discard_sectors; - lim->max_discard_sectors = - min(max_discard_sectors, lim->max_user_discard_sectors); -} -EXPORT_SYMBOL(blk_queue_max_discard_sectors); - -/** - * blk_queue_max_secure_erase_sectors - set max sectors for a secure erase - * @q: the request queue for the device - * @max_sectors: maximum number of sectors to secure_erase - **/ -void blk_queue_max_secure_erase_sectors(struct request_queue *q, - unsigned int max_sectors) -{ - q->limits.max_secure_erase_sectors = max_sectors; -} -EXPORT_SYMBOL(blk_queue_max_secure_erase_sectors); - -/** - * blk_queue_max_write_zeroes_sectors - set max sectors for a single - * write zeroes - * @q: the request queue for the device - * @max_write_zeroes_sectors: maximum number of sectors to write per command - **/ -void blk_queue_max_write_zeroes_sectors(struct request_queue *q, - unsigned int max_write_zeroes_sectors) -{ - q->limits.max_write_zeroes_sectors = max_write_zeroes_sectors; -} -EXPORT_SYMBOL(blk_queue_max_write_zeroes_sectors); - void disk_update_readahead(struct gendisk *disk) { blk_apply_bdi_limits(disk->bdi, &disk->queue->limits); diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index fd7c0ff2139c..9b4ec3e4908c 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -1605,8 +1605,8 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) blkif_req(req)->error = BLK_STS_NOTSUPP; info->feature_discard = 0; info->feature_secdiscard = 0; - blk_queue_max_discard_sectors(rq, 0); - blk_queue_max_secure_erase_sectors(rq, 0); + blk_queue_disable_discard(rq); + blk_queue_disable_secure_erase(rq); } break; case BLKIF_OP_FLUSH_DISKCACHE: diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 049071b56819..d957e29b17a9 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -837,7 +837,7 @@ static unsigned char sd_setup_protect_cmnd(struct scsi_cmnd *scmd, static void sd_disable_discard(struct scsi_disk *sdkp) { sdkp->provisioning_mode = SD_LBP_DISABLE; - blk_queue_max_discard_sectors(sdkp->disk->queue, 0); + blk_queue_disable_discard(sdkp->disk->queue); } static void sd_config_discard(struct scsi_disk *sdkp, struct queue_limits *lim, @@ -1019,7 +1019,7 @@ static void sd_disable_write_same(struct scsi_disk *sdkp) { sdkp->device->no_write_same = 1; sdkp->max_ws_blocks = 0; - blk_queue_max_write_zeroes_sectors(sdkp->disk->queue, 0); + blk_queue_disable_write_zeroes(sdkp->disk->queue); } static void sd_config_write_same(struct scsi_disk *sdkp, diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index ad995e7a7698..ac8e0cb2353a 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -912,15 +912,31 @@ static inline void queue_limits_cancel_update(struct request_queue *q) mutex_unlock(&q->limits_lock); } +/* + * These helpers are for drivers that have sloppy feature negotiation and might + * have to disable DISCARD, WRITE_ZEROES or SECURE_DISCARD from the I/O + * completion handler when the device returned an indicator that the respective + * feature is not actually supported. They are racy and the driver needs to + * cope with that. Try to avoid this scheme if you can. + */ +static inline void blk_queue_disable_discard(struct request_queue *q) +{ + q->limits.max_discard_sectors = 0; +} + +static inline void blk_queue_disable_secure_erase(struct request_queue *q) +{ + q->limits.max_secure_erase_sectors = 0; +} + +static inline void blk_queue_disable_write_zeroes(struct request_queue *q) +{ + q->limits.max_write_zeroes_sectors = 0; +} + /* * Access functions for manipulating queue properties */ -void blk_queue_max_secure_erase_sectors(struct request_queue *q, - unsigned int max_sectors); -extern void blk_queue_max_discard_sectors(struct request_queue *q, - unsigned int max_discard_sectors); -extern void blk_queue_max_write_zeroes_sectors(struct request_queue *q, - unsigned int max_write_same_sectors); void disk_update_readahead(struct gendisk *disk); extern void blk_limits_io_min(struct queue_limits *limits, unsigned int min); extern void blk_limits_io_opt(struct queue_limits *limits, unsigned int opt); From 899ee2c3829c5ac14bfc7d3c4a5846c0b709b78f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 13 Jun 2024 10:48:11 +0200 Subject: [PATCH 15/26] block: initialize integrity buffer to zero before writing it to media Metadata added by bio_integrity_prep is using plain kmalloc, which leads to random kernel memory being written media. For PI metadata this is limited to the app tag that isn't used by kernel generated metadata, but for non-PI metadata the entire buffer leaks kernel memory. Fix this by adding the __GFP_ZERO flag to allocations for writes. Fixes: 7ba1ba12eeef ("block: Block layer data integrity support") Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Kanchan Joshi Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20240613084839.1044015-2-hch@lst.de Signed-off-by: Jens Axboe --- block/bio-integrity.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 2e3e8e04961e..af7f71d16114 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -432,6 +432,7 @@ bool bio_integrity_prep(struct bio *bio) unsigned long start, end; unsigned int len, nr_pages; unsigned int bytes, offset, i; + gfp_t gfp = GFP_NOIO; if (!bi) return true; @@ -454,11 +455,19 @@ bool bio_integrity_prep(struct bio *bio) if (!bi->profile->generate_fn || !(bi->flags & BLK_INTEGRITY_GENERATE)) return true; + + /* + * Zero the memory allocated to not leak uninitialized kernel + * memory to disk. For PI this only affects the app tag, but + * for non-integrity metadata it affects the entire metadata + * buffer. + */ + gfp |= __GFP_ZERO; } /* Allocate kernel buffer for protection data */ len = bio_integrity_bytes(bi, bio_sectors(bio)); - buf = kmalloc(len, GFP_NOIO); + buf = kmalloc(len, gfp); if (unlikely(buf == NULL)) { printk(KERN_ERR "could not allocate integrity buffer\n"); goto err_end_io; From d11854ed05635e4a73fa61a988ffdd0978c9e202 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 13 Jun 2024 10:48:12 +0200 Subject: [PATCH 16/26] md/raid0: don't free conf on raid0_run failure The core md code calls the ->free method which already frees conf. Fixes: 0c031fd37f69 ("md: Move alloc/free acct bioset in to personality") Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Yu Kuai Link: https://lore.kernel.org/r/20240613084839.1044015-3-hch@lst.de Signed-off-by: Jens Axboe --- drivers/md/raid0.c | 21 +++++---------------- 1 file changed, 5 insertions(+), 16 deletions(-) diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index c5d4aeb68404..81c01347cd24 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -365,18 +365,13 @@ static sector_t raid0_size(struct mddev *mddev, sector_t sectors, int raid_disks return array_sectors; } -static void free_conf(struct mddev *mddev, struct r0conf *conf) -{ - kfree(conf->strip_zone); - kfree(conf->devlist); - kfree(conf); -} - static void raid0_free(struct mddev *mddev, void *priv) { struct r0conf *conf = priv; - free_conf(mddev, conf); + kfree(conf->strip_zone); + kfree(conf->devlist); + kfree(conf); } static int raid0_set_limits(struct mddev *mddev) @@ -415,7 +410,7 @@ static int raid0_run(struct mddev *mddev) if (!mddev_is_dm(mddev)) { ret = raid0_set_limits(mddev); if (ret) - goto out_free_conf; + return ret; } /* calculate array device size */ @@ -427,13 +422,7 @@ static int raid0_run(struct mddev *mddev) dump_zones(mddev); - ret = md_integrity_register(mddev); - if (ret) - goto out_free_conf; - return 0; -out_free_conf: - free_conf(mddev, conf); - return ret; + return md_integrity_register(mddev); } /* From 799af947ed132956d6de6d77a5bc053817ccb06b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 13 Jun 2024 10:48:13 +0200 Subject: [PATCH 17/26] md/raid1: don't free conf on raid0_run failure The core md code calls the ->free method which already frees conf. Fixes: 07f1a6850c5d ("md/raid1: fail run raid1 array when active disk less than one") Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240613084839.1044015-4-hch@lst.de Signed-off-by: Jens Axboe --- drivers/md/raid1.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 7b8a71ca66dd..1f321826ef02 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -3204,7 +3204,6 @@ static int raid1_set_limits(struct mddev *mddev) return queue_limits_set(mddev->gendisk->queue, &lim); } -static void raid1_free(struct mddev *mddev, void *priv); static int raid1_run(struct mddev *mddev) { struct r1conf *conf; @@ -3238,7 +3237,7 @@ static int raid1_run(struct mddev *mddev) if (!mddev_is_dm(mddev)) { ret = raid1_set_limits(mddev); if (ret) - goto abort; + return ret; } mddev->degraded = 0; @@ -3252,8 +3251,7 @@ static int raid1_run(struct mddev *mddev) */ if (conf->raid_disks - mddev->degraded < 1) { md_unregister_thread(mddev, &conf->thread); - ret = -EINVAL; - goto abort; + return -EINVAL; } if (conf->raid_disks - mddev->degraded == 1) @@ -3277,14 +3275,8 @@ static int raid1_run(struct mddev *mddev) md_set_array_sectors(mddev, raid1_size(mddev, 0, 0)); ret = md_integrity_register(mddev); - if (ret) { + if (ret) md_unregister_thread(mddev, &mddev->thread); - goto abort; - } - return 0; - -abort: - raid1_free(mddev, conf); return ret; } From 63e649594ab19cc3122a2d0fc2c94b19932f0b19 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 13 Jun 2024 10:48:14 +0200 Subject: [PATCH 18/26] dm-integrity: use the nop integrity profile Use the block layer built-in nop profile instead of duplicating it. Tested by: $ dd if=/dev/urandom of=key.bin bs=512 count=1 $ cryptsetup luksFormat -q --type luks2 --integrity hmac-sha256 \ --integrity-no-wipe /dev/nvme0n1 key.bin $ cryptsetup luksOpen /dev/nvme0n1 luks-integrity --key-file key.bin and then doing mkfs.xfs and simple I/O on the mount file system. Signed-off-by: Christoph Hellwig Reviewed-by: Milan Broz Reviewed-by: Chaitanya Kulkarni Reviewed-by: Hannes Reinecke Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240613084839.1044015-5-hch@lst.de Signed-off-by: Jens Axboe --- drivers/md/dm-crypt.c | 4 ++-- drivers/md/dm-integrity.c | 20 -------------------- 2 files changed, 2 insertions(+), 22 deletions(-) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 1b7a97cc3779..1dfc462f29cd 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -1176,8 +1176,8 @@ static int crypt_integrity_ctr(struct crypt_config *cc, struct dm_target *ti) struct blk_integrity *bi = blk_get_integrity(cc->dev->bdev->bd_disk); struct mapped_device *md = dm_table_get_md(ti->table); - /* From now we require underlying device with our integrity profile */ - if (!bi || strcasecmp(bi->profile->name, "DM-DIF-EXT-TAG")) { + /* We require an underlying device with non-PI metadata */ + if (!bi || strcmp(bi->profile->name, "nop")) { ti->error = "Integrity profile not supported."; return -EINVAL; } diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 417fddebe367..c1cc27541673 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -350,25 +350,6 @@ static struct kmem_cache *journal_io_cache; #define DEBUG_bytes(bytes, len, msg, ...) do { } while (0) #endif -static void dm_integrity_prepare(struct request *rq) -{ -} - -static void dm_integrity_complete(struct request *rq, unsigned int nr_bytes) -{ -} - -/* - * DM Integrity profile, protection is performed layer above (dm-crypt) - */ -static const struct blk_integrity_profile dm_integrity_profile = { - .name = "DM-DIF-EXT-TAG", - .generate_fn = NULL, - .verify_fn = NULL, - .prepare_fn = dm_integrity_prepare, - .complete_fn = dm_integrity_complete, -}; - static void dm_integrity_map_continue(struct dm_integrity_io *dio, bool from_map); static void integrity_bio_wait(struct work_struct *w); static void dm_integrity_dtr(struct dm_target *ti); @@ -3656,7 +3637,6 @@ static void dm_integrity_set(struct dm_target *ti, struct dm_integrity_c *ic) struct blk_integrity bi; memset(&bi, 0, sizeof(bi)); - bi.profile = &dm_integrity_profile; bi.tuple_size = ic->tag_size; bi.tag_size = bi.tuple_size; bi.interval_exp = ic->sb->log2_sectors_per_block + SECTOR_SHIFT; From e9f5f44ad3725335d9c559c3c22cd3726152a7b1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 13 Jun 2024 10:48:15 +0200 Subject: [PATCH 19/26] block: remove the blk_integrity_profile structure Block layer integrity configuration is a bit complex right now, as it indirects through operation vectors for a simple two-dimensional configuration: a) the checksum type of none, ip checksum, crc, crc64 b) the presence or absence of a reference tag Remove the integrity profile, and instead add a separate csum_type flag which replaces the existing ip-checksum field and a new flag that indicates the presence of the reference tag. This removes up to two layers of indirect calls, remove the need to offload the no-op verification of non-PI metadata to a workqueue and generally simplifies the code. The downside is that block/t10-pi.c now has to be built into the kernel when CONFIG_BLK_DEV_INTEGRITY is supported. Given that both nvme and SCSI require t10-pi.ko, it is loaded for all usual configurations that enabled CONFIG_BLK_DEV_INTEGRITY already, though. Signed-off-by: Christoph Hellwig Reviewed-by: Kanchan Joshi Reviewed-by: Hannes Reinecke Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240613084839.1044015-6-hch@lst.de Signed-off-by: Jens Axboe --- block/Kconfig | 8 +- block/Makefile | 3 +- block/bio-integrity.c | 32 ++-- block/blk-integrity.c | 66 ++++---- block/blk-mq.c | 13 +- block/blk.h | 8 + block/t10-pi.c | 239 ++++++++++------------------ drivers/md/dm-crypt.c | 2 +- drivers/nvme/host/Kconfig | 1 - drivers/nvme/host/core.c | 17 +- drivers/nvme/target/Kconfig | 1 - drivers/nvme/target/io-cmd-bdev.c | 16 +- drivers/scsi/Kconfig | 1 - drivers/scsi/sd_dif.c | 19 +-- drivers/target/target_core_iblock.c | 53 +++--- include/linux/blk-integrity.h | 35 ++-- include/linux/blkdev.h | 9 +- include/linux/t10-pi.h | 8 - 18 files changed, 216 insertions(+), 315 deletions(-) diff --git a/block/Kconfig b/block/Kconfig index dc12af58dbae..5b623b876d3b 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -62,6 +62,8 @@ config BLK_DEV_BSGLIB config BLK_DEV_INTEGRITY bool "Block layer data integrity support" + select CRC_T10DIF + select CRC64_ROCKSOFT help Some storage devices allow extra information to be stored/retrieved to help protect the data. The block layer @@ -72,12 +74,6 @@ config BLK_DEV_INTEGRITY T10/SCSI Data Integrity Field or the T13/ATA External Path Protection. If in doubt, say N. -config BLK_DEV_INTEGRITY_T10 - tristate - depends on BLK_DEV_INTEGRITY - select CRC_T10DIF - select CRC64_ROCKSOFT - config BLK_DEV_WRITE_MOUNTED bool "Allow writing to mounted block devices" default y diff --git a/block/Makefile b/block/Makefile index 168150b9c510..ddfd21c1a9ff 100644 --- a/block/Makefile +++ b/block/Makefile @@ -26,8 +26,7 @@ obj-$(CONFIG_MQ_IOSCHED_KYBER) += kyber-iosched.o bfq-y := bfq-iosched.o bfq-wf2q.o bfq-cgroup.o obj-$(CONFIG_IOSCHED_BFQ) += bfq.o -obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o blk-integrity.o -obj-$(CONFIG_BLK_DEV_INTEGRITY_T10) += t10-pi.o +obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o blk-integrity.o t10-pi.o obj-$(CONFIG_BLK_MQ_PCI) += blk-mq-pci.o obj-$(CONFIG_BLK_MQ_VIRTIO) += blk-mq-virtio.o obj-$(CONFIG_BLK_DEV_ZONED) += blk-zoned.o diff --git a/block/bio-integrity.c b/block/bio-integrity.c index af7f71d16114..31dbc2853f92 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -378,10 +378,9 @@ EXPORT_SYMBOL_GPL(bio_integrity_map_user); * bio_integrity_process - Process integrity metadata for a bio * @bio: bio to generate/verify integrity metadata for * @proc_iter: iterator to process - * @proc_fn: Pointer to the relevant processing function */ static blk_status_t bio_integrity_process(struct bio *bio, - struct bvec_iter *proc_iter, integrity_processing_fn *proc_fn) + struct bvec_iter *proc_iter) { struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); struct blk_integrity_iter iter; @@ -392,17 +391,18 @@ static blk_status_t bio_integrity_process(struct bio *bio, iter.disk_name = bio->bi_bdev->bd_disk->disk_name; iter.interval = 1 << bi->interval_exp; - iter.tuple_size = bi->tuple_size; iter.seed = proc_iter->bi_sector; iter.prot_buf = bvec_virt(bip->bip_vec); - iter.pi_offset = bi->pi_offset; __bio_for_each_segment(bv, bio, bviter, *proc_iter) { void *kaddr = bvec_kmap_local(&bv); iter.data_buf = kaddr; iter.data_size = bv.bv_len; - ret = proc_fn(&iter); + if (bio_data_dir(bio) == WRITE) + blk_integrity_generate(&iter, bi); + else + ret = blk_integrity_verify(&iter, bi); kunmap_local(kaddr); if (ret) @@ -448,12 +448,10 @@ bool bio_integrity_prep(struct bio *bio) return true; if (bio_data_dir(bio) == READ) { - if (!bi->profile->verify_fn || - !(bi->flags & BLK_INTEGRITY_VERIFY)) + if (!(bi->flags & BLK_INTEGRITY_VERIFY)) return true; } else { - if (!bi->profile->generate_fn || - !(bi->flags & BLK_INTEGRITY_GENERATE)) + if (!(bi->flags & BLK_INTEGRITY_GENERATE)) return true; /* @@ -488,7 +486,7 @@ bool bio_integrity_prep(struct bio *bio) bip->bip_flags |= BIP_BLOCK_INTEGRITY; bip_set_seed(bip, bio->bi_iter.bi_sector); - if (bi->flags & BLK_INTEGRITY_IP_CHECKSUM) + if (bi->csum_type == BLK_INTEGRITY_CSUM_IP) bip->bip_flags |= BIP_IP_CHECKSUM; /* Map it */ @@ -511,12 +509,10 @@ bool bio_integrity_prep(struct bio *bio) } /* Auto-generate integrity metadata if this is a write */ - if (bio_data_dir(bio) == WRITE) { - bio_integrity_process(bio, &bio->bi_iter, - bi->profile->generate_fn); - } else { + if (bio_data_dir(bio) == WRITE) + bio_integrity_process(bio, &bio->bi_iter); + else bip->bio_iter = bio->bi_iter; - } return true; err_end_io: @@ -539,15 +535,13 @@ static void bio_integrity_verify_fn(struct work_struct *work) struct bio_integrity_payload *bip = container_of(work, struct bio_integrity_payload, bip_work); struct bio *bio = bip->bip_bio; - struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); /* * At the moment verify is called bio's iterator was advanced * during split and completion, we need to rewind iterator to * it's original position. */ - bio->bi_status = bio_integrity_process(bio, &bip->bio_iter, - bi->profile->verify_fn); + bio->bi_status = bio_integrity_process(bio, &bip->bio_iter); bio_integrity_free(bio); bio_endio(bio); } @@ -569,7 +563,7 @@ bool __bio_integrity_endio(struct bio *bio) struct bio_integrity_payload *bip = bio_integrity(bio); if (bio_op(bio) == REQ_OP_READ && !bio->bi_status && - (bip->bip_flags & BIP_BLOCK_INTEGRITY) && bi->profile->verify_fn) { + (bip->bip_flags & BIP_BLOCK_INTEGRITY) && bi->csum_type) { INIT_WORK(&bip->bip_work, bio_integrity_verify_fn); queue_work(kintegrityd_wq, &bip->bip_work); return false; diff --git a/block/blk-integrity.c b/block/blk-integrity.c index ccbeb6dfa87a..17d37badfbb8 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -123,10 +123,10 @@ int blk_integrity_compare(struct gendisk *gd1, struct gendisk *gd2) struct blk_integrity *b1 = &gd1->queue->integrity; struct blk_integrity *b2 = &gd2->queue->integrity; - if (!b1->profile && !b2->profile) + if (!b1->tuple_size && !b2->tuple_size) return 0; - if (!b1->profile || !b2->profile) + if (!b1->tuple_size || !b2->tuple_size) return -1; if (b1->interval_exp != b2->interval_exp) { @@ -150,10 +150,13 @@ int blk_integrity_compare(struct gendisk *gd1, struct gendisk *gd2) return -1; } - if (b1->profile != b2->profile) { + if (b1->csum_type != b2->csum_type || + (b1->flags & BLK_INTEGRITY_REF_TAG) != + (b2->flags & BLK_INTEGRITY_REF_TAG)) { pr_err("%s: %s/%s type %s != %s\n", __func__, gd1->disk_name, gd2->disk_name, - b1->profile->name, b2->profile->name); + blk_integrity_profile_name(b1), + blk_integrity_profile_name(b2)); return -1; } @@ -217,14 +220,37 @@ static inline struct blk_integrity *dev_to_bi(struct device *dev) return &dev_to_disk(dev)->queue->integrity; } +const char *blk_integrity_profile_name(struct blk_integrity *bi) +{ + switch (bi->csum_type) { + case BLK_INTEGRITY_CSUM_IP: + if (bi->flags & BLK_INTEGRITY_REF_TAG) + return "T10-DIF-TYPE1-IP"; + return "T10-DIF-TYPE3-IP"; + case BLK_INTEGRITY_CSUM_CRC: + if (bi->flags & BLK_INTEGRITY_REF_TAG) + return "T10-DIF-TYPE1-CRC"; + return "T10-DIF-TYPE3-CRC"; + case BLK_INTEGRITY_CSUM_CRC64: + if (bi->flags & BLK_INTEGRITY_REF_TAG) + return "EXT-DIF-TYPE1-CRC64"; + return "EXT-DIF-TYPE3-CRC64"; + case BLK_INTEGRITY_CSUM_NONE: + break; + } + + return "nop"; +} +EXPORT_SYMBOL_GPL(blk_integrity_profile_name); + static ssize_t format_show(struct device *dev, struct device_attribute *attr, char *page) { struct blk_integrity *bi = dev_to_bi(dev); - if (bi->profile && bi->profile->name) - return sysfs_emit(page, "%s\n", bi->profile->name); - return sysfs_emit(page, "none\n"); + if (!bi->tuple_size) + return sysfs_emit(page, "none\n"); + return sysfs_emit(page, "%s\n", blk_integrity_profile_name(bi)); } static ssize_t tag_size_show(struct device *dev, struct device_attribute *attr, @@ -326,28 +352,6 @@ const struct attribute_group blk_integrity_attr_group = { .attrs = integrity_attrs, }; -static blk_status_t blk_integrity_nop_fn(struct blk_integrity_iter *iter) -{ - return BLK_STS_OK; -} - -static void blk_integrity_nop_prepare(struct request *rq) -{ -} - -static void blk_integrity_nop_complete(struct request *rq, - unsigned int nr_bytes) -{ -} - -static const struct blk_integrity_profile nop_profile = { - .name = "nop", - .generate_fn = blk_integrity_nop_fn, - .verify_fn = blk_integrity_nop_fn, - .prepare_fn = blk_integrity_nop_prepare, - .complete_fn = blk_integrity_nop_complete, -}; - /** * blk_integrity_register - Register a gendisk as being integrity-capable * @disk: struct gendisk pointer to make integrity-aware @@ -363,11 +367,11 @@ void blk_integrity_register(struct gendisk *disk, struct blk_integrity *template { struct blk_integrity *bi = &disk->queue->integrity; + bi->csum_type = template->csum_type; bi->flags = BLK_INTEGRITY_VERIFY | BLK_INTEGRITY_GENERATE | template->flags; bi->interval_exp = template->interval_exp ? : ilog2(queue_logical_block_size(disk->queue)); - bi->profile = template->profile ? template->profile : &nop_profile; bi->tuple_size = template->tuple_size; bi->tag_size = template->tag_size; bi->pi_offset = template->pi_offset; @@ -394,7 +398,7 @@ void blk_integrity_unregister(struct gendisk *disk) { struct blk_integrity *bi = &disk->queue->integrity; - if (!bi->profile) + if (!bi->tuple_size) return; /* ensure all bios are off the integrity workqueue */ diff --git a/block/blk-mq.c b/block/blk-mq.c index 3b4df8e5ac9e..0d4cd39c3d25 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -804,10 +804,8 @@ static void blk_complete_request(struct request *req) if (!bio) return; -#ifdef CONFIG_BLK_DEV_INTEGRITY if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ) - req->q->integrity.profile->complete_fn(req, total_bytes); -#endif + blk_integrity_complete(req, total_bytes); /* * Upper layers may call blk_crypto_evict_key() anytime after the last @@ -875,11 +873,9 @@ bool blk_update_request(struct request *req, blk_status_t error, if (!req->bio) return false; -#ifdef CONFIG_BLK_DEV_INTEGRITY if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ && error == BLK_STS_OK) - req->q->integrity.profile->complete_fn(req, nr_bytes); -#endif + blk_integrity_complete(req, nr_bytes); /* * Upper layers may call blk_crypto_evict_key() anytime after the last @@ -1264,10 +1260,9 @@ void blk_mq_start_request(struct request *rq) WRITE_ONCE(rq->state, MQ_RQ_IN_FLIGHT); rq->mq_hctx->tags->rqs[rq->tag] = rq; -#ifdef CONFIG_BLK_DEV_INTEGRITY if (blk_integrity_rq(rq) && req_op(rq) == REQ_OP_WRITE) - q->integrity.profile->prepare_fn(rq); -#endif + blk_integrity_prepare(rq); + if (rq->bio && rq->bio->bi_opf & REQ_POLLED) WRITE_ONCE(rq->bio->bi_cookie, rq->mq_hctx->queue_num); } diff --git a/block/blk.h b/block/blk.h index 189bc25beb50..79e8d5d4fe0c 100644 --- a/block/blk.h +++ b/block/blk.h @@ -9,6 +9,7 @@ #include #include "blk-crypto-internal.h" +struct blk_integrity_iter; struct elevator_type; /* Max future timer expiry for timeouts */ @@ -673,4 +674,11 @@ int bdev_open(struct block_device *bdev, blk_mode_t mode, void *holder, const struct blk_holder_ops *hops, struct file *bdev_file); int bdev_permission(dev_t dev, blk_mode_t mode, void *holder); +void blk_integrity_generate(struct blk_integrity_iter *iter, + struct blk_integrity *bi); +blk_status_t blk_integrity_verify(struct blk_integrity_iter *iter, + struct blk_integrity *bi); +void blk_integrity_prepare(struct request *rq); +void blk_integrity_complete(struct request *rq, unsigned int nr_bytes); + #endif /* BLK_INTERNAL_H */ diff --git a/block/t10-pi.c b/block/t10-pi.c index f4cc91156da1..dadecf621497 100644 --- a/block/t10-pi.c +++ b/block/t10-pi.c @@ -11,17 +11,14 @@ #include #include #include +#include "blk.h" -typedef __be16 (csum_fn) (__be16, void *, unsigned int); - -static __be16 t10_pi_crc_fn(__be16 crc, void *data, unsigned int len) +static __be16 t10_pi_csum(__be16 csum, void *data, unsigned int len, + unsigned char csum_type) { - return cpu_to_be16(crc_t10dif_update(be16_to_cpu(crc), data, len)); -} - -static __be16 t10_pi_ip_fn(__be16 csum, void *data, unsigned int len) -{ - return (__force __be16)ip_compute_csum(data, len); + if (csum_type == BLK_INTEGRITY_CSUM_IP) + return (__force __be16)ip_compute_csum(data, len); + return cpu_to_be16(crc_t10dif_update(be16_to_cpu(csum), data, len)); } /* @@ -29,48 +26,44 @@ static __be16 t10_pi_ip_fn(__be16 csum, void *data, unsigned int len) * 16 bit app tag, 32 bit reference tag. Type 3 does not define the ref * tag. */ -static blk_status_t t10_pi_generate(struct blk_integrity_iter *iter, - csum_fn *fn, enum t10_dif_type type) +static void t10_pi_generate(struct blk_integrity_iter *iter, + struct blk_integrity *bi) { - u8 offset = iter->pi_offset; + u8 offset = bi->pi_offset; unsigned int i; for (i = 0 ; i < iter->data_size ; i += iter->interval) { struct t10_pi_tuple *pi = iter->prot_buf + offset; - pi->guard_tag = fn(0, iter->data_buf, iter->interval); + pi->guard_tag = t10_pi_csum(0, iter->data_buf, iter->interval, + bi->csum_type); if (offset) - pi->guard_tag = fn(pi->guard_tag, iter->prot_buf, - offset); + pi->guard_tag = t10_pi_csum(pi->guard_tag, + iter->prot_buf, offset, bi->csum_type); pi->app_tag = 0; - if (type == T10_PI_TYPE1_PROTECTION) + if (bi->flags & BLK_INTEGRITY_REF_TAG) pi->ref_tag = cpu_to_be32(lower_32_bits(iter->seed)); else pi->ref_tag = 0; iter->data_buf += iter->interval; - iter->prot_buf += iter->tuple_size; + iter->prot_buf += bi->tuple_size; iter->seed++; } - - return BLK_STS_OK; } static blk_status_t t10_pi_verify(struct blk_integrity_iter *iter, - csum_fn *fn, enum t10_dif_type type) + struct blk_integrity *bi) { - u8 offset = iter->pi_offset; + u8 offset = bi->pi_offset; unsigned int i; - BUG_ON(type == T10_PI_TYPE0_PROTECTION); - for (i = 0 ; i < iter->data_size ; i += iter->interval) { struct t10_pi_tuple *pi = iter->prot_buf + offset; __be16 csum; - if (type == T10_PI_TYPE1_PROTECTION || - type == T10_PI_TYPE2_PROTECTION) { + if (bi->flags & BLK_INTEGRITY_REF_TAG) { if (pi->app_tag == T10_PI_APP_ESCAPE) goto next; @@ -82,15 +75,17 @@ static blk_status_t t10_pi_verify(struct blk_integrity_iter *iter, iter->seed, be32_to_cpu(pi->ref_tag)); return BLK_STS_PROTECTION; } - } else if (type == T10_PI_TYPE3_PROTECTION) { + } else { if (pi->app_tag == T10_PI_APP_ESCAPE && pi->ref_tag == T10_PI_REF_ESCAPE) goto next; } - csum = fn(0, iter->data_buf, iter->interval); + csum = t10_pi_csum(0, iter->data_buf, iter->interval, + bi->csum_type); if (offset) - csum = fn(csum, iter->prot_buf, offset); + csum = t10_pi_csum(csum, iter->prot_buf, offset, + bi->csum_type); if (pi->guard_tag != csum) { pr_err("%s: guard tag error at sector %llu " \ @@ -102,33 +97,13 @@ static blk_status_t t10_pi_verify(struct blk_integrity_iter *iter, next: iter->data_buf += iter->interval; - iter->prot_buf += iter->tuple_size; + iter->prot_buf += bi->tuple_size; iter->seed++; } return BLK_STS_OK; } -static blk_status_t t10_pi_type1_generate_crc(struct blk_integrity_iter *iter) -{ - return t10_pi_generate(iter, t10_pi_crc_fn, T10_PI_TYPE1_PROTECTION); -} - -static blk_status_t t10_pi_type1_generate_ip(struct blk_integrity_iter *iter) -{ - return t10_pi_generate(iter, t10_pi_ip_fn, T10_PI_TYPE1_PROTECTION); -} - -static blk_status_t t10_pi_type1_verify_crc(struct blk_integrity_iter *iter) -{ - return t10_pi_verify(iter, t10_pi_crc_fn, T10_PI_TYPE1_PROTECTION); -} - -static blk_status_t t10_pi_type1_verify_ip(struct blk_integrity_iter *iter) -{ - return t10_pi_verify(iter, t10_pi_ip_fn, T10_PI_TYPE1_PROTECTION); -} - /** * t10_pi_type1_prepare - prepare PI prior submitting request to device * @rq: request with PI that should be prepared @@ -225,81 +200,15 @@ static void t10_pi_type1_complete(struct request *rq, unsigned int nr_bytes) } } -static blk_status_t t10_pi_type3_generate_crc(struct blk_integrity_iter *iter) -{ - return t10_pi_generate(iter, t10_pi_crc_fn, T10_PI_TYPE3_PROTECTION); -} - -static blk_status_t t10_pi_type3_generate_ip(struct blk_integrity_iter *iter) -{ - return t10_pi_generate(iter, t10_pi_ip_fn, T10_PI_TYPE3_PROTECTION); -} - -static blk_status_t t10_pi_type3_verify_crc(struct blk_integrity_iter *iter) -{ - return t10_pi_verify(iter, t10_pi_crc_fn, T10_PI_TYPE3_PROTECTION); -} - -static blk_status_t t10_pi_type3_verify_ip(struct blk_integrity_iter *iter) -{ - return t10_pi_verify(iter, t10_pi_ip_fn, T10_PI_TYPE3_PROTECTION); -} - -/* Type 3 does not have a reference tag so no remapping is required. */ -static void t10_pi_type3_prepare(struct request *rq) -{ -} - -/* Type 3 does not have a reference tag so no remapping is required. */ -static void t10_pi_type3_complete(struct request *rq, unsigned int nr_bytes) -{ -} - -const struct blk_integrity_profile t10_pi_type1_crc = { - .name = "T10-DIF-TYPE1-CRC", - .generate_fn = t10_pi_type1_generate_crc, - .verify_fn = t10_pi_type1_verify_crc, - .prepare_fn = t10_pi_type1_prepare, - .complete_fn = t10_pi_type1_complete, -}; -EXPORT_SYMBOL(t10_pi_type1_crc); - -const struct blk_integrity_profile t10_pi_type1_ip = { - .name = "T10-DIF-TYPE1-IP", - .generate_fn = t10_pi_type1_generate_ip, - .verify_fn = t10_pi_type1_verify_ip, - .prepare_fn = t10_pi_type1_prepare, - .complete_fn = t10_pi_type1_complete, -}; -EXPORT_SYMBOL(t10_pi_type1_ip); - -const struct blk_integrity_profile t10_pi_type3_crc = { - .name = "T10-DIF-TYPE3-CRC", - .generate_fn = t10_pi_type3_generate_crc, - .verify_fn = t10_pi_type3_verify_crc, - .prepare_fn = t10_pi_type3_prepare, - .complete_fn = t10_pi_type3_complete, -}; -EXPORT_SYMBOL(t10_pi_type3_crc); - -const struct blk_integrity_profile t10_pi_type3_ip = { - .name = "T10-DIF-TYPE3-IP", - .generate_fn = t10_pi_type3_generate_ip, - .verify_fn = t10_pi_type3_verify_ip, - .prepare_fn = t10_pi_type3_prepare, - .complete_fn = t10_pi_type3_complete, -}; -EXPORT_SYMBOL(t10_pi_type3_ip); - static __be64 ext_pi_crc64(u64 crc, void *data, unsigned int len) { return cpu_to_be64(crc64_rocksoft_update(crc, data, len)); } -static blk_status_t ext_pi_crc64_generate(struct blk_integrity_iter *iter, - enum t10_dif_type type) +static void ext_pi_crc64_generate(struct blk_integrity_iter *iter, + struct blk_integrity *bi) { - u8 offset = iter->pi_offset; + u8 offset = bi->pi_offset; unsigned int i; for (i = 0 ; i < iter->data_size ; i += iter->interval) { @@ -311,17 +220,15 @@ static blk_status_t ext_pi_crc64_generate(struct blk_integrity_iter *iter, iter->prot_buf, offset); pi->app_tag = 0; - if (type == T10_PI_TYPE1_PROTECTION) + if (bi->flags & BLK_INTEGRITY_REF_TAG) put_unaligned_be48(iter->seed, pi->ref_tag); else put_unaligned_be48(0ULL, pi->ref_tag); iter->data_buf += iter->interval; - iter->prot_buf += iter->tuple_size; + iter->prot_buf += bi->tuple_size; iter->seed++; } - - return BLK_STS_OK; } static bool ext_pi_ref_escape(u8 *ref_tag) @@ -332,9 +239,9 @@ static bool ext_pi_ref_escape(u8 *ref_tag) } static blk_status_t ext_pi_crc64_verify(struct blk_integrity_iter *iter, - enum t10_dif_type type) + struct blk_integrity *bi) { - u8 offset = iter->pi_offset; + u8 offset = bi->pi_offset; unsigned int i; for (i = 0; i < iter->data_size; i += iter->interval) { @@ -342,7 +249,7 @@ static blk_status_t ext_pi_crc64_verify(struct blk_integrity_iter *iter, u64 ref, seed; __be64 csum; - if (type == T10_PI_TYPE1_PROTECTION) { + if (bi->flags & BLK_INTEGRITY_REF_TAG) { if (pi->app_tag == T10_PI_APP_ESCAPE) goto next; @@ -353,7 +260,7 @@ static blk_status_t ext_pi_crc64_verify(struct blk_integrity_iter *iter, iter->disk_name, seed, ref); return BLK_STS_PROTECTION; } - } else if (type == T10_PI_TYPE3_PROTECTION) { + } else { if (pi->app_tag == T10_PI_APP_ESCAPE && ext_pi_ref_escape(pi->ref_tag)) goto next; @@ -374,23 +281,13 @@ static blk_status_t ext_pi_crc64_verify(struct blk_integrity_iter *iter, next: iter->data_buf += iter->interval; - iter->prot_buf += iter->tuple_size; + iter->prot_buf += bi->tuple_size; iter->seed++; } return BLK_STS_OK; } -static blk_status_t ext_pi_type1_verify_crc64(struct blk_integrity_iter *iter) -{ - return ext_pi_crc64_verify(iter, T10_PI_TYPE1_PROTECTION); -} - -static blk_status_t ext_pi_type1_generate_crc64(struct blk_integrity_iter *iter) -{ - return ext_pi_crc64_generate(iter, T10_PI_TYPE1_PROTECTION); -} - static void ext_pi_type1_prepare(struct request *rq) { struct blk_integrity *bi = &rq->q->integrity; @@ -467,33 +364,61 @@ static void ext_pi_type1_complete(struct request *rq, unsigned int nr_bytes) } } -static blk_status_t ext_pi_type3_verify_crc64(struct blk_integrity_iter *iter) +void blk_integrity_generate(struct blk_integrity_iter *iter, + struct blk_integrity *bi) { - return ext_pi_crc64_verify(iter, T10_PI_TYPE3_PROTECTION); + switch (bi->csum_type) { + case BLK_INTEGRITY_CSUM_CRC64: + ext_pi_crc64_generate(iter, bi); + break; + case BLK_INTEGRITY_CSUM_CRC: + case BLK_INTEGRITY_CSUM_IP: + t10_pi_generate(iter, bi); + break; + default: + break; + } } -static blk_status_t ext_pi_type3_generate_crc64(struct blk_integrity_iter *iter) +blk_status_t blk_integrity_verify(struct blk_integrity_iter *iter, + struct blk_integrity *bi) { - return ext_pi_crc64_generate(iter, T10_PI_TYPE3_PROTECTION); + switch (bi->csum_type) { + case BLK_INTEGRITY_CSUM_CRC64: + return ext_pi_crc64_verify(iter, bi); + case BLK_INTEGRITY_CSUM_CRC: + case BLK_INTEGRITY_CSUM_IP: + return t10_pi_verify(iter, bi); + default: + return BLK_STS_OK; + } } -const struct blk_integrity_profile ext_pi_type1_crc64 = { - .name = "EXT-DIF-TYPE1-CRC64", - .generate_fn = ext_pi_type1_generate_crc64, - .verify_fn = ext_pi_type1_verify_crc64, - .prepare_fn = ext_pi_type1_prepare, - .complete_fn = ext_pi_type1_complete, -}; -EXPORT_SYMBOL_GPL(ext_pi_type1_crc64); +void blk_integrity_prepare(struct request *rq) +{ + struct blk_integrity *bi = &rq->q->integrity; -const struct blk_integrity_profile ext_pi_type3_crc64 = { - .name = "EXT-DIF-TYPE3-CRC64", - .generate_fn = ext_pi_type3_generate_crc64, - .verify_fn = ext_pi_type3_verify_crc64, - .prepare_fn = t10_pi_type3_prepare, - .complete_fn = t10_pi_type3_complete, -}; -EXPORT_SYMBOL_GPL(ext_pi_type3_crc64); + if (!(bi->flags & BLK_INTEGRITY_REF_TAG)) + return; + + if (bi->csum_type == BLK_INTEGRITY_CSUM_CRC64) + ext_pi_type1_prepare(rq); + else + t10_pi_type1_prepare(rq); +} + +void blk_integrity_complete(struct request *rq, unsigned int nr_bytes) +{ + struct blk_integrity *bi = &rq->q->integrity; + + if (!(bi->flags & BLK_INTEGRITY_REF_TAG)) + return; + + if (bi->csum_type == BLK_INTEGRITY_CSUM_CRC64) + ext_pi_type1_complete(rq, nr_bytes); + else + t10_pi_type1_complete(rq, nr_bytes); +} MODULE_DESCRIPTION("T10 Protection Information module"); MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 1dfc462f29cd..6c013ceb0e5f 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -1177,7 +1177,7 @@ static int crypt_integrity_ctr(struct crypt_config *cc, struct dm_target *ti) struct mapped_device *md = dm_table_get_md(ti->table); /* We require an underlying device with non-PI metadata */ - if (!bi || strcmp(bi->profile->name, "nop")) { + if (!bi || bi->csum_type != BLK_INTEGRITY_CSUM_NONE) { ti->error = "Integrity profile not supported."; return -EINVAL; } diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig index b309c8be720f..a3caef75aa0a 100644 --- a/drivers/nvme/host/Kconfig +++ b/drivers/nvme/host/Kconfig @@ -1,7 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only config NVME_CORE tristate - select BLK_DEV_INTEGRITY_T10 if BLK_DEV_INTEGRITY config BLK_DEV_NVME tristate "NVM Express block device" diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index f5d150c62955..14bac248cde4 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1744,17 +1744,16 @@ static bool nvme_init_integrity(struct gendisk *disk, struct nvme_ns_head *head) case NVME_NS_DPS_PI_TYPE3: switch (head->guard_type) { case NVME_NVM_NS_16B_GUARD: - integrity.profile = &t10_pi_type3_crc; + integrity.csum_type = BLK_INTEGRITY_CSUM_CRC; integrity.tag_size = sizeof(u16) + sizeof(u32); integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE; break; case NVME_NVM_NS_64B_GUARD: - integrity.profile = &ext_pi_type3_crc64; + integrity.csum_type = BLK_INTEGRITY_CSUM_CRC64; integrity.tag_size = sizeof(u16) + 6; integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE; break; default: - integrity.profile = NULL; break; } break; @@ -1762,22 +1761,22 @@ static bool nvme_init_integrity(struct gendisk *disk, struct nvme_ns_head *head) case NVME_NS_DPS_PI_TYPE2: switch (head->guard_type) { case NVME_NVM_NS_16B_GUARD: - integrity.profile = &t10_pi_type1_crc; + integrity.csum_type = BLK_INTEGRITY_CSUM_CRC; integrity.tag_size = sizeof(u16); - integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE; + integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE | + BLK_INTEGRITY_REF_TAG; break; case NVME_NVM_NS_64B_GUARD: - integrity.profile = &ext_pi_type1_crc64; + integrity.csum_type = BLK_INTEGRITY_CSUM_CRC64; integrity.tag_size = sizeof(u16); - integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE; + integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE | + BLK_INTEGRITY_REF_TAG; break; default: - integrity.profile = NULL; break; } break; default: - integrity.profile = NULL; break; } diff --git a/drivers/nvme/target/Kconfig b/drivers/nvme/target/Kconfig index 872dd1a0acd8..c42aec41cc7b 100644 --- a/drivers/nvme/target/Kconfig +++ b/drivers/nvme/target/Kconfig @@ -6,7 +6,6 @@ config NVME_TARGET depends on CONFIGFS_FS select NVME_KEYRING if NVME_TARGET_TCP_TLS select KEYS if NVME_TARGET_TCP_TLS - select BLK_DEV_INTEGRITY_T10 if BLK_DEV_INTEGRITY select SGL_ALLOC help This enabled target side support for the NVMe protocol, that is diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c index 6426aac2634a..b628bc5ee998 100644 --- a/drivers/nvme/target/io-cmd-bdev.c +++ b/drivers/nvme/target/io-cmd-bdev.c @@ -61,15 +61,17 @@ static void nvmet_bdev_ns_enable_integrity(struct nvmet_ns *ns) { struct blk_integrity *bi = bdev_get_integrity(ns->bdev); - if (bi) { + if (!bi) + return; + + if (bi->csum_type == BLK_INTEGRITY_CSUM_CRC) { ns->metadata_size = bi->tuple_size; - if (bi->profile == &t10_pi_type1_crc) + if (bi->flags & BLK_INTEGRITY_REF_TAG) ns->pi_type = NVME_NS_DPS_PI_TYPE1; - else if (bi->profile == &t10_pi_type3_crc) - ns->pi_type = NVME_NS_DPS_PI_TYPE3; else - /* Unsupported metadata type */ - ns->metadata_size = 0; + ns->pi_type = NVME_NS_DPS_PI_TYPE3; + } else { + ns->metadata_size = 0; } } @@ -102,7 +104,7 @@ int nvmet_bdev_ns_enable(struct nvmet_ns *ns) ns->pi_type = 0; ns->metadata_size = 0; - if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY_T10)) + if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY)) nvmet_bdev_ns_enable_integrity(ns); if (bdev_is_zoned(ns->bdev)) { diff --git a/drivers/scsi/Kconfig b/drivers/scsi/Kconfig index 065db86d6021..37c24ffea65c 100644 --- a/drivers/scsi/Kconfig +++ b/drivers/scsi/Kconfig @@ -82,7 +82,6 @@ comment "SCSI support type (disk, tape, CD-ROM)" config BLK_DEV_SD tristate "SCSI disk support" depends on SCSI - select BLK_DEV_INTEGRITY_T10 if BLK_DEV_INTEGRITY help If you want to use SCSI hard disks, Fibre Channel disks, Serial ATA (SATA) or Parallel ATA (PATA) hard disks, diff --git a/drivers/scsi/sd_dif.c b/drivers/scsi/sd_dif.c index 1df847b5f747..6f0921c7db78 100644 --- a/drivers/scsi/sd_dif.c +++ b/drivers/scsi/sd_dif.c @@ -47,18 +47,13 @@ void sd_dif_config_host(struct scsi_disk *sdkp) memset(&bi, 0, sizeof(bi)); /* Enable DMA of protection information */ - if (scsi_host_get_guard(sdkp->device->host) & SHOST_DIX_GUARD_IP) { - if (type == T10_PI_TYPE3_PROTECTION) - bi.profile = &t10_pi_type3_ip; - else - bi.profile = &t10_pi_type1_ip; + if (scsi_host_get_guard(sdkp->device->host) & SHOST_DIX_GUARD_IP) + bi.csum_type = BLK_INTEGRITY_CSUM_IP; + else + bi.csum_type = BLK_INTEGRITY_CSUM_CRC; - bi.flags |= BLK_INTEGRITY_IP_CHECKSUM; - } else - if (type == T10_PI_TYPE3_PROTECTION) - bi.profile = &t10_pi_type3_crc; - else - bi.profile = &t10_pi_type1_crc; + if (type != T10_PI_TYPE3_PROTECTION) + bi.flags |= BLK_INTEGRITY_REF_TAG; bi.tuple_size = sizeof(struct t10_pi_tuple); @@ -76,7 +71,7 @@ void sd_dif_config_host(struct scsi_disk *sdkp) sd_first_printk(KERN_NOTICE, sdkp, "Enabling DIX %s, application tag size %u bytes\n", - bi.profile->name, bi.tag_size); + blk_integrity_profile_name(&bi), bi.tag_size); out: blk_integrity_register(disk, &bi); } diff --git a/drivers/target/target_core_iblock.c b/drivers/target/target_core_iblock.c index 7f6ca8177845..a3e09adc4e76 100644 --- a/drivers/target/target_core_iblock.c +++ b/drivers/target/target_core_iblock.c @@ -148,35 +148,38 @@ static int iblock_configure_device(struct se_device *dev) dev->dev_attrib.is_nonrot = 1; bi = bdev_get_integrity(bd); - if (bi) { - struct bio_set *bs = &ib_dev->ibd_bio_set; + if (!bi) + return 0; - if (!strcmp(bi->profile->name, "T10-DIF-TYPE3-IP") || - !strcmp(bi->profile->name, "T10-DIF-TYPE1-IP")) { - pr_err("IBLOCK export of blk_integrity: %s not" - " supported\n", bi->profile->name); - ret = -ENOSYS; - goto out_blkdev_put; - } - - if (!strcmp(bi->profile->name, "T10-DIF-TYPE3-CRC")) { - dev->dev_attrib.pi_prot_type = TARGET_DIF_TYPE3_PROT; - } else if (!strcmp(bi->profile->name, "T10-DIF-TYPE1-CRC")) { + switch (bi->csum_type) { + case BLK_INTEGRITY_CSUM_IP: + pr_err("IBLOCK export of blk_integrity: %s not supported\n", + blk_integrity_profile_name(bi)); + ret = -ENOSYS; + goto out_blkdev_put; + case BLK_INTEGRITY_CSUM_CRC: + if (bi->flags & BLK_INTEGRITY_REF_TAG) dev->dev_attrib.pi_prot_type = TARGET_DIF_TYPE1_PROT; - } - - if (dev->dev_attrib.pi_prot_type) { - if (bioset_integrity_create(bs, IBLOCK_BIO_POOL_SIZE) < 0) { - pr_err("Unable to allocate bioset for PI\n"); - ret = -ENOMEM; - goto out_blkdev_put; - } - pr_debug("IBLOCK setup BIP bs->bio_integrity_pool: %p\n", - &bs->bio_integrity_pool); - } - dev->dev_attrib.hw_pi_prot_type = dev->dev_attrib.pi_prot_type; + else + dev->dev_attrib.pi_prot_type = TARGET_DIF_TYPE3_PROT; + break; + default: + break; } + if (dev->dev_attrib.pi_prot_type) { + struct bio_set *bs = &ib_dev->ibd_bio_set; + + if (bioset_integrity_create(bs, IBLOCK_BIO_POOL_SIZE) < 0) { + pr_err("Unable to allocate bioset for PI\n"); + ret = -ENOMEM; + goto out_blkdev_put; + } + pr_debug("IBLOCK setup BIP bs->bio_integrity_pool: %p\n", + &bs->bio_integrity_pool); + } + + dev->dev_attrib.hw_pi_prot_type = dev->dev_attrib.pi_prot_type; return 0; out_blkdev_put: diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h index 7428cb43952d..56ce1ae35580 100644 --- a/include/linux/blk-integrity.h +++ b/include/linux/blk-integrity.h @@ -10,7 +10,7 @@ enum blk_integrity_flags { BLK_INTEGRITY_VERIFY = 1 << 0, BLK_INTEGRITY_GENERATE = 1 << 1, BLK_INTEGRITY_DEVICE_CAPABLE = 1 << 2, - BLK_INTEGRITY_IP_CHECKSUM = 1 << 3, + BLK_INTEGRITY_REF_TAG = 1 << 3, }; struct blk_integrity_iter { @@ -19,22 +19,10 @@ struct blk_integrity_iter { sector_t seed; unsigned int data_size; unsigned short interval; - unsigned char tuple_size; - unsigned char pi_offset; const char *disk_name; }; -typedef blk_status_t (integrity_processing_fn) (struct blk_integrity_iter *); -typedef void (integrity_prepare_fn) (struct request *); -typedef void (integrity_complete_fn) (struct request *, unsigned int); - -struct blk_integrity_profile { - integrity_processing_fn *generate_fn; - integrity_processing_fn *verify_fn; - integrity_prepare_fn *prepare_fn; - integrity_complete_fn *complete_fn; - const char *name; -}; +const char *blk_integrity_profile_name(struct blk_integrity *bi); #ifdef CONFIG_BLK_DEV_INTEGRITY void blk_integrity_register(struct gendisk *, struct blk_integrity *); @@ -44,14 +32,17 @@ int blk_rq_map_integrity_sg(struct request_queue *, struct bio *, struct scatterlist *); int blk_rq_count_integrity_sg(struct request_queue *, struct bio *); +static inline bool +blk_integrity_queue_supports_integrity(struct request_queue *q) +{ + return q->integrity.tuple_size; +} + static inline struct blk_integrity *blk_get_integrity(struct gendisk *disk) { - struct blk_integrity *bi = &disk->queue->integrity; - - if (!bi->profile) + if (!blk_integrity_queue_supports_integrity(disk->queue)) return NULL; - - return bi; + return &disk->queue->integrity; } static inline struct blk_integrity * @@ -60,12 +51,6 @@ bdev_get_integrity(struct block_device *bdev) return blk_get_integrity(bdev->bd_disk); } -static inline bool -blk_integrity_queue_supports_integrity(struct request_queue *q) -{ - return q->integrity.profile; -} - static inline unsigned short queue_max_integrity_segments(const struct request_queue *q) { diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index ac8e0cb2353a..bdd33388e1ce 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -105,9 +105,16 @@ enum { struct disk_events; struct badblocks; +enum blk_integrity_checksum { + BLK_INTEGRITY_CSUM_NONE = 0, + BLK_INTEGRITY_CSUM_IP = 1, + BLK_INTEGRITY_CSUM_CRC = 2, + BLK_INTEGRITY_CSUM_CRC64 = 3, +} __packed ; + struct blk_integrity { - const struct blk_integrity_profile *profile; unsigned char flags; + enum blk_integrity_checksum csum_type; unsigned char tuple_size; unsigned char pi_offset; unsigned char interval_exp; diff --git a/include/linux/t10-pi.h b/include/linux/t10-pi.h index 248f4ac95642..d2bafb76badf 100644 --- a/include/linux/t10-pi.h +++ b/include/linux/t10-pi.h @@ -48,11 +48,6 @@ static inline u32 t10_pi_ref_tag(struct request *rq) return blk_rq_pos(rq) >> (shift - SECTOR_SHIFT) & 0xffffffff; } -extern const struct blk_integrity_profile t10_pi_type1_crc; -extern const struct blk_integrity_profile t10_pi_type1_ip; -extern const struct blk_integrity_profile t10_pi_type3_crc; -extern const struct blk_integrity_profile t10_pi_type3_ip; - struct crc64_pi_tuple { __be64 guard_tag; __be16 app_tag; @@ -79,7 +74,4 @@ static inline u64 ext_pi_ref_tag(struct request *rq) return lower_48_bits(blk_rq_pos(rq) >> (shift - SECTOR_SHIFT)); } -extern const struct blk_integrity_profile ext_pi_type1_crc64; -extern const struct blk_integrity_profile ext_pi_type3_crc64; - #endif From e8bc14d116aeac8f0f133ec8d249acf4e0658da7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 13 Jun 2024 10:48:16 +0200 Subject: [PATCH 20/26] block: remove the blk_flush_integrity call in blk_integrity_unregister Now that there are no indirect calls for PI processing there is no way to dereference a NULL pointer here. Additionally drivers now always freeze the queue (or in case of stacking drivers use their internal equivalent) around changing the integrity profile. This is effectively a revert of commit 3df49967f6f1 ("block: flush the integrity workqueue in blk_integrity_unregister"). Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20240613084839.1044015-7-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-integrity.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/block/blk-integrity.c b/block/blk-integrity.c index 17d37badfbb8..24f04575096d 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -401,8 +401,6 @@ void blk_integrity_unregister(struct gendisk *disk) if (!bi->tuple_size) return; - /* ensure all bios are off the integrity workqueue */ - blk_flush_integrity(); blk_queue_flag_clear(QUEUE_FLAG_STABLE_WRITES, disk->queue); memset(bi, 0, sizeof(*bi)); } From 1366251a794b149a132ef8423c8946b6e565a923 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 13 Jun 2024 10:48:17 +0200 Subject: [PATCH 21/26] block: factor out flag_{store,show} helper for integrity Factor the duplicate code for the generate and verify attributes into common helpers. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Chaitanya Kulkarni Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20240613084839.1044015-8-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-integrity.c | 53 +++++++++++++++++++++---------------------- 1 file changed, 26 insertions(+), 27 deletions(-) diff --git a/block/blk-integrity.c b/block/blk-integrity.c index 24f04575096d..24671d9f90a1 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -243,6 +243,28 @@ const char *blk_integrity_profile_name(struct blk_integrity *bi) } EXPORT_SYMBOL_GPL(blk_integrity_profile_name); +static ssize_t flag_store(struct device *dev, struct device_attribute *attr, + const char *page, size_t count, unsigned char flag) +{ + struct blk_integrity *bi = dev_to_bi(dev); + char *p = (char *) page; + unsigned long val = simple_strtoul(p, &p, 10); + + if (val) + bi->flags |= flag; + else + bi->flags &= ~flag; + return count; +} + +static ssize_t flag_show(struct device *dev, struct device_attribute *attr, + char *page, unsigned char flag) +{ + struct blk_integrity *bi = dev_to_bi(dev); + + return sysfs_emit(page, "%d\n", !!(bi->flags & flag)); +} + static ssize_t format_show(struct device *dev, struct device_attribute *attr, char *page) { @@ -275,49 +297,26 @@ static ssize_t read_verify_store(struct device *dev, struct device_attribute *attr, const char *page, size_t count) { - struct blk_integrity *bi = dev_to_bi(dev); - char *p = (char *) page; - unsigned long val = simple_strtoul(p, &p, 10); - - if (val) - bi->flags |= BLK_INTEGRITY_VERIFY; - else - bi->flags &= ~BLK_INTEGRITY_VERIFY; - - return count; + return flag_store(dev, attr, page, count, BLK_INTEGRITY_VERIFY); } static ssize_t read_verify_show(struct device *dev, struct device_attribute *attr, char *page) { - struct blk_integrity *bi = dev_to_bi(dev); - - return sysfs_emit(page, "%d\n", !!(bi->flags & BLK_INTEGRITY_VERIFY)); + return flag_show(dev, attr, page, BLK_INTEGRITY_VERIFY); } static ssize_t write_generate_store(struct device *dev, struct device_attribute *attr, const char *page, size_t count) { - struct blk_integrity *bi = dev_to_bi(dev); - - char *p = (char *) page; - unsigned long val = simple_strtoul(p, &p, 10); - - if (val) - bi->flags |= BLK_INTEGRITY_GENERATE; - else - bi->flags &= ~BLK_INTEGRITY_GENERATE; - - return count; + return flag_store(dev, attr, page, count, BLK_INTEGRITY_GENERATE); } static ssize_t write_generate_show(struct device *dev, struct device_attribute *attr, char *page) { - struct blk_integrity *bi = dev_to_bi(dev); - - return sysfs_emit(page, "%d\n", !!(bi->flags & BLK_INTEGRITY_GENERATE)); + return flag_show(dev, attr, page, BLK_INTEGRITY_GENERATE); } static ssize_t device_is_integrity_capable_show(struct device *dev, From 1d59857ed2ec4d506e346859713c4325b5053da3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 13 Jun 2024 10:48:18 +0200 Subject: [PATCH 22/26] block: use kstrtoul in flag_store Use the text to integer helper that has error handling and doesn't modify the input pointer. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Chaitanya Kulkarni Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Reviewed-by: Kanchan Joshi Link: https://lore.kernel.org/r/20240613084839.1044015-9-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-integrity.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/block/blk-integrity.c b/block/blk-integrity.c index 24671d9f90a1..58760a6d6b22 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -247,8 +247,12 @@ static ssize_t flag_store(struct device *dev, struct device_attribute *attr, const char *page, size_t count, unsigned char flag) { struct blk_integrity *bi = dev_to_bi(dev); - char *p = (char *) page; - unsigned long val = simple_strtoul(p, &p, 10); + unsigned long val; + int err; + + err = kstrtoul(page, 10, &val); + if (err) + return err; if (val) bi->flags |= flag; From 43c5dbe98a3953e07f4fbf89aa137b9207d52378 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 13 Jun 2024 10:48:19 +0200 Subject: [PATCH 23/26] block: don't require stable pages for non-PI metadata Non-PI metadata doesn't contain checksums and thus doesn't require stable pages. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20240613084839.1044015-10-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-integrity.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/block/blk-integrity.c b/block/blk-integrity.c index 58760a6d6b22..1d2d371cd632 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -379,7 +379,8 @@ void blk_integrity_register(struct gendisk *disk, struct blk_integrity *template bi->tag_size = template->tag_size; bi->pi_offset = template->pi_offset; - blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, disk->queue); + if (bi->csum_type != BLK_INTEGRITY_CSUM_NONE) + blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, disk->queue); #ifdef CONFIG_BLK_INLINE_ENCRYPTION if (disk->queue->crypto_profile) { @@ -404,7 +405,8 @@ void blk_integrity_unregister(struct gendisk *disk) if (!bi->tuple_size) return; - blk_queue_flag_clear(QUEUE_FLAG_STABLE_WRITES, disk->queue); + if (bi->csum_type != BLK_INTEGRITY_CSUM_NONE) + blk_queue_flag_clear(QUEUE_FLAG_STABLE_WRITES, disk->queue); memset(bi, 0, sizeof(*bi)); } EXPORT_SYMBOL(blk_integrity_unregister); From 3c3e85ddffae93eba1a257eb6939bf5dc1e93b9e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 13 Jun 2024 10:48:20 +0200 Subject: [PATCH 24/26] block: bypass the STABLE_WRITES flag for protection information Currently registering a checksum-enabled (aka PI) integrity profile sets the QUEUE_FLAG_STABLE_WRITE flag, and unregistering it clears the flag. This can incorrectly clear the flag when the driver requires stable writes even without PI, e.g. in case of iSCSI or NVMe/TCP with data digest enabled. Fix this by looking at the csum_type directly in bdev_stable_writes and not setting the queue flag. Also remove the blk_queue_stable_writes helper as the only user in nvme wants to only look at the actual QUEUE_FLAG_STABLE_WRITE flag as it inherits the integrity configuration by other means. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20240613084839.1044015-11-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-integrity.c | 6 ------ drivers/nvme/host/multipath.c | 3 ++- include/linux/blkdev.h | 12 ++++++++---- 3 files changed, 10 insertions(+), 11 deletions(-) diff --git a/block/blk-integrity.c b/block/blk-integrity.c index 1d2d371cd632..bec0d1df387c 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -379,9 +379,6 @@ void blk_integrity_register(struct gendisk *disk, struct blk_integrity *template bi->tag_size = template->tag_size; bi->pi_offset = template->pi_offset; - if (bi->csum_type != BLK_INTEGRITY_CSUM_NONE) - blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, disk->queue); - #ifdef CONFIG_BLK_INLINE_ENCRYPTION if (disk->queue->crypto_profile) { pr_warn("blk-integrity: Integrity and hardware inline encryption are not supported together. Disabling hardware inline encryption.\n"); @@ -404,9 +401,6 @@ void blk_integrity_unregister(struct gendisk *disk) if (!bi->tuple_size) return; - - if (bi->csum_type != BLK_INTEGRITY_CSUM_NONE) - blk_queue_flag_clear(QUEUE_FLAG_STABLE_WRITES, disk->queue); memset(bi, 0, sizeof(*bi)); } EXPORT_SYMBOL(blk_integrity_unregister); diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index d8b6b4648eaf..12c59db02539 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -875,7 +875,8 @@ void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid) nvme_mpath_set_live(ns); } - if (blk_queue_stable_writes(ns->queue) && ns->head->disk) + if (test_bit(QUEUE_FLAG_STABLE_WRITES, &ns->queue->queue_flags) && + ns->head->disk) blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, ns->head->disk->queue); #ifdef CONFIG_BLK_DEV_ZONED diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index bdd33388e1ce..f9089750919c 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -571,8 +571,6 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q); #define blk_queue_noxmerges(q) \ test_bit(QUEUE_FLAG_NOXMERGES, &(q)->queue_flags) #define blk_queue_nonrot(q) test_bit(QUEUE_FLAG_NONROT, &(q)->queue_flags) -#define blk_queue_stable_writes(q) \ - test_bit(QUEUE_FLAG_STABLE_WRITES, &(q)->queue_flags) #define blk_queue_io_stat(q) test_bit(QUEUE_FLAG_IO_STAT, &(q)->queue_flags) #define blk_queue_add_random(q) test_bit(QUEUE_FLAG_ADD_RANDOM, &(q)->queue_flags) #define blk_queue_zone_resetall(q) \ @@ -1300,8 +1298,14 @@ static inline bool bdev_synchronous(struct block_device *bdev) static inline bool bdev_stable_writes(struct block_device *bdev) { - return test_bit(QUEUE_FLAG_STABLE_WRITES, - &bdev_get_queue(bdev)->queue_flags); + struct request_queue *q = bdev_get_queue(bdev); + +#ifdef CONFIG_BLK_DEV_INTEGRITY + /* BLK_INTEGRITY_CSUM_NONE is not available in blkdev.h */ + if (q->integrity.csum_type != 0) + return true; +#endif + return test_bit(QUEUE_FLAG_STABLE_WRITES, &q->queue_flags); } static inline bool bdev_write_cache(struct block_device *bdev) From 9f4aa46f2a7401025d8561495cf8740f773310fc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 13 Jun 2024 10:48:21 +0200 Subject: [PATCH 25/26] block: invert the BLK_INTEGRITY_{GENERATE,VERIFY} flags Invert the flags so that user set values will be able to persist revalidating the integrity information once we switch the integrity information to queue_limits. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Reviewed-by: Hannes Reinecke Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240613084839.1044015-12-hch@lst.de Signed-off-by: Jens Axboe --- block/bio-integrity.c | 4 ++-- block/blk-integrity.c | 18 +++++++++--------- include/linux/blk-integrity.h | 4 ++-- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 31dbc2853f92..173ffd4d6237 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -448,10 +448,10 @@ bool bio_integrity_prep(struct bio *bio) return true; if (bio_data_dir(bio) == READ) { - if (!(bi->flags & BLK_INTEGRITY_VERIFY)) + if (bi->flags & BLK_INTEGRITY_NOVERIFY) return true; } else { - if (!(bi->flags & BLK_INTEGRITY_GENERATE)) + if (bi->flags & BLK_INTEGRITY_NOGENERATE) return true; /* diff --git a/block/blk-integrity.c b/block/blk-integrity.c index bec0d1df387c..b37b8855eed1 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -254,10 +254,11 @@ static ssize_t flag_store(struct device *dev, struct device_attribute *attr, if (err) return err; + /* the flags are inverted vs the values in the sysfs files */ if (val) - bi->flags |= flag; - else bi->flags &= ~flag; + else + bi->flags |= flag; return count; } @@ -266,7 +267,7 @@ static ssize_t flag_show(struct device *dev, struct device_attribute *attr, { struct blk_integrity *bi = dev_to_bi(dev); - return sysfs_emit(page, "%d\n", !!(bi->flags & flag)); + return sysfs_emit(page, "%d\n", !(bi->flags & flag)); } static ssize_t format_show(struct device *dev, struct device_attribute *attr, @@ -301,26 +302,26 @@ static ssize_t read_verify_store(struct device *dev, struct device_attribute *attr, const char *page, size_t count) { - return flag_store(dev, attr, page, count, BLK_INTEGRITY_VERIFY); + return flag_store(dev, attr, page, count, BLK_INTEGRITY_NOVERIFY); } static ssize_t read_verify_show(struct device *dev, struct device_attribute *attr, char *page) { - return flag_show(dev, attr, page, BLK_INTEGRITY_VERIFY); + return flag_show(dev, attr, page, BLK_INTEGRITY_NOVERIFY); } static ssize_t write_generate_store(struct device *dev, struct device_attribute *attr, const char *page, size_t count) { - return flag_store(dev, attr, page, count, BLK_INTEGRITY_GENERATE); + return flag_store(dev, attr, page, count, BLK_INTEGRITY_NOGENERATE); } static ssize_t write_generate_show(struct device *dev, struct device_attribute *attr, char *page) { - return flag_show(dev, attr, page, BLK_INTEGRITY_GENERATE); + return flag_show(dev, attr, page, BLK_INTEGRITY_NOGENERATE); } static ssize_t device_is_integrity_capable_show(struct device *dev, @@ -371,8 +372,7 @@ void blk_integrity_register(struct gendisk *disk, struct blk_integrity *template struct blk_integrity *bi = &disk->queue->integrity; bi->csum_type = template->csum_type; - bi->flags = BLK_INTEGRITY_VERIFY | BLK_INTEGRITY_GENERATE | - template->flags; + bi->flags = template->flags; bi->interval_exp = template->interval_exp ? : ilog2(queue_logical_block_size(disk->queue)); bi->tuple_size = template->tuple_size; diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h index 56ce1ae35580..bafa01d4e7f9 100644 --- a/include/linux/blk-integrity.h +++ b/include/linux/blk-integrity.h @@ -7,8 +7,8 @@ struct request; enum blk_integrity_flags { - BLK_INTEGRITY_VERIFY = 1 << 0, - BLK_INTEGRITY_GENERATE = 1 << 1, + BLK_INTEGRITY_NOVERIFY = 1 << 0, + BLK_INTEGRITY_NOGENERATE = 1 << 1, BLK_INTEGRITY_DEVICE_CAPABLE = 1 << 2, BLK_INTEGRITY_REF_TAG = 1 << 3, }; From c6e56cf6b2e79a463af21286ba951714ed20828c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 13 Jun 2024 10:48:22 +0200 Subject: [PATCH 26/26] block: move integrity information into queue_limits Move the integrity information into the queue limits so that it can be set atomically with other queue limits, and that the sysfs changes to the read_verify and write_generate flags are properly synchronized. This also allows to provide a more useful helper to stack the integrity fields, although it still is separate from the main stacking function as not all stackable devices want to inherit the integrity settings. Even with that it greatly simplifies the code in md and dm. Note that the integrity field is moved as-is into the queue limits. While there are good arguments for removing the separate blk_integrity structure, this would cause a lot of churn and might better be done at a later time if desired. However the integrity field in the queue_limits structure is now unconditional so that various ifdefs can be avoided or replaced with IS_ENABLED(). Given that tiny size of it that seems like a worthwhile trade off. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240613084839.1044015-13-hch@lst.de Signed-off-by: Jens Axboe --- Documentation/block/data-integrity.rst | 49 +------- block/blk-integrity.c | 124 ++----------------- block/blk-settings.c | 118 +++++++++++++++++- block/t10-pi.c | 12 +- drivers/md/dm-core.h | 1 - drivers/md/dm-integrity.c | 27 ++--- drivers/md/dm-table.c | 161 +++++-------------------- drivers/md/md.c | 72 +++-------- drivers/md/md.h | 5 +- drivers/md/raid0.c | 7 +- drivers/md/raid1.c | 10 +- drivers/md/raid10.c | 10 +- drivers/md/raid5.c | 2 +- drivers/nvdimm/btt.c | 13 +- drivers/nvme/host/core.c | 70 +++++------ drivers/scsi/sd.c | 8 +- drivers/scsi/sd.h | 12 +- drivers/scsi/sd_dif.c | 34 +++--- include/linux/blk-integrity.h | 27 ++--- include/linux/blkdev.h | 12 +- include/linux/t10-pi.h | 12 +- 21 files changed, 289 insertions(+), 497 deletions(-) diff --git a/Documentation/block/data-integrity.rst b/Documentation/block/data-integrity.rst index 6a760c0eb192..99905e880a0e 100644 --- a/Documentation/block/data-integrity.rst +++ b/Documentation/block/data-integrity.rst @@ -153,18 +153,11 @@ bio_free() will automatically free the bip. 4.2 Block Device ---------------- -Because the format of the protection data is tied to the physical -disk, each block device has been extended with a block integrity -profile (struct blk_integrity). This optional profile is registered -with the block layer using blk_integrity_register(). - -The profile contains callback functions for generating and verifying -the protection data, as well as getting and setting application tags. -The profile also contains a few constants to aid in completing, -merging and splitting the integrity metadata. +Block devices can set up the integrity information in the integrity +sub-struture of the queue_limits structure. Layered block devices will need to pick a profile that's appropriate -for all subdevices. blk_integrity_compare() can help with that. DM +for all subdevices. queue_limits_stack_integrity() can help with that. DM and MD linear, RAID0 and RAID1 are currently supported. RAID4/5/6 will require extra work due to the application tag. @@ -250,42 +243,6 @@ will require extra work due to the application tag. integrity upon completion. -5.4 Registering A Block Device As Capable Of Exchanging Integrity Metadata --------------------------------------------------------------------------- - - To enable integrity exchange on a block device the gendisk must be - registered as capable: - - `int blk_integrity_register(gendisk, blk_integrity);` - - The blk_integrity struct is a template and should contain the - following:: - - static struct blk_integrity my_profile = { - .name = "STANDARDSBODY-TYPE-VARIANT-CSUM", - .generate_fn = my_generate_fn, - .verify_fn = my_verify_fn, - .tuple_size = sizeof(struct my_tuple_size), - .tag_size = , - }; - - 'name' is a text string which will be visible in sysfs. This is - part of the userland API so chose it carefully and never change - it. The format is standards body-type-variant. - E.g. T10-DIF-TYPE1-IP or T13-EPP-0-CRC. - - 'generate_fn' generates appropriate integrity metadata (for WRITE). - - 'verify_fn' verifies that the data buffer matches the integrity - metadata. - - 'tuple_size' must be set to match the size of the integrity - metadata per sector. I.e. 8 for DIF and EPP. - - 'tag_size' must be set to identify how many bytes of tag space - are available per hardware sector. For DIF this is either 2 or - 0 depending on the value of the Control Mode Page ATO bit. - ---------------------------------------------------------------------- 2007-12-24 Martin K. Petersen diff --git a/block/blk-integrity.c b/block/blk-integrity.c index b37b8855eed1..05a48689a424 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -107,63 +107,6 @@ new_segment: } EXPORT_SYMBOL(blk_rq_map_integrity_sg); -/** - * blk_integrity_compare - Compare integrity profile of two disks - * @gd1: Disk to compare - * @gd2: Disk to compare - * - * Description: Meta-devices like DM and MD need to verify that all - * sub-devices use the same integrity format before advertising to - * upper layers that they can send/receive integrity metadata. This - * function can be used to check whether two gendisk devices have - * compatible integrity formats. - */ -int blk_integrity_compare(struct gendisk *gd1, struct gendisk *gd2) -{ - struct blk_integrity *b1 = &gd1->queue->integrity; - struct blk_integrity *b2 = &gd2->queue->integrity; - - if (!b1->tuple_size && !b2->tuple_size) - return 0; - - if (!b1->tuple_size || !b2->tuple_size) - return -1; - - if (b1->interval_exp != b2->interval_exp) { - pr_err("%s: %s/%s protection interval %u != %u\n", - __func__, gd1->disk_name, gd2->disk_name, - 1 << b1->interval_exp, 1 << b2->interval_exp); - return -1; - } - - if (b1->tuple_size != b2->tuple_size) { - pr_err("%s: %s/%s tuple sz %u != %u\n", __func__, - gd1->disk_name, gd2->disk_name, - b1->tuple_size, b2->tuple_size); - return -1; - } - - if (b1->tag_size && b2->tag_size && (b1->tag_size != b2->tag_size)) { - pr_err("%s: %s/%s tag sz %u != %u\n", __func__, - gd1->disk_name, gd2->disk_name, - b1->tag_size, b2->tag_size); - return -1; - } - - if (b1->csum_type != b2->csum_type || - (b1->flags & BLK_INTEGRITY_REF_TAG) != - (b2->flags & BLK_INTEGRITY_REF_TAG)) { - pr_err("%s: %s/%s type %s != %s\n", __func__, - gd1->disk_name, gd2->disk_name, - blk_integrity_profile_name(b1), - blk_integrity_profile_name(b2)); - return -1; - } - - return 0; -} -EXPORT_SYMBOL(blk_integrity_compare); - bool blk_integrity_merge_rq(struct request_queue *q, struct request *req, struct request *next) { @@ -217,7 +160,7 @@ bool blk_integrity_merge_bio(struct request_queue *q, struct request *req, static inline struct blk_integrity *dev_to_bi(struct device *dev) { - return &dev_to_disk(dev)->queue->integrity; + return &dev_to_disk(dev)->queue->limits.integrity; } const char *blk_integrity_profile_name(struct blk_integrity *bi) @@ -246,7 +189,8 @@ EXPORT_SYMBOL_GPL(blk_integrity_profile_name); static ssize_t flag_store(struct device *dev, struct device_attribute *attr, const char *page, size_t count, unsigned char flag) { - struct blk_integrity *bi = dev_to_bi(dev); + struct request_queue *q = dev_to_disk(dev)->queue; + struct queue_limits lim; unsigned long val; int err; @@ -254,11 +198,18 @@ static ssize_t flag_store(struct device *dev, struct device_attribute *attr, if (err) return err; - /* the flags are inverted vs the values in the sysfs files */ + /* note that the flags are inverted vs the values in the sysfs files */ + lim = queue_limits_start_update(q); if (val) - bi->flags &= ~flag; + lim.integrity.flags &= ~flag; else - bi->flags |= flag; + lim.integrity.flags |= flag; + + blk_mq_freeze_queue(q); + err = queue_limits_commit_update(q, &lim); + blk_mq_unfreeze_queue(q); + if (err) + return err; return count; } @@ -355,52 +306,3 @@ const struct attribute_group blk_integrity_attr_group = { .name = "integrity", .attrs = integrity_attrs, }; - -/** - * blk_integrity_register - Register a gendisk as being integrity-capable - * @disk: struct gendisk pointer to make integrity-aware - * @template: block integrity profile to register - * - * Description: When a device needs to advertise itself as being able to - * send/receive integrity metadata it must use this function to register - * the capability with the block layer. The template is a blk_integrity - * struct with values appropriate for the underlying hardware. See - * Documentation/block/data-integrity.rst. - */ -void blk_integrity_register(struct gendisk *disk, struct blk_integrity *template) -{ - struct blk_integrity *bi = &disk->queue->integrity; - - bi->csum_type = template->csum_type; - bi->flags = template->flags; - bi->interval_exp = template->interval_exp ? : - ilog2(queue_logical_block_size(disk->queue)); - bi->tuple_size = template->tuple_size; - bi->tag_size = template->tag_size; - bi->pi_offset = template->pi_offset; - -#ifdef CONFIG_BLK_INLINE_ENCRYPTION - if (disk->queue->crypto_profile) { - pr_warn("blk-integrity: Integrity and hardware inline encryption are not supported together. Disabling hardware inline encryption.\n"); - disk->queue->crypto_profile = NULL; - } -#endif -} -EXPORT_SYMBOL(blk_integrity_register); - -/** - * blk_integrity_unregister - Unregister block integrity profile - * @disk: disk whose integrity profile to unregister - * - * Description: This function unregisters the integrity capability from - * a block device. - */ -void blk_integrity_unregister(struct gendisk *disk) -{ - struct blk_integrity *bi = &disk->queue->integrity; - - if (!bi->tuple_size) - return; - memset(bi, 0, sizeof(*bi)); -} -EXPORT_SYMBOL(blk_integrity_unregister); diff --git a/block/blk-settings.c b/block/blk-settings.c index 996f247fc98e..f11c8676eb4c 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -6,7 +6,7 @@ #include #include #include -#include +#include #include #include #include @@ -97,6 +97,36 @@ static int blk_validate_zoned_limits(struct queue_limits *lim) return 0; } +static int blk_validate_integrity_limits(struct queue_limits *lim) +{ + struct blk_integrity *bi = &lim->integrity; + + if (!bi->tuple_size) { + if (bi->csum_type != BLK_INTEGRITY_CSUM_NONE || + bi->tag_size || ((bi->flags & BLK_INTEGRITY_REF_TAG))) { + pr_warn("invalid PI settings.\n"); + return -EINVAL; + } + return 0; + } + + if (!IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY)) { + pr_warn("integrity support disabled.\n"); + return -EINVAL; + } + + if (bi->csum_type == BLK_INTEGRITY_CSUM_NONE && + (bi->flags & BLK_INTEGRITY_REF_TAG)) { + pr_warn("ref tag not support without checksum.\n"); + return -EINVAL; + } + + if (!bi->interval_exp) + bi->interval_exp = ilog2(lim->logical_block_size); + + return 0; +} + /* * Check that the limits in lim are valid, initialize defaults for unset * values, and cap values based on others where needed. @@ -105,6 +135,7 @@ static int blk_validate_limits(struct queue_limits *lim) { unsigned int max_hw_sectors; unsigned int logical_block_sectors; + int err; /* * Unless otherwise specified, default to 512 byte logical blocks and a @@ -230,6 +261,9 @@ static int blk_validate_limits(struct queue_limits *lim) lim->misaligned = 0; } + err = blk_validate_integrity_limits(lim); + if (err) + return err; return blk_validate_zoned_limits(lim); } @@ -263,13 +297,24 @@ int queue_limits_commit_update(struct request_queue *q, struct queue_limits *lim) __releases(q->limits_lock) { - int error = blk_validate_limits(lim); + int error; - if (!error) { - q->limits = *lim; - if (q->disk) - blk_apply_bdi_limits(q->disk->bdi, lim); + error = blk_validate_limits(lim); + if (error) + goto out_unlock; + +#ifdef CONFIG_BLK_INLINE_ENCRYPTION + if (q->crypto_profile && lim->integrity.tag_size) { + pr_warn("blk-integrity: Integrity and hardware inline encryption are not supported together.\n"); + error = -EINVAL; + goto out_unlock; } +#endif + + q->limits = *lim; + if (q->disk) + blk_apply_bdi_limits(q->disk->bdi, lim); +out_unlock: mutex_unlock(&q->limits_lock); return error; } @@ -575,6 +620,67 @@ void queue_limits_stack_bdev(struct queue_limits *t, struct block_device *bdev, } EXPORT_SYMBOL_GPL(queue_limits_stack_bdev); +/** + * queue_limits_stack_integrity - stack integrity profile + * @t: target queue limits + * @b: base queue limits + * + * Check if the integrity profile in the @b can be stacked into the + * target @t. Stacking is possible if either: + * + * a) does not have any integrity information stacked into it yet + * b) the integrity profile in @b is identical to the one in @t + * + * If @b can be stacked into @t, return %true. Else return %false and clear the + * integrity information in @t. + */ +bool queue_limits_stack_integrity(struct queue_limits *t, + struct queue_limits *b) +{ + struct blk_integrity *ti = &t->integrity; + struct blk_integrity *bi = &b->integrity; + + if (!IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY)) + return true; + + if (!ti->tuple_size) { + /* inherit the settings from the first underlying device */ + if (!(ti->flags & BLK_INTEGRITY_STACKED)) { + ti->flags = BLK_INTEGRITY_DEVICE_CAPABLE | + (bi->flags & BLK_INTEGRITY_REF_TAG); + ti->csum_type = bi->csum_type; + ti->tuple_size = bi->tuple_size; + ti->pi_offset = bi->pi_offset; + ti->interval_exp = bi->interval_exp; + ti->tag_size = bi->tag_size; + goto done; + } + if (!bi->tuple_size) + goto done; + } + + if (ti->tuple_size != bi->tuple_size) + goto incompatible; + if (ti->interval_exp != bi->interval_exp) + goto incompatible; + if (ti->tag_size != bi->tag_size) + goto incompatible; + if (ti->csum_type != bi->csum_type) + goto incompatible; + if ((ti->flags & BLK_INTEGRITY_REF_TAG) != + (bi->flags & BLK_INTEGRITY_REF_TAG)) + goto incompatible; + +done: + ti->flags |= BLK_INTEGRITY_STACKED; + return true; + +incompatible: + memset(ti, 0, sizeof(*ti)); + return false; +} +EXPORT_SYMBOL_GPL(queue_limits_stack_integrity); + /** * blk_queue_update_dma_pad - update pad mask * @q: the request queue for the device diff --git a/block/t10-pi.c b/block/t10-pi.c index dadecf621497..cd7fa60d63ff 100644 --- a/block/t10-pi.c +++ b/block/t10-pi.c @@ -116,7 +116,7 @@ next: */ static void t10_pi_type1_prepare(struct request *rq) { - struct blk_integrity *bi = &rq->q->integrity; + struct blk_integrity *bi = &rq->q->limits.integrity; const int tuple_sz = bi->tuple_size; u32 ref_tag = t10_pi_ref_tag(rq); u8 offset = bi->pi_offset; @@ -167,7 +167,7 @@ static void t10_pi_type1_prepare(struct request *rq) */ static void t10_pi_type1_complete(struct request *rq, unsigned int nr_bytes) { - struct blk_integrity *bi = &rq->q->integrity; + struct blk_integrity *bi = &rq->q->limits.integrity; unsigned intervals = nr_bytes >> bi->interval_exp; const int tuple_sz = bi->tuple_size; u32 ref_tag = t10_pi_ref_tag(rq); @@ -290,7 +290,7 @@ next: static void ext_pi_type1_prepare(struct request *rq) { - struct blk_integrity *bi = &rq->q->integrity; + struct blk_integrity *bi = &rq->q->limits.integrity; const int tuple_sz = bi->tuple_size; u64 ref_tag = ext_pi_ref_tag(rq); u8 offset = bi->pi_offset; @@ -330,7 +330,7 @@ static void ext_pi_type1_prepare(struct request *rq) static void ext_pi_type1_complete(struct request *rq, unsigned int nr_bytes) { - struct blk_integrity *bi = &rq->q->integrity; + struct blk_integrity *bi = &rq->q->limits.integrity; unsigned intervals = nr_bytes >> bi->interval_exp; const int tuple_sz = bi->tuple_size; u64 ref_tag = ext_pi_ref_tag(rq); @@ -396,7 +396,7 @@ blk_status_t blk_integrity_verify(struct blk_integrity_iter *iter, void blk_integrity_prepare(struct request *rq) { - struct blk_integrity *bi = &rq->q->integrity; + struct blk_integrity *bi = &rq->q->limits.integrity; if (!(bi->flags & BLK_INTEGRITY_REF_TAG)) return; @@ -409,7 +409,7 @@ void blk_integrity_prepare(struct request *rq) void blk_integrity_complete(struct request *rq, unsigned int nr_bytes) { - struct blk_integrity *bi = &rq->q->integrity; + struct blk_integrity *bi = &rq->q->limits.integrity; if (!(bi->flags & BLK_INTEGRITY_REF_TAG)) return; diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h index 08700bfc3e23..14a44c0f8286 100644 --- a/drivers/md/dm-core.h +++ b/drivers/md/dm-core.h @@ -206,7 +206,6 @@ struct dm_table { bool integrity_supported:1; bool singleton:1; - unsigned integrity_added:1; /* * Indicates the rw permissions for the new logical device. This diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index c1cc27541673..2a89f8eb4713 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -3475,6 +3475,17 @@ static void dm_integrity_io_hints(struct dm_target *ti, struct queue_limits *lim limits->dma_alignment = limits->logical_block_size - 1; limits->discard_granularity = ic->sectors_per_block << SECTOR_SHIFT; } + + if (!ic->internal_hash) { + struct blk_integrity *bi = &limits->integrity; + + memset(bi, 0, sizeof(*bi)); + bi->tuple_size = ic->tag_size; + bi->tag_size = bi->tuple_size; + bi->interval_exp = + ic->sb->log2_sectors_per_block + SECTOR_SHIFT; + } + limits->max_integrity_segments = USHRT_MAX; } @@ -3631,19 +3642,6 @@ try_smaller_buffer: return 0; } -static void dm_integrity_set(struct dm_target *ti, struct dm_integrity_c *ic) -{ - struct gendisk *disk = dm_disk(dm_table_get_md(ti->table)); - struct blk_integrity bi; - - memset(&bi, 0, sizeof(bi)); - bi.tuple_size = ic->tag_size; - bi.tag_size = bi.tuple_size; - bi.interval_exp = ic->sb->log2_sectors_per_block + SECTOR_SHIFT; - - blk_integrity_register(disk, &bi); -} - static void dm_integrity_free_page_list(struct page_list *pl) { unsigned int i; @@ -4629,9 +4627,6 @@ try_smaller_buffer: } } - if (!ic->internal_hash) - dm_integrity_set(ti, ic); - ti->num_flush_bios = 1; ti->flush_supported = true; if (ic->discard) diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index b2d5246cff21..fd789eeb62d9 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -425,6 +425,13 @@ static int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev, q->limits.logical_block_size, q->limits.alignment_offset, (unsigned long long) start << SECTOR_SHIFT); + + /* + * Only stack the integrity profile if the target doesn't have native + * integrity support. + */ + if (!dm_target_has_integrity(ti->type)) + queue_limits_stack_integrity_bdev(limits, bdev); return 0; } @@ -702,9 +709,6 @@ int dm_table_add_target(struct dm_table *t, const char *type, t->immutable_target_type = ti->type; } - if (dm_target_has_integrity(ti->type)) - t->integrity_added = 1; - ti->table = t; ti->begin = start; ti->len = len; @@ -1119,99 +1123,6 @@ static int dm_table_build_index(struct dm_table *t) return r; } -static bool integrity_profile_exists(struct gendisk *disk) -{ - return !!blk_get_integrity(disk); -} - -/* - * Get a disk whose integrity profile reflects the table's profile. - * Returns NULL if integrity support was inconsistent or unavailable. - */ -static struct gendisk *dm_table_get_integrity_disk(struct dm_table *t) -{ - struct list_head *devices = dm_table_get_devices(t); - struct dm_dev_internal *dd = NULL; - struct gendisk *prev_disk = NULL, *template_disk = NULL; - - for (unsigned int i = 0; i < t->num_targets; i++) { - struct dm_target *ti = dm_table_get_target(t, i); - - if (!dm_target_passes_integrity(ti->type)) - goto no_integrity; - } - - list_for_each_entry(dd, devices, list) { - template_disk = dd->dm_dev->bdev->bd_disk; - if (!integrity_profile_exists(template_disk)) - goto no_integrity; - else if (prev_disk && - blk_integrity_compare(prev_disk, template_disk) < 0) - goto no_integrity; - prev_disk = template_disk; - } - - return template_disk; - -no_integrity: - if (prev_disk) - DMWARN("%s: integrity not set: %s and %s profile mismatch", - dm_device_name(t->md), - prev_disk->disk_name, - template_disk->disk_name); - return NULL; -} - -/* - * Register the mapped device for blk_integrity support if the - * underlying devices have an integrity profile. But all devices may - * not have matching profiles (checking all devices isn't reliable - * during table load because this table may use other DM device(s) which - * must be resumed before they will have an initialized integity - * profile). Consequently, stacked DM devices force a 2 stage integrity - * profile validation: First pass during table load, final pass during - * resume. - */ -static int dm_table_register_integrity(struct dm_table *t) -{ - struct mapped_device *md = t->md; - struct gendisk *template_disk = NULL; - - /* If target handles integrity itself do not register it here. */ - if (t->integrity_added) - return 0; - - template_disk = dm_table_get_integrity_disk(t); - if (!template_disk) - return 0; - - if (!integrity_profile_exists(dm_disk(md))) { - t->integrity_supported = true; - /* - * Register integrity profile during table load; we can do - * this because the final profile must match during resume. - */ - blk_integrity_register(dm_disk(md), - blk_get_integrity(template_disk)); - return 0; - } - - /* - * If DM device already has an initialized integrity - * profile the new profile should not conflict. - */ - if (blk_integrity_compare(dm_disk(md), template_disk) < 0) { - DMERR("%s: conflict with existing integrity profile: %s profile mismatch", - dm_device_name(t->md), - template_disk->disk_name); - return 1; - } - - /* Preserve existing integrity profile */ - t->integrity_supported = true; - return 0; -} - #ifdef CONFIG_BLK_INLINE_ENCRYPTION struct dm_crypto_profile { @@ -1423,12 +1334,6 @@ int dm_table_complete(struct dm_table *t) return r; } - r = dm_table_register_integrity(t); - if (r) { - DMERR("could not register integrity profile."); - return r; - } - r = dm_table_construct_crypto_profile(t); if (r) { DMERR("could not construct crypto profile."); @@ -1688,6 +1593,14 @@ int dm_calculate_queue_limits(struct dm_table *t, blk_set_stacking_limits(limits); + t->integrity_supported = true; + for (unsigned int i = 0; i < t->num_targets; i++) { + struct dm_target *ti = dm_table_get_target(t, i); + + if (!dm_target_passes_integrity(ti->type)) + t->integrity_supported = false; + } + for (unsigned int i = 0; i < t->num_targets; i++) { struct dm_target *ti = dm_table_get_target(t, i); @@ -1738,6 +1651,18 @@ combine_limits: dm_device_name(t->md), (unsigned long long) ti->begin, (unsigned long long) ti->len); + + if (t->integrity_supported || + dm_target_has_integrity(ti->type)) { + if (!queue_limits_stack_integrity(limits, &ti_limits)) { + DMWARN("%s: adding target device (start sect %llu len %llu) " + "disabled integrity support due to incompatibility", + dm_device_name(t->md), + (unsigned long long) ti->begin, + (unsigned long long) ti->len); + t->integrity_supported = false; + } + } } /* @@ -1761,36 +1686,6 @@ combine_limits: return validate_hardware_logical_block_alignment(t, limits); } -/* - * Verify that all devices have an integrity profile that matches the - * DM device's registered integrity profile. If the profiles don't - * match then unregister the DM device's integrity profile. - */ -static void dm_table_verify_integrity(struct dm_table *t) -{ - struct gendisk *template_disk = NULL; - - if (t->integrity_added) - return; - - if (t->integrity_supported) { - /* - * Verify that the original integrity profile - * matches all the devices in this table. - */ - template_disk = dm_table_get_integrity_disk(t); - if (template_disk && - blk_integrity_compare(dm_disk(t->md), template_disk) >= 0) - return; - } - - if (integrity_profile_exists(dm_disk(t->md))) { - DMWARN("%s: unable to establish an integrity profile", - dm_device_name(t->md)); - blk_integrity_unregister(dm_disk(t->md)); - } -} - static int device_flush_capable(struct dm_target *ti, struct dm_dev *dev, sector_t start, sector_t len, void *data) { @@ -2004,8 +1899,6 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, else blk_queue_flag_set(QUEUE_FLAG_NONROT, q); - dm_table_verify_integrity(t); - /* * Some devices don't use blk_integrity but still want stable pages * because they do their own checksumming. diff --git a/drivers/md/md.c b/drivers/md/md.c index aff9118ff697..67ece2cd725f 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2410,36 +2410,10 @@ static LIST_HEAD(pending_raid_disks); */ int md_integrity_register(struct mddev *mddev) { - struct md_rdev *rdev, *reference = NULL; - if (list_empty(&mddev->disks)) return 0; /* nothing to do */ - if (mddev_is_dm(mddev) || blk_get_integrity(mddev->gendisk)) - return 0; /* shouldn't register, or already is */ - rdev_for_each(rdev, mddev) { - /* skip spares and non-functional disks */ - if (test_bit(Faulty, &rdev->flags)) - continue; - if (rdev->raid_disk < 0) - continue; - if (!reference) { - /* Use the first rdev as the reference */ - reference = rdev; - continue; - } - /* does this rdev's profile match the reference profile? */ - if (blk_integrity_compare(reference->bdev->bd_disk, - rdev->bdev->bd_disk) < 0) - return -EINVAL; - } - if (!reference || !bdev_get_integrity(reference->bdev)) - return 0; - /* - * All component devices are integrity capable and have matching - * profiles, register the common profile for the md device. - */ - blk_integrity_register(mddev->gendisk, - bdev_get_integrity(reference->bdev)); + if (mddev_is_dm(mddev) || !blk_get_integrity(mddev->gendisk)) + return 0; /* shouldn't register */ pr_debug("md: data integrity enabled on %s\n", mdname(mddev)); if (bioset_integrity_create(&mddev->bio_set, BIO_POOL_SIZE) || @@ -2459,32 +2433,6 @@ int md_integrity_register(struct mddev *mddev) } EXPORT_SYMBOL(md_integrity_register); -/* - * Attempt to add an rdev, but only if it is consistent with the current - * integrity profile - */ -int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev) -{ - struct blk_integrity *bi_mddev; - - if (mddev_is_dm(mddev)) - return 0; - - bi_mddev = blk_get_integrity(mddev->gendisk); - - if (!bi_mddev) /* nothing to do */ - return 0; - - if (blk_integrity_compare(mddev->gendisk, rdev->bdev->bd_disk) != 0) { - pr_err("%s: incompatible integrity profile for %pg\n", - mdname(mddev), rdev->bdev); - return -ENXIO; - } - - return 0; -} -EXPORT_SYMBOL(md_integrity_add_rdev); - static bool rdev_read_only(struct md_rdev *rdev) { return bdev_read_only(rdev->bdev) || @@ -5755,14 +5703,20 @@ static const struct kobj_type md_ktype = { int mdp_major = 0; /* stack the limit for all rdevs into lim */ -void mddev_stack_rdev_limits(struct mddev *mddev, struct queue_limits *lim) +int mddev_stack_rdev_limits(struct mddev *mddev, struct queue_limits *lim, + unsigned int flags) { struct md_rdev *rdev; rdev_for_each(rdev, mddev) { queue_limits_stack_bdev(lim, rdev->bdev, rdev->data_offset, mddev->gendisk->disk_name); + if ((flags & MDDEV_STACK_INTEGRITY) && + !queue_limits_stack_integrity_bdev(lim, rdev->bdev)) + return -EINVAL; } + + return 0; } EXPORT_SYMBOL_GPL(mddev_stack_rdev_limits); @@ -5777,6 +5731,14 @@ int mddev_stack_new_rdev(struct mddev *mddev, struct md_rdev *rdev) lim = queue_limits_start_update(mddev->gendisk->queue); queue_limits_stack_bdev(&lim, rdev->bdev, rdev->data_offset, mddev->gendisk->disk_name); + + if (!queue_limits_stack_integrity_bdev(&lim, rdev->bdev)) { + pr_err("%s: incompatible integrity profile for %pg\n", + mdname(mddev), rdev->bdev); + queue_limits_cancel_update(mddev->gendisk->queue); + return -ENXIO; + } + return queue_limits_commit_update(mddev->gendisk->queue, &lim); } EXPORT_SYMBOL_GPL(mddev_stack_new_rdev); diff --git a/drivers/md/md.h b/drivers/md/md.h index ca085ecad504..6733b0b0abf9 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -809,7 +809,6 @@ extern void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev); extern void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors); extern int md_check_no_bitmap(struct mddev *mddev); extern int md_integrity_register(struct mddev *mddev); -extern int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev); extern int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale); extern int mddev_init(struct mddev *mddev); @@ -908,7 +907,9 @@ void md_autostart_arrays(int part); int md_set_array_info(struct mddev *mddev, struct mdu_array_info_s *info); int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info); int do_md_run(struct mddev *mddev); -void mddev_stack_rdev_limits(struct mddev *mddev, struct queue_limits *lim); +#define MDDEV_STACK_INTEGRITY (1u << 0) +int mddev_stack_rdev_limits(struct mddev *mddev, struct queue_limits *lim, + unsigned int flags); int mddev_stack_new_rdev(struct mddev *mddev, struct md_rdev *rdev); void mddev_update_io_opt(struct mddev *mddev, unsigned int nr_stripes); diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 81c01347cd24..62634e2a33bd 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -377,13 +377,18 @@ static void raid0_free(struct mddev *mddev, void *priv) static int raid0_set_limits(struct mddev *mddev) { struct queue_limits lim; + int err; blk_set_stacking_limits(&lim); lim.max_hw_sectors = mddev->chunk_sectors; lim.max_write_zeroes_sectors = mddev->chunk_sectors; lim.io_min = mddev->chunk_sectors << 9; lim.io_opt = lim.io_min * mddev->raid_disks; - mddev_stack_rdev_limits(mddev, &lim); + err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY); + if (err) { + queue_limits_cancel_update(mddev->gendisk->queue); + return err; + } return queue_limits_set(mddev->gendisk->queue, &lim); } diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 1f321826ef02..779cad62f6f8 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1907,9 +1907,6 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) if (mddev->recovery_disabled == conf->recovery_disabled) return -EBUSY; - if (md_integrity_add_rdev(rdev, mddev)) - return -ENXIO; - if (rdev->raid_disk >= 0) first = last = rdev->raid_disk; @@ -3197,10 +3194,15 @@ static struct r1conf *setup_conf(struct mddev *mddev) static int raid1_set_limits(struct mddev *mddev) { struct queue_limits lim; + int err; blk_set_stacking_limits(&lim); lim.max_write_zeroes_sectors = 0; - mddev_stack_rdev_limits(mddev, &lim); + err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY); + if (err) { + queue_limits_cancel_update(mddev->gendisk->queue); + return err; + } return queue_limits_set(mddev->gendisk->queue, &lim); } diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index a4556d2e46bf..5f6885b53b69 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -2083,9 +2083,6 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) if (rdev->saved_raid_disk < 0 && !_enough(conf, 1, -1)) return -EINVAL; - if (md_integrity_add_rdev(rdev, mddev)) - return -ENXIO; - if (rdev->raid_disk >= 0) first = last = rdev->raid_disk; @@ -3980,12 +3977,17 @@ static int raid10_set_queue_limits(struct mddev *mddev) { struct r10conf *conf = mddev->private; struct queue_limits lim; + int err; blk_set_stacking_limits(&lim); lim.max_write_zeroes_sectors = 0; lim.io_min = mddev->chunk_sectors << 9; lim.io_opt = lim.io_min * raid10_nr_stripes(conf); - mddev_stack_rdev_limits(mddev, &lim); + err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY); + if (err) { + queue_limits_cancel_update(mddev->gendisk->queue); + return err; + } return queue_limits_set(mddev->gendisk->queue, &lim); } diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 2bd1ce9b3922..675c68fa6c64 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -7708,7 +7708,7 @@ static int raid5_set_limits(struct mddev *mddev) lim.raid_partial_stripes_expensive = 1; lim.discard_granularity = stripe; lim.max_write_zeroes_sectors = 0; - mddev_stack_rdev_limits(mddev, &lim); + mddev_stack_rdev_limits(mddev, &lim, 0); rdev_for_each(rdev, mddev) queue_limits_stack_bdev(&lim, rdev->bdev, rdev->new_data_offset, mddev->gendisk->disk_name); diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c index 1e5aedaf8c7b..c5f8451b494d 100644 --- a/drivers/nvdimm/btt.c +++ b/drivers/nvdimm/btt.c @@ -1504,6 +1504,11 @@ static int btt_blk_init(struct btt *btt) }; int rc; + if (btt_meta_size(btt) && IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY)) { + lim.integrity.tuple_size = btt_meta_size(btt); + lim.integrity.tag_size = btt_meta_size(btt); + } + btt->btt_disk = blk_alloc_disk(&lim, NUMA_NO_NODE); if (IS_ERR(btt->btt_disk)) return PTR_ERR(btt->btt_disk); @@ -1516,14 +1521,6 @@ static int btt_blk_init(struct btt *btt) blk_queue_flag_set(QUEUE_FLAG_NONROT, btt->btt_disk->queue); blk_queue_flag_set(QUEUE_FLAG_SYNCHRONOUS, btt->btt_disk->queue); - if (btt_meta_size(btt) && IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY)) { - struct blk_integrity bi = { - .tuple_size = btt_meta_size(btt), - .tag_size = btt_meta_size(btt), - }; - blk_integrity_register(btt->btt_disk, &bi); - } - set_capacity(btt->btt_disk, btt->nlba * btt->sector_size >> 9); rc = device_add_disk(&btt->nd_btt->dev, btt->btt_disk, NULL); if (rc) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 14bac248cde4..5a673fa5cb26 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1723,11 +1723,12 @@ int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo) return 0; } -static bool nvme_init_integrity(struct gendisk *disk, struct nvme_ns_head *head) +static bool nvme_init_integrity(struct gendisk *disk, struct nvme_ns_head *head, + struct queue_limits *lim) { - struct blk_integrity integrity = { }; + struct blk_integrity *bi = &lim->integrity; - blk_integrity_unregister(disk); + memset(bi, 0, sizeof(*bi)); if (!head->ms) return true; @@ -1744,14 +1745,14 @@ static bool nvme_init_integrity(struct gendisk *disk, struct nvme_ns_head *head) case NVME_NS_DPS_PI_TYPE3: switch (head->guard_type) { case NVME_NVM_NS_16B_GUARD: - integrity.csum_type = BLK_INTEGRITY_CSUM_CRC; - integrity.tag_size = sizeof(u16) + sizeof(u32); - integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE; + bi->csum_type = BLK_INTEGRITY_CSUM_CRC; + bi->tag_size = sizeof(u16) + sizeof(u32); + bi->flags |= BLK_INTEGRITY_DEVICE_CAPABLE; break; case NVME_NVM_NS_64B_GUARD: - integrity.csum_type = BLK_INTEGRITY_CSUM_CRC64; - integrity.tag_size = sizeof(u16) + 6; - integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE; + bi->csum_type = BLK_INTEGRITY_CSUM_CRC64; + bi->tag_size = sizeof(u16) + 6; + bi->flags |= BLK_INTEGRITY_DEVICE_CAPABLE; break; default: break; @@ -1761,16 +1762,16 @@ static bool nvme_init_integrity(struct gendisk *disk, struct nvme_ns_head *head) case NVME_NS_DPS_PI_TYPE2: switch (head->guard_type) { case NVME_NVM_NS_16B_GUARD: - integrity.csum_type = BLK_INTEGRITY_CSUM_CRC; - integrity.tag_size = sizeof(u16); - integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE | - BLK_INTEGRITY_REF_TAG; + bi->csum_type = BLK_INTEGRITY_CSUM_CRC; + bi->tag_size = sizeof(u16); + bi->flags |= BLK_INTEGRITY_DEVICE_CAPABLE | + BLK_INTEGRITY_REF_TAG; break; case NVME_NVM_NS_64B_GUARD: - integrity.csum_type = BLK_INTEGRITY_CSUM_CRC64; - integrity.tag_size = sizeof(u16); - integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE | - BLK_INTEGRITY_REF_TAG; + bi->csum_type = BLK_INTEGRITY_CSUM_CRC64; + bi->tag_size = sizeof(u16); + bi->flags |= BLK_INTEGRITY_DEVICE_CAPABLE | + BLK_INTEGRITY_REF_TAG; break; default: break; @@ -1780,9 +1781,8 @@ static bool nvme_init_integrity(struct gendisk *disk, struct nvme_ns_head *head) break; } - integrity.tuple_size = head->ms; - integrity.pi_offset = head->pi_offset; - blk_integrity_register(disk, &integrity); + bi->tuple_size = head->ms; + bi->pi_offset = head->pi_offset; return true; } @@ -2105,11 +2105,6 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns, if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && ns->head->ids.csi == NVME_CSI_ZNS) nvme_update_zone_info(ns, &lim, &zi); - ret = queue_limits_commit_update(ns->disk->queue, &lim); - if (ret) { - blk_mq_unfreeze_queue(ns->disk->queue); - goto out; - } /* * Register a metadata profile for PI, or the plain non-integrity NVMe @@ -2117,9 +2112,15 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns, * I/O to namespaces with metadata except when the namespace supports * PI, as it can strip/insert in that case. */ - if (!nvme_init_integrity(ns->disk, ns->head)) + if (!nvme_init_integrity(ns->disk, ns->head, &lim)) capacity = 0; + ret = queue_limits_commit_update(ns->disk->queue, &lim); + if (ret) { + blk_mq_unfreeze_queue(ns->disk->queue); + goto out; + } + set_capacity_and_notify(ns->disk, capacity); /* @@ -2191,14 +2192,6 @@ static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_ns_info *info) struct queue_limits lim; blk_mq_freeze_queue(ns->head->disk->queue); - if (unsupported) - ns->head->disk->flags |= GENHD_FL_HIDDEN; - else - nvme_init_integrity(ns->head->disk, ns->head); - set_capacity_and_notify(ns->head->disk, get_capacity(ns->disk)); - set_disk_ro(ns->head->disk, nvme_ns_is_readonly(ns, info)); - nvme_mpath_revalidate_paths(ns); - /* * queue_limits mixes values that are the hardware limitations * for bio splitting with what is the device configuration. @@ -2221,7 +2214,16 @@ static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_ns_info *info) lim.io_opt = ns_lim->io_opt; queue_limits_stack_bdev(&lim, ns->disk->part0, 0, ns->head->disk->disk_name); + if (unsupported) + ns->head->disk->flags |= GENHD_FL_HIDDEN; + else + nvme_init_integrity(ns->head->disk, ns->head, &lim); ret = queue_limits_commit_update(ns->head->disk->queue, &lim); + + set_capacity_and_notify(ns->head->disk, get_capacity(ns->disk)); + set_disk_ro(ns->head->disk, nvme_ns_is_readonly(ns, info)); + nvme_mpath_revalidate_paths(ns); + blk_mq_unfreeze_queue(ns->head->disk->queue); } diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index d957e29b17a9..e01393ed4207 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -2482,11 +2482,13 @@ static int sd_read_protection_type(struct scsi_disk *sdkp, unsigned char *buffer return 0; } -static void sd_config_protection(struct scsi_disk *sdkp) +static void sd_config_protection(struct scsi_disk *sdkp, + struct queue_limits *lim) { struct scsi_device *sdp = sdkp->device; - sd_dif_config_host(sdkp); + if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY)) + sd_dif_config_host(sdkp, lim); if (!sdkp->protection_type) return; @@ -3677,7 +3679,7 @@ static int sd_revalidate_disk(struct gendisk *disk) sd_read_app_tag_own(sdkp, buffer); sd_read_write_same(sdkp, buffer); sd_read_security(sdkp, buffer); - sd_config_protection(sdkp); + sd_config_protection(sdkp, &lim); } /* diff --git a/drivers/scsi/sd.h b/drivers/scsi/sd.h index b4170b17bad4..726f1613f6cb 100644 --- a/drivers/scsi/sd.h +++ b/drivers/scsi/sd.h @@ -220,17 +220,7 @@ static inline sector_t sectors_to_logical(struct scsi_device *sdev, sector_t sec return sector >> (ilog2(sdev->sector_size) - 9); } -#ifdef CONFIG_BLK_DEV_INTEGRITY - -extern void sd_dif_config_host(struct scsi_disk *); - -#else /* CONFIG_BLK_DEV_INTEGRITY */ - -static inline void sd_dif_config_host(struct scsi_disk *disk) -{ -} - -#endif /* CONFIG_BLK_DEV_INTEGRITY */ +void sd_dif_config_host(struct scsi_disk *sdkp, struct queue_limits *lim); static inline int sd_is_zoned(struct scsi_disk *sdkp) { diff --git a/drivers/scsi/sd_dif.c b/drivers/scsi/sd_dif.c index 6f0921c7db78..ae6ce6f5d622 100644 --- a/drivers/scsi/sd_dif.c +++ b/drivers/scsi/sd_dif.c @@ -24,14 +24,15 @@ /* * Configure exchange of protection information between OS and HBA. */ -void sd_dif_config_host(struct scsi_disk *sdkp) +void sd_dif_config_host(struct scsi_disk *sdkp, struct queue_limits *lim) { struct scsi_device *sdp = sdkp->device; - struct gendisk *disk = sdkp->disk; u8 type = sdkp->protection_type; - struct blk_integrity bi; + struct blk_integrity *bi = &lim->integrity; int dif, dix; + memset(bi, 0, sizeof(*bi)); + dif = scsi_host_dif_capable(sdp->host, type); dix = scsi_host_dix_capable(sdp->host, type); @@ -39,40 +40,33 @@ void sd_dif_config_host(struct scsi_disk *sdkp) dif = 0; dix = 1; } - if (!dix) { - blk_integrity_unregister(disk); + if (!dix) return; - } - - memset(&bi, 0, sizeof(bi)); /* Enable DMA of protection information */ if (scsi_host_get_guard(sdkp->device->host) & SHOST_DIX_GUARD_IP) - bi.csum_type = BLK_INTEGRITY_CSUM_IP; + bi->csum_type = BLK_INTEGRITY_CSUM_IP; else - bi.csum_type = BLK_INTEGRITY_CSUM_CRC; + bi->csum_type = BLK_INTEGRITY_CSUM_CRC; if (type != T10_PI_TYPE3_PROTECTION) - bi.flags |= BLK_INTEGRITY_REF_TAG; + bi->flags |= BLK_INTEGRITY_REF_TAG; - bi.tuple_size = sizeof(struct t10_pi_tuple); + bi->tuple_size = sizeof(struct t10_pi_tuple); if (dif && type) { - bi.flags |= BLK_INTEGRITY_DEVICE_CAPABLE; + bi->flags |= BLK_INTEGRITY_DEVICE_CAPABLE; if (!sdkp->ATO) - goto out; + return; if (type == T10_PI_TYPE3_PROTECTION) - bi.tag_size = sizeof(u16) + sizeof(u32); + bi->tag_size = sizeof(u16) + sizeof(u32); else - bi.tag_size = sizeof(u16); + bi->tag_size = sizeof(u16); } sd_first_printk(KERN_NOTICE, sdkp, "Enabling DIX %s, application tag size %u bytes\n", - blk_integrity_profile_name(&bi), bi.tag_size); -out: - blk_integrity_register(disk, &bi); + blk_integrity_profile_name(bi), bi->tag_size); } - diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h index bafa01d4e7f9..d201140d77a3 100644 --- a/include/linux/blk-integrity.h +++ b/include/linux/blk-integrity.h @@ -11,6 +11,7 @@ enum blk_integrity_flags { BLK_INTEGRITY_NOGENERATE = 1 << 1, BLK_INTEGRITY_DEVICE_CAPABLE = 1 << 2, BLK_INTEGRITY_REF_TAG = 1 << 3, + BLK_INTEGRITY_STACKED = 1 << 4, }; struct blk_integrity_iter { @@ -23,11 +24,15 @@ struct blk_integrity_iter { }; const char *blk_integrity_profile_name(struct blk_integrity *bi); +bool queue_limits_stack_integrity(struct queue_limits *t, + struct queue_limits *b); +static inline bool queue_limits_stack_integrity_bdev(struct queue_limits *t, + struct block_device *bdev) +{ + return queue_limits_stack_integrity(t, &bdev->bd_disk->queue->limits); +} #ifdef CONFIG_BLK_DEV_INTEGRITY -void blk_integrity_register(struct gendisk *, struct blk_integrity *); -void blk_integrity_unregister(struct gendisk *); -int blk_integrity_compare(struct gendisk *, struct gendisk *); int blk_rq_map_integrity_sg(struct request_queue *, struct bio *, struct scatterlist *); int blk_rq_count_integrity_sg(struct request_queue *, struct bio *); @@ -35,14 +40,14 @@ int blk_rq_count_integrity_sg(struct request_queue *, struct bio *); static inline bool blk_integrity_queue_supports_integrity(struct request_queue *q) { - return q->integrity.tuple_size; + return q->limits.integrity.tuple_size; } static inline struct blk_integrity *blk_get_integrity(struct gendisk *disk) { if (!blk_integrity_queue_supports_integrity(disk->queue)) return NULL; - return &disk->queue->integrity; + return &disk->queue->limits.integrity; } static inline struct blk_integrity * @@ -119,17 +124,6 @@ blk_integrity_queue_supports_integrity(struct request_queue *q) { return false; } -static inline int blk_integrity_compare(struct gendisk *a, struct gendisk *b) -{ - return 0; -} -static inline void blk_integrity_register(struct gendisk *d, - struct blk_integrity *b) -{ -} -static inline void blk_integrity_unregister(struct gendisk *d) -{ -} static inline unsigned short queue_max_integrity_segments(const struct request_queue *q) { @@ -157,4 +151,5 @@ static inline struct bio_vec *rq_integrity_vec(struct request *rq) return NULL; } #endif /* CONFIG_BLK_DEV_INTEGRITY */ + #endif /* _LINUX_BLK_INTEGRITY_H */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index f9089750919c..0c247a716885 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -334,6 +334,8 @@ struct queue_limits { * due to possible offsets. */ unsigned int dma_alignment; + + struct blk_integrity integrity; }; typedef int (*report_zones_cb)(struct blk_zone *zone, unsigned int idx, @@ -419,10 +421,6 @@ struct request_queue { struct queue_limits limits; -#ifdef CONFIG_BLK_DEV_INTEGRITY - struct blk_integrity integrity; -#endif /* CONFIG_BLK_DEV_INTEGRITY */ - #ifdef CONFIG_PM struct device *dev; enum rpm_status rpm_status; @@ -1300,11 +1298,9 @@ static inline bool bdev_stable_writes(struct block_device *bdev) { struct request_queue *q = bdev_get_queue(bdev); -#ifdef CONFIG_BLK_DEV_INTEGRITY - /* BLK_INTEGRITY_CSUM_NONE is not available in blkdev.h */ - if (q->integrity.csum_type != 0) + if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) && + q->limits.integrity.csum_type != BLK_INTEGRITY_CSUM_NONE) return true; -#endif return test_bit(QUEUE_FLAG_STABLE_WRITES, &q->queue_flags); } diff --git a/include/linux/t10-pi.h b/include/linux/t10-pi.h index d2bafb76badf..1773610010eb 100644 --- a/include/linux/t10-pi.h +++ b/include/linux/t10-pi.h @@ -39,12 +39,8 @@ struct t10_pi_tuple { static inline u32 t10_pi_ref_tag(struct request *rq) { - unsigned int shift = ilog2(queue_logical_block_size(rq->q)); + unsigned int shift = rq->q->limits.integrity.interval_exp; -#ifdef CONFIG_BLK_DEV_INTEGRITY - if (rq->q->integrity.interval_exp) - shift = rq->q->integrity.interval_exp; -#endif return blk_rq_pos(rq) >> (shift - SECTOR_SHIFT) & 0xffffffff; } @@ -65,12 +61,8 @@ static inline u64 lower_48_bits(u64 n) static inline u64 ext_pi_ref_tag(struct request *rq) { - unsigned int shift = ilog2(queue_logical_block_size(rq->q)); + unsigned int shift = rq->q->limits.integrity.interval_exp; -#ifdef CONFIG_BLK_DEV_INTEGRITY - if (rq->q->integrity.interval_exp) - shift = rq->q->integrity.interval_exp; -#endif return lower_48_bits(blk_rq_pos(rq) >> (shift - SECTOR_SHIFT)); }