Merge tag 'nvme-6.16-2025-06-05' of git://git.infradead.org/nvme into block-6.16

Pull NVMe updates and fixes from Christoph:

"nvme updates for Linux 6.16

 - TCP error handling fix (Shin'ichiro Kawasaki)
 - TCP I/O stall handling fixes (Hannes Reinecke)
 - fix command limits status code (Keith Busch)
 - support vectored buffers also for passthrough (Pavel Begunkov)
 - spelling fixes (Yi Zhang)"

* tag 'nvme-6.16-2025-06-05' of git://git.infradead.org/nvme:
  nvme: spelling fixes
  nvme-tcp: fix I/O stalls on congested sockets
  nvme-tcp: sanitize request list handling
  nvme-tcp: remove tag set when second admin queue config fails
  nvme: enable vectored registered bufs for passthrough cmds
  nvme: fix implicit bool to flags conversion
  nvme: fix command limits status code
This commit is contained in:
Jens Axboe
2025-06-05 07:40:38 -06:00
20 changed files with 57 additions and 54 deletions
+3 -3
View File
@@ -471,7 +471,7 @@ EXPORT_SYMBOL_GPL(nvme_auth_generate_key);
* @c1: Value of challenge C1
* @c2: Value of challenge C2
* @hash_len: Hash length of the hash algorithm
* @ret_psk: Pointer too the resulting generated PSK
* @ret_psk: Pointer to the resulting generated PSK
* @ret_len: length of @ret_psk
*
* Generate a PSK for TLS as specified in NVMe base specification, section
@@ -759,8 +759,8 @@ int nvme_auth_derive_tls_psk(int hmac_id, u8 *psk, size_t psk_len,
goto out_free_prk;
/*
* 2 addtional bytes for the length field from HDKF-Expand-Label,
* 2 addtional bytes for the HMAC ID, and one byte for the space
* 2 additional bytes for the length field from HDKF-Expand-Label,
* 2 additional bytes for the HMAC ID, and one byte for the space
* separator.
*/
info_len = strlen(psk_digest) + strlen(psk_prefix) + 5;
+1 -1
View File
@@ -106,7 +106,7 @@ config NVME_TCP_TLS
help
Enables TLS encryption for NVMe TCP using the netlink handshake API.
The TLS handshake daemon is availble at
The TLS handshake daemon is available at
https://github.com/oracle/ktls-utils.
If unsure, say N.
+1 -1
View File
@@ -145,7 +145,7 @@ static const char * const nvme_statuses[] = {
[NVME_SC_BAD_ATTRIBUTES] = "Conflicting Attributes",
[NVME_SC_INVALID_PI] = "Invalid Protection Information",
[NVME_SC_READ_ONLY] = "Attempted Write to Read Only Range",
[NVME_SC_ONCS_NOT_SUPPORTED] = "ONCS Not Supported",
[NVME_SC_CMD_SIZE_LIM_EXCEEDED ] = "Command Size Limits Exceeded",
[NVME_SC_ZONE_BOUNDARY_ERROR] = "Zoned Boundary Error",
[NVME_SC_ZONE_FULL] = "Zone Is Full",
[NVME_SC_ZONE_READ_ONLY] = "Zone Is Read Only",
+1 -2
View File
@@ -290,7 +290,6 @@ static blk_status_t nvme_error_status(u16 status)
case NVME_SC_NS_NOT_READY:
return BLK_STS_TARGET;
case NVME_SC_BAD_ATTRIBUTES:
case NVME_SC_ONCS_NOT_SUPPORTED:
case NVME_SC_INVALID_OPCODE:
case NVME_SC_INVALID_FIELD:
case NVME_SC_INVALID_NS:
@@ -1027,7 +1026,7 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
if (ns->head->ms) {
/*
* If formated with metadata, the block layer always provides a
* If formatted with metadata, the block layer always provides a
* metadata buffer if CONFIG_BLK_DEV_INTEGRITY is enabled. Else
* we enable the PRACT bit for protection information or set the
* namespace capacity to zero to prevent any I/O.
+1 -1
View File
@@ -582,7 +582,7 @@ EXPORT_SYMBOL_GPL(nvmf_connect_io_queue);
* Do not retry when:
*
* - the DNR bit is set and the specification states no further connect
* attempts with the same set of paramenters should be attempted.
* attempts with the same set of parameters should be attempted.
*
* - when the authentication attempt fails, because the key was invalid.
* This error code is set on the host side.
+3 -3
View File
@@ -80,7 +80,7 @@ enum {
* @transport: Holds the fabric transport "technology name" (for a lack of
* better description) that will be used by an NVMe controller
* being added.
* @subsysnqn: Hold the fully qualified NQN subystem name (format defined
* @subsysnqn: Hold the fully qualified NQN subsystem name (format defined
* in the NVMe specification, "NVMe Qualified Names").
* @traddr: The transport-specific TRADDR field for a port on the
* subsystem which is adding a controller.
@@ -156,7 +156,7 @@ struct nvmf_ctrl_options {
* @create_ctrl(): function pointer that points to a non-NVMe
* implementation-specific fabric technology
* that would go into starting up that fabric
* for the purpose of conneciton to an NVMe controller
* for the purpose of connection to an NVMe controller
* using that fabric technology.
*
* Notes:
@@ -165,7 +165,7 @@ struct nvmf_ctrl_options {
* 2. create_ctrl() must be defined (even if it does nothing)
* 3. struct nvmf_transport_ops must be statically allocated in the
* modules .bss section so that a pure module_get on @module
* prevents the memory from beeing freed.
* prevents the memory from being freed.
*/
struct nvmf_transport_ops {
struct list_head entry;
+2 -2
View File
@@ -1955,7 +1955,7 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
}
/*
* For the linux implementation, if we have an unsuccesful
* For the linux implementation, if we have an unsucceesful
* status, they blk-mq layer can typically be called with the
* non-zero status and the content of the cqe isn't important.
*/
@@ -2479,7 +2479,7 @@ __nvme_fc_abort_outstanding_ios(struct nvme_fc_ctrl *ctrl, bool start_queues)
* writing the registers for shutdown and polling (call
* nvme_disable_ctrl()). Given a bunch of i/o was potentially
* just aborted and we will wait on those contexts, and given
* there was no indication of how live the controlelr is on the
* there was no indication of how live the controller is on the
* link, don't send more io to create more contexts for the
* shutdown. Let the controller fail via keepalive failure if
* its still present.
+10 -8
View File
@@ -493,13 +493,15 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
d.timeout_ms = READ_ONCE(cmd->timeout_ms);
if (d.data_len && (ioucmd->flags & IORING_URING_CMD_FIXED)) {
/* fixedbufs is only for non-vectored io */
if (vec)
return -EINVAL;
int ddir = nvme_is_write(&c) ? WRITE : READ;
ret = io_uring_cmd_import_fixed(d.addr, d.data_len,
nvme_is_write(&c) ? WRITE : READ, &iter, ioucmd,
issue_flags);
if (vec)
ret = io_uring_cmd_import_fixed_vec(ioucmd,
u64_to_user_ptr(d.addr), d.data_len,
ddir, &iter, issue_flags);
else
ret = io_uring_cmd_import_fixed(d.addr, d.data_len,
ddir, &iter, ioucmd, issue_flags);
if (ret < 0)
return ret;
@@ -521,7 +523,7 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
if (d.data_len) {
ret = nvme_map_user_request(req, d.addr, d.data_len,
nvme_to_user_ptr(d.metadata), d.metadata_len,
map_iter, vec);
map_iter, vec ? NVME_IOCTL_VEC : 0);
if (ret)
goto out_free_req;
}
@@ -727,7 +729,7 @@ int nvme_ns_head_ioctl(struct block_device *bdev, blk_mode_t mode,
/*
* Handle ioctls that apply to the controller instead of the namespace
* seperately and drop the ns SRCU reference early. This avoids a
* separately and drop the ns SRCU reference early. This avoids a
* deadlock when deleting namespaces using the passthrough interface.
*/
if (is_ctrl_ioctl(cmd))
+1 -1
View File
@@ -760,7 +760,7 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
* controller's scan_work context. If a path error occurs here, the IO
* will wait until a path becomes available or all paths are torn down,
* but that action also occurs within scan_work, so it would deadlock.
* Defer the partion scan to a different context that does not block
* Defer the partition scan to a different context that does not block
* scan_work.
*/
set_bit(GD_SUPPRESS_PART_SCAN, &head->disk->state);
+1 -1
View File
@@ -523,7 +523,7 @@ static inline bool nvme_ns_head_multipath(struct nvme_ns_head *head)
enum nvme_ns_features {
NVME_NS_EXT_LBAS = 1 << 0, /* support extended LBA format */
NVME_NS_METADATA_SUPPORTED = 1 << 1, /* support getting generated md */
NVME_NS_DEAC = 1 << 2, /* DEAC bit in Write Zeores supported */
NVME_NS_DEAC = 1 << 2, /* DEAC bit in Write Zeroes supported */
};
struct nvme_ns {
+2 -2
View File
@@ -3015,7 +3015,7 @@ static void nvme_reset_work(struct work_struct *work)
goto out;
/*
* Freeze and update the number of I/O queues as thos might have
* Freeze and update the number of I/O queues as those might have
* changed. If there are no I/O queues left after this reset, keep the
* controller around but remove all namespaces.
*/
@@ -3186,7 +3186,7 @@ static unsigned long check_vendor_combination_bug(struct pci_dev *pdev)
/*
* Exclude some Kingston NV1 and A2000 devices from
* NVME_QUIRK_SIMPLE_SUSPEND. Do a full suspend to save a
* lot fo energy with s2idle sleep on some TUXEDO platforms.
* lot of energy with s2idle sleep on some TUXEDO platforms.
*/
if (dmi_match(DMI_BOARD_NAME, "NS5X_NS7XAU") ||
dmi_match(DMI_BOARD_NAME, "NS5x_7xAU") ||
-2
View File
@@ -82,8 +82,6 @@ static int nvme_status_to_pr_err(int status)
return PR_STS_SUCCESS;
case NVME_SC_RESERVATION_CONFLICT:
return PR_STS_RESERVATION_CONFLICT;
case NVME_SC_ONCS_NOT_SUPPORTED:
return -EOPNOTSUPP;
case NVME_SC_BAD_ATTRIBUTES:
case NVME_SC_INVALID_OPCODE:
case NVME_SC_INVALID_FIELD:
+2 -2
View File
@@ -221,7 +221,7 @@ static struct nvme_rdma_qe *nvme_rdma_alloc_ring(struct ib_device *ibdev,
/*
* Bind the CQEs (post recv buffers) DMA mapping to the RDMA queue
* lifetime. It's safe, since any chage in the underlying RDMA device
* lifetime. It's safe, since any change in the underlying RDMA device
* will issue error recovery and queue re-creation.
*/
for (i = 0; i < ib_queue_size; i++) {
@@ -800,7 +800,7 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl,
/*
* Bind the async event SQE DMA mapping to the admin queue lifetime.
* It's safe, since any chage in the underlying RDMA device will issue
* It's safe, since any change in the underlying RDMA device will issue
* error recovery and queue re-creation.
*/
error = nvme_rdma_alloc_qe(ctrl->device->dev, &ctrl->async_event_sqe,
+21 -3
View File
@@ -452,7 +452,8 @@ nvme_tcp_fetch_request(struct nvme_tcp_queue *queue)
return NULL;
}
list_del(&req->entry);
list_del_init(&req->entry);
init_llist_node(&req->lentry);
return req;
}
@@ -560,6 +561,8 @@ static int nvme_tcp_init_request(struct blk_mq_tag_set *set,
req->queue = queue;
nvme_req(rq)->ctrl = &ctrl->ctrl;
nvme_req(rq)->cmd = &pdu->cmd;
init_llist_node(&req->lentry);
INIT_LIST_HEAD(&req->entry);
return 0;
}
@@ -764,6 +767,14 @@ static int nvme_tcp_handle_r2t(struct nvme_tcp_queue *queue,
return -EPROTO;
}
if (llist_on_list(&req->lentry) ||
!list_empty(&req->entry)) {
dev_err(queue->ctrl->ctrl.device,
"req %d unexpected r2t while processing request\n",
rq->tag);
return -EPROTO;
}
req->pdu_len = 0;
req->h2cdata_left = r2t_length;
req->h2cdata_offset = r2t_offset;
@@ -1350,7 +1361,7 @@ static int nvme_tcp_try_recv(struct nvme_tcp_queue *queue)
queue->nr_cqe = 0;
consumed = sock->ops->read_sock(sk, &rd_desc, nvme_tcp_recv_skb);
release_sock(sk);
return consumed;
return consumed == -EAGAIN ? 0 : consumed;
}
static void nvme_tcp_io_work(struct work_struct *w)
@@ -1378,6 +1389,11 @@ static void nvme_tcp_io_work(struct work_struct *w)
else if (unlikely(result < 0))
return;
/* did we get some space after spending time in recv? */
if (nvme_tcp_queue_has_pending(queue) &&
sk_stream_is_writeable(queue->sock->sk))
pending = true;
if (!pending || !queue->rd_enabled)
return;
@@ -2394,7 +2410,7 @@ static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new)
nvme_tcp_teardown_admin_queue(ctrl, false);
ret = nvme_tcp_configure_admin_queue(ctrl, false);
if (ret)
return ret;
goto destroy_admin;
}
if (ctrl->icdoff) {
@@ -2638,6 +2654,8 @@ static void nvme_tcp_submit_async_event(struct nvme_ctrl *arg)
ctrl->async_req.offset = 0;
ctrl->async_req.curr_bio = NULL;
ctrl->async_req.data_len = 0;
init_llist_node(&ctrl->async_req.lentry);
INIT_LIST_HEAD(&ctrl->async_req.entry);
nvme_tcp_queue_request(&ctrl->async_req, true);
}
+1 -1
View File
@@ -1165,7 +1165,7 @@ static void nvmet_execute_identify(struct nvmet_req *req)
* A "minimum viable" abort implementation: the command is mandatory in the
* spec, but we are not required to do any useful work. We couldn't really
* do a useful abort, so don't bother even with waiting for the command
* to be exectuted and return immediately telling the command to abort
* to be executed and return immediately telling the command to abort
* wasn't found.
*/
static void nvmet_execute_abort(struct nvmet_req *req)
+2 -9
View File
@@ -62,14 +62,7 @@ inline u16 errno_to_nvme_status(struct nvmet_req *req, int errno)
return NVME_SC_LBA_RANGE | NVME_STATUS_DNR;
case -EOPNOTSUPP:
req->error_loc = offsetof(struct nvme_common_command, opcode);
switch (req->cmd->common.opcode) {
case nvme_cmd_dsm:
case nvme_cmd_write_zeroes:
return NVME_SC_ONCS_NOT_SUPPORTED | NVME_STATUS_DNR;
default:
return NVME_SC_INVALID_OPCODE | NVME_STATUS_DNR;
}
break;
return NVME_SC_INVALID_OPCODE | NVME_STATUS_DNR;
case -ENODATA:
req->error_loc = offsetof(struct nvme_rw_command, nsid);
return NVME_SC_ACCESS_DENIED;
@@ -651,7 +644,7 @@ void nvmet_ns_disable(struct nvmet_ns *ns)
* Now that we removed the namespaces from the lookup list, we
* can kill the per_cpu ref and wait for any remaining references
* to be dropped, as well as a RCU grace period for anyone only
* using the namepace under rcu_read_lock(). Note that we can't
* using the namespace under rcu_read_lock(). Note that we can't
* use call_rcu here as we need to ensure the namespaces have
* been fully destroyed before unloading the module.
*/
+1 -1
View File
@@ -1339,7 +1339,7 @@ nvmet_fc_portentry_rebind_tgt(struct nvmet_fc_tgtport *tgtport)
/**
* nvmet_fc_register_targetport - transport entry point called by an
* LLDD to register the existence of a local
* NVME subystem FC port.
* NVME subsystem FC port.
* @pinfo: pointer to information about the port to be registered
* @template: LLDD entrypoints and operational parameters for the port
* @dev: physical hardware device node port corresponds to. Will be
+2 -9
View File
@@ -133,7 +133,7 @@ u16 blk_to_nvme_status(struct nvmet_req *req, blk_status_t blk_sts)
* Right now there exists M : 1 mapping between block layer error
* to the NVMe status code (see nvme_error_status()). For consistency,
* when we reverse map we use most appropriate NVMe Status code from
* the group of the NVMe staus codes used in the nvme_error_status().
* the group of the NVMe status codes used in the nvme_error_status().
*/
switch (blk_sts) {
case BLK_STS_NOSPC:
@@ -145,15 +145,8 @@ u16 blk_to_nvme_status(struct nvmet_req *req, blk_status_t blk_sts)
req->error_loc = offsetof(struct nvme_rw_command, slba);
break;
case BLK_STS_NOTSUPP:
status = NVME_SC_INVALID_OPCODE | NVME_STATUS_DNR;
req->error_loc = offsetof(struct nvme_common_command, opcode);
switch (req->cmd->common.opcode) {
case nvme_cmd_dsm:
case nvme_cmd_write_zeroes:
status = NVME_SC_ONCS_NOT_SUPPORTED | NVME_STATUS_DNR;
break;
default:
status = NVME_SC_INVALID_OPCODE | NVME_STATUS_DNR;
}
break;
case BLK_STS_MEDIUM:
status = NVME_SC_ACCESS_DENIED;
+1 -1
View File
@@ -99,7 +99,7 @@ static u16 nvmet_passthru_override_id_ctrl(struct nvmet_req *req)
/*
* The passthru NVMe driver may have a limit on the number of segments
* which depends on the host's memory fragementation. To solve this,
* which depends on the host's memory fragmentation. To solve this,
* ensure mdts is limited to the pages equal to the number of segments.
*/
max_hw_sectors = min_not_zero(pctrl->max_segments << PAGE_SECTORS_SHIFT,
+1 -1
View File
@@ -2171,7 +2171,7 @@ enum {
NVME_SC_BAD_ATTRIBUTES = 0x180,
NVME_SC_INVALID_PI = 0x181,
NVME_SC_READ_ONLY = 0x182,
NVME_SC_ONCS_NOT_SUPPORTED = 0x183,
NVME_SC_CMD_SIZE_LIM_EXCEEDED = 0x183,
/*
* I/O Command Set Specific - Fabrics commands: