Merge tag 'nvme-6.16-2025-06-05' of git://git.infradead.org/nvme into block-6.16
Pull NVMe updates and fixes from Christoph: "nvme updates for Linux 6.16 - TCP error handling fix (Shin'ichiro Kawasaki) - TCP I/O stall handling fixes (Hannes Reinecke) - fix command limits status code (Keith Busch) - support vectored buffers also for passthrough (Pavel Begunkov) - spelling fixes (Yi Zhang)" * tag 'nvme-6.16-2025-06-05' of git://git.infradead.org/nvme: nvme: spelling fixes nvme-tcp: fix I/O stalls on congested sockets nvme-tcp: sanitize request list handling nvme-tcp: remove tag set when second admin queue config fails nvme: enable vectored registered bufs for passthrough cmds nvme: fix implicit bool to flags conversion nvme: fix command limits status code
This commit is contained in:
@@ -471,7 +471,7 @@ EXPORT_SYMBOL_GPL(nvme_auth_generate_key);
|
||||
* @c1: Value of challenge C1
|
||||
* @c2: Value of challenge C2
|
||||
* @hash_len: Hash length of the hash algorithm
|
||||
* @ret_psk: Pointer too the resulting generated PSK
|
||||
* @ret_psk: Pointer to the resulting generated PSK
|
||||
* @ret_len: length of @ret_psk
|
||||
*
|
||||
* Generate a PSK for TLS as specified in NVMe base specification, section
|
||||
@@ -759,8 +759,8 @@ int nvme_auth_derive_tls_psk(int hmac_id, u8 *psk, size_t psk_len,
|
||||
goto out_free_prk;
|
||||
|
||||
/*
|
||||
* 2 addtional bytes for the length field from HDKF-Expand-Label,
|
||||
* 2 addtional bytes for the HMAC ID, and one byte for the space
|
||||
* 2 additional bytes for the length field from HDKF-Expand-Label,
|
||||
* 2 additional bytes for the HMAC ID, and one byte for the space
|
||||
* separator.
|
||||
*/
|
||||
info_len = strlen(psk_digest) + strlen(psk_prefix) + 5;
|
||||
|
||||
@@ -106,7 +106,7 @@ config NVME_TCP_TLS
|
||||
help
|
||||
Enables TLS encryption for NVMe TCP using the netlink handshake API.
|
||||
|
||||
The TLS handshake daemon is availble at
|
||||
The TLS handshake daemon is available at
|
||||
https://github.com/oracle/ktls-utils.
|
||||
|
||||
If unsure, say N.
|
||||
|
||||
@@ -145,7 +145,7 @@ static const char * const nvme_statuses[] = {
|
||||
[NVME_SC_BAD_ATTRIBUTES] = "Conflicting Attributes",
|
||||
[NVME_SC_INVALID_PI] = "Invalid Protection Information",
|
||||
[NVME_SC_READ_ONLY] = "Attempted Write to Read Only Range",
|
||||
[NVME_SC_ONCS_NOT_SUPPORTED] = "ONCS Not Supported",
|
||||
[NVME_SC_CMD_SIZE_LIM_EXCEEDED ] = "Command Size Limits Exceeded",
|
||||
[NVME_SC_ZONE_BOUNDARY_ERROR] = "Zoned Boundary Error",
|
||||
[NVME_SC_ZONE_FULL] = "Zone Is Full",
|
||||
[NVME_SC_ZONE_READ_ONLY] = "Zone Is Read Only",
|
||||
|
||||
@@ -290,7 +290,6 @@ static blk_status_t nvme_error_status(u16 status)
|
||||
case NVME_SC_NS_NOT_READY:
|
||||
return BLK_STS_TARGET;
|
||||
case NVME_SC_BAD_ATTRIBUTES:
|
||||
case NVME_SC_ONCS_NOT_SUPPORTED:
|
||||
case NVME_SC_INVALID_OPCODE:
|
||||
case NVME_SC_INVALID_FIELD:
|
||||
case NVME_SC_INVALID_NS:
|
||||
@@ -1027,7 +1026,7 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
|
||||
|
||||
if (ns->head->ms) {
|
||||
/*
|
||||
* If formated with metadata, the block layer always provides a
|
||||
* If formatted with metadata, the block layer always provides a
|
||||
* metadata buffer if CONFIG_BLK_DEV_INTEGRITY is enabled. Else
|
||||
* we enable the PRACT bit for protection information or set the
|
||||
* namespace capacity to zero to prevent any I/O.
|
||||
|
||||
@@ -582,7 +582,7 @@ EXPORT_SYMBOL_GPL(nvmf_connect_io_queue);
|
||||
* Do not retry when:
|
||||
*
|
||||
* - the DNR bit is set and the specification states no further connect
|
||||
* attempts with the same set of paramenters should be attempted.
|
||||
* attempts with the same set of parameters should be attempted.
|
||||
*
|
||||
* - when the authentication attempt fails, because the key was invalid.
|
||||
* This error code is set on the host side.
|
||||
|
||||
@@ -80,7 +80,7 @@ enum {
|
||||
* @transport: Holds the fabric transport "technology name" (for a lack of
|
||||
* better description) that will be used by an NVMe controller
|
||||
* being added.
|
||||
* @subsysnqn: Hold the fully qualified NQN subystem name (format defined
|
||||
* @subsysnqn: Hold the fully qualified NQN subsystem name (format defined
|
||||
* in the NVMe specification, "NVMe Qualified Names").
|
||||
* @traddr: The transport-specific TRADDR field for a port on the
|
||||
* subsystem which is adding a controller.
|
||||
@@ -156,7 +156,7 @@ struct nvmf_ctrl_options {
|
||||
* @create_ctrl(): function pointer that points to a non-NVMe
|
||||
* implementation-specific fabric technology
|
||||
* that would go into starting up that fabric
|
||||
* for the purpose of conneciton to an NVMe controller
|
||||
* for the purpose of connection to an NVMe controller
|
||||
* using that fabric technology.
|
||||
*
|
||||
* Notes:
|
||||
@@ -165,7 +165,7 @@ struct nvmf_ctrl_options {
|
||||
* 2. create_ctrl() must be defined (even if it does nothing)
|
||||
* 3. struct nvmf_transport_ops must be statically allocated in the
|
||||
* modules .bss section so that a pure module_get on @module
|
||||
* prevents the memory from beeing freed.
|
||||
* prevents the memory from being freed.
|
||||
*/
|
||||
struct nvmf_transport_ops {
|
||||
struct list_head entry;
|
||||
|
||||
@@ -1955,7 +1955,7 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
|
||||
}
|
||||
|
||||
/*
|
||||
* For the linux implementation, if we have an unsuccesful
|
||||
* For the linux implementation, if we have an unsucceesful
|
||||
* status, they blk-mq layer can typically be called with the
|
||||
* non-zero status and the content of the cqe isn't important.
|
||||
*/
|
||||
@@ -2479,7 +2479,7 @@ __nvme_fc_abort_outstanding_ios(struct nvme_fc_ctrl *ctrl, bool start_queues)
|
||||
* writing the registers for shutdown and polling (call
|
||||
* nvme_disable_ctrl()). Given a bunch of i/o was potentially
|
||||
* just aborted and we will wait on those contexts, and given
|
||||
* there was no indication of how live the controlelr is on the
|
||||
* there was no indication of how live the controller is on the
|
||||
* link, don't send more io to create more contexts for the
|
||||
* shutdown. Let the controller fail via keepalive failure if
|
||||
* its still present.
|
||||
|
||||
@@ -493,13 +493,15 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
|
||||
d.timeout_ms = READ_ONCE(cmd->timeout_ms);
|
||||
|
||||
if (d.data_len && (ioucmd->flags & IORING_URING_CMD_FIXED)) {
|
||||
/* fixedbufs is only for non-vectored io */
|
||||
if (vec)
|
||||
return -EINVAL;
|
||||
int ddir = nvme_is_write(&c) ? WRITE : READ;
|
||||
|
||||
ret = io_uring_cmd_import_fixed(d.addr, d.data_len,
|
||||
nvme_is_write(&c) ? WRITE : READ, &iter, ioucmd,
|
||||
issue_flags);
|
||||
if (vec)
|
||||
ret = io_uring_cmd_import_fixed_vec(ioucmd,
|
||||
u64_to_user_ptr(d.addr), d.data_len,
|
||||
ddir, &iter, issue_flags);
|
||||
else
|
||||
ret = io_uring_cmd_import_fixed(d.addr, d.data_len,
|
||||
ddir, &iter, ioucmd, issue_flags);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
@@ -521,7 +523,7 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
|
||||
if (d.data_len) {
|
||||
ret = nvme_map_user_request(req, d.addr, d.data_len,
|
||||
nvme_to_user_ptr(d.metadata), d.metadata_len,
|
||||
map_iter, vec);
|
||||
map_iter, vec ? NVME_IOCTL_VEC : 0);
|
||||
if (ret)
|
||||
goto out_free_req;
|
||||
}
|
||||
@@ -727,7 +729,7 @@ int nvme_ns_head_ioctl(struct block_device *bdev, blk_mode_t mode,
|
||||
|
||||
/*
|
||||
* Handle ioctls that apply to the controller instead of the namespace
|
||||
* seperately and drop the ns SRCU reference early. This avoids a
|
||||
* separately and drop the ns SRCU reference early. This avoids a
|
||||
* deadlock when deleting namespaces using the passthrough interface.
|
||||
*/
|
||||
if (is_ctrl_ioctl(cmd))
|
||||
|
||||
@@ -760,7 +760,7 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
|
||||
* controller's scan_work context. If a path error occurs here, the IO
|
||||
* will wait until a path becomes available or all paths are torn down,
|
||||
* but that action also occurs within scan_work, so it would deadlock.
|
||||
* Defer the partion scan to a different context that does not block
|
||||
* Defer the partition scan to a different context that does not block
|
||||
* scan_work.
|
||||
*/
|
||||
set_bit(GD_SUPPRESS_PART_SCAN, &head->disk->state);
|
||||
|
||||
@@ -523,7 +523,7 @@ static inline bool nvme_ns_head_multipath(struct nvme_ns_head *head)
|
||||
enum nvme_ns_features {
|
||||
NVME_NS_EXT_LBAS = 1 << 0, /* support extended LBA format */
|
||||
NVME_NS_METADATA_SUPPORTED = 1 << 1, /* support getting generated md */
|
||||
NVME_NS_DEAC = 1 << 2, /* DEAC bit in Write Zeores supported */
|
||||
NVME_NS_DEAC = 1 << 2, /* DEAC bit in Write Zeroes supported */
|
||||
};
|
||||
|
||||
struct nvme_ns {
|
||||
|
||||
@@ -3015,7 +3015,7 @@ static void nvme_reset_work(struct work_struct *work)
|
||||
goto out;
|
||||
|
||||
/*
|
||||
* Freeze and update the number of I/O queues as thos might have
|
||||
* Freeze and update the number of I/O queues as those might have
|
||||
* changed. If there are no I/O queues left after this reset, keep the
|
||||
* controller around but remove all namespaces.
|
||||
*/
|
||||
@@ -3186,7 +3186,7 @@ static unsigned long check_vendor_combination_bug(struct pci_dev *pdev)
|
||||
/*
|
||||
* Exclude some Kingston NV1 and A2000 devices from
|
||||
* NVME_QUIRK_SIMPLE_SUSPEND. Do a full suspend to save a
|
||||
* lot fo energy with s2idle sleep on some TUXEDO platforms.
|
||||
* lot of energy with s2idle sleep on some TUXEDO platforms.
|
||||
*/
|
||||
if (dmi_match(DMI_BOARD_NAME, "NS5X_NS7XAU") ||
|
||||
dmi_match(DMI_BOARD_NAME, "NS5x_7xAU") ||
|
||||
|
||||
@@ -82,8 +82,6 @@ static int nvme_status_to_pr_err(int status)
|
||||
return PR_STS_SUCCESS;
|
||||
case NVME_SC_RESERVATION_CONFLICT:
|
||||
return PR_STS_RESERVATION_CONFLICT;
|
||||
case NVME_SC_ONCS_NOT_SUPPORTED:
|
||||
return -EOPNOTSUPP;
|
||||
case NVME_SC_BAD_ATTRIBUTES:
|
||||
case NVME_SC_INVALID_OPCODE:
|
||||
case NVME_SC_INVALID_FIELD:
|
||||
|
||||
@@ -221,7 +221,7 @@ static struct nvme_rdma_qe *nvme_rdma_alloc_ring(struct ib_device *ibdev,
|
||||
|
||||
/*
|
||||
* Bind the CQEs (post recv buffers) DMA mapping to the RDMA queue
|
||||
* lifetime. It's safe, since any chage in the underlying RDMA device
|
||||
* lifetime. It's safe, since any change in the underlying RDMA device
|
||||
* will issue error recovery and queue re-creation.
|
||||
*/
|
||||
for (i = 0; i < ib_queue_size; i++) {
|
||||
@@ -800,7 +800,7 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl,
|
||||
|
||||
/*
|
||||
* Bind the async event SQE DMA mapping to the admin queue lifetime.
|
||||
* It's safe, since any chage in the underlying RDMA device will issue
|
||||
* It's safe, since any change in the underlying RDMA device will issue
|
||||
* error recovery and queue re-creation.
|
||||
*/
|
||||
error = nvme_rdma_alloc_qe(ctrl->device->dev, &ctrl->async_event_sqe,
|
||||
|
||||
+21
-3
@@ -452,7 +452,8 @@ nvme_tcp_fetch_request(struct nvme_tcp_queue *queue)
|
||||
return NULL;
|
||||
}
|
||||
|
||||
list_del(&req->entry);
|
||||
list_del_init(&req->entry);
|
||||
init_llist_node(&req->lentry);
|
||||
return req;
|
||||
}
|
||||
|
||||
@@ -560,6 +561,8 @@ static int nvme_tcp_init_request(struct blk_mq_tag_set *set,
|
||||
req->queue = queue;
|
||||
nvme_req(rq)->ctrl = &ctrl->ctrl;
|
||||
nvme_req(rq)->cmd = &pdu->cmd;
|
||||
init_llist_node(&req->lentry);
|
||||
INIT_LIST_HEAD(&req->entry);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -764,6 +767,14 @@ static int nvme_tcp_handle_r2t(struct nvme_tcp_queue *queue,
|
||||
return -EPROTO;
|
||||
}
|
||||
|
||||
if (llist_on_list(&req->lentry) ||
|
||||
!list_empty(&req->entry)) {
|
||||
dev_err(queue->ctrl->ctrl.device,
|
||||
"req %d unexpected r2t while processing request\n",
|
||||
rq->tag);
|
||||
return -EPROTO;
|
||||
}
|
||||
|
||||
req->pdu_len = 0;
|
||||
req->h2cdata_left = r2t_length;
|
||||
req->h2cdata_offset = r2t_offset;
|
||||
@@ -1350,7 +1361,7 @@ static int nvme_tcp_try_recv(struct nvme_tcp_queue *queue)
|
||||
queue->nr_cqe = 0;
|
||||
consumed = sock->ops->read_sock(sk, &rd_desc, nvme_tcp_recv_skb);
|
||||
release_sock(sk);
|
||||
return consumed;
|
||||
return consumed == -EAGAIN ? 0 : consumed;
|
||||
}
|
||||
|
||||
static void nvme_tcp_io_work(struct work_struct *w)
|
||||
@@ -1378,6 +1389,11 @@ static void nvme_tcp_io_work(struct work_struct *w)
|
||||
else if (unlikely(result < 0))
|
||||
return;
|
||||
|
||||
/* did we get some space after spending time in recv? */
|
||||
if (nvme_tcp_queue_has_pending(queue) &&
|
||||
sk_stream_is_writeable(queue->sock->sk))
|
||||
pending = true;
|
||||
|
||||
if (!pending || !queue->rd_enabled)
|
||||
return;
|
||||
|
||||
@@ -2394,7 +2410,7 @@ static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new)
|
||||
nvme_tcp_teardown_admin_queue(ctrl, false);
|
||||
ret = nvme_tcp_configure_admin_queue(ctrl, false);
|
||||
if (ret)
|
||||
return ret;
|
||||
goto destroy_admin;
|
||||
}
|
||||
|
||||
if (ctrl->icdoff) {
|
||||
@@ -2638,6 +2654,8 @@ static void nvme_tcp_submit_async_event(struct nvme_ctrl *arg)
|
||||
ctrl->async_req.offset = 0;
|
||||
ctrl->async_req.curr_bio = NULL;
|
||||
ctrl->async_req.data_len = 0;
|
||||
init_llist_node(&ctrl->async_req.lentry);
|
||||
INIT_LIST_HEAD(&ctrl->async_req.entry);
|
||||
|
||||
nvme_tcp_queue_request(&ctrl->async_req, true);
|
||||
}
|
||||
|
||||
@@ -1165,7 +1165,7 @@ static void nvmet_execute_identify(struct nvmet_req *req)
|
||||
* A "minimum viable" abort implementation: the command is mandatory in the
|
||||
* spec, but we are not required to do any useful work. We couldn't really
|
||||
* do a useful abort, so don't bother even with waiting for the command
|
||||
* to be exectuted and return immediately telling the command to abort
|
||||
* to be executed and return immediately telling the command to abort
|
||||
* wasn't found.
|
||||
*/
|
||||
static void nvmet_execute_abort(struct nvmet_req *req)
|
||||
|
||||
@@ -62,14 +62,7 @@ inline u16 errno_to_nvme_status(struct nvmet_req *req, int errno)
|
||||
return NVME_SC_LBA_RANGE | NVME_STATUS_DNR;
|
||||
case -EOPNOTSUPP:
|
||||
req->error_loc = offsetof(struct nvme_common_command, opcode);
|
||||
switch (req->cmd->common.opcode) {
|
||||
case nvme_cmd_dsm:
|
||||
case nvme_cmd_write_zeroes:
|
||||
return NVME_SC_ONCS_NOT_SUPPORTED | NVME_STATUS_DNR;
|
||||
default:
|
||||
return NVME_SC_INVALID_OPCODE | NVME_STATUS_DNR;
|
||||
}
|
||||
break;
|
||||
return NVME_SC_INVALID_OPCODE | NVME_STATUS_DNR;
|
||||
case -ENODATA:
|
||||
req->error_loc = offsetof(struct nvme_rw_command, nsid);
|
||||
return NVME_SC_ACCESS_DENIED;
|
||||
@@ -651,7 +644,7 @@ void nvmet_ns_disable(struct nvmet_ns *ns)
|
||||
* Now that we removed the namespaces from the lookup list, we
|
||||
* can kill the per_cpu ref and wait for any remaining references
|
||||
* to be dropped, as well as a RCU grace period for anyone only
|
||||
* using the namepace under rcu_read_lock(). Note that we can't
|
||||
* using the namespace under rcu_read_lock(). Note that we can't
|
||||
* use call_rcu here as we need to ensure the namespaces have
|
||||
* been fully destroyed before unloading the module.
|
||||
*/
|
||||
|
||||
@@ -1339,7 +1339,7 @@ nvmet_fc_portentry_rebind_tgt(struct nvmet_fc_tgtport *tgtport)
|
||||
/**
|
||||
* nvmet_fc_register_targetport - transport entry point called by an
|
||||
* LLDD to register the existence of a local
|
||||
* NVME subystem FC port.
|
||||
* NVME subsystem FC port.
|
||||
* @pinfo: pointer to information about the port to be registered
|
||||
* @template: LLDD entrypoints and operational parameters for the port
|
||||
* @dev: physical hardware device node port corresponds to. Will be
|
||||
|
||||
@@ -133,7 +133,7 @@ u16 blk_to_nvme_status(struct nvmet_req *req, blk_status_t blk_sts)
|
||||
* Right now there exists M : 1 mapping between block layer error
|
||||
* to the NVMe status code (see nvme_error_status()). For consistency,
|
||||
* when we reverse map we use most appropriate NVMe Status code from
|
||||
* the group of the NVMe staus codes used in the nvme_error_status().
|
||||
* the group of the NVMe status codes used in the nvme_error_status().
|
||||
*/
|
||||
switch (blk_sts) {
|
||||
case BLK_STS_NOSPC:
|
||||
@@ -145,15 +145,8 @@ u16 blk_to_nvme_status(struct nvmet_req *req, blk_status_t blk_sts)
|
||||
req->error_loc = offsetof(struct nvme_rw_command, slba);
|
||||
break;
|
||||
case BLK_STS_NOTSUPP:
|
||||
status = NVME_SC_INVALID_OPCODE | NVME_STATUS_DNR;
|
||||
req->error_loc = offsetof(struct nvme_common_command, opcode);
|
||||
switch (req->cmd->common.opcode) {
|
||||
case nvme_cmd_dsm:
|
||||
case nvme_cmd_write_zeroes:
|
||||
status = NVME_SC_ONCS_NOT_SUPPORTED | NVME_STATUS_DNR;
|
||||
break;
|
||||
default:
|
||||
status = NVME_SC_INVALID_OPCODE | NVME_STATUS_DNR;
|
||||
}
|
||||
break;
|
||||
case BLK_STS_MEDIUM:
|
||||
status = NVME_SC_ACCESS_DENIED;
|
||||
|
||||
@@ -99,7 +99,7 @@ static u16 nvmet_passthru_override_id_ctrl(struct nvmet_req *req)
|
||||
|
||||
/*
|
||||
* The passthru NVMe driver may have a limit on the number of segments
|
||||
* which depends on the host's memory fragementation. To solve this,
|
||||
* which depends on the host's memory fragmentation. To solve this,
|
||||
* ensure mdts is limited to the pages equal to the number of segments.
|
||||
*/
|
||||
max_hw_sectors = min_not_zero(pctrl->max_segments << PAGE_SECTORS_SHIFT,
|
||||
|
||||
@@ -2171,7 +2171,7 @@ enum {
|
||||
NVME_SC_BAD_ATTRIBUTES = 0x180,
|
||||
NVME_SC_INVALID_PI = 0x181,
|
||||
NVME_SC_READ_ONLY = 0x182,
|
||||
NVME_SC_ONCS_NOT_SUPPORTED = 0x183,
|
||||
NVME_SC_CMD_SIZE_LIM_EXCEEDED = 0x183,
|
||||
|
||||
/*
|
||||
* I/O Command Set Specific - Fabrics commands:
|
||||
|
||||
Reference in New Issue
Block a user