From dda10fc801a9d81829e624ef75ecdfc94f39b310 Mon Sep 17 00:00:00 2001 From: Nick Child Date: Wed, 7 Aug 2024 16:18:03 -0500 Subject: [PATCH 1/7] ibmvnic: Only replenish rx pool when resources are getting low Previously, the driver would replenish the rx pool if the polling function consumed less than the budget. The logic being that the driver did not exhaust its budget so that must mean that the driver is not busy and has cycles to spare for replenishing the pool. So pool replenishment happens on every poll which did not consume the budget. This can very costly during request-response tests. In fact, an extra ~100pps can be seen in TCP_RR_150 tests when we remove this conditional. Trace results (ftrace, graph-time=1) for the poll function are below: Previous results: ibmvnic_poll = 64951846.0 us / 4167628.0 hits = AVG 15.58 replenish_rx_pool = 17602846.0 us / 4710437.0 hits = AVG 3.74 Now: ibmvnic_poll = 57673941.0 us / 4791737.0 hits = AVG 12.04 replenish_rx_pool = 3938171.6 us / 4314.0 hits = AVG 912.88 While the replenish function takes longer, it is hit less frequently meaning the ibmvnic_poll function, on average, is faster. Furthermore, this change does not have a negative effect on performance bandwidth/latency measurements. Signed-off-by: Nick Child Link: https://patch.msgid.link/20240807211809.1259563-2-nnac123@linux.ibm.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/ibm/ibmvnic.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index 23ebeb143987..857d585bd229 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -3527,9 +3527,8 @@ restart_poll: } if (adapter->state != VNIC_CLOSING && - ((atomic_read(&adapter->rx_pool[scrq_num].available) < - adapter->req_rx_add_entries_per_subcrq / 2) || - frames_processed < budget)) + (atomic_read(&adapter->rx_pool[scrq_num].available) < + adapter->req_rx_add_entries_per_subcrq / 2)) replenish_rx_pool(adapter, &adapter->rx_pool[scrq_num]); if (frames_processed < budget) { if (napi_complete_done(napi, frames_processed)) { From b41b45ecee6ba74bd590b113be9f349c6b818b46 Mon Sep 17 00:00:00 2001 From: Nick Child Date: Wed, 7 Aug 2024 16:18:04 -0500 Subject: [PATCH 2/7] ibmvnic: Use header len helper functions on tx Use the header length helper functions rather than trying to calculate it within the driver. There are defined functions for mac and network headers (skb_mac_header_len and skb_network_header_len) but no such function exists for the transport header length. Also, hdr_data was memset during allocation to all 0's so no need to memset again. Signed-off-by: Nick Child Link: https://patch.msgid.link/20240807211809.1259563-3-nnac123@linux.ibm.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/ibm/ibmvnic.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index 857d585bd229..7d552d4bbe15 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -2156,36 +2156,28 @@ static int build_hdr_data(u8 hdr_field, struct sk_buff *skb, int len = 0; u8 *hdr; - if (skb_vlan_tagged(skb) && !skb_vlan_tag_present(skb)) - hdr_len[0] = sizeof(struct vlan_ethhdr); - else - hdr_len[0] = sizeof(struct ethhdr); if (skb->protocol == htons(ETH_P_IP)) { - hdr_len[1] = ip_hdr(skb)->ihl * 4; if (ip_hdr(skb)->protocol == IPPROTO_TCP) hdr_len[2] = tcp_hdrlen(skb); else if (ip_hdr(skb)->protocol == IPPROTO_UDP) hdr_len[2] = sizeof(struct udphdr); } else if (skb->protocol == htons(ETH_P_IPV6)) { - hdr_len[1] = sizeof(struct ipv6hdr); if (ipv6_hdr(skb)->nexthdr == IPPROTO_TCP) hdr_len[2] = tcp_hdrlen(skb); else if (ipv6_hdr(skb)->nexthdr == IPPROTO_UDP) hdr_len[2] = sizeof(struct udphdr); - } else if (skb->protocol == htons(ETH_P_ARP)) { - hdr_len[1] = arp_hdr_len(skb->dev); - hdr_len[2] = 0; } - memset(hdr_data, 0, 120); if ((hdr_field >> 6) & 1) { + hdr_len[0] = skb_mac_header_len(skb); hdr = skb_mac_header(skb); memcpy(hdr_data, hdr, hdr_len[0]); len += hdr_len[0]; } if ((hdr_field >> 5) & 1) { + hdr_len[1] = skb_network_header_len(skb); hdr = skb_network_header(skb); memcpy(hdr_data + len, hdr, hdr_len[1]); len += hdr_len[1]; @@ -2196,6 +2188,7 @@ static int build_hdr_data(u8 hdr_field, struct sk_buff *skb, memcpy(hdr_data + len, hdr, hdr_len[2]); len += hdr_len[2]; } + return len; } From d95f749a0b5e42a30d7025d19c15a11c1a570e72 Mon Sep 17 00:00:00 2001 From: Nick Child Date: Wed, 7 Aug 2024 16:18:05 -0500 Subject: [PATCH 3/7] ibmvnic: Reduce memcpys in tx descriptor generation Previously when creating the header descriptors, the driver would: 1. allocate a temporary buffer on the stack (in build_hdr_descs_arr) 2. memcpy the header info into the temporary buffer (in build_hdr_data) 3. memcpy the temp buffer into a local variable (in create_hdr_descs) 4. copy the local variable into the return buffer (in create_hdr_descs) Since, there is no opportunity for errors during this process, the temp buffer is not needed and work can be done on the return buffer directly. Repurpose build_hdr_data() to only calculate the header lengths. Rename it to get_hdr_lens(). Edit create_hdr_descs() to read from the skb directly and copy directly into the returned useful buffer. The process now involves less memory and write operations while also being more readable. Signed-off-by: Nick Child Link: https://patch.msgid.link/20240807211809.1259563-4-nnac123@linux.ibm.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/ibm/ibmvnic.c | 89 ++++++++++++++---------------- 1 file changed, 40 insertions(+), 49 deletions(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index 7d552d4bbe15..58a517ecbda3 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -2140,23 +2140,36 @@ static int ibmvnic_close(struct net_device *netdev) } /** - * build_hdr_data - creates L2/L3/L4 header data buffer + * get_hdr_lens - fills list of L2/L3/L4 hdr lens * @hdr_field: bitfield determining needed headers * @skb: socket buffer - * @hdr_len: array of header lengths - * @hdr_data: buffer to write the header to + * @hdr_len: array of header lengths to be filled * * Reads hdr_field to determine which headers are needed by firmware. * Builds a buffer containing these headers. Saves individual header * lengths and total buffer length to be used to build descriptors. + * + * Return: total len of all headers */ -static int build_hdr_data(u8 hdr_field, struct sk_buff *skb, - int *hdr_len, u8 *hdr_data) +static int get_hdr_lens(u8 hdr_field, struct sk_buff *skb, + int *hdr_len) { int len = 0; - u8 *hdr; + if ((hdr_field >> 6) & 1) { + hdr_len[0] = skb_mac_header_len(skb); + len += hdr_len[0]; + } + + if ((hdr_field >> 5) & 1) { + hdr_len[1] = skb_network_header_len(skb); + len += hdr_len[1]; + } + + if (!((hdr_field >> 4) & 1)) + return len; + if (skb->protocol == htons(ETH_P_IP)) { if (ip_hdr(skb)->protocol == IPPROTO_TCP) hdr_len[2] = tcp_hdrlen(skb); @@ -2169,27 +2182,7 @@ static int build_hdr_data(u8 hdr_field, struct sk_buff *skb, hdr_len[2] = sizeof(struct udphdr); } - if ((hdr_field >> 6) & 1) { - hdr_len[0] = skb_mac_header_len(skb); - hdr = skb_mac_header(skb); - memcpy(hdr_data, hdr, hdr_len[0]); - len += hdr_len[0]; - } - - if ((hdr_field >> 5) & 1) { - hdr_len[1] = skb_network_header_len(skb); - hdr = skb_network_header(skb); - memcpy(hdr_data + len, hdr, hdr_len[1]); - len += hdr_len[1]; - } - - if ((hdr_field >> 4) & 1) { - hdr = skb_transport_header(skb); - memcpy(hdr_data + len, hdr, hdr_len[2]); - len += hdr_len[2]; - } - - return len; + return len + hdr_len[2]; } /** @@ -2202,12 +2195,14 @@ static int build_hdr_data(u8 hdr_field, struct sk_buff *skb, * * Creates header and, if needed, header extension descriptors and * places them in a descriptor array, scrq_arr + * + * Return: Number of header descs */ static int create_hdr_descs(u8 hdr_field, u8 *hdr_data, int len, int *hdr_len, union sub_crq *scrq_arr) { - union sub_crq hdr_desc; + union sub_crq *hdr_desc; int tmp_len = len; int num_descs = 0; u8 *data, *cur; @@ -2216,28 +2211,26 @@ static int create_hdr_descs(u8 hdr_field, u8 *hdr_data, int len, int *hdr_len, while (tmp_len > 0) { cur = hdr_data + len - tmp_len; - memset(&hdr_desc, 0, sizeof(hdr_desc)); - if (cur != hdr_data) { - data = hdr_desc.hdr_ext.data; + hdr_desc = &scrq_arr[num_descs]; + if (num_descs) { + data = hdr_desc->hdr_ext.data; tmp = tmp_len > 29 ? 29 : tmp_len; - hdr_desc.hdr_ext.first = IBMVNIC_CRQ_CMD; - hdr_desc.hdr_ext.type = IBMVNIC_HDR_EXT_DESC; - hdr_desc.hdr_ext.len = tmp; + hdr_desc->hdr_ext.first = IBMVNIC_CRQ_CMD; + hdr_desc->hdr_ext.type = IBMVNIC_HDR_EXT_DESC; + hdr_desc->hdr_ext.len = tmp; } else { - data = hdr_desc.hdr.data; + data = hdr_desc->hdr.data; tmp = tmp_len > 24 ? 24 : tmp_len; - hdr_desc.hdr.first = IBMVNIC_CRQ_CMD; - hdr_desc.hdr.type = IBMVNIC_HDR_DESC; - hdr_desc.hdr.len = tmp; - hdr_desc.hdr.l2_len = (u8)hdr_len[0]; - hdr_desc.hdr.l3_len = cpu_to_be16((u16)hdr_len[1]); - hdr_desc.hdr.l4_len = (u8)hdr_len[2]; - hdr_desc.hdr.flag = hdr_field << 1; + hdr_desc->hdr.first = IBMVNIC_CRQ_CMD; + hdr_desc->hdr.type = IBMVNIC_HDR_DESC; + hdr_desc->hdr.len = tmp; + hdr_desc->hdr.l2_len = (u8)hdr_len[0]; + hdr_desc->hdr.l3_len = cpu_to_be16((u16)hdr_len[1]); + hdr_desc->hdr.l4_len = (u8)hdr_len[2]; + hdr_desc->hdr.flag = hdr_field << 1; } memcpy(data, cur, tmp); tmp_len -= tmp; - *scrq_arr = hdr_desc; - scrq_arr++; num_descs++; } @@ -2260,13 +2253,11 @@ static void build_hdr_descs_arr(struct sk_buff *skb, int *num_entries, u8 hdr_field) { int hdr_len[3] = {0, 0, 0}; - u8 hdr_data[140] = {0}; int tot_len; - tot_len = build_hdr_data(hdr_field, skb, hdr_len, - hdr_data); - *num_entries += create_hdr_descs(hdr_field, hdr_data, tot_len, hdr_len, - indir_arr + 1); + tot_len = get_hdr_lens(hdr_field, skb, hdr_len); + *num_entries += create_hdr_descs(hdr_field, skb_mac_header(skb), + tot_len, hdr_len, indir_arr + 1); } static int ibmvnic_xmit_workarounds(struct sk_buff *skb, From 6e7a57581abea5335cbf29d738724d25d3c302d7 Mon Sep 17 00:00:00 2001 From: Nick Child Date: Wed, 7 Aug 2024 16:18:06 -0500 Subject: [PATCH 4/7] ibmvnic: Remove duplicate memory barriers in tx send_subcrq_[in]direct() already has a dma memory barrier. Remove the earlier one. Signed-off-by: Nick Child Link: https://patch.msgid.link/20240807211809.1259563-5-nnac123@linux.ibm.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/ibm/ibmvnic.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index 58a517ecbda3..256feac588ef 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -2459,9 +2459,6 @@ static netdev_tx_t ibmvnic_xmit(struct sk_buff *skb, struct net_device *netdev) skb_copy_from_linear_data(skb, dst, skb->len); } - /* post changes to long_term_buff *dst before VIOS accessing it */ - dma_wmb(); - tx_pool->consumer_index = (tx_pool->consumer_index + 1) % tx_pool->num_buffers; From 74839f7a82689bf5a21a5447cae8e3a7b7a606d2 Mon Sep 17 00:00:00 2001 From: Nick Child Date: Wed, 7 Aug 2024 16:18:07 -0500 Subject: [PATCH 5/7] ibmvnic: Introduce send sub-crq direct Firmware supports two hcalls to send a sub-crq request: H_SEND_SUB_CRQ_INDIRECT and H_SEND_SUB_CRQ. The indirect hcall allows for submission of batched messages while the other hcall is limited to only one message. This protocol is defined in PAPR section 17.2.3.3. Previously, the ibmvnic xmit function only used the indirect hcall. This allowed the driver to batch it's skbs. A single skb can occupy a few entries per hcall depending on if FW requires skb header information or not. The FW only needs header information if the packet is segmented. By this logic, if an skb is not GSO then it can fit in one sub-crq message and therefore is a candidate for H_SEND_SUB_CRQ. Batching skb transmission is only useful when there are more packets coming down the line (ie netdev_xmit_more is true). As it turns out, H_SEND_SUB_CRQ induces less latency than H_SEND_SUB_CRQ_INDIRECT. Therefore, use H_SEND_SUB_CRQ where appropriate. Small latency gains seen when doing TCP_RR_150 (request/response workload). Ftrace results (graph-time=1): Previous: ibmvnic_xmit = 29618270.83 us / 8860058.0 hits = AVG 3.34 ibmvnic_tx_scrq_flush = 21972231.02 us / 6553972.0 hits = AVG 3.35 Now: ibmvnic_xmit = 22153350.96 us / 8438942.0 hits = AVG 2.63 ibmvnic_tx_scrq_flush = 15858922.4 us / 6244076.0 hits = AVG 2.54 Signed-off-by: Nick Child Link: https://patch.msgid.link/20240807211809.1259563-6-nnac123@linux.ibm.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/ibm/ibmvnic.c | 52 ++++++++++++++++++++++++++---- 1 file changed, 46 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index 256feac588ef..d7262674eab7 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -117,6 +117,7 @@ static void free_long_term_buff(struct ibmvnic_adapter *adapter, struct ibmvnic_long_term_buff *ltb); static void ibmvnic_disable_irqs(struct ibmvnic_adapter *adapter); static void flush_reset_queue(struct ibmvnic_adapter *adapter); +static void print_subcrq_error(struct device *dev, int rc, const char *func); struct ibmvnic_stat { char name[ETH_GSTRING_LEN]; @@ -2334,8 +2335,29 @@ static void ibmvnic_tx_scrq_clean_buffer(struct ibmvnic_adapter *adapter, } } +static int send_subcrq_direct(struct ibmvnic_adapter *adapter, + u64 remote_handle, u64 *entry) +{ + unsigned int ua = adapter->vdev->unit_address; + struct device *dev = &adapter->vdev->dev; + int rc; + + /* Make sure the hypervisor sees the complete request */ + dma_wmb(); + rc = plpar_hcall_norets(H_SEND_SUB_CRQ, ua, + cpu_to_be64(remote_handle), + cpu_to_be64(entry[0]), cpu_to_be64(entry[1]), + cpu_to_be64(entry[2]), cpu_to_be64(entry[3])); + + if (rc) + print_subcrq_error(dev, rc, __func__); + + return rc; +} + static int ibmvnic_tx_scrq_flush(struct ibmvnic_adapter *adapter, - struct ibmvnic_sub_crq_queue *tx_scrq) + struct ibmvnic_sub_crq_queue *tx_scrq, + bool indirect) { struct ibmvnic_ind_xmit_queue *ind_bufp; u64 dma_addr; @@ -2350,7 +2372,13 @@ static int ibmvnic_tx_scrq_flush(struct ibmvnic_adapter *adapter, if (!entries) return 0; - rc = send_subcrq_indirect(adapter, handle, dma_addr, entries); + + if (indirect) + rc = send_subcrq_indirect(adapter, handle, dma_addr, entries); + else + rc = send_subcrq_direct(adapter, handle, + (u64 *)ind_bufp->indir_arr); + if (rc) ibmvnic_tx_scrq_clean_buffer(adapter, tx_scrq); else @@ -2408,7 +2436,7 @@ static netdev_tx_t ibmvnic_xmit(struct sk_buff *skb, struct net_device *netdev) tx_dropped++; tx_send_failed++; ret = NETDEV_TX_OK; - lpar_rc = ibmvnic_tx_scrq_flush(adapter, tx_scrq); + lpar_rc = ibmvnic_tx_scrq_flush(adapter, tx_scrq, true); if (lpar_rc != H_SUCCESS) goto tx_err; goto out; @@ -2426,7 +2454,7 @@ static netdev_tx_t ibmvnic_xmit(struct sk_buff *skb, struct net_device *netdev) tx_send_failed++; tx_dropped++; ret = NETDEV_TX_OK; - lpar_rc = ibmvnic_tx_scrq_flush(adapter, tx_scrq); + lpar_rc = ibmvnic_tx_scrq_flush(adapter, tx_scrq, true); if (lpar_rc != H_SUCCESS) goto tx_err; goto out; @@ -2521,6 +2549,16 @@ static netdev_tx_t ibmvnic_xmit(struct sk_buff *skb, struct net_device *netdev) tx_crq.v1.flags1 |= IBMVNIC_TX_LSO; tx_crq.v1.mss = cpu_to_be16(skb_shinfo(skb)->gso_size); hdrs += 2; + } else if (!ind_bufp->index && !netdev_xmit_more()) { + ind_bufp->indir_arr[0] = tx_crq; + ind_bufp->index = 1; + tx_buff->num_entries = 1; + netdev_tx_sent_queue(txq, skb->len); + lpar_rc = ibmvnic_tx_scrq_flush(adapter, tx_scrq, false); + if (lpar_rc != H_SUCCESS) + goto tx_err; + + goto early_exit; } if ((*hdrs >> 7) & 1) @@ -2530,7 +2568,7 @@ static netdev_tx_t ibmvnic_xmit(struct sk_buff *skb, struct net_device *netdev) tx_buff->num_entries = num_entries; /* flush buffer if current entry can not fit */ if (num_entries + ind_bufp->index > IBMVNIC_MAX_IND_DESCS) { - lpar_rc = ibmvnic_tx_scrq_flush(adapter, tx_scrq); + lpar_rc = ibmvnic_tx_scrq_flush(adapter, tx_scrq, true); if (lpar_rc != H_SUCCESS) goto tx_flush_err; } @@ -2538,15 +2576,17 @@ static netdev_tx_t ibmvnic_xmit(struct sk_buff *skb, struct net_device *netdev) indir_arr[0] = tx_crq; memcpy(&ind_bufp->indir_arr[ind_bufp->index], &indir_arr[0], num_entries * sizeof(struct ibmvnic_generic_scrq)); + ind_bufp->index += num_entries; if (__netdev_tx_sent_queue(txq, skb->len, netdev_xmit_more() && ind_bufp->index < IBMVNIC_MAX_IND_DESCS)) { - lpar_rc = ibmvnic_tx_scrq_flush(adapter, tx_scrq); + lpar_rc = ibmvnic_tx_scrq_flush(adapter, tx_scrq, true); if (lpar_rc != H_SUCCESS) goto tx_err; } +early_exit: if (atomic_add_return(num_entries, &tx_scrq->used) >= adapter->req_tx_entries_per_subcrq) { netdev_dbg(netdev, "Stopping queue %d\n", queue_num); From 1c33e29245ccb1182876eb57319777456f9e509c Mon Sep 17 00:00:00 2001 From: Nick Child Date: Wed, 7 Aug 2024 16:18:08 -0500 Subject: [PATCH 6/7] ibmvnic: Only record tx completed bytes once per handler Byte Queue Limits depends on dql_completed being called once per tx completion round in order to adjust its algorithm appropriately. The dql->limit value is an approximation of the amount of bytes that the NIC can consume per irq interval. If this approximation is too high then the NIC will become over-saturated. Too low and the NIC will starve. The dql->limit depends on dql->prev-* stats to calculate an optimal value. If dql_completed() is called more than once per irq handler then those prev-* values become unreliable (because they are not an accurate representation of the previous state of the NIC) resulting in a sub-optimal limit value. Therefore, move the call to netdev_tx_completed_queue() to the end of ibmvnic_complete_tx(). When performing 150 sessions of TCP rr (request-response 1 byte packets) workloads, one could observe: PREVIOUSLY: - limit and inflight values hovering around 130 - transaction rate of around 750k pps. NOW: - limit rises and falls in response to inflight (130-900) - transaction rate of around 1M pps (33% improvement) Signed-off-by: Nick Child Link: https://patch.msgid.link/20240807211809.1259563-7-nnac123@linux.ibm.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/ibm/ibmvnic.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index d7262674eab7..b687e5396e11 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -4189,20 +4189,17 @@ static int ibmvnic_complete_tx(struct ibmvnic_adapter *adapter, struct ibmvnic_sub_crq_queue *scrq) { struct device *dev = &adapter->vdev->dev; + int num_packets = 0, total_bytes = 0; struct ibmvnic_tx_pool *tx_pool; struct ibmvnic_tx_buff *txbuff; struct netdev_queue *txq; union sub_crq *next; - int index; - int i; + int index, i; restart_loop: while (pending_scrq(adapter, scrq)) { unsigned int pool = scrq->pool_index; int num_entries = 0; - int total_bytes = 0; - int num_packets = 0; - next = ibmvnic_next_scrq(adapter, scrq); for (i = 0; i < next->tx_comp.num_comps; i++) { index = be32_to_cpu(next->tx_comp.correlators[i]); @@ -4238,8 +4235,6 @@ restart_loop: /* remove tx_comp scrq*/ next->tx_comp.first = 0; - txq = netdev_get_tx_queue(adapter->netdev, scrq->pool_index); - netdev_tx_completed_queue(txq, num_packets, total_bytes); if (atomic_sub_return(num_entries, &scrq->used) <= (adapter->req_tx_entries_per_subcrq / 2) && @@ -4264,6 +4259,9 @@ restart_loop: goto restart_loop; } + txq = netdev_get_tx_queue(adapter->netdev, scrq->pool_index); + netdev_tx_completed_queue(txq, num_packets, total_bytes); + return 0; } From e633e32b60fd6701bed73599b273a2a03621ea54 Mon Sep 17 00:00:00 2001 From: Nick Child Date: Wed, 7 Aug 2024 16:18:09 -0500 Subject: [PATCH 7/7] ibmvnic: Perform tx CSO during send scrq direct During initialization with the vnic server, a bitstring is communicated to the client regarding header info needed during CSO (See "VNIC Capabilities" in PAPR). Most of the time, to be safe, vnic server requests header info for CSO. When header info is needed, multiple TX descriptors are required per skb; This limits the driver to use send_subcrq_indirect instead of send_subcrq_direct. Previously, the vnic server request for header info was ignored. This allowed the use of send_sub_crq_direct. Transmissions were successful because the bitstring returned by vnic server is broad and over cautionary. It was observed that mlx backing devices could actually transmit and handle CSO packets without the vnic server receiving header info (despite the fact that the bitstring requested it). There was a trust issue: The bitstring was overcautionary. This extra precaution (requesting header info when the backing device may not use it) comes at the cost of performance (using direct vs indirect hcalls has a 30% delta in small packet RR transaction rate). So it has been requested that the vnic server team tries to ensure that the bitstring is more exact. In the meantime, disable CSO when it is possible to use the skb in the send_subcrq_direct path. In other words, calculate the checksum before handing the packet to FW when the packet is not segmented and xmit_more is false. Since the code path is only possible if the skb is non GSO and xmit_more is false, the cost of doing checksum in the send_subcrq_direct path is minimal. Any large segmented skb will have xmit_more set to true more frequently and it is inexpensive to do checksumming on a small skb. The worst-case workload would be a 9000 MTU TCP_RR test with close to MTU sized packets (and TSO off). This allows xmit_more to be false more frequently and open the code path up to use send_subcrq_direct. Observing trace data (graph-time = 1) and packet rate with this workload shows minimal performance degradation: 1. NIC does checksum w headers, safely use send_subcrq_indirect: - Packet rate: 631k txs - Trace data: ibmvnic_xmit = 44344685.87 us / 6234576 hits = AVG 7.11 us skb_checksum_help = 4.07 us / 2 hits = AVG 2.04 us ^ Notice hits, tracing this just for reassurance ibmvnic_tx_scrq_flush = 33040649.69 us / 5638441 hits = AVG 5.86 us send_subcrq_indirect = 37438922.24 us / 6030859 hits = AVG 6.21 us 2. NIC does checksum w/o headers, dangerously use send_subcrq_direct: - Packet rate: 831k txs - Trace data: ibmvnic_xmit = 48940092.29 us / 8187630 hits = AVG 5.98 us skb_checksum_help = 2.03 us / 1 hits = AVG 2.03 ibmvnic_tx_scrq_flush = 31141879.57 us / 7948960 hits = AVG 3.92 us send_subcrq_indirect = 8412506.03 us / 728781 hits = AVG 11.54 ^ notice hits is much lower b/c send_subcrq_direct was called ^ wasn't traceable 3. driver does checksum, safely use send_subcrq_direct (THIS PATCH): - Packet rate: 829k txs - Trace data: ibmvnic_xmit = 56696077.63 us / 8066168 hits = AVG 7.03 us skb_checksum_help = 8587456.16 us / 7526072 hits = AVG 1.14 us ibmvnic_tx_scrq_flush = 30219545.55 us / 7782409 hits = AVG 3.88 us send_subcrq_indirect = 8638326.44 us / 763693 hits = AVG 11.31 us When the bitstring ever specifies that CSO does not require headers (dependent on VIOS vnic server changes), then this patch should be removed and replaced with one that investigates the bitstring before using send_subcrq_direct. Signed-off-by: Nick Child Link: https://patch.msgid.link/20240807211809.1259563-8-nnac123@linux.ibm.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/ibm/ibmvnic.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index b687e5396e11..87e693a81433 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -2409,6 +2409,7 @@ static netdev_tx_t ibmvnic_xmit(struct sk_buff *skb, struct net_device *netdev) unsigned long lpar_rc; union sub_crq tx_crq; unsigned int offset; + bool use_scrq_send_direct = false; int num_entries = 1; unsigned char *dst; int bufidx = 0; @@ -2468,6 +2469,18 @@ static netdev_tx_t ibmvnic_xmit(struct sk_buff *skb, struct net_device *netdev) memset(dst, 0, tx_pool->buf_size); data_dma_addr = ltb->addr + offset; + /* if we are going to send_subcrq_direct this then we need to + * update the checksum before copying the data into ltb. Essentially + * these packets force disable CSO so that we can guarantee that + * FW does not need header info and we can send direct. + */ + if (!skb_is_gso(skb) && !ind_bufp->index && !netdev_xmit_more()) { + use_scrq_send_direct = true; + if (skb->ip_summed == CHECKSUM_PARTIAL && + skb_checksum_help(skb)) + use_scrq_send_direct = false; + } + if (skb_shinfo(skb)->nr_frags) { int cur, i; @@ -2549,11 +2562,13 @@ static netdev_tx_t ibmvnic_xmit(struct sk_buff *skb, struct net_device *netdev) tx_crq.v1.flags1 |= IBMVNIC_TX_LSO; tx_crq.v1.mss = cpu_to_be16(skb_shinfo(skb)->gso_size); hdrs += 2; - } else if (!ind_bufp->index && !netdev_xmit_more()) { - ind_bufp->indir_arr[0] = tx_crq; + } else if (use_scrq_send_direct) { + /* See above comment, CSO disabled with direct xmit */ + tx_crq.v1.flags1 &= ~(IBMVNIC_TX_CHKSUM_OFFLOAD); ind_bufp->index = 1; tx_buff->num_entries = 1; netdev_tx_sent_queue(txq, skb->len); + ind_bufp->indir_arr[0] = tx_crq; lpar_rc = ibmvnic_tx_scrq_flush(adapter, tx_scrq, false); if (lpar_rc != H_SUCCESS) goto tx_err;