diff --git a/Documentation/netlink/specs/rt_link.yaml b/Documentation/netlink/specs/rt_link.yaml index 0c4d5d40cae9..d7131a1afadf 100644 --- a/Documentation/netlink/specs/rt_link.yaml +++ b/Documentation/netlink/specs/rt_link.yaml @@ -1137,6 +1137,10 @@ attribute-sets: name: dpll-pin type: nest nested-attributes: link-dpll-pin-attrs + - + name: max-pacing-offload-horizon + type: uint + doc: EDT offload horizon supported by the device (in nsec). - name: af-spec-attrs attributes: diff --git a/Documentation/networking/net_cachelines/net_device.rst b/Documentation/networking/net_cachelines/net_device.rst index 22b07c814f4a..49f03cb78c6e 100644 --- a/Documentation/networking/net_cachelines/net_device.rst +++ b/Documentation/networking/net_cachelines/net_device.rst @@ -183,3 +183,4 @@ struct_devlink_port* devlink_port struct_dpll_pin* dpll_pin struct hlist_head page_pools struct dim_irq_moder* irq_moder +u64 max_pacing_offload_horizon diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 4d20c776a4ff..49a7e7db0883 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2009,6 +2009,8 @@ enum netdev_reg_state { * @dpll_pin: Pointer to the SyncE source pin of a DPLL subsystem, * where the clock is recovered. * + * @max_pacing_offload_horizon: max EDT offload horizon in nsec. + * * FIXME: cleanup struct net_device such that network protocol info * moves out. */ @@ -2399,6 +2401,8 @@ struct net_device { /** @irq_moder: dim parameters used if IS_ENABLED(CONFIG_DIMLIB). */ struct dim_irq_moder *irq_moder; + u64 max_pacing_offload_horizon; + u8 priv[] ____cacheline_aligned __counted_by(priv_len); } ____cacheline_aligned; diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 6dc258993b17..506ba9c80e83 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -377,6 +377,7 @@ enum { IFLA_GSO_IPV4_MAX_SIZE, IFLA_GRO_IPV4_MAX_SIZE, IFLA_DPLL_PIN, + IFLA_MAX_PACING_OFFLOAD_HORIZON, __IFLA_MAX }; diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index a3cd0c2dc995..25a9a47001cd 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -836,6 +836,8 @@ enum { TCA_FQ_WEIGHTS, /* Weights for each band */ + TCA_FQ_OFFLOAD_HORIZON, /* dequeue paced packets within this horizon immediately (us units) */ + __TCA_FQ_MAX }; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index f0a520987085..682d8d3127db 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1118,6 +1118,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + nla_total_size(MAX_ADDR_LEN) /* IFLA_PERM_ADDRESS */ + rtnl_devlink_port_size(dev) + rtnl_dpll_pin_size(dev) + + nla_total_size(8) /* IFLA_MAX_PACING_OFFLOAD_HORIZON */ + 0; } @@ -1867,6 +1868,8 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, READ_ONCE(dev->tso_max_size)) || nla_put_u32(skb, IFLA_TSO_MAX_SEGS, READ_ONCE(dev->tso_max_segs)) || + nla_put_uint(skb, IFLA_MAX_PACING_OFFLOAD_HORIZON, + READ_ONCE(dev->max_pacing_offload_horizon)) || #ifdef CONFIG_RPS nla_put_u32(skb, IFLA_NUM_RX_QUEUES, READ_ONCE(dev->num_rx_queues)) || @@ -1975,6 +1978,7 @@ nla_put_failure: } static const struct nla_policy ifla_policy[IFLA_MAX+1] = { + [IFLA_UNSPEC] = { .strict_start_type = IFLA_DPLL_PIN }, [IFLA_IFNAME] = { .type = NLA_STRING, .len = IFNAMSIZ-1 }, [IFLA_ADDRESS] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, [IFLA_BROADCAST] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index 19a49af5a9e5..aeabf45c9200 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -111,6 +111,7 @@ struct fq_perband_flows { struct fq_sched_data { /* Read mostly cache line */ + u64 offload_horizon; u32 quantum; u32 initial_quantum; u32 flow_refill_delay; @@ -299,7 +300,7 @@ static void fq_gc(struct fq_sched_data *q, } /* Fast path can be used if : - * 1) Packet tstamp is in the past. + * 1) Packet tstamp is in the past, or within the pacing offload horizon. * 2) FQ qlen == 0 OR * (no flow is currently eligible for transmit, * AND fast path queue has less than 8 packets) @@ -314,7 +315,7 @@ static bool fq_fastpath_check(const struct Qdisc *sch, struct sk_buff *skb, const struct fq_sched_data *q = qdisc_priv(sch); const struct sock *sk; - if (fq_skb_cb(skb)->time_to_send > now) + if (fq_skb_cb(skb)->time_to_send > now + q->offload_horizon) return false; if (sch->q.qlen != 0) { @@ -595,15 +596,18 @@ static void fq_check_throttled(struct fq_sched_data *q, u64 now) unsigned long sample; struct rb_node *p; - if (q->time_next_delayed_flow > now) + if (q->time_next_delayed_flow > now + q->offload_horizon) return; /* Update unthrottle latency EWMA. * This is cheap and can help diagnosing timer/latency problems. */ sample = (unsigned long)(now - q->time_next_delayed_flow); - q->unthrottle_latency_ns -= q->unthrottle_latency_ns >> 3; - q->unthrottle_latency_ns += sample >> 3; + if ((long)sample > 0) { + q->unthrottle_latency_ns -= q->unthrottle_latency_ns >> 3; + q->unthrottle_latency_ns += sample >> 3; + } + now += q->offload_horizon; q->time_next_delayed_flow = ~0ULL; while ((p = rb_first(&q->delayed)) != NULL) { @@ -687,7 +691,7 @@ begin: u64 time_next_packet = max_t(u64, fq_skb_cb(skb)->time_to_send, f->time_next_packet); - if (now < time_next_packet) { + if (now + q->offload_horizon < time_next_packet) { head->first = f->next; f->time_next_packet = time_next_packet; fq_flow_set_throttled(q, f); @@ -925,6 +929,7 @@ static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = { [TCA_FQ_HORIZON_DROP] = { .type = NLA_U8 }, [TCA_FQ_PRIOMAP] = NLA_POLICY_EXACT_LEN(sizeof(struct tc_prio_qopt)), [TCA_FQ_WEIGHTS] = NLA_POLICY_EXACT_LEN(FQ_BANDS * sizeof(s32)), + [TCA_FQ_OFFLOAD_HORIZON] = { .type = NLA_U32 }, }; /* compress a u8 array with all elems <= 3 to an array of 2-bit fields */ @@ -1100,6 +1105,17 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt, WRITE_ONCE(q->horizon_drop, nla_get_u8(tb[TCA_FQ_HORIZON_DROP])); + if (tb[TCA_FQ_OFFLOAD_HORIZON]) { + u64 offload_horizon = (u64)NSEC_PER_USEC * + nla_get_u32(tb[TCA_FQ_OFFLOAD_HORIZON]); + + if (offload_horizon <= qdisc_dev(sch)->max_pacing_offload_horizon) { + WRITE_ONCE(q->offload_horizon, offload_horizon); + } else { + NL_SET_ERR_MSG_MOD(extack, "invalid offload_horizon"); + err = -EINVAL; + } + } if (!err) { sch_tree_unlock(sch); @@ -1183,6 +1199,7 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb) .bands = FQ_BANDS, }; struct nlattr *opts; + u64 offload_horizon; u64 ce_threshold; s32 weights[3]; u64 horizon; @@ -1199,6 +1216,9 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb) horizon = READ_ONCE(q->horizon); do_div(horizon, NSEC_PER_USEC); + offload_horizon = READ_ONCE(q->offload_horizon); + do_div(offload_horizon, NSEC_PER_USEC); + if (nla_put_u32(skb, TCA_FQ_PLIMIT, READ_ONCE(sch->limit)) || nla_put_u32(skb, TCA_FQ_FLOW_PLIMIT, @@ -1224,6 +1244,7 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb) nla_put_u32(skb, TCA_FQ_TIMER_SLACK, READ_ONCE(q->timer_slack)) || nla_put_u32(skb, TCA_FQ_HORIZON, (u32)horizon) || + nla_put_u32(skb, TCA_FQ_OFFLOAD_HORIZON, (u32)offload_horizon) || nla_put_u8(skb, TCA_FQ_HORIZON_DROP, READ_ONCE(q->horizon_drop))) goto nla_put_failure; diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h index f0d71b2a3f1e..96ec2b01e725 100644 --- a/tools/include/uapi/linux/if_link.h +++ b/tools/include/uapi/linux/if_link.h @@ -377,6 +377,7 @@ enum { IFLA_GSO_IPV4_MAX_SIZE, IFLA_GRO_IPV4_MAX_SIZE, IFLA_DPLL_PIN, + IFLA_MAX_PACING_OFFLOAD_HORIZON, __IFLA_MAX };