From 9234a37a495dc34cece943bec495ab541e4143ab Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Thu, 5 Dec 2024 16:40:50 +0100 Subject: [PATCH 01/11] vxlan: In vxlan_rcv(), access flags through the vxlan netdevice vxlan_sock.flags is constructed from vxlan_dev.cfg.flags, as the subset of flags (named VXLAN_F_RCV_FLAGS) that is important from the point of view of socket sharing. Attempts to reconfigure these flags during the vxlan netdev lifetime are also bounced. It is therefore immaterial whether we access the flags through the vxlan_dev or through the socket. Convert the socket accesses to netdevice accesses in this separate patch to make the conversions that take place in the following patches more obvious. Signed-off-by: Petr Machata Reviewed-by: Ido Schimmel Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/5d237ffd731055e524d7b7c436de43358d8743d2.1733412063.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/vxlan/vxlan_core.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index b46a799bd390..1ac2dcdd493e 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -1717,7 +1717,7 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) /* For backwards compatibility, only allow reserved fields to be * used by VXLAN extensions if explicitly requested. */ - if (vs->flags & VXLAN_F_GPE) { + if (vxlan->cfg.flags & VXLAN_F_GPE) { if (!vxlan_parse_gpe_proto(&unparsed, &protocol)) goto drop; unparsed.vx_flags &= ~VXLAN_GPE_USED_BITS; @@ -1730,8 +1730,8 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) goto drop; } - if (vs->flags & VXLAN_F_REMCSUM_RX) { - reason = vxlan_remcsum(&unparsed, skb, vs->flags); + if (vxlan->cfg.flags & VXLAN_F_REMCSUM_RX) { + reason = vxlan_remcsum(&unparsed, skb, vxlan->cfg.flags); if (unlikely(reason)) goto drop; } @@ -1756,8 +1756,8 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) memset(md, 0, sizeof(*md)); } - if (vs->flags & VXLAN_F_GBP) - vxlan_parse_gbp_hdr(&unparsed, skb, vs->flags, md); + if (vxlan->cfg.flags & VXLAN_F_GBP) + vxlan_parse_gbp_hdr(&unparsed, skb, vxlan->cfg.flags, md); /* Note that GBP and GPE can never be active together. This is * ensured in vxlan_dev_configure. */ From 0f09ae907818d593e55c4b058d286a0914a43c3f Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Thu, 5 Dec 2024 16:40:51 +0100 Subject: [PATCH 02/11] vxlan: vxlan_rcv() callees: Move clearing of unparsed flags out In order to migrate away from the use of unparsed to detect invalid flags, move all the code that actually clears the flags from callees directly to vxlan_rcv(). Signed-off-by: Petr Machata Reviewed-by: Ido Schimmel Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/2857871d929375c881b9defe378473c8200ead9b.1733412063.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/vxlan/vxlan_core.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index 1ac2dcdd493e..c2254b0ac964 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -1562,7 +1562,7 @@ static enum skb_drop_reason vxlan_remcsum(struct vxlanhdr *unparsed, size_t start, offset; if (!(unparsed->vx_flags & VXLAN_HF_RCO) || skb->remcsum_offload) - goto out; + return SKB_NOT_DROPPED_YET; start = vxlan_rco_start(unparsed->vx_vni); offset = start + vxlan_rco_offset(unparsed->vx_vni); @@ -1573,10 +1573,6 @@ static enum skb_drop_reason vxlan_remcsum(struct vxlanhdr *unparsed, skb_remcsum_process(skb, (void *)(vxlan_hdr(skb) + 1), start, offset, !!(vxflags & VXLAN_F_REMCSUM_NOPARTIAL)); -out: - unparsed->vx_flags &= ~VXLAN_HF_RCO; - unparsed->vx_vni &= VXLAN_VNI_MASK; - return SKB_NOT_DROPPED_YET; } @@ -1588,7 +1584,7 @@ static void vxlan_parse_gbp_hdr(struct vxlanhdr *unparsed, struct metadata_dst *tun_dst; if (!(unparsed->vx_flags & VXLAN_HF_GBP)) - goto out; + return; md->gbp = ntohs(gbp->policy_id); @@ -1607,8 +1603,6 @@ static void vxlan_parse_gbp_hdr(struct vxlanhdr *unparsed, /* In flow-based mode, GBP is carried in dst_metadata */ if (!(vxflags & VXLAN_F_COLLECT_METADATA)) skb->mark = md->gbp; -out: - unparsed->vx_flags &= ~VXLAN_GBP_USED_BITS; } static enum skb_drop_reason vxlan_set_mac(struct vxlan_dev *vxlan, @@ -1734,6 +1728,8 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) reason = vxlan_remcsum(&unparsed, skb, vxlan->cfg.flags); if (unlikely(reason)) goto drop; + unparsed.vx_flags &= ~VXLAN_HF_RCO; + unparsed.vx_vni &= VXLAN_VNI_MASK; } if (vxlan_collect_metadata(vs)) { @@ -1756,8 +1752,10 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) memset(md, 0, sizeof(*md)); } - if (vxlan->cfg.flags & VXLAN_F_GBP) + if (vxlan->cfg.flags & VXLAN_F_GBP) { vxlan_parse_gbp_hdr(&unparsed, skb, vxlan->cfg.flags, md); + unparsed.vx_flags &= ~VXLAN_GBP_USED_BITS; + } /* Note that GBP and GPE can never be active together. This is * ensured in vxlan_dev_configure. */ From fe3dcbcfae522fae9c62954488398562ff6b5ece Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Thu, 5 Dec 2024 16:40:52 +0100 Subject: [PATCH 03/11] vxlan: vxlan_rcv() callees: Drop the unparsed argument The functions vxlan_remcsum() and vxlan_parse_gbp_hdr() take both the SKB and the unparsed VXLAN header. Now that unparsed adjustment is handled directly by vxlan_rcv(), drop this argument, and have the function derive it from the SKB on its own. vxlan_parse_gpe_proto() does not take SKB, so keep the header parameter. However const it so that it's clear that the intention is that it does not get changed. Signed-off-by: Petr Machata Reviewed-by: Ido Schimmel Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/5ea651f4e06485ba1a84a8eb556a457c39f0dfd4.1733412063.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/vxlan/vxlan_core.c | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index c2254b0ac964..37c20cf93f92 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -622,9 +622,9 @@ static int vxlan_fdb_append(struct vxlan_fdb *f, return 1; } -static bool vxlan_parse_gpe_proto(struct vxlanhdr *hdr, __be16 *protocol) +static bool vxlan_parse_gpe_proto(const struct vxlanhdr *hdr, __be16 *protocol) { - struct vxlanhdr_gpe *gpe = (struct vxlanhdr_gpe *)hdr; + const struct vxlanhdr_gpe *gpe = (const struct vxlanhdr_gpe *)hdr; /* Need to have Next Protocol set for interfaces in GPE mode. */ if (!gpe->np_applied) @@ -1554,18 +1554,17 @@ static void vxlan_sock_release(struct vxlan_dev *vxlan) #endif } -static enum skb_drop_reason vxlan_remcsum(struct vxlanhdr *unparsed, - struct sk_buff *skb, - u32 vxflags) +static enum skb_drop_reason vxlan_remcsum(struct sk_buff *skb, u32 vxflags) { + const struct vxlanhdr *vh = vxlan_hdr(skb); enum skb_drop_reason reason; size_t start, offset; - if (!(unparsed->vx_flags & VXLAN_HF_RCO) || skb->remcsum_offload) + if (!(vh->vx_flags & VXLAN_HF_RCO) || skb->remcsum_offload) return SKB_NOT_DROPPED_YET; - start = vxlan_rco_start(unparsed->vx_vni); - offset = start + vxlan_rco_offset(unparsed->vx_vni); + start = vxlan_rco_start(vh->vx_vni); + offset = start + vxlan_rco_offset(vh->vx_vni); reason = pskb_may_pull_reason(skb, offset + sizeof(u16)); if (reason) @@ -1576,14 +1575,16 @@ static enum skb_drop_reason vxlan_remcsum(struct vxlanhdr *unparsed, return SKB_NOT_DROPPED_YET; } -static void vxlan_parse_gbp_hdr(struct vxlanhdr *unparsed, - struct sk_buff *skb, u32 vxflags, +static void vxlan_parse_gbp_hdr(struct sk_buff *skb, u32 vxflags, struct vxlan_metadata *md) { - struct vxlanhdr_gbp *gbp = (struct vxlanhdr_gbp *)unparsed; + const struct vxlanhdr *vh = vxlan_hdr(skb); + const struct vxlanhdr_gbp *gbp; struct metadata_dst *tun_dst; - if (!(unparsed->vx_flags & VXLAN_HF_GBP)) + gbp = (const struct vxlanhdr_gbp *)vh; + + if (!(vh->vx_flags & VXLAN_HF_GBP)) return; md->gbp = ntohs(gbp->policy_id); @@ -1712,7 +1713,7 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) * used by VXLAN extensions if explicitly requested. */ if (vxlan->cfg.flags & VXLAN_F_GPE) { - if (!vxlan_parse_gpe_proto(&unparsed, &protocol)) + if (!vxlan_parse_gpe_proto(vxlan_hdr(skb), &protocol)) goto drop; unparsed.vx_flags &= ~VXLAN_GPE_USED_BITS; raw_proto = true; @@ -1725,7 +1726,7 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) } if (vxlan->cfg.flags & VXLAN_F_REMCSUM_RX) { - reason = vxlan_remcsum(&unparsed, skb, vxlan->cfg.flags); + reason = vxlan_remcsum(skb, vxlan->cfg.flags); if (unlikely(reason)) goto drop; unparsed.vx_flags &= ~VXLAN_HF_RCO; @@ -1753,7 +1754,7 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) } if (vxlan->cfg.flags & VXLAN_F_GBP) { - vxlan_parse_gbp_hdr(&unparsed, skb, vxlan->cfg.flags, md); + vxlan_parse_gbp_hdr(skb, vxlan->cfg.flags, md); unparsed.vx_flags &= ~VXLAN_GBP_USED_BITS; } /* Note that GBP and GPE can never be active together. This is From e713130dfb4d6b5a2cd42f33a94b6ac983d2989d Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Thu, 5 Dec 2024 16:40:53 +0100 Subject: [PATCH 04/11] vxlan: vxlan_rcv(): Extract vxlan_hdr(skb) to a named variable Having a named reference to the VXLAN header is more handy than having to conjure it anew through vxlan_hdr() on every use. Add a new variable and convert several open-coded sites. Additionally, convert one "unparsed" use to the new variable as well. Thus the only "unparsed" uses that remain are the flag-clearing and the header validity check at the end. Signed-off-by: Petr Machata Reviewed-by: Ido Schimmel Reviewed-by: Mateusz Polchlopek Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/2a0a940e883c435a0fdbcdc1d03c4858957ad00e.1733412063.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/vxlan/vxlan_core.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index 37c20cf93f92..f114568b67a3 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -1667,6 +1667,7 @@ static bool vxlan_ecn_decapsulate(struct vxlan_sock *vs, void *oiph, static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) { struct vxlan_vni_node *vninode = NULL; + const struct vxlanhdr *vh; struct vxlan_dev *vxlan; struct vxlan_sock *vs; struct vxlanhdr unparsed; @@ -1685,11 +1686,11 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) goto drop; unparsed = *vxlan_hdr(skb); + vh = vxlan_hdr(skb); /* VNI flag always required to be set */ - if (!(unparsed.vx_flags & VXLAN_HF_VNI)) { + if (!(vh->vx_flags & VXLAN_HF_VNI)) { netdev_dbg(skb->dev, "invalid vxlan flags=%#x vni=%#x\n", - ntohl(vxlan_hdr(skb)->vx_flags), - ntohl(vxlan_hdr(skb)->vx_vni)); + ntohl(vh->vx_flags), ntohl(vh->vx_vni)); reason = SKB_DROP_REASON_VXLAN_INVALID_HDR; /* Return non vxlan pkt */ goto drop; @@ -1701,7 +1702,7 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) if (!vs) goto drop; - vni = vxlan_vni(vxlan_hdr(skb)->vx_vni); + vni = vxlan_vni(vh->vx_vni); vxlan = vxlan_vs_find_vni(vs, skb->dev->ifindex, vni, &vninode); if (!vxlan) { @@ -1713,7 +1714,7 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) * used by VXLAN extensions if explicitly requested. */ if (vxlan->cfg.flags & VXLAN_F_GPE) { - if (!vxlan_parse_gpe_proto(vxlan_hdr(skb), &protocol)) + if (!vxlan_parse_gpe_proto(vh, &protocol)) goto drop; unparsed.vx_flags &= ~VXLAN_GPE_USED_BITS; raw_proto = true; From e4f8647767cfac0291def86ddfac23b925294701 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Thu, 5 Dec 2024 16:40:54 +0100 Subject: [PATCH 05/11] vxlan: Track reserved bits explicitly as part of the configuration In order to make it possible to configure which bits in VXLAN header should be considered reserved, introduce a new field vxlan_config::reserved_bits. Have it cover the whole header, except for the VNI-present bit and the bits for VNI itself, and have individual enabled features clear more bits off reserved_bits. (This is expressed as first constructing a used_bits set, and then inverting it to get the reserved_bits. The set of used_bits will be useful on its own for validation of user-set reserved_bits in a following patch.) The patch also moves a comment relevant to the validation from the unparsed validation site up to the new site. Logically this patch should add the new comment, and a later patch that removes the unparsed bits would remove the old comment. But keeping both legs in the same patch is better from the history spelunking point of view. Signed-off-by: Petr Machata Reviewed-by: Ido Schimmel Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/984dbf98d5940d3900268dbffaf70961f731d4a4.1733412063.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/vxlan/vxlan_core.c | 41 +++++++++++++++++++++++++--------- include/net/vxlan.h | 1 + 2 files changed, 31 insertions(+), 11 deletions(-) diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index f114568b67a3..e50f4cb70193 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -1710,9 +1710,20 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) goto drop; } - /* For backwards compatibility, only allow reserved fields to be - * used by VXLAN extensions if explicitly requested. - */ + if (vh->vx_flags & vxlan->cfg.reserved_bits.vx_flags || + vh->vx_vni & vxlan->cfg.reserved_bits.vx_vni) { + /* If the header uses bits besides those enabled by the + * netdevice configuration, treat this as a malformed packet. + * This behavior diverges from VXLAN RFC (RFC7348) which + * stipulates that bits in reserved in reserved fields are to be + * ignored. The approach here maintains compatibility with + * previous stack code, and also is more robust and provides a + * little more security in adding extensions to VXLAN. + */ + reason = SKB_DROP_REASON_VXLAN_INVALID_HDR; + goto drop; + } + if (vxlan->cfg.flags & VXLAN_F_GPE) { if (!vxlan_parse_gpe_proto(vh, &protocol)) goto drop; @@ -1763,14 +1774,6 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) */ if (unparsed.vx_flags || unparsed.vx_vni) { - /* If there are any unprocessed flags remaining treat - * this as a malformed packet. This behavior diverges from - * VXLAN RFC (RFC7348) which stipulates that bits in reserved - * in reserved fields are to be ignored. The approach here - * maintains compatibility with previous stack code, and also - * is more robust and provides a little more security in - * adding extensions to VXLAN. - */ reason = SKB_DROP_REASON_VXLAN_INVALID_HDR; goto drop; } @@ -4070,6 +4073,10 @@ static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[], struct net_device *dev, struct vxlan_config *conf, bool changelink, struct netlink_ext_ack *extack) { + struct vxlanhdr used_bits = { + .vx_flags = VXLAN_HF_VNI, + .vx_vni = VXLAN_VNI_MASK, + }; struct vxlan_dev *vxlan = netdev_priv(dev); int err = 0; @@ -4296,6 +4303,8 @@ static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[], extack); if (err) return err; + used_bits.vx_flags |= VXLAN_HF_RCO; + used_bits.vx_vni |= ~VXLAN_VNI_MASK; } if (data[IFLA_VXLAN_GBP]) { @@ -4303,6 +4312,7 @@ static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[], VXLAN_F_GBP, changelink, false, extack); if (err) return err; + used_bits.vx_flags |= VXLAN_GBP_USED_BITS; } if (data[IFLA_VXLAN_GPE]) { @@ -4311,8 +4321,17 @@ static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[], extack); if (err) return err; + + used_bits.vx_flags |= VXLAN_GPE_USED_BITS; } + /* For backwards compatibility, only allow reserved fields to be + * used by VXLAN extensions if explicitly requested. + */ + conf->reserved_bits = (struct vxlanhdr) { + .vx_flags = ~used_bits.vx_flags, + .vx_vni = ~used_bits.vx_vni, + }; if (data[IFLA_VXLAN_REMCSUM_NOPARTIAL]) { err = vxlan_nl2flag(conf, data, IFLA_VXLAN_REMCSUM_NOPARTIAL, VXLAN_F_REMCSUM_NOPARTIAL, changelink, diff --git a/include/net/vxlan.h b/include/net/vxlan.h index 33ba6fc151cf..2dd23ee2bacd 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -227,6 +227,7 @@ struct vxlan_config { unsigned int addrmax; bool no_share; enum ifla_vxlan_df df; + struct vxlanhdr reserved_bits; }; enum { From 752b1c8d8b409f2b03e61e153696689ee081bf07 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Thu, 5 Dec 2024 16:40:55 +0100 Subject: [PATCH 06/11] vxlan: Bump error counters for header mismatches The VXLAN driver so far has not increased the error counters for packets that set reserved bits. It does so for other packet errors, so do it for this case as well. Signed-off-by: Petr Machata Reviewed-by: Ido Schimmel Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/d096084167d56706d620afe5136cf37a2d34d1b9.1733412063.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/vxlan/vxlan_core.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index e50f4cb70193..92832a396ab7 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -1721,6 +1721,10 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) * little more security in adding extensions to VXLAN. */ reason = SKB_DROP_REASON_VXLAN_INVALID_HDR; + DEV_STATS_INC(vxlan->dev, rx_frame_errors); + DEV_STATS_INC(vxlan->dev, rx_errors); + vxlan_vnifilter_count(vxlan, vni, vninode, + VXLAN_VNI_STATS_RX_ERRORS, 0); goto drop; } From bb16786ed6fdff3a67ba33ed928ae138fd4254b5 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Thu, 5 Dec 2024 16:40:56 +0100 Subject: [PATCH 07/11] vxlan: vxlan_rcv(): Drop unparsed The code currently validates the VXLAN header in two ways: first by comparing it with the set of reserved bits, constructed ahead of time during the netdevice construction; and second by gradually clearing the bits off a separate copy of VXLAN header, "unparsed". Drop the latter validation method. Signed-off-by: Petr Machata Reviewed-by: Ido Schimmel Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/4559f16c5664c189b3a4ee6f5da91f552ad4821c.1733412063.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/vxlan/vxlan_core.c | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index 92832a396ab7..ff5684a2103a 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -1670,7 +1670,6 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) const struct vxlanhdr *vh; struct vxlan_dev *vxlan; struct vxlan_sock *vs; - struct vxlanhdr unparsed; struct vxlan_metadata _md; struct vxlan_metadata *md = &_md; __be16 protocol = htons(ETH_P_TEB); @@ -1685,7 +1684,6 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) if (reason) goto drop; - unparsed = *vxlan_hdr(skb); vh = vxlan_hdr(skb); /* VNI flag always required to be set */ if (!(vh->vx_flags & VXLAN_HF_VNI)) { @@ -1695,8 +1693,6 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) /* Return non vxlan pkt */ goto drop; } - unparsed.vx_flags &= ~VXLAN_HF_VNI; - unparsed.vx_vni &= ~VXLAN_VNI_MASK; vs = rcu_dereference_sk_user_data(sk); if (!vs) @@ -1731,7 +1727,6 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) if (vxlan->cfg.flags & VXLAN_F_GPE) { if (!vxlan_parse_gpe_proto(vh, &protocol)) goto drop; - unparsed.vx_flags &= ~VXLAN_GPE_USED_BITS; raw_proto = true; } @@ -1745,8 +1740,6 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) reason = vxlan_remcsum(skb, vxlan->cfg.flags); if (unlikely(reason)) goto drop; - unparsed.vx_flags &= ~VXLAN_HF_RCO; - unparsed.vx_vni &= VXLAN_VNI_MASK; } if (vxlan_collect_metadata(vs)) { @@ -1769,19 +1762,12 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) memset(md, 0, sizeof(*md)); } - if (vxlan->cfg.flags & VXLAN_F_GBP) { + if (vxlan->cfg.flags & VXLAN_F_GBP) vxlan_parse_gbp_hdr(skb, vxlan->cfg.flags, md); - unparsed.vx_flags &= ~VXLAN_GBP_USED_BITS; - } /* Note that GBP and GPE can never be active together. This is * ensured in vxlan_dev_configure. */ - if (unparsed.vx_flags || unparsed.vx_vni) { - reason = SKB_DROP_REASON_VXLAN_INVALID_HDR; - goto drop; - } - if (!raw_proto) { reason = vxlan_set_mac(vxlan, vs, skb, vni); if (reason) From 6c11379b104e3718135fd7fc37bb254b41e4cf65 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Thu, 5 Dec 2024 16:40:57 +0100 Subject: [PATCH 08/11] vxlan: Add an attribute to make VXLAN header validation configurable The set of bits that the VXLAN netdevice currently considers reserved is defined by the features enabled at the netdevice construction. In order to make this configurable, add an attribute, IFLA_VXLAN_RESERVED_BITS. The payload is a pair of big-endian u32's covering the VXLAN header. This is validated against the set of flags used by the various enabled VXLAN features, and attempts to override bits used by an enabled feature are bounced. Signed-off-by: Petr Machata Reviewed-by: Ido Schimmel Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/c657275e5ceed301e62c69fe8e559e32909442e2.1733412063.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/vxlan/vxlan_core.c | 53 +++++++++++++++++++++++++++++----- include/uapi/linux/if_link.h | 1 + 2 files changed, 47 insertions(+), 7 deletions(-) diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index ff5684a2103a..43cf672b7b9f 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -3428,6 +3428,7 @@ static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = { [IFLA_VXLAN_VNIFILTER] = { .type = NLA_U8 }, [IFLA_VXLAN_LOCALBYPASS] = NLA_POLICY_MAX(NLA_U8, 1), [IFLA_VXLAN_LABEL_POLICY] = NLA_POLICY_MAX(NLA_U32, VXLAN_LABEL_MAX), + [IFLA_VXLAN_RESERVED_BITS] = NLA_POLICY_EXACT_LEN(sizeof(struct vxlanhdr)), }; static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[], @@ -4315,13 +4316,44 @@ static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[], used_bits.vx_flags |= VXLAN_GPE_USED_BITS; } - /* For backwards compatibility, only allow reserved fields to be - * used by VXLAN extensions if explicitly requested. - */ - conf->reserved_bits = (struct vxlanhdr) { - .vx_flags = ~used_bits.vx_flags, - .vx_vni = ~used_bits.vx_vni, - }; + if (data[IFLA_VXLAN_RESERVED_BITS]) { + struct vxlanhdr reserved_bits; + + if (changelink) { + NL_SET_ERR_MSG_ATTR(extack, + data[IFLA_VXLAN_RESERVED_BITS], + "Cannot change reserved_bits"); + return -EOPNOTSUPP; + } + + nla_memcpy(&reserved_bits, data[IFLA_VXLAN_RESERVED_BITS], + sizeof(reserved_bits)); + if (used_bits.vx_flags & reserved_bits.vx_flags || + used_bits.vx_vni & reserved_bits.vx_vni) { + __be64 ub_be64, rb_be64; + + memcpy(&ub_be64, &used_bits, sizeof(ub_be64)); + memcpy(&rb_be64, &reserved_bits, sizeof(rb_be64)); + + NL_SET_ERR_MSG_ATTR_FMT(extack, + data[IFLA_VXLAN_RESERVED_BITS], + "Used bits %#018llx cannot overlap reserved bits %#018llx", + be64_to_cpu(ub_be64), + be64_to_cpu(rb_be64)); + return -EINVAL; + } + + conf->reserved_bits = reserved_bits; + } else { + /* For backwards compatibility, only allow reserved fields to be + * used by VXLAN extensions if explicitly requested. + */ + conf->reserved_bits = (struct vxlanhdr) { + .vx_flags = ~used_bits.vx_flags, + .vx_vni = ~used_bits.vx_vni, + }; + } + if (data[IFLA_VXLAN_REMCSUM_NOPARTIAL]) { err = vxlan_nl2flag(conf, data, IFLA_VXLAN_REMCSUM_NOPARTIAL, VXLAN_F_REMCSUM_NOPARTIAL, changelink, @@ -4506,6 +4538,8 @@ static size_t vxlan_get_size(const struct net_device *dev) nla_total_size(0) + /* IFLA_VXLAN_GPE */ nla_total_size(0) + /* IFLA_VXLAN_REMCSUM_NOPARTIAL */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_VNIFILTER */ + /* IFLA_VXLAN_RESERVED_BITS */ + nla_total_size(sizeof(struct vxlanhdr)) + 0; } @@ -4608,6 +4642,11 @@ static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev) !!(vxlan->cfg.flags & VXLAN_F_VNIFILTER))) goto nla_put_failure; + if (nla_put(skb, IFLA_VXLAN_RESERVED_BITS, + sizeof(vxlan->cfg.reserved_bits), + &vxlan->cfg.reserved_bits)) + goto nla_put_failure; + return 0; nla_put_failure: diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 2575e0cd9b48..77730c340c8f 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -1394,6 +1394,7 @@ enum { IFLA_VXLAN_VNIFILTER, /* only applicable with COLLECT_METADATA mode */ IFLA_VXLAN_LOCALBYPASS, IFLA_VXLAN_LABEL_POLICY, /* IPv6 flow label policy; ifla_vxlan_label_policy */ + IFLA_VXLAN_RESERVED_BITS, __IFLA_VXLAN_MAX }; #define IFLA_VXLAN_MAX (__IFLA_VXLAN_MAX - 1) From 8653eb21d68c6882ce5716b04379431817310b85 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Thu, 5 Dec 2024 16:40:58 +0100 Subject: [PATCH 09/11] selftests: net: lib: Rename ip_link_master() to ip_link_set_master() Let's have a verb in that function name to make it clearer what's going on. Signed-off-by: Petr Machata Reviewed-by: Ido Schimmel Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/fbf7c53a429b340b9cff5831280ea8c305a224f9.1733412063.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/fdb_notify.sh | 6 +++--- tools/testing/selftests/net/lib.sh | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/net/fdb_notify.sh b/tools/testing/selftests/net/fdb_notify.sh index c03151e7791c..c159230c9b62 100755 --- a/tools/testing/selftests/net/fdb_notify.sh +++ b/tools/testing/selftests/net/fdb_notify.sh @@ -49,7 +49,7 @@ test_dup_vxlan_self() { ip_link_add br up type bridge vlan_filtering 1 ip_link_add vx up type vxlan id 2000 dstport 4789 - ip_link_master vx br + ip_link_set_master vx br do_test_dup add "vxlan" dev vx self dst 192.0.2.1 do_test_dup del "vxlan" dev vx self dst 192.0.2.1 @@ -59,7 +59,7 @@ test_dup_vxlan_master() { ip_link_add br up type bridge vlan_filtering 1 ip_link_add vx up type vxlan id 2000 dstport 4789 - ip_link_master vx br + ip_link_set_master vx br do_test_dup add "vxlan master" dev vx master do_test_dup del "vxlan master" dev vx master @@ -79,7 +79,7 @@ test_dup_macvlan_master() ip_link_add br up type bridge vlan_filtering 1 ip_link_add dd up type dummy ip_link_add mv up link dd type macvlan mode passthru - ip_link_master mv br + ip_link_set_master mv br do_test_dup add "macvlan master" dev mv self do_test_dup del "macvlan master" dev mv self diff --git a/tools/testing/selftests/net/lib.sh b/tools/testing/selftests/net/lib.sh index 8994fec1c38f..5ea6537acd2b 100644 --- a/tools/testing/selftests/net/lib.sh +++ b/tools/testing/selftests/net/lib.sh @@ -451,7 +451,7 @@ ip_link_add() defer ip link del dev "$name" } -ip_link_master() +ip_link_set_master() { local member=$1; shift local master=$1; shift From d76ccb2ec368c8a44f64839140cd253c19f6a79a Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Thu, 5 Dec 2024 16:40:59 +0100 Subject: [PATCH 10/11] selftests: net: lib: Add several autodefer helpers Add ip_link_set_addr(), ip_link_set_up(), ip_addr_add() and ip_route_add() to the suite of helpers that automatically schedule a corresponding cleanup. When setting a new MAC, one needs to remember the old address first. Move mac_get() from forwarding/ to that end. Signed-off-by: Petr Machata Reviewed-by: Ido Schimmel Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/add6bcbe30828fd01363266df20c338cf13aaf25.1733412063.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/forwarding/lib.sh | 7 ---- tools/testing/selftests/net/lib.sh | 39 +++++++++++++++++++ 2 files changed, 39 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh index 7337f398f9cc..1fd40bada694 100644 --- a/tools/testing/selftests/net/forwarding/lib.sh +++ b/tools/testing/selftests/net/forwarding/lib.sh @@ -932,13 +932,6 @@ packets_rate() echo $(((t1 - t0) / interval)) } -mac_get() -{ - local if_name=$1 - - ip -j link show dev $if_name | jq -r '.[]["address"]' -} - ether_addr_to_u64() { local addr="$1" diff --git a/tools/testing/selftests/net/lib.sh b/tools/testing/selftests/net/lib.sh index 5ea6537acd2b..2cd5c743b2d9 100644 --- a/tools/testing/selftests/net/lib.sh +++ b/tools/testing/selftests/net/lib.sh @@ -435,6 +435,13 @@ xfail_on_veth() fi } +mac_get() +{ + local if_name=$1 + + ip -j link show dev $if_name | jq -r '.[]["address"]' +} + kill_process() { local pid=$1; shift @@ -459,3 +466,35 @@ ip_link_set_master() ip link set dev "$member" master "$master" defer ip link set dev "$member" nomaster } + +ip_link_set_addr() +{ + local name=$1; shift + local addr=$1; shift + + local old_addr=$(mac_get "$name") + ip link set dev "$name" address "$addr" + defer ip link set dev "$name" address "$old_addr" +} + +ip_link_set_up() +{ + local name=$1; shift + + ip link set dev "$name" up + defer ip link set dev "$name" down +} + +ip_addr_add() +{ + local name=$1; shift + + ip addr add dev "$name" "$@" + defer ip addr del dev "$name" "$@" +} + +ip_route_add() +{ + ip route add "$@" + defer ip route del "$@" +} From d84b5dccf3ebdeeabef910d1c19b931c84f67884 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Thu, 5 Dec 2024 16:41:00 +0100 Subject: [PATCH 11/11] selftests: forwarding: Add a selftest for the new reserved_bits UAPI Run VXLAN packets through a gateway. Flip individual bits of the packet and/or reserved bits of the gateway, and check that the gateway treats the packets as expected. Signed-off-by: Petr Machata Reviewed-by: Ido Schimmel Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/388bef3c30ebc887d4e64cd86a362e2df2f2d2e1.1733412063.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- .../testing/selftests/net/forwarding/Makefile | 1 + .../net/forwarding/vxlan_reserved.sh | 352 ++++++++++++++++++ 2 files changed, 353 insertions(+) create mode 100755 tools/testing/selftests/net/forwarding/vxlan_reserved.sh diff --git a/tools/testing/selftests/net/forwarding/Makefile b/tools/testing/selftests/net/forwarding/Makefile index 7d885cff8d79..00bde7b6f39e 100644 --- a/tools/testing/selftests/net/forwarding/Makefile +++ b/tools/testing/selftests/net/forwarding/Makefile @@ -105,6 +105,7 @@ TEST_PROGS = bridge_fdb_learning_limit.sh \ vxlan_bridge_1q_port_8472_ipv6.sh \ vxlan_bridge_1q_port_8472.sh \ vxlan_bridge_1q.sh \ + vxlan_reserved.sh \ vxlan_symmetric_ipv6.sh \ vxlan_symmetric.sh diff --git a/tools/testing/selftests/net/forwarding/vxlan_reserved.sh b/tools/testing/selftests/net/forwarding/vxlan_reserved.sh new file mode 100755 index 000000000000..46c31794b91b --- /dev/null +++ b/tools/testing/selftests/net/forwarding/vxlan_reserved.sh @@ -0,0 +1,352 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# +--------------------+ +# | H1 (vrf) | +# | + $h1 | +# | | 192.0.2.1/28 | +# +----|---------------+ +# | +# +----|--------------------------------+ +# | SW | | +# | +--|------------------------------+ | +# | | + $swp1 BR1 (802.1d) | | +# | | | | +# | | + vx1 (vxlan) | | +# | | local 192.0.2.17 | | +# | | id 1000 dstport $VXPORT | | +# | +---------------------------------+ | +# | | +# | 192.0.2.32/28 via 192.0.2.18 | +# | | +# | + $rp1 | +# | | 192.0.2.17/28 | +# +--|----------------------------------+ +# | +# +--|----------------------------------+ +# | | | +# | + $rp2 | +# | 192.0.2.18/28 | +# | | +# | VRP2 (vrf) | +# +-------------------------------------+ + +: ${VXPORT:=4789} +: ${ALL_TESTS:=" + default_test + plain_test + reserved_0_test + reserved_10_test + reserved_31_test + reserved_56_test + reserved_63_test + "} + +NUM_NETIFS=4 +source lib.sh + +h1_create() +{ + simple_if_init $h1 192.0.2.1/28 + defer simple_if_fini $h1 192.0.2.1/28 + + tc qdisc add dev $h1 clsact + defer tc qdisc del dev $h1 clsact + + tc filter add dev $h1 ingress pref 77 \ + prot ip flower skip_hw ip_proto icmp action drop + defer tc filter del dev $h1 ingress pref 77 +} + +switch_create() +{ + ip_link_add br1 type bridge vlan_filtering 0 mcast_snooping 0 + # Make sure the bridge uses the MAC address of the local port and not + # that of the VxLAN's device. + ip_link_set_addr br1 $(mac_get $swp1) + ip_link_set_up br1 + + ip_link_set_up $rp1 + ip_addr_add $rp1 192.0.2.17/28 + ip_route_add 192.0.2.32/28 nexthop via 192.0.2.18 + + ip_link_set_master $swp1 br1 + ip_link_set_up $swp1 +} + +vrp2_create() +{ + simple_if_init $rp2 192.0.2.18/28 + defer simple_if_fini $rp2 192.0.2.18/28 +} + +setup_prepare() +{ + h1=${NETIFS[p1]} + swp1=${NETIFS[p2]} + + rp1=${NETIFS[p3]} + rp2=${NETIFS[p4]} + + vrf_prepare + defer vrf_cleanup + + forwarding_enable + defer forwarding_restore + + h1_create + switch_create + + vrp2_create +} + +vxlan_header_bytes() +{ + local vni=$1; shift + local -a extra_bits=("$@") + local -a bits + local i + + for ((i=0; i < 64; i++)); do + bits[i]=0 + done + + # Bit 4 is the I flag and is always on. + bits[4]=1 + + for i in ${extra_bits[@]}; do + bits[i]=1 + done + + # Bits 32..55 carry the VNI + local mask=0x800000 + for ((i=0; i < 24; i++)); do + bits[$((i + 32))]=$(((vni & mask) != 0)) + ((mask >>= 1)) + done + + local bytes + for ((i=0; i < 8; i++)); do + local byte=0 + local j + for ((j=0; j < 8; j++)); do + local bit=${bits[8 * i + j]} + ((byte += bit << (7 - j))) + done + bytes+=$(printf %02x $byte): + done + + echo ${bytes%:} +} + +neg_bytes() +{ + local bytes=$1; shift + + local -A neg=([0]=f [1]=e [2]=d [3]=c [4]=b [5]=a [6]=9 [7]=8 + [8]=7 [9]=6 [a]=5 [b]=4 [c]=3 [d]=2 [e]=1 [f]=0 [:]=:) + local out + local i + + for ((i=0; i < ${#bytes}; i++)); do + local c=${bytes:$i:1} + out+=${neg[$c]} + done + echo $out +} + +vxlan_ping_do() +{ + local count=$1; shift + local dev=$1; shift + local next_hop_mac=$1; shift + local dest_ip=$1; shift + local dest_mac=$1; shift + local vni=$1; shift + local reserved_bits=$1; shift + + local vxlan_header=$(vxlan_header_bytes $vni $reserved_bits) + + $MZ $dev -c $count -d 100msec -q \ + -b $next_hop_mac -B $dest_ip \ + -t udp sp=23456,dp=$VXPORT,p=$(: + )"$vxlan_header:"$( : VXLAN + )"$dest_mac:"$( : ETH daddr + )"00:11:22:33:44:55:"$( : ETH saddr + )"08:00:"$( : ETH type + )"45:"$( : IP version + IHL + )"00:"$( : IP TOS + )"00:54:"$( : IP total length + )"99:83:"$( : IP identification + )"40:00:"$( : IP flags + frag off + )"40:"$( : IP TTL + )"01:"$( : IP proto + )"00:00:"$( : IP header csum + )"$(ipv4_to_bytes 192.0.2.3):"$( : IP saddr + )"$(ipv4_to_bytes 192.0.2.1):"$( : IP daddr + )"08:"$( : ICMP type + )"00:"$( : ICMP code + )"8b:f2:"$( : ICMP csum + )"1f:6a:"$( : ICMP request identifier + )"00:01:"$( : ICMP request seq. number + )"4f:ff:c5:5b:00:00:00:00:"$( : ICMP payload + )"6d:74:0b:00:00:00:00:00:"$( : + )"10:11:12:13:14:15:16:17:"$( : + )"18:19:1a:1b:1c:1d:1e:1f:"$( : + )"20:21:22:23:24:25:26:27:"$( : + )"28:29:2a:2b:2c:2d:2e:2f:"$( : + )"30:31:32:33:34:35:36:37" +} + +vxlan_device_add() +{ + ip_link_add vx1 up type vxlan id 1000 \ + local 192.0.2.17 dstport "$VXPORT" \ + nolearning noudpcsum tos inherit ttl 100 "$@" + ip_link_set_master vx1 br1 +} + +vxlan_all_reserved_bits() +{ + local i + + for ((i=0; i < 64; i++)); do + if ((i == 4 || i >= 32 && i < 56)); then + continue + fi + echo $i + done +} + +vxlan_ping_vanilla() +{ + vxlan_ping_do 10 $rp2 $(mac_get $rp1) 192.0.2.17 $(mac_get $h1) 1000 +} + +vxlan_ping_reserved() +{ + for bit in $(vxlan_all_reserved_bits); do + vxlan_ping_do 1 $rp2 $(mac_get $rp1) \ + 192.0.2.17 $(mac_get $h1) 1000 "$bit" + ((n++)) + done +} + +vxlan_ping_test() +{ + local what=$1; shift + local get_stat=$1; shift + local expect=$1; shift + + RET=0 + + local t0=$($get_stat) + + "$@" + check_err $? "Failure when running $@" + + local t1=$($get_stat) + local delta=$((t1 - t0)) + + ((expect == delta)) + check_err $? "Expected to capture $expect packets, got $delta." + + log_test "$what" +} + +__default_test_do() +{ + local n_allowed_bits=$1; shift + local what=$1; shift + + vxlan_ping_test "$what: clean packets" \ + "tc_rule_stats_get $h1 77 ingress" \ + 10 vxlan_ping_vanilla + + local t0=$(link_stats_get vx1 rx errors) + vxlan_ping_test "$what: mangled packets" \ + "tc_rule_stats_get $h1 77 ingress" \ + $n_allowed_bits vxlan_ping_reserved + local t1=$(link_stats_get vx1 rx errors) + + RET=0 + local expect=$((39 - n_allowed_bits)) + local delta=$((t1 - t0)) + ((expect == delta)) + check_err $? "Expected $expect error packets, got $delta." + log_test "$what: drops reported" +} + +default_test_do() +{ + vxlan_device_add + __default_test_do 0 "Default" +} + +default_test() +{ + in_defer_scope \ + default_test_do +} + +plain_test_do() +{ + vxlan_device_add reserved_bits 0xf7ffffff000000ff + __default_test_do 0 "reserved_bits 0xf7ffffff000000ff" +} + +plain_test() +{ + in_defer_scope \ + plain_test_do +} + +reserved_test() +{ + local bit=$1; shift + + local allowed_bytes=$(vxlan_header_bytes 0xffffff $bit) + local reserved_bytes=$(neg_bytes $allowed_bytes) + local reserved_bits=${reserved_bytes//:/} + + vxlan_device_add reserved_bits 0x$reserved_bits + __default_test_do 1 "reserved_bits 0x$reserved_bits" +} + +reserved_0_test() +{ + in_defer_scope \ + reserved_test 0 +} + +reserved_10_test() +{ + in_defer_scope \ + reserved_test 10 +} + +reserved_31_test() +{ + in_defer_scope \ + reserved_test 31 +} + +reserved_56_test() +{ + in_defer_scope \ + reserved_test 56 +} + +reserved_63_test() +{ + in_defer_scope \ + reserved_test 63 +} + +trap cleanup EXIT + +setup_prepare +setup_wait +tests_run + +exit $EXIT_STATUS