/*
 * kgraft_patch_bsc1165631_vxlan
 *
 * Fix for CVE-2020-1749, bsc#1165631 (vxlan.ko part)
 *
 *  Copyright (c) 2020 SUSE
 *  Author: Nicolai Stange <nstange@suse.de>
 *
 *  Based on the original Linux kernel code. Other copyrights apply.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, see <http://www.gnu.org/licenses/>.
 */

#if !IS_MODULE(CONFIG_VXLAN)
#error "Live patch supports only CONFIG_VXLAN=m"
#endif

#define KGR_PATCHED_MODULE "vxlan"

#define pr_fmt(fmt) KGR_PATCHED_MODULE ": " fmt

#include <linux/kernel.h>
#include <linux/types.h>
#include <linux/module.h>
#include <linux/errno.h>
#include <linux/slab.h>
#include <linux/skbuff.h>
#include <linux/rculist.h>
#include <linux/netdevice.h>
#include <linux/in.h>
#include <linux/ip.h>
#include <linux/udp.h>
#include <linux/etherdevice.h>
#include <linux/if_ether.h>
#include <linux/if_vlan.h>
#include <linux/hash.h>
#include <linux/ethtool.h>
#include <net/ndisc.h>
#include <net/ip_tunnels.h>
#include <net/udp_tunnel.h>
#include <net/rtnetlink.h>
#include <net/route.h>
#include <net/dsfield.h>
#include <net/inet_ecn.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
#include <net/vxlan.h>
#include <net/ipv6.h>
#include <net/addrconf.h>
#include <net/dst_metadata.h>

#include <linux/module.h>
#include "kgr_patch_bsc1165631.h"
#include "../kallsyms_relocs.h"


/* from include/net/udp_tunnel.h */
static int (*kgre_udp_tunnel_xmit_skb)(struct rtable *rt, struct sock *sk, struct sk_buff *skb,
			__be32 src, __be32 dst, __u8 tos, __u8 ttl,
			__be16 df, __be16 src_port, __be16 dst_port,
			bool xnet, bool nocheck);

#if IS_ENABLED(CONFIG_IPV6)
static int (*kgre_udp_tunnel6_xmit_skb)(struct dst_entry *dst, struct sock *sk,
			 struct sk_buff *skb,
			 struct net_device *dev, struct in6_addr *saddr,
			 struct in6_addr *daddr,
			 __u8 prio, __u8 ttl, __be16 src_port,
			 __be16 dst_port, bool nocheck);
#else
#error "klp-ccp: a preceeding branch should have been taken"
#endif



/* from drivers/net/vxlan.c */
#if IS_ENABLED(CONFIG_IPV6)

static inline bool vxlan_addr_any(const union vxlan_addr *ipa)
{
	if (ipa->sa.sa_family == AF_INET6)
		return ipv6_addr_any(&ipa->sin6.sin6_addr);
	else
		return ipa->sin.sin_addr.s_addr == htonl(INADDR_ANY);
}

static inline bool vxlan_addr_multicast(const union vxlan_addr *ipa)
{
	if (ipa->sa.sa_family == AF_INET6)
		return ipv6_addr_is_multicast(&ipa->sin6.sin6_addr);
	else
		return IN_MULTICAST(ntohl(ipa->sin.sin_addr.s_addr));
}

#else /* !CONFIG_IPV6 */
#error "klp-ccp: non-taken branch"
#endif

static struct vxlan_dev *(*kgre_vxlan_find_vni)(struct net *net, u32 id,
					sa_family_t family, __be16 port,
					u32 flags);

static bool (*kgre_vxlan_snoop)(struct net_device *dev,
			union vxlan_addr *src_ip, const u8 *src_mac);

static void vxlan_build_gbp_hdr(struct vxlanhdr *vxh, u32 vxflags,
				struct vxlan_metadata *md)
{
	struct vxlanhdr_gbp *gbp;

	if (!md->gbp)
		return;

	gbp = (struct vxlanhdr_gbp *)vxh;
	vxh->vx_flags |= htonl(VXLAN_HF_GBP);

	if (md->gbp & VXLAN_GBP_DONT_LEARN)
		gbp->dont_learn = 1;

	if (md->gbp & VXLAN_GBP_POLICY_APPLIED)
		gbp->policy_applied = 1;

	gbp->policy_id = htons(md->gbp & VXLAN_GBP_ID_MASK);
}

static int vxlan_build_skb(struct sk_buff *skb, struct dst_entry *dst,
			   int iphdr_len, __be32 vni,
			   struct vxlan_metadata *md, u32 vxflags,
			   bool udp_sum)
{
	struct vxlanhdr *vxh;
	int min_headroom;
	int err;
	int type = udp_sum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL;
	u16 hdrlen = sizeof(struct vxlanhdr);

	if ((vxflags & VXLAN_F_REMCSUM_TX) &&
	    skb->ip_summed == CHECKSUM_PARTIAL) {
		int csum_start = skb_checksum_start_offset(skb);

		if (csum_start <= VXLAN_MAX_REMCSUM_START &&
		    !(csum_start & VXLAN_RCO_SHIFT_MASK) &&
		    (skb->csum_offset == offsetof(struct udphdr, check) ||
		     skb->csum_offset == offsetof(struct tcphdr, check))) {
			udp_sum = false;
			type |= SKB_GSO_TUNNEL_REMCSUM;
		}
	}

	min_headroom = LL_RESERVED_SPACE(dst->dev) + dst->header_len
			+ VXLAN_HLEN + iphdr_len
			+ (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0);

	/* Need space for new headers (invalidates iph ptr) */
	err = skb_cow_head(skb, min_headroom);
	if (unlikely(err)) {
		kfree_skb(skb);
		return err;
	}

	skb = vlan_hwaccel_push_inside(skb);
	if (WARN_ON(!skb))
		return -ENOMEM;

	skb = iptunnel_handle_offloads(skb, udp_sum, type);
	if (IS_ERR(skb))
		return PTR_ERR(skb);

	vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh));
	vxh->vx_flags = htonl(VXLAN_HF_VNI);
	vxh->vx_vni = vni;

	if (type & SKB_GSO_TUNNEL_REMCSUM) {
		u32 data = (skb_checksum_start_offset(skb) - hdrlen) >>
			   VXLAN_RCO_SHIFT;

		if (skb->csum_offset == offsetof(struct udphdr, check))
			data |= VXLAN_RCO_UDP;

		vxh->vx_vni |= htonl(data);
		vxh->vx_flags |= htonl(VXLAN_HF_RCO);

		if (!skb_is_gso(skb)) {
			skb->ip_summed = CHECKSUM_NONE;
			skb->encapsulation = 0;
		}
	}

	if (vxflags & VXLAN_F_GBP)
		vxlan_build_gbp_hdr(vxh, vxflags, md);

	skb_set_inner_protocol(skb, htons(ETH_P_TEB));
	return 0;
}

static struct rtable *vxlan_get_route(struct vxlan_dev *vxlan,
				      struct sk_buff *skb, int oif, u8 tos,
				      __be32 daddr, __be32 *saddr)
{
	struct rtable *rt = NULL;
	struct flowi4 fl4;

	memset(&fl4, 0, sizeof(fl4));
	fl4.flowi4_oif = oif;
	fl4.flowi4_tos = RT_TOS(tos);
	fl4.flowi4_mark = skb->mark;
	fl4.flowi4_proto = IPPROTO_UDP;
	fl4.daddr = daddr;
	fl4.saddr = *saddr;

	rt = ip_route_output_key(vxlan->net, &fl4);
	if (!IS_ERR(rt))
		*saddr = fl4.saddr;
	return rt;
}

#if IS_ENABLED(CONFIG_IPV6)
static struct dst_entry *kgrp_vxlan6_get_route(struct vxlan_dev *vxlan,
					  struct sk_buff *skb, int oif,
					  const struct in6_addr *daddr,
					  struct in6_addr *saddr)
{
	struct dst_entry *ndst;
	struct flowi6 fl6;
	/*
	 * Fix CVE-2020-1749
	 *  -1 line
	 */

	memset(&fl6, 0, sizeof(fl6));
	fl6.flowi6_oif = oif;
	fl6.daddr = *daddr;
	fl6.saddr = *saddr;
	fl6.flowi6_mark = skb->mark;
	fl6.flowi6_proto = IPPROTO_UDP;

	/*
	 * Fix CVE-2020-1749
	 *  -5 lines, +4 lines
	 */
	ndst = kgrp_ip6_dst_lookup_flow(vxlan->net, vxlan->vn6_sock->sock->sk,
					&fl6, NULL);
	if (unlikely(IS_ERR(ndst)))
		return ndst;

	*saddr = fl6.saddr;
	return ndst;
}
#else
#error "klp-ccp: a preceeding branch should have been taken"
#endif

static void kgrr_vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan,
			       struct vxlan_dev *dst_vxlan)
{
	struct pcpu_sw_netstats *tx_stats, *rx_stats;
	union vxlan_addr loopback;
	union vxlan_addr *remote_ip = &dst_vxlan->default_dst.remote_ip;
	struct net_device *dev = skb->dev;
	int len = skb->len;

	tx_stats = this_cpu_ptr(src_vxlan->dev->tstats);
	rx_stats = this_cpu_ptr(dst_vxlan->dev->tstats);
	skb->pkt_type = PACKET_HOST;
	skb->encapsulation = 0;
	skb->dev = dst_vxlan->dev;
	__skb_pull(skb, skb_network_offset(skb));

	if (remote_ip->sa.sa_family == AF_INET) {
		loopback.sin.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
		loopback.sa.sa_family =  AF_INET;
#if IS_ENABLED(CONFIG_IPV6)
	} else {
		loopback.sin6.sin6_addr = in6addr_loopback;
		loopback.sa.sa_family =  AF_INET6;
#else
#error "klp-ccp: a preceeding branch should have been taken"
#endif
	}

	if (dst_vxlan->flags & VXLAN_F_LEARN)
		(*kgre_vxlan_snoop)(skb->dev, &loopback, eth_hdr(skb)->h_source);

	u64_stats_update_begin(&tx_stats->syncp);
	tx_stats->tx_packets++;
	tx_stats->tx_bytes += len;
	u64_stats_update_end(&tx_stats->syncp);

	if (netif_rx(skb) == NET_RX_SUCCESS) {
		u64_stats_update_begin(&rx_stats->syncp);
		rx_stats->rx_packets++;
		rx_stats->rx_bytes += len;
		u64_stats_update_end(&rx_stats->syncp);
	} else {
		dev->stats.rx_dropped++;
	}
}

void kgrp_vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
			   struct vxlan_rdst *rdst, bool did_rsc)
{
	struct ip_tunnel_info *info;
	struct vxlan_dev *vxlan = netdev_priv(dev);
	struct sock *sk;
	struct rtable *rt = NULL;
	const struct iphdr *old_iph;
	union vxlan_addr *dst;
	union vxlan_addr remote_ip, local_ip;
	struct vxlan_metadata _md;
	struct vxlan_metadata *md = &_md;
	__be16 src_port = 0, dst_port;
	u32 vni;
	__be16 df = 0;
	__u8 tos, ttl;
	int err;
	u32 flags = vxlan->flags;
	bool udp_sum = false;
	bool xnet = !net_eq(vxlan->net, dev_net(vxlan->dev));

	info = skb_tunnel_info(skb);

	if (rdst) {
		dst_port = rdst->remote_port ? rdst->remote_port : vxlan->cfg.dst_port;
		vni = rdst->remote_vni;
		dst = &rdst->remote_ip;
		local_ip = vxlan->cfg.saddr;
	} else {
		if (!info) {
			WARN_ONCE(1, "%s: Missing encapsulation instructions\n",
				  dev->name);
			goto drop;
		}
		dst_port = info->key.tp_dst ? : vxlan->cfg.dst_port;
		vni = be64_to_cpu(info->key.tun_id);
		remote_ip.sa.sa_family = ip_tunnel_info_af(info);
		if (remote_ip.sa.sa_family == AF_INET) {
			remote_ip.sin.sin_addr.s_addr = info->key.u.ipv4.dst;
			local_ip.sin.sin_addr.s_addr = info->key.u.ipv4.src;
		} else {
			remote_ip.sin6.sin6_addr = info->key.u.ipv6.dst;
			local_ip.sin6.sin6_addr = info->key.u.ipv6.src;
		}
		dst = &remote_ip;
	}

	if (vxlan_addr_any(dst)) {
		if (did_rsc) {
			/* short-circuited back to local bridge */
			kgrr_vxlan_encap_bypass(skb, vxlan, vxlan);
			return;
		}
		goto drop;
	}

	old_iph = ip_hdr(skb);

	ttl = vxlan->cfg.ttl;
	if (!ttl && vxlan_addr_multicast(dst))
		ttl = 1;

	tos = vxlan->cfg.tos;
	if (tos == 1)
		tos = ip_tunnel_get_dsfield(old_iph, skb);

	src_port = udp_flow_src_port(dev_net(dev), skb, vxlan->cfg.port_min,
				     vxlan->cfg.port_max, true);

	if (info) {
		ttl = info->key.ttl;
		tos = info->key.tos;
		udp_sum = !!(info->key.tun_flags & TUNNEL_CSUM);

		if (info->options_len)
			md = ip_tunnel_info_opts(info);
	} else {
		md->gbp = skb->mark;
	}

	if (dst->sa.sa_family == AF_INET) {
		if (!vxlan->vn4_sock)
			goto drop;
		sk = vxlan->vn4_sock->sock->sk;

		if (info) {
			if (info->key.tun_flags & TUNNEL_DONT_FRAGMENT)
				df = htons(IP_DF);
		} else {
			udp_sum = !!(flags & VXLAN_F_UDP_CSUM);
		}

		rt = vxlan_get_route(vxlan, skb,
				     rdst ? rdst->remote_ifindex : 0, tos,
				     dst->sin.sin_addr.s_addr,
				     &local_ip.sin.sin_addr.s_addr);
		if (IS_ERR(rt)) {
			netdev_dbg(dev, "no route to %pI4\n",
				   &dst->sin.sin_addr.s_addr);
			dev->stats.tx_carrier_errors++;
			goto tx_error;
		}

		if (rt->dst.dev == dev) {
			netdev_dbg(dev, "circular route to %pI4\n",
				   &dst->sin.sin_addr.s_addr);
			dev->stats.collisions++;
			goto rt_tx_error;
		}

		/* Bypass encapsulation if the destination is local */
		if (!info && rt->rt_flags & RTCF_LOCAL &&
		    !(rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))) {
			struct vxlan_dev *dst_vxlan;

			ip_rt_put(rt);
			dst_vxlan = (*kgre_vxlan_find_vni)(vxlan->net, vni,
						   dst->sa.sa_family, dst_port,
						   vxlan->flags);
			if (!dst_vxlan)
				goto tx_error;
			kgrr_vxlan_encap_bypass(skb, vxlan, dst_vxlan);
			return;
		}

		tos = ip_tunnel_ecn_encap(tos, old_iph, skb);
		ttl = ttl ? : ip4_dst_hoplimit(&rt->dst);
		err = vxlan_build_skb(skb, &rt->dst, sizeof(struct iphdr),
				      htonl(vni << 8), md, flags, udp_sum);
		if (err < 0)
			goto xmit_tx_error;

		err = (*kgre_udp_tunnel_xmit_skb)(rt, sk, skb,
					  local_ip.sin.sin_addr.s_addr,
					  dst->sin.sin_addr.s_addr, tos, ttl,
					  df, src_port, dst_port, xnet,
					  !udp_sum);

		iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
#if IS_ENABLED(CONFIG_IPV6)
	} else {
		struct dst_entry *ndst;
		u32 rt6i_flags;

		if (!vxlan->vn6_sock)
			goto drop;
		sk = vxlan->vn6_sock->sock->sk;

		ndst = kgrp_vxlan6_get_route(vxlan, skb,
					rdst ? rdst->remote_ifindex : 0,
					&dst->sin6.sin6_addr,
					&local_ip.sin6.sin6_addr);
		if (IS_ERR(ndst)) {
			netdev_dbg(dev, "no route to %pI6\n",
				   &dst->sin6.sin6_addr);
			dev->stats.tx_carrier_errors++;
			goto tx_error;
		}

		if (ndst->dev == dev) {
			netdev_dbg(dev, "circular route to %pI6\n",
				   &dst->sin6.sin6_addr);
			dst_release(ndst);
			dev->stats.collisions++;
			goto tx_error;
		}

		/* Bypass encapsulation if the destination is local */
		rt6i_flags = ((struct rt6_info *)ndst)->rt6i_flags;
		if (!info && rt6i_flags & RTF_LOCAL &&
		    !(rt6i_flags & (RTCF_BROADCAST | RTCF_MULTICAST))) {
			struct vxlan_dev *dst_vxlan;

			dst_release(ndst);
			dst_vxlan = (*kgre_vxlan_find_vni)(vxlan->net, vni,
						   dst->sa.sa_family, dst_port,
						   vxlan->flags);
			if (!dst_vxlan)
				goto tx_error;
			kgrr_vxlan_encap_bypass(skb, vxlan, dst_vxlan);
			return;
		}

		if (!info)
			udp_sum = !(flags & VXLAN_F_UDP_ZERO_CSUM6_TX);

		ttl = ttl ? : ip6_dst_hoplimit(ndst);
		skb_scrub_packet(skb, xnet);
		err = vxlan_build_skb(skb, ndst, sizeof(struct ipv6hdr),
				      htonl(vni << 8), md, flags, udp_sum);
		if (err < 0) {
			dst_release(ndst);
			return;
		}
		(*kgre_udp_tunnel6_xmit_skb)(ndst, sk, skb, dev,
				     &local_ip.sin6.sin6_addr, &dst->sin6.sin6_addr,
				     0, ttl, src_port, dst_port, !udp_sum);
#else
#error "klp-ccp: a preceeding branch should have been taken"
#endif
	}

	return;

drop:
	dev->stats.tx_dropped++;
	goto tx_free;

xmit_tx_error:
	/* skb is already freed. */
	skb = NULL;
rt_tx_error:
	ip_rt_put(rt);
tx_error:
	dev->stats.tx_errors++;
tx_free:
	dev_kfree_skb(skb);
}

int kgrp_vxlan_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
	struct ip_tunnel_info *info = skb_tunnel_info(skb);
	__be16 sport, dport;

	sport = udp_flow_src_port(dev_net(dev), skb, vxlan->cfg.port_min,
				  vxlan->cfg.port_max, true);
	dport = info->key.tp_dst ? : vxlan->cfg.dst_port;

	if (ip_tunnel_info_af(info) == AF_INET) {
		struct rtable *rt;

		if (!vxlan->vn4_sock)
			return -EINVAL;
		rt = vxlan_get_route(vxlan, skb, 0, info->key.tos,
				     info->key.u.ipv4.dst,
				     &info->key.u.ipv4.src);
		if (IS_ERR(rt))
			return PTR_ERR(rt);
		ip_rt_put(rt);
	} else {
#if IS_ENABLED(CONFIG_IPV6)
		struct dst_entry *ndst;

		if (!vxlan->vn6_sock)
			return -EINVAL;
		ndst = kgrp_vxlan6_get_route(vxlan, skb, 0,
					&info->key.u.ipv6.dst,
					&info->key.u.ipv6.src);
		if (IS_ERR(ndst))
			return PTR_ERR(ndst);
		dst_release(ndst);
#else /* !CONFIG_IPV6 */
#error "klp-ccp: non-taken branch"
#endif
	}
	info->key.tp_src = sport;
	info->key.tp_dst = dport;
	return 0;
}



static struct kgr_kallsyms_reloc kgr_funcs[] = {
	{ "udp_tunnel_xmit_skb", (void *)&kgre_udp_tunnel_xmit_skb,
	  "udp_tunnel" },
	{ "udp_tunnel6_xmit_skb", (void *)&kgre_udp_tunnel6_xmit_skb,
	  "ip6_udp_tunnel" },
	{ "vxlan_find_vni", (void *)&kgre_vxlan_find_vni, "vxlan" },
	{ "vxlan_snoop", (void *)&kgre_vxlan_snoop, "vxlan" },
};

static int kgr_patch_bsc1165631_vxlan_module_notify(struct notifier_block *nb,
					      unsigned long action, void *data)
{
	struct module *mod = data;
	int ret;

	if (action != MODULE_STATE_COMING || strcmp(mod->name, KGR_PATCHED_MODULE))
		return 0;

	ret = __kgr_resolve_kallsyms_relocs(kgr_funcs, ARRAY_SIZE(kgr_funcs));
	WARN(ret, "kgraft-patch: delayed kallsyms lookup failed. System is broken and can crash.\n");

	return ret;
}

static struct notifier_block kgr_patch_bsc1165631_vxlan_module_nb = {
	.notifier_call = kgr_patch_bsc1165631_vxlan_module_notify,
	.priority = INT_MIN+1,
};

int kgr_patch_bsc1165631_vxlan_init(void)
{
	int ret;

	mutex_lock(&module_mutex);
	if (find_module(KGR_PATCHED_MODULE)) {
		ret = __kgr_resolve_kallsyms_relocs(kgr_funcs,
						    ARRAY_SIZE(kgr_funcs));
		if (ret)
			goto out;
	}

	ret = register_module_notifier(&kgr_patch_bsc1165631_vxlan_module_nb);
out:
	mutex_unlock(&module_mutex);
	return ret;
}

void kgr_patch_bsc1165631_vxlan_cleanup(void)
{
	unregister_module_notifier(&kgr_patch_bsc1165631_vxlan_module_nb);
}
