From b6c3f69bf3e5eeafc6f775511ab903c9f5faa837 Mon Sep 17 00:00:00 2001 From: /gray Date: Sun, 31 Mar 2024 13:03:20 +0800 Subject: [PATCH] patch/optimize(bpf): improve wan tcp hijack datapath performance (#481) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Sumire (菫) <151038614+sumire88@users.noreply.github.com> --- common/consts/ebpf.go | 1 + control/control_plane.go | 5 +- control/control_plane_core.go | 27 +++++++ control/kern/tproxy.c | 139 ++++++++++++++++++++++++++++++---- docs/en/README.md | 4 +- docs/zh/README.md | 4 +- 6 files changed, 160 insertions(+), 20 deletions(-) diff --git a/common/consts/ebpf.go b/common/consts/ebpf.go index 420632d..40e0bc6 100644 --- a/common/consts/ebpf.go +++ b/common/consts/ebpf.go @@ -153,6 +153,7 @@ var ( SkAssignFeatureVersion = internal.Version{5, 7, 0} ChecksumFeatureVersion = internal.Version{5, 8, 0} ProgTypeSkLookupFeatureVersion = internal.Version{5, 9, 0} + SockmapFeatureVersion = internal.Version{5, 10, 0} UserspaceBatchUpdateLpmTrieFeatureVersion = internal.Version{5, 13, 0} HelperBpfGetFuncIpVersionFeatureVersion = internal.Version{5, 15, 0} ) diff --git a/control/control_plane.go b/control/control_plane.go index 8f1fdf7..d447e69 100644 --- a/control/control_plane.go +++ b/control/control_plane.go @@ -106,7 +106,7 @@ func NewControlPlane( kernelVersion.String(), requirement.String()) } - if requirement := consts.CgSocketCookieFeatureVersion; len(global.WanInterface) > 0 && kernelVersion.Less(requirement) { + if requirement := consts.SockmapFeatureVersion; len(global.WanInterface) > 0 && kernelVersion.Less(requirement) { return nil, fmt.Errorf("your kernel version %v does not support bind to WAN; expect >=%v; remove wan_interface in config file and try again", kernelVersion.String(), requirement.String()) @@ -221,6 +221,9 @@ func NewControlPlane( if err = core.setupSkPidMonitor(); err != nil { log.WithError(err).Warnln("cgroup2 is not enabled; pname routing cannot be used") } + if err = core.setupLocalTcpFastRedirect(); err != nil { + log.WithError(err).Warnln("failed to setup local tcp fast redirect") + } for _, ifname := range global.WanInterface { if err = core.bindWan(ifname, global.AutoConfigKernelParameter); err != nil { return nil, fmt.Errorf("bindWan: %v: %w", ifname, err) diff --git a/control/control_plane_core.go b/control/control_plane_core.go index 0c2176f..f73edfa 100644 --- a/control/control_plane_core.go +++ b/control/control_plane_core.go @@ -15,6 +15,7 @@ import ( "sync" "github.com/cilium/ebpf" + "github.com/cilium/ebpf/link" ciliumLink "github.com/cilium/ebpf/link" "github.com/daeuniverse/dae/common" "github.com/daeuniverse/dae/common/consts" @@ -382,6 +383,32 @@ func (c *controlPlaneCore) setupSkPidMonitor() error { return nil } +func (c *controlPlaneCore) setupLocalTcpFastRedirect() (err error) { + cgroupPath, err := detectCgroupPath() + if err != nil { + return + } + cg, err := link.AttachCgroup(link.CgroupOptions{ + Path: cgroupPath, + Program: c.bpf.LocalTcpSockops, // todo@gray: rename + Attach: ebpf.AttachCGroupSockOps, + }) + if err != nil { + return fmt.Errorf("AttachCgroupSockOps: %w", err) + } + c.deferFuncs = append(c.deferFuncs, cg.Close) + + if err = link.RawAttachProgram(link.RawAttachProgramOptions{ + Target: c.bpf.FastSock.FD(), + Program: c.bpf.SkMsgFastRedirect, + Attach: ebpf.AttachSkMsgVerdict, + }); err != nil { + return fmt.Errorf("AttachSkMsgVerdict: %w", err) + } + return nil + +} + func (c *controlPlaneCore) bindWan(ifname string, autoConfigKernelParameter bool) error { return c._bindWan(ifname) } diff --git a/control/kern/tproxy.c b/control/kern/tproxy.c index 93afd68..1b68e52 100644 --- a/control/kern/tproxy.c +++ b/control/kern/tproxy.c @@ -193,6 +193,17 @@ struct { __uint(pinning, LIBBPF_PIN_BY_NAME); } routing_tuples_map SEC(".maps"); +/* Sockets in fast_sock map are used for fast-redirecting via + * sk_msg/fast_redirect. Sockets are automactically deleted from map once + * closed, so we don't need to worry about stale entries. + */ +struct { + __uint(type, BPF_MAP_TYPE_SOCKHASH); + __type(key, struct tuples_key); + __type(value, __u64); + __uint(max_entries, 65535); +} fast_sock SEC(".maps"); + // Link to type: #define LinkType_None 0 #define LinkType_Ethernet 1 @@ -1158,22 +1169,7 @@ int tproxy_wan_egress(struct __sk_buff *skb) get_tuples(skb, &tuples, &iph, &ipv6h, &tcph, &udph, l4proto); - // We should know if this packet is from tproxy. - // We do not need to check the source ip because we have skipped packets not - // from localhost. - __be16 tproxy_port = PARAM.tproxy_port; - - if (!tproxy_port) - return TC_ACT_OK; - bool tproxy_response = tproxy_port == tuples.five.sport; - - if (tproxy_response) { - // WAN response won't reach here, must be a LAN response. - return TC_ACT_PIPE; - } - // Normal packets. - if (l4proto == IPPROTO_TCP) { // Backup for further use. tcp_state_syn = tcph.syn && !tcph.ack; @@ -1611,4 +1607,117 @@ int tproxy_wan_cg_sendmsg6(struct bpf_sock_addr *ctx) return 1; } +SEC("sockops") +int local_tcp_sockops(struct bpf_sock_ops *skops) +{ + struct task_struct *task = (struct task_struct *)bpf_get_current_task(); + __u32 pid = BPF_CORE_READ(task, pid); + + /* Only local TCP connection has non-zero pids. */ + if (pid == 0) + return 0; + + struct tuples_key tuple = {}; + + tuple.l4proto = IPPROTO_TCP; + tuple.sport = bpf_htonl(skops->local_port) >> 16; + tuple.dport = skops->remote_port >> 16; + if (skops->family == AF_INET) { + tuple.sip.u6_addr32[2] = bpf_htonl(0x0000ffff); + tuple.sip.u6_addr32[3] = skops->local_ip4; + tuple.dip.u6_addr32[2] = bpf_htonl(0x0000ffff); + tuple.dip.u6_addr32[3] = skops->remote_ip4; + } else if (skops->family == AF_INET6) { + tuple.sip.u6_addr32[3] = skops->local_ip6[3]; + tuple.sip.u6_addr32[2] = skops->local_ip6[2]; + tuple.sip.u6_addr32[1] = skops->local_ip6[1]; + tuple.sip.u6_addr32[0] = skops->local_ip6[0]; + tuple.dip.u6_addr32[3] = skops->remote_ip6[3]; + tuple.dip.u6_addr32[2] = skops->remote_ip6[2]; + tuple.dip.u6_addr32[1] = skops->remote_ip6[1]; + tuple.dip.u6_addr32[0] = skops->remote_ip6[0]; + } else { + return 0; + } + + switch (skops->op) { + case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB: // dae sockets + { + struct tuples_key rev_tuple = {}; + + rev_tuple.l4proto = IPPROTO_TCP; + rev_tuple.sport = tuple.dport; + rev_tuple.dport = tuple.sport; + __builtin_memcpy(&rev_tuple.sip, &tuple.dip, IPV6_BYTE_LENGTH); + __builtin_memcpy(&rev_tuple.dip, &tuple.sip, IPV6_BYTE_LENGTH); + + struct routing_result *routing_result; + + routing_result = bpf_map_lookup_elem(&routing_tuples_map, &rev_tuple); + if (!routing_result || !routing_result->pid) + break; + + if (!bpf_sock_hash_update(skops, &fast_sock, &tuple, BPF_ANY)) + bpf_printk("fast_sock added: %pI4:%lu -> %pI4:%lu", + &tuple.sip.u6_addr32[3], bpf_ntohs(tuple.sport), + &tuple.dip.u6_addr32[3], bpf_ntohs(tuple.dport)); + break; + } + + case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB: // local client sockets + { + struct routing_result *routing_result; + + routing_result = bpf_map_lookup_elem(&routing_tuples_map, &tuple); + if (!routing_result || !routing_result->pid) + break; + + if (!bpf_sock_hash_update(skops, &fast_sock, &tuple, BPF_ANY)) + bpf_printk("fast_sock added: %pI4:%lu -> %pI4:%lu", + &tuple.sip.u6_addr32[3], bpf_ntohs(tuple.sport), + &tuple.dip.u6_addr32[3], bpf_ntohs(tuple.dport)); + break; + } + + default: + break; + } + + return 0; +} + +SEC("sk_msg/fast_redirect") +int sk_msg_fast_redirect(struct sk_msg_md *msg) +{ + struct tuples_key rev_tuple = {}; + + rev_tuple.l4proto = IPPROTO_TCP; + rev_tuple.sport = msg->remote_port >> 16; + rev_tuple.dport = bpf_htonl(msg->local_port) >> 16; + if (msg->family == AF_INET) { + rev_tuple.sip.u6_addr32[2] = bpf_htonl(0x0000ffff); + rev_tuple.sip.u6_addr32[3] = msg->remote_ip4; + rev_tuple.dip.u6_addr32[2] = bpf_htonl(0x0000ffff); + rev_tuple.dip.u6_addr32[3] = msg->local_ip4; + } else if (msg->family == AF_INET6) { + rev_tuple.sip.u6_addr32[3] = msg->remote_ip6[3]; + rev_tuple.sip.u6_addr32[2] = msg->remote_ip6[2]; + rev_tuple.sip.u6_addr32[1] = msg->remote_ip6[1]; + rev_tuple.sip.u6_addr32[0] = msg->remote_ip6[0]; + rev_tuple.dip.u6_addr32[3] = msg->local_ip6[3]; + rev_tuple.dip.u6_addr32[2] = msg->local_ip6[2]; + rev_tuple.dip.u6_addr32[1] = msg->local_ip6[1]; + rev_tuple.dip.u6_addr32[0] = msg->local_ip6[0]; + } else { + return SK_PASS; + } + + if (bpf_msg_redirect_hash(msg, &fast_sock, &rev_tuple, BPF_F_INGRESS) == SK_PASS) + bpf_printk("tcp fast redirect: %pI4:%lu -> %pI4:%lu", + &rev_tuple.sip.u6_addr32[3], bpf_ntohs(rev_tuple.sport), + &rev_tuple.dip.u6_addr32[3], bpf_ntohs(rev_tuple.dport)); + + return SK_PASS; +} + SEC("license") const char __license[] = "Dual BSD/GPL"; diff --git a/docs/en/README.md b/docs/en/README.md index a59d9c2..0f220e6 100644 --- a/docs/en/README.md +++ b/docs/en/README.md @@ -19,11 +19,11 @@ This feature requires the kernel version of machine on which dae install >= 5.8. Note that if you bind dae to LAN only, dae only provide network service for traffic from LAN, and not impact local programs. -`Bind to WAN: >= 5.8` +`Bind to WAN: >= 5.10` You need bind dae to WAN interface, if you want dae to provide network service for local programs. -This feature requires kernel version of the machine >= 5.8. +This feature requires kernel version of the machine >= 5.10. Note that if you bind dae to WAN only, dae only provide network service for local programs and not impact traffic coming in from other interfaces. diff --git a/docs/zh/README.md b/docs/zh/README.md index 7fbc82f..1fe159f 100644 --- a/docs/zh/README.md +++ b/docs/zh/README.md @@ -17,11 +17,11 @@ 如果你只在 `lan_interface` 中填写了接口,而未在 `wan_interface` 中填写内容,那么本地程序将无法被代理。如果你期望代理本地程序,需要在 `wan_interface` 中填写 `auto` 或是手动输入 WAN 接口。 -`绑定到 WAN 接口: >= 5.8` +`绑定到 WAN 接口: >= 5.10` 如果你想为本地程序提供代理服务,需要把 dae 绑定到 WAN 接口上。 -该特性要求 dae 所在的设备的内核版本 >= 5.8。 +该特性要求 dae 所在的设备的内核版本 >= 5.10。 如果你只在 `wan_interface` 中填写了接口或 `auto`,而未在 `lan_interface` 中填写内容,那么从局域网中传来的流量将无法被代理。如果你想同时代理本机和局域网流量,请同时填写 `wan_interface` 和 `lan_interface`。