From e60295dac7d4a3e54d390ac2161bf487ee758e39 Mon Sep 17 00:00:00 2001 From: mzz2017 <2017@duck.com> Date: Tue, 31 Jan 2023 18:08:38 +0800 Subject: [PATCH] fix: problem that node address cannot be domain --- .gitignore | 5 +- Makefile | 2 + README.md | 1 - common/consts/ebpf.go | 1 + component/control/control.go | 3 +- component/control/control_plane.go | 97 +++- .../control/kern/headers/if_ether_defs.h | 137 +++++ component/control/kern/headers/pkt_cls_defs.h | 105 ++++ component/control/kern/headers/socket_defs.h | 196 +++++++ component/control/kern/tproxy.c | 522 +++++++++++++----- component/control/tcp.go | 12 +- insert.sh | 2 +- 12 files changed, 921 insertions(+), 162 deletions(-) create mode 100644 component/control/kern/headers/if_ether_defs.h create mode 100644 component/control/kern/headers/pkt_cls_defs.h create mode 100644 component/control/kern/headers/socket_defs.h diff --git a/.gitignore b/.gitignore index 3a83dd6..2a896f5 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,7 @@ .idea *.o *.tmp -bpf_bpfeb.go -bpf_bpfel.go +bpf_bpfeb*.go +bpf_bpfel*.go dae +vmlinux.h \ No newline at end of file diff --git a/Makefile b/Makefile index d72f4fd..a1d1534 100644 --- a/Makefile +++ b/Makefile @@ -10,6 +10,7 @@ CLANG ?= clang STRIP ?= llvm-strip OUTPUT ?= dae CFLAGS := -O2 -Wall -Werror $(CFLAGS) +GOARCH ?= amd64 # Get version from .git. date=$(shell git log -1 --format="%cd" --date=short | sed s/-//g) @@ -30,6 +31,7 @@ dae: ebpf ebpf: export BPF_CLANG := $(CLANG) ebpf: export BPF_STRIP := $(STRIP) ebpf: export BPF_CFLAGS := $(CFLAGS) +ebpf: export BPF_GOARCH := $(GOARCH) ebpf: unset GOOS && \ unset GOARCH && \ diff --git a/README.md b/README.md index 1510693..8bb85dd 100644 --- a/README.md +++ b/README.md @@ -32,5 +32,4 @@ See [example.dae](https://github.com/v2rayA/dae/blob/main/example.dae). 1. Handle the case that nodes do not support UDP. 1. L4Checksum problem. 1. Config support list like: `wan_interface: [wlp5s0, eth0]`. -1. Fix problem that node address cannot be domain. 1. ... diff --git a/common/consts/ebpf.go b/common/consts/ebpf.go index 395fd18..98d4478 100644 --- a/common/consts/ebpf.go +++ b/common/consts/ebpf.go @@ -19,6 +19,7 @@ const ( BigEndianTproxyPortKey DisableL4TxChecksumKey DisableL4RxChecksumKey + ControlPlaneOidKey ) type DisableL4ChecksumPolicy uint32 diff --git a/component/control/control.go b/component/control/control.go index 6530581..57e7ece 100644 --- a/component/control/control.go +++ b/component/control/control.go @@ -6,4 +6,5 @@ package control // $BPF_CLANG and $BPF_CFLAGS are set by the Makefile. -//go:generate go run github.com/cilium/ebpf/cmd/bpf2go -cc $BPF_CLANG -strip $BPF_STRIP -cflags $BPF_CFLAGS bpf kern/tproxy.c -- +//go:generate sh -c "bpftool btf dump file /sys/kernel/btf/vmlinux format c > kern/headers/vmlinux.h" +//go:generate go run github.com/cilium/ebpf/cmd/bpf2go -cc $BPF_CLANG -strip $BPF_STRIP -cflags $BPF_CFLAGS -target $BPF_GOARCH bpf kern/tproxy.c -- -I./headers diff --git a/component/control/control_plane.go b/component/control/control_plane.go index 51c9b3c..b8495eb 100644 --- a/component/control/control_plane.go +++ b/component/control/control_plane.go @@ -10,6 +10,7 @@ import ( "errors" "fmt" "github.com/cilium/ebpf" + ciliumLink "github.com/cilium/ebpf/link" "github.com/cilium/ebpf/rlimit" "github.com/mzz2017/softwind/pool" "github.com/sirupsen/logrus" @@ -116,6 +117,11 @@ retryLoadBpf: if err = bpf.ParamMap.Update(consts.DisableL4RxChecksumKey, consts.DisableL4ChecksumPolicy_SetZero, ebpf.UpdateAny); err != nil { return nil, err } + // Write tproxy (control plane) PID. + if err = bpf.ParamMap.Update(consts.ControlPlaneOidKey, uint32(os.Getpid()), ebpf.UpdateAny); err != nil { + return nil, err + } + // Write ip_proto to hdr_size map for IPv6 extension extraction. if err = bpf.IpprotoHdrsizeMap.Update(uint32(unix.IPPROTO_HOPOPTS), int32(-1), ebpf.UpdateAny); err != nil { return nil, err } @@ -368,14 +374,89 @@ func (c *ControlPlane) BindWan(ifname string) error { return err } - //// Insert SrcPidMapper. - //sock, err := internal.OpenRawSock(link.Attrs().Index) - //if err != nil { - // return fmt.Errorf("failed to open raw sock: %v: %w", ifname, err) - //} - //if err = unix.SetsockoptInt(sock, unix.SOL_SOCKET, unix.SO_ATTACH_BPF, c.bpf.bpfPrograms.SrcPidMapper.FD()); err != nil { - // return fmt.Errorf("failed to attach SrcPidMapper") - //} + version, e := internal.KernelVersion() + if e != nil { + return fmt.Errorf("BindWan: failed to get kernel version: %w", e) + } + ftraceFeatureVersion := internal.Version{5, 5, 0} + if version.Less(ftraceFeatureVersion) { + // Not support ftrace (fentry/fexit). + // PID bypass needs it. + return fmt.Errorf("your kernel version %v does not support bind to WAN; expect >=%v", version.String(), ftraceFeatureVersion.String()) + } + + // Set-up SrcPidMapper. + // Attach programs to support pname routing. + + // ipv4 tcp/udp: send + inetSendPrepare, err := ciliumLink.AttachTracing(ciliumLink.TracingOptions{ + Program: c.bpf.InetSendPrepare, + }) + if err != nil { + return fmt.Errorf("AttachTracing InetSendPrepare: %w", err) + } + c.deferFuncs = append(c.deferFuncs, func() error { + if err := inetSendPrepare.Close(); err != nil { + return fmt.Errorf("inetSendPrepare.Close(): %w", err) + } + return nil + }) + + // ipv4 tcp/udp: listen + inetBind, err := ciliumLink.AttachTracing(ciliumLink.TracingOptions{ + Program: c.bpf.InetBind, + }) + if err != nil { + return fmt.Errorf("AttachTracing InetBind: %w", err) + } + c.deferFuncs = append(c.deferFuncs, func() error { + if err := inetBind.Close(); err != nil { + return fmt.Errorf("inetBind.Close(): %w", err) + } + return nil + }) + + // ipv4 udp: sendto/sendmsg + inetAutoBind, err := ciliumLink.AttachTracing(ciliumLink.TracingOptions{ + Program: c.bpf.InetAutobind, + }) + if err != nil { + return fmt.Errorf("AttachTracing InetAutobind: %w", err) + } + c.deferFuncs = append(c.deferFuncs, func() error { + if err := inetAutoBind.Close(); err != nil { + return fmt.Errorf("inetAutoBind.Close(): %w", err) + } + return nil + }) + + // ipv4 tcp: connect + tcpConnect, err := ciliumLink.AttachTracing(ciliumLink.TracingOptions{ + Program: c.bpf.TcpConnect, + }) + if err != nil { + return fmt.Errorf("AttachTracing TcpConnect: %w", err) + } + c.deferFuncs = append(c.deferFuncs, func() error { + if err := tcpConnect.Close(); err != nil { + return fmt.Errorf("inetStreamConnect.Close(): %w", err) + } + return nil + }) + + // ipv6 tcp/udp: listen + inet6Bind, err := ciliumLink.AttachTracing(ciliumLink.TracingOptions{ + Program: c.bpf.Inet6Bind, + }) + if err != nil { + return fmt.Errorf("AttachTracing Inet6Bind: %w", err) + } + c.deferFuncs = append(c.deferFuncs, func() error { + if err := inet6Bind.Close(); err != nil { + return fmt.Errorf("inet6Bind.Close(): %w", err) + } + return nil + }) // Insert qdisc and tc filters. qdisc := &netlink.GenericQdisc{ diff --git a/component/control/kern/headers/if_ether_defs.h b/component/control/kern/headers/if_ether_defs.h new file mode 100644 index 0000000..aaf4c95 --- /dev/null +++ b/component/control/kern/headers/if_ether_defs.h @@ -0,0 +1,137 @@ +#ifndef __IF_ETHER_DEFS_H__ +#define __IF_ETHER_DEFS_H__ +/* + * IEEE 802.3 Ethernet magic constants. The frame sizes omit the preamble + * and FCS/CRC (frame check sequence). + */ + +#define ETH_ALEN 6 /* Octets in one ethernet addr */ +#define ETH_TLEN 2 /* Octets in ethernet type field */ +#define ETH_HLEN 14 /* Total octets in header. */ +#define ETH_ZLEN 60 /* Min. octets in frame sans FCS */ +#define ETH_DATA_LEN 1500 /* Max. octets in payload */ +#define ETH_FRAME_LEN 1514 /* Max. octets in frame sans FCS */ +#define ETH_FCS_LEN 4 /* Octets in the FCS */ + +#define ETH_MIN_MTU 68 /* Min IPv4 MTU per RFC791 */ +#define ETH_MAX_MTU 0xFFFFU /* 65535, same as IP_MAX_MTU */ + +/* + * These are the defined Ethernet Protocol ID's. + */ + +#define ETH_P_LOOP 0x0060 /* Ethernet Loopback packet */ +#define ETH_P_PUP 0x0200 /* Xerox PUP packet */ +#define ETH_P_PUPAT 0x0201 /* Xerox PUP Addr Trans packet */ +#define ETH_P_TSN 0x22F0 /* TSN (IEEE 1722) packet */ +#define ETH_P_ERSPAN2 0x22EB /* ERSPAN version 2 (type III) */ +#define ETH_P_IP 0x0800 /* Internet Protocol packet */ +#define ETH_P_X25 0x0805 /* CCITT X.25 */ +#define ETH_P_ARP 0x0806 /* Address Resolution packet */ +#define ETH_P_BPQ 0x08FF /* G8BPQ AX.25 Ethernet Packet [ NOT AN OFFICIALLY REGISTERED ID ] */ +#define ETH_P_IEEEPUP 0x0a00 /* Xerox IEEE802.3 PUP packet */ +#define ETH_P_IEEEPUPAT 0x0a01 /* Xerox IEEE802.3 PUP Addr Trans packet */ +#define ETH_P_BATMAN 0x4305 /* B.A.T.M.A.N.-Advanced packet [ NOT AN OFFICIALLY REGISTERED ID ] */ +#define ETH_P_DEC 0x6000 /* DEC Assigned proto */ +#define ETH_P_DNA_DL 0x6001 /* DEC DNA Dump/Load */ +#define ETH_P_DNA_RC 0x6002 /* DEC DNA Remote Console */ +#define ETH_P_DNA_RT 0x6003 /* DEC DNA Routing */ +#define ETH_P_LAT 0x6004 /* DEC LAT */ +#define ETH_P_DIAG 0x6005 /* DEC Diagnostics */ +#define ETH_P_CUST 0x6006 /* DEC Customer use */ +#define ETH_P_SCA 0x6007 /* DEC Systems Comms Arch */ +#define ETH_P_TEB 0x6558 /* Trans Ether Bridging */ +#define ETH_P_RARP 0x8035 /* Reverse Addr Res packet */ +#define ETH_P_ATALK 0x809B /* Appletalk DDP */ +#define ETH_P_AARP 0x80F3 /* Appletalk AARP */ +#define ETH_P_8021Q 0x8100 /* 802.1Q VLAN Extended Header */ +#define ETH_P_ERSPAN 0x88BE /* ERSPAN type II */ +#define ETH_P_IPX 0x8137 /* IPX over DIX */ +#define ETH_P_IPV6 0x86DD /* IPv6 over bluebook */ +#define ETH_P_PAUSE 0x8808 /* IEEE Pause frames. See 802.3 31B */ +#define ETH_P_SLOW 0x8809 /* Slow Protocol. See 802.3ad 43B */ +#define ETH_P_WCCP 0x883E /* Web-cache coordination protocol + * defined in draft-wilson-wrec-wccp-v2-00.txt */ +#define ETH_P_MPLS_UC 0x8847 /* MPLS Unicast traffic */ +#define ETH_P_MPLS_MC 0x8848 /* MPLS Multicast traffic */ +#define ETH_P_ATMMPOA 0x884c /* MultiProtocol Over ATM */ +#define ETH_P_PPP_DISC 0x8863 /* PPPoE discovery messages */ +#define ETH_P_PPP_SES 0x8864 /* PPPoE session messages */ +#define ETH_P_LINK_CTL 0x886c /* HPNA, wlan link local tunnel */ +#define ETH_P_ATMFATE 0x8884 /* Frame-based ATM Transport + * over Ethernet + */ +#define ETH_P_PAE 0x888E /* Port Access Entity (IEEE 802.1X) */ +#define ETH_P_PROFINET 0x8892 /* PROFINET */ +#define ETH_P_REALTEK 0x8899 /* Multiple proprietary protocols */ +#define ETH_P_AOE 0x88A2 /* ATA over Ethernet */ +#define ETH_P_ETHERCAT 0x88A4 /* EtherCAT */ +#define ETH_P_8021AD 0x88A8 /* 802.1ad Service VLAN */ +#define ETH_P_802_EX1 0x88B5 /* 802.1 Local Experimental 1. */ +#define ETH_P_PREAUTH 0x88C7 /* 802.11 Preauthentication */ +#define ETH_P_TIPC 0x88CA /* TIPC */ +#define ETH_P_LLDP 0x88CC /* Link Layer Discovery Protocol */ +#define ETH_P_MRP 0x88E3 /* Media Redundancy Protocol */ +#define ETH_P_MACSEC 0x88E5 /* 802.1ae MACsec */ +#define ETH_P_8021AH 0x88E7 /* 802.1ah Backbone Service Tag */ +#define ETH_P_MVRP 0x88F5 /* 802.1Q MVRP */ +#define ETH_P_1588 0x88F7 /* IEEE 1588 Timesync */ +#define ETH_P_NCSI 0x88F8 /* NCSI protocol */ +#define ETH_P_PRP 0x88FB /* IEC 62439-3 PRP/HSRv0 */ +#define ETH_P_CFM 0x8902 /* Connectivity Fault Management */ +#define ETH_P_FCOE 0x8906 /* Fibre Channel over Ethernet */ +#define ETH_P_IBOE 0x8915 /* Infiniband over Ethernet */ +#define ETH_P_TDLS 0x890D /* TDLS */ +#define ETH_P_FIP 0x8914 /* FCoE Initialization Protocol */ +#define ETH_P_80221 0x8917 /* IEEE 802.21 Media Independent Handover Protocol */ +#define ETH_P_HSR 0x892F /* IEC 62439-3 HSRv1 */ +#define ETH_P_NSH 0x894F /* Network Service Header */ +#define ETH_P_LOOPBACK 0x9000 /* Ethernet loopback packet, per IEEE 802.3 */ +#define ETH_P_QINQ1 0x9100 /* deprecated QinQ VLAN [ NOT AN OFFICIALLY REGISTERED ID ] */ +#define ETH_P_QINQ2 0x9200 /* deprecated QinQ VLAN [ NOT AN OFFICIALLY REGISTERED ID ] */ +#define ETH_P_QINQ3 0x9300 /* deprecated QinQ VLAN [ NOT AN OFFICIALLY REGISTERED ID ] */ +#define ETH_P_EDSA 0xDADA /* Ethertype DSA [ NOT AN OFFICIALLY REGISTERED ID ] */ +#define ETH_P_DSA_8021Q 0xDADB /* Fake VLAN Header for DSA [ NOT AN OFFICIALLY REGISTERED ID ] */ +#define ETH_P_IFE 0xED3E /* ForCES inter-FE LFB type */ +#define ETH_P_AF_IUCV 0xFBFB /* IBM af_iucv [ NOT AN OFFICIALLY REGISTERED ID ] */ + +#define ETH_P_802_3_MIN 0x0600 /* If the value in the ethernet type is more than this value + * then the frame is Ethernet II. Else it is 802.3 */ + +/* + * Non DIX types. Won't clash for 1500 types. + */ + +#define ETH_P_802_3 0x0001 /* Dummy type for 802.3 frames */ +#define ETH_P_AX25 0x0002 /* Dummy protocol id for AX.25 */ +#define ETH_P_ALL 0x0003 /* Every packet (be careful!!!) */ +#define ETH_P_802_2 0x0004 /* 802.2 frames */ +#define ETH_P_SNAP 0x0005 /* Internal only */ +#define ETH_P_DDCMP 0x0006 /* DEC DDCMP: Internal only */ +#define ETH_P_WAN_PPP 0x0007 /* Dummy type for WAN PPP frames*/ +#define ETH_P_PPP_MP 0x0008 /* Dummy type for PPP MP frames */ +#define ETH_P_LOCALTALK 0x0009 /* Localtalk pseudo type */ +#define ETH_P_CAN 0x000C /* CAN: Controller Area Network */ +#define ETH_P_CANFD 0x000D /* CANFD: CAN flexible data rate*/ +#define ETH_P_PPPTALK 0x0010 /* Dummy type for Atalk over PPP*/ +#define ETH_P_TR_802_2 0x0011 /* 802.2 frames */ +#define ETH_P_MOBITEX 0x0015 /* Mobitex (kaz@cafe.net) */ +#define ETH_P_CONTROL 0x0016 /* Card specific control frames */ +#define ETH_P_IRDA 0x0017 /* Linux-IrDA */ +#define ETH_P_ECONET 0x0018 /* Acorn Econet */ +#define ETH_P_HDLC 0x0019 /* HDLC frames */ +#define ETH_P_ARCNET 0x001A /* 1A for ArcNet :-) */ +#define ETH_P_DSA 0x001B /* Distributed Switch Arch. */ +#define ETH_P_TRAILER 0x001C /* Trailer switch tagging */ +#define ETH_P_PHONET 0x00F5 /* Nokia Phonet frames */ +#define ETH_P_IEEE802154 0x00F6 /* IEEE802.15.4 frame */ +#define ETH_P_CAIF 0x00F7 /* ST-Ericsson CAIF protocol */ +#define ETH_P_XDSA 0x00F8 /* Multiplexed DSA protocol */ +#define ETH_P_MAP 0x00F9 /* Qualcomm multiplexing and + * aggregation protocol + */ +#define ETH_P_MCTP 0x00FA /* Management component transport + * protocol packets + */ + +#endif \ No newline at end of file diff --git a/component/control/kern/headers/pkt_cls_defs.h b/component/control/kern/headers/pkt_cls_defs.h new file mode 100644 index 0000000..9afb1ee --- /dev/null +++ b/component/control/kern/headers/pkt_cls_defs.h @@ -0,0 +1,105 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef __PKT_CLS_DEFS_H__ +#define __PKT_CLS_DEFS_H__ + +#define TC_COOKIE_MAX_SIZE 16 + +/* Action attributes */ + +/* See other TCA_ACT_FLAGS_ * flags in include/net/act_api.h. */ +#define TCA_ACT_FLAGS_NO_PERCPU_STATS \ + (1 << 0) /* Don't use percpu allocator for \ + * actions stats. \ + */ +#define TCA_ACT_FLAGS_SKIP_HW (1 << 1) /* don't offload action to HW */ +#define TCA_ACT_FLAGS_SKIP_SW (1 << 2) /* don't use action in SW */ + +/* tca HW stats type + * When user does not pass the attribute, he does not care. + * It is the same as if he would pass the attribute with + * all supported bits set. + * In case no bits are set, user is not interested in getting any HW statistics. + */ +#define TCA_ACT_HW_STATS_IMMEDIATE \ + (1 << 0) /* Means that in dump, user \ + * gets the current HW stats \ + * state from the device \ + * queried at the dump time. \ + */ +#define TCA_ACT_HW_STATS_DELAYED \ + (1 << 1) /* Means that in dump, user gets \ + * HW stats that might be out of date \ + * for some time, maybe couple of \ + * seconds. This is the case when \ + * driver polls stats updates \ + * periodically or when it gets async \ + * stats update from the device. \ + */ + +#define TCA_ACT_MAX __TCA_ACT_MAX +#define TCA_OLD_COMPAT (TCA_ACT_MAX + 1) +#define TCA_ACT_MAX_PRIO 32 +#define TCA_ACT_BIND 1 +#define TCA_ACT_NOBIND 0 +#define TCA_ACT_UNBIND 1 +#define TCA_ACT_NOUNBIND 0 +#define TCA_ACT_REPLACE 1 +#define TCA_ACT_NOREPLACE 0 + +#define TC_ACT_UNSPEC (-1) +#define TC_ACT_OK 0 +#define TC_ACT_RECLASSIFY 1 +#define TC_ACT_SHOT 2 +#define TC_ACT_PIPE 3 +#define TC_ACT_STOLEN 4 +#define TC_ACT_QUEUED 5 +#define TC_ACT_REPEAT 6 +#define TC_ACT_REDIRECT 7 +#define TC_ACT_TRAP 8 +/* For hw path, this means "trap to cpu" + * and don't further process the frame + * in hardware. For sw path, this is + * equivalent of TC_ACT_STOLEN - drop + * the skb and act like everything + * is alright. + */ +#define TC_ACT_VALUE_MAX TC_ACT_TRAP + +/* There is a special kind of actions called "extended actions", + * which need a value parameter. These have a local opcode located in + * the highest nibble, starting from 1. The rest of the bits + * are used to carry the value. These two parts together make + * a combined opcode. + */ +#define __TC_ACT_EXT_SHIFT 28 +#define __TC_ACT_EXT(local) ((local) << __TC_ACT_EXT_SHIFT) +#define TC_ACT_EXT_VAL_MASK ((1 << __TC_ACT_EXT_SHIFT) - 1) +#define TC_ACT_EXT_OPCODE(combined) ((combined) & (~TC_ACT_EXT_VAL_MASK)) +#define TC_ACT_EXT_CMP(combined, opcode) (TC_ACT_EXT_OPCODE(combined) == opcode) + +#define TC_ACT_JUMP __TC_ACT_EXT(1) +#define TC_ACT_GOTO_CHAIN __TC_ACT_EXT(2) +#define TC_ACT_EXT_OPCODE_MAX TC_ACT_GOTO_CHAIN + +/* These macros are put here for binary compatibility with userspace apps that + * make use of them. For kernel code and new userspace apps, use the TCA_ID_* + * versions. + */ +#define TCA_ACT_GACT 5 +#define TCA_ACT_IPT 6 +#define TCA_ACT_PEDIT 7 +#define TCA_ACT_MIRRED 8 +#define TCA_ACT_NAT 9 +#define TCA_ACT_XT 10 +#define TCA_ACT_SKBEDIT 11 +#define TCA_ACT_VLAN 12 +#define TCA_ACT_BPF 13 +#define TCA_ACT_CONNMARK 14 +#define TCA_ACT_SKBMOD 15 +#define TCA_ACT_CSUM 16 +#define TCA_ACT_TUNNEL_KEY 17 +#define TCA_ACT_SIMP 22 +#define TCA_ACT_IFE 25 +#define TCA_ACT_SAMPLE 26 + +#endif \ No newline at end of file diff --git a/component/control/kern/headers/socket_defs.h b/component/control/kern/headers/socket_defs.h new file mode 100644 index 0000000..149bd8b --- /dev/null +++ b/component/control/kern/headers/socket_defs.h @@ -0,0 +1,196 @@ +#ifndef __SOCKET_DEFS_H__ +#define __SOCKET_DEFS_H__ + +/* Supported address families. */ +#define AF_UNSPEC 0 +#define AF_UNIX 1 /* Unix domain sockets */ +#define AF_LOCAL 1 /* POSIX name for AF_UNIX */ +#define AF_INET 2 /* Internet IP Protocol */ +#define AF_AX25 3 /* Amateur Radio AX.25 */ +#define AF_IPX 4 /* Novell IPX */ +#define AF_APPLETALK 5 /* AppleTalk DDP */ +#define AF_NETROM 6 /* Amateur Radio NET/ROM */ +#define AF_BRIDGE 7 /* Multiprotocol bridge */ +#define AF_ATMPVC 8 /* ATM PVCs */ +#define AF_X25 9 /* Reserved for X.25 project */ +#define AF_INET6 10 /* IP version 6 */ +#define AF_ROSE 11 /* Amateur Radio X.25 PLP */ +#define AF_DECnet 12 /* Reserved for DECnet project */ +#define AF_NETBEUI 13 /* Reserved for 802.2LLC project*/ +#define AF_SECURITY 14 /* Security callback pseudo AF */ +#define AF_KEY 15 /* PF_KEY key management API */ +#define AF_NETLINK 16 +#define AF_ROUTE AF_NETLINK /* Alias to emulate 4.4BSD */ +#define AF_PACKET 17 /* Packet family */ +#define AF_ASH 18 /* Ash */ +#define AF_ECONET 19 /* Acorn Econet */ +#define AF_ATMSVC 20 /* ATM SVCs */ +#define AF_RDS 21 /* RDS sockets */ +#define AF_SNA 22 /* Linux SNA Project (nutters!) */ +#define AF_IRDA 23 /* IRDA sockets */ +#define AF_PPPOX 24 /* PPPoX sockets */ +#define AF_WANPIPE 25 /* Wanpipe API Sockets */ +#define AF_LLC 26 /* Linux LLC */ +#define AF_IB 27 /* Native InfiniBand address */ +#define AF_MPLS 28 /* MPLS */ +#define AF_CAN 29 /* Controller Area Network */ +#define AF_TIPC 30 /* TIPC sockets */ +#define AF_BLUETOOTH 31 /* Bluetooth sockets */ +#define AF_IUCV 32 /* IUCV sockets */ +#define AF_RXRPC 33 /* RxRPC sockets */ +#define AF_ISDN 34 /* mISDN sockets */ +#define AF_PHONET 35 /* Phonet sockets */ +#define AF_IEEE802154 36 /* IEEE802154 sockets */ +#define AF_CAIF 37 /* CAIF sockets */ +#define AF_ALG 38 /* Algorithm sockets */ +#define AF_NFC 39 /* NFC sockets */ +#define AF_VSOCK 40 /* vSockets */ +#define AF_KCM 41 /* Kernel Connection Multiplexor*/ +#define AF_QIPCRTR 42 /* Qualcomm IPC Router */ +#define AF_SMC 43 /* smc sockets: reserve number for + * PF_SMC protocol family that + * reuses AF_INET address family + */ +#define AF_XDP 44 /* XDP sockets */ + +#define AF_MAX 45 /* For now.. */ + +/* Protocol families, same as address families. */ +#define PF_UNSPEC AF_UNSPEC +#define PF_UNIX AF_UNIX +#define PF_LOCAL AF_LOCAL +#define PF_INET AF_INET +#define PF_AX25 AF_AX25 +#define PF_IPX AF_IPX +#define PF_APPLETALK AF_APPLETALK +#define PF_NETROM AF_NETROM +#define PF_BRIDGE AF_BRIDGE +#define PF_ATMPVC AF_ATMPVC +#define PF_X25 AF_X25 +#define PF_INET6 AF_INET6 +#define PF_ROSE AF_ROSE +#define PF_DECnet AF_DECnet +#define PF_NETBEUI AF_NETBEUI +#define PF_SECURITY AF_SECURITY +#define PF_KEY AF_KEY +#define PF_NETLINK AF_NETLINK +#define PF_ROUTE AF_ROUTE +#define PF_PACKET AF_PACKET +#define PF_ASH AF_ASH +#define PF_ECONET AF_ECONET +#define PF_ATMSVC AF_ATMSVC +#define PF_RDS AF_RDS +#define PF_SNA AF_SNA +#define PF_IRDA AF_IRDA +#define PF_PPPOX AF_PPPOX +#define PF_WANPIPE AF_WANPIPE +#define PF_LLC AF_LLC +#define PF_IB AF_IB +#define PF_MPLS AF_MPLS +#define PF_CAN AF_CAN +#define PF_TIPC AF_TIPC +#define PF_BLUETOOTH AF_BLUETOOTH +#define PF_IUCV AF_IUCV +#define PF_RXRPC AF_RXRPC +#define PF_ISDN AF_ISDN +#define PF_PHONET AF_PHONET +#define PF_IEEE802154 AF_IEEE802154 +#define PF_CAIF AF_CAIF +#define PF_ALG AF_ALG +#define PF_NFC AF_NFC +#define PF_VSOCK AF_VSOCK +#define PF_KCM AF_KCM +#define PF_QIPCRTR AF_QIPCRTR +#define PF_SMC AF_SMC +#define PF_XDP AF_XDP +#define PF_MAX AF_MAX + +/* Maximum queue length specifiable by listen. */ +#define SOMAXCONN 4096 + +/* Flags we can use with send/ and recv. + Added those for 1003.1g not all are supported yet + */ + +#define MSG_OOB 1 +#define MSG_PEEK 2 +#define MSG_DONTROUTE 4 +#define MSG_TRYHARD 4 /* Synonym for MSG_DONTROUTE for DECnet */ +#define MSG_CTRUNC 8 +#define MSG_PROBE 0x10 /* Do not send. Only probe path f.e. for MTU */ +#define MSG_TRUNC 0x20 +#define MSG_DONTWAIT 0x40 /* Nonblocking io */ +#define MSG_EOR 0x80 /* End of record */ +#define MSG_WAITALL 0x100 /* Wait for a full request */ +#define MSG_FIN 0x200 +#define MSG_SYN 0x400 +#define MSG_CONFIRM 0x800 /* Confirm path validity */ +#define MSG_RST 0x1000 +#define MSG_ERRQUEUE 0x2000 /* Fetch message from error queue */ +#define MSG_NOSIGNAL 0x4000 /* Do not generate SIGPIPE */ +#define MSG_MORE 0x8000 /* Sender will send more */ +#define MSG_WAITFORONE 0x10000 /* recvmmsg(): block until 1+ packets avail */ +#define MSG_SENDPAGE_NOPOLICY 0x10000 /* sendpage() internal : do no apply policy */ +#define MSG_SENDPAGE_NOTLAST 0x20000 /* sendpage() internal : not the last page */ +#define MSG_BATCH 0x40000 /* sendmmsg(): more messages coming */ +#define MSG_EOF MSG_FIN +#define MSG_NO_SHARED_FRAGS 0x80000 /* sendpage() internal : page frags are not shared */ +#define MSG_SENDPAGE_DECRYPTED 0x100000 /* sendpage() internal : page may carry + * plain text and require encryption + */ + +#define MSG_ZEROCOPY 0x4000000 /* Use user data in kernel path */ +#define MSG_FASTOPEN 0x20000000 /* Send data in TCP SYN */ +#define MSG_CMSG_CLOEXEC 0x40000000 /* Set close_on_exec for file + descriptor received through + SCM_RIGHTS */ +#if defined(CONFIG_COMPAT) +#define MSG_CMSG_COMPAT 0x80000000 /* This message needs 32 bit fixups */ +#else +#define MSG_CMSG_COMPAT 0 /* We never have 32 bit fixups */ +#endif + + +/* Setsockoptions(2) level. Thanks to BSD these must match IPPROTO_xxx */ +#define SOL_IP 0 +/* #define SOL_ICMP 1 No-no-no! Due to Linux :-) we cannot use SOL_ICMP=1 */ +#define SOL_TCP 6 +#define SOL_UDP 17 +#define SOL_IPV6 41 +#define SOL_ICMPV6 58 +#define SOL_SCTP 132 +#define SOL_UDPLITE 136 /* UDP-Lite (RFC 3828) */ +#define SOL_RAW 255 +#define SOL_IPX 256 +#define SOL_AX25 257 +#define SOL_ATALK 258 +#define SOL_NETROM 259 +#define SOL_ROSE 260 +#define SOL_DECNET 261 +#define SOL_X25 262 +#define SOL_PACKET 263 +#define SOL_ATM 264 /* ATM layer (cell level) */ +#define SOL_AAL 265 /* ATM Adaption Layer (packet level) */ +#define SOL_IRDA 266 +#define SOL_NETBEUI 267 +#define SOL_LLC 268 +#define SOL_DCCP 269 +#define SOL_NETLINK 270 +#define SOL_TIPC 271 +#define SOL_RXRPC 272 +#define SOL_PPPOL2TP 273 +#define SOL_BLUETOOTH 274 +#define SOL_PNPIPE 275 +#define SOL_RDS 276 +#define SOL_IUCV 277 +#define SOL_CAIF 278 +#define SOL_ALG 279 +#define SOL_NFC 280 +#define SOL_KCM 281 +#define SOL_TLS 282 +#define SOL_XDP 283 + +/* IPX options */ +#define IPX_TYPE 1 + +#endif \ No newline at end of file diff --git a/component/control/kern/tproxy.c b/component/control/kern/tproxy.c index 8953a6c..eb4d73b 100644 --- a/component/control/kern/tproxy.c +++ b/component/control/kern/tproxy.c @@ -3,20 +3,18 @@ * SPDX-License-Identifier: AGPL-3.0-only * Copyright (c) since 2022, v2rayA Organization */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include +#include "headers/if_ether_defs.h" +#include "headers/pkt_cls_defs.h" +#include "headers/socket_defs.h" +#include "headers/vmlinux.h" // Use "make ebpf" to generate. +#include +// #include + +#include #include #include +#include // #define likely(x) x // #define unlikely(x) x @@ -24,6 +22,7 @@ #define unlikely(x) __builtin_expect((x), 0) #define IPV6_BYTE_LENGTH 16 +#define TASK_COMM_LEN 16 #define IPV4_CSUM_OFF (ETH_HLEN + offsetof(struct iphdr, check)) #define IPV4_DST_OFF (ETH_HLEN + offsetof(struct iphdr, daddr)) @@ -41,8 +40,7 @@ //#define MAX_LPM_SIZE 20480 #define MAX_LPM_NUM (MAX_MATCH_SET_LEN + 8) #define MAX_DST_MAPPING_NUM (65536 * 2) -#define MAX_SRC_PID_MAPPING_NUM \ - 65536 // It is enough because we use it just for routing. +#define MAX_SRC_PID_PNAME_MAPPING_NUM (65536) #define IPV6_MAX_EXTENSIONS 4 #define OUTBOUND_DIRECT 0 @@ -52,6 +50,11 @@ #define OUTBOUND_LOGICAL_AND 0xFF #define OUTBOUND_LOGICAL_MASK 0xFE +/* Current network namespace */ +enum { + BPF_F_CURRENT_NETNS = (-1L), +}; + enum { DisableL4ChecksumPolicy_EnableL4Checksum, DisableL4ChecksumPolicy_Restore, @@ -63,6 +66,7 @@ static const __u32 zero_key = 0; static const __u32 tproxy_port_key = 1; static const __u32 disable_l4_tx_checksum_key = 2; static const __u32 disable_l4_rx_checksum_key = 3; +static const __u32 control_plane_pid_key = 4; struct ip_port { __be32 ip[4]; @@ -197,10 +201,10 @@ enum L4ProtoType { L4ProtoType_UDP = 2, L4ProtoType_X = 3, }; -enum IpVersion { - IpVersion_4 = 1, - IpVersion_6 = 2, - IpVersion_X = 3, +enum IpVersionType { + IpVersionType_4 = 1, + IpVersionType_6 = 2, + IpVersionType_X = 3, }; struct port_range { __u16 port_start; @@ -223,7 +227,7 @@ struct match_set { __u32 index; struct port_range port_range; enum L4ProtoType l4proto_type; - enum IpVersion ip_version; + enum IpVersionType ip_version; }; enum MatchType type; bool not ; // A subrule flag (this is not a match_set flag). @@ -255,11 +259,16 @@ struct ip_port_proto { __u8 proto; }; +struct pid_pname { + __u32 pid; + char pname[TASK_COMM_LEN]; +}; + struct { __uint(type, BPF_MAP_TYPE_LRU_HASH); __type(key, struct ip_port_proto); - __type(value, __u32); // pid - __uint(max_entries, MAX_SRC_PID_MAPPING_NUM); + __type(value, struct pid_pname); + __uint(max_entries, MAX_SRC_PID_PNAME_MAPPING_NUM); /// NOTICE: No persistence. // __uint(pinning, LIBBPF_PIN_BY_NAME); } src_pid_map SEC(".maps"); @@ -483,6 +492,10 @@ parse_transport(struct __sk_buff *skb, struct ethhdr *ethh, struct iphdr *iph, // Skip ethhdr for next hdr. offset += sizeof(struct ethhdr); + __builtin_memset(iph, 0, sizeof(struct iphdr)); + __builtin_memset(ipv6h, 0, sizeof(struct ipv6hdr)); + __builtin_memset(tcph, 0, sizeof(struct tcphdr)); + __builtin_memset(udph, 0, sizeof(struct udphdr)); *ihl = 0; *ipversion = 0; *l4proto = 0; @@ -796,18 +809,20 @@ static __always_inline int decap_after_udp_hdr(struct __sk_buff *skb, // Do not use __always_inline here because this function is too heavy. static int routing(__u32 flag[3], void *l4_hdr, __be32 saddr[4], __be32 daddr[4], __be32 mac[4]) { -#define _l4proto flag[0] -#define _ipversion flag[1] -#define _hash flag[2] +#define _l4proto_type flag[0] +#define _ipversion_type flag[1] +#define _from_localhost flag[2] + + int ret; + /// TODO: BPF_MAP_UPDATE_BATCH ? __u32 key = MatchType_L4Proto; - int ret; - if ((ret = bpf_map_update_elem(&l4proto_ipversion_map, &key, &_l4proto, + if ((ret = bpf_map_update_elem(&l4proto_ipversion_map, &key, &_l4proto_type, BPF_ANY))) { return ret; }; key = MatchType_IpVersion; - if ((ret = bpf_map_update_elem(&l4proto_ipversion_map, &key, &_ipversion, + if ((ret = bpf_map_update_elem(&l4proto_ipversion_map, &key, &_ipversion_type, BPF_ANY))) { return ret; }; @@ -815,13 +830,14 @@ static int routing(__u32 flag[3], void *l4_hdr, __be32 saddr[4], // Define variables for further use. __u16 h_dport; __u16 h_sport; - if (_l4proto == L4ProtoType_TCP) { + if (_l4proto_type == L4ProtoType_TCP) { h_dport = bpf_ntohs(((struct tcphdr *)l4_hdr)->dest); h_sport = bpf_ntohs(((struct tcphdr *)l4_hdr)->source); } else { h_dport = bpf_ntohs(((struct udphdr *)l4_hdr)->dest); h_sport = bpf_ntohs(((struct udphdr *)l4_hdr)->source); } + key = MatchType_SourcePort; if ((ret = bpf_map_update_elem(&h_port_map, &key, &h_sport, BPF_ANY))) { return ret; @@ -832,7 +848,7 @@ static int routing(__u32 flag[3], void *l4_hdr, __be32 saddr[4], }; // Modify DNS upstream for routing. - if (h_dport == 53 && _l4proto == L4ProtoType_UDP) { + if (h_dport == 53 && _l4proto_type == L4ProtoType_UDP) { struct ip_port *upstream = bpf_map_lookup_elem(&dns_upstream_map, &zero_key); if (!upstream) { @@ -975,7 +991,7 @@ static int routing(__u32 flag[3], void *l4_hdr, __be32 saddr[4], // bpf_printk("MATCHED: match_set->type: %u, match_set->not: %d", // match_set->type, match_set->not ); if (match_set->outbound == OUTBOUND_DIRECT && h_dport == 53 && - _l4proto == L4ProtoType_UDP) { + _l4proto_type == L4ProtoType_UDP) { // DNS packet should go through control plane. return OUTBOUND_CONTROL_PLANE_DIRECT; } @@ -987,8 +1003,9 @@ static int routing(__u32 flag[3], void *l4_hdr, __be32 saddr[4], bpf_printk("No match_set hits. Did coder forget to sync " "common/consts/ebpf.go with enum MatchType?"); return -EPERM; -#undef _l4proto -#undef _ip_version +#undef _l4proto_type +#undef _ipversion_type +#undef _from_localhost } // Do DNAT. @@ -1010,13 +1027,15 @@ int tproxy_ingress(struct __sk_buff *skb) { bpf_printk("parse_transport: %d", ret); return TC_ACT_OK; } + bool is_ipv6 = ipversion == 6; + // Backup for further use. __be16 ipv4_tot_len = 0; // Parse saddr and daddr as ipv6 format. __be32 saddr[4]; __be32 daddr[4]; - if (ipversion == 4) { + if (!is_ipv6) { saddr[0] = 0; saddr[1] = 0; saddr[2] = bpf_htonl(0x0000ffff); @@ -1028,16 +1047,14 @@ int tproxy_ingress(struct __sk_buff *skb) { daddr[3] = iph.daddr; ipv4_tot_len = iph.tot_len; - } else if (ipversion == 6) { + } else { __builtin_memcpy(daddr, &ipv6h.daddr, IPV6_BYTE_LENGTH); __builtin_memcpy(saddr, &ipv6h.saddr, IPV6_BYTE_LENGTH); - } else { - return TC_ACT_OK; } // If this packet is sent to this host, accept it. __u32 tproxy_ip[4]; - int to_host = ip_is_host(ipversion == 6, skb->ifindex, daddr, tproxy_ip); + int to_host = ip_is_host(is_ipv6, skb->ifindex, daddr, tproxy_ip); if (to_host < 0) { // error // bpf_printk("to_host: %ld", to_host); return TC_ACT_OK; @@ -1069,12 +1086,12 @@ int tproxy_ingress(struct __sk_buff *skb) { // New TCP connection. // bpf_printk("[%X]New Connection", bpf_ntohl(tcph.seq)); __u32 flag[3] = {L4ProtoType_TCP}; // TCP - if (ipversion == 6) { - flag[1] = IpVersion_6; + if (is_ipv6) { + flag[1] = IpVersionType_6; } else { - flag[1] = IpVersion_4; + flag[1] = IpVersionType_4; } - flag[2] = skb->hash; + flag[2] = false; __be32 mac[4] = { 0, 0, @@ -1090,8 +1107,8 @@ int tproxy_ingress(struct __sk_buff *skb) { outbound = ret; // Print only new connection. - bpf_printk("tcp: outbound: %u, %pI6:%u", outbound, daddr, - bpf_ntohs(key_src.port)); + // bpf_printk("tcp(lan): outbound: %u, %pI6:%u", outbound, daddr, + // bpf_ntohs(key_src.port)); } else { // bpf_printk("[%X]Old Connection", bpf_ntohl(tcph.seq)); // The TCP connection exists. @@ -1122,8 +1139,8 @@ int tproxy_ingress(struct __sk_buff *skb) { __u32 *dst_ip = daddr; __u16 dst_port = tcph.dest; - if ((ret = rewrite_ip(skb, ipversion == 6, IPPROTO_TCP, ihl, dst_ip, - tproxy_ip, true))) { + if ((ret = rewrite_ip(skb, is_ipv6, IPPROTO_TCP, ihl, dst_ip, tproxy_ip, + true))) { bpf_printk("Shot IP: %d", ret); return TC_ACT_SHOT; } @@ -1143,12 +1160,12 @@ int tproxy_ingress(struct __sk_buff *skb) { // Routing. It decides if we redirect traffic to control plane. __u32 flag[3] = {L4ProtoType_UDP}; - if (ipversion == 6) { - flag[1] = IpVersion_6; + if (is_ipv6) { + flag[1] = IpVersionType_6; } else { - flag[1] = IpVersion_4; + flag[1] = IpVersionType_4; } - flag[2] = skb->hash; + flag[2] = false; __be32 mac[4] = { 0, 0, @@ -1161,8 +1178,8 @@ int tproxy_ingress(struct __sk_buff *skb) { return TC_ACT_SHOT; } new_hdr.outbound = ret; - bpf_printk("udp: outbound: %u, %pI6:%u", new_hdr.outbound, daddr, - bpf_ntohs(new_hdr.port)); + // bpf_printk("udp(lan): outbound: %u, %pI6:%u", new_hdr.outbound, daddr, + // bpf_ntohs(new_hdr.port)); if (new_hdr.outbound == OUTBOUND_DIRECT) { return TC_ACT_OK; @@ -1172,12 +1189,12 @@ int tproxy_ingress(struct __sk_buff *skb) { // Rewrite to control plane. // Encap a header to transmit fullcone tuple. - encap_after_udp_hdr(skb, ipversion == 6, ihl, ipv4_tot_len, &new_hdr, + encap_after_udp_hdr(skb, is_ipv6, ihl, ipv4_tot_len, &new_hdr, sizeof(new_hdr)); // Rewrite udp dst ip. // bpf_printk("rewrite dst ip from %pI4", &ori_dst.ip); - if ((ret = rewrite_ip(skb, ipversion == 6, IPPROTO_UDP, ihl, new_hdr.ip, + if ((ret = rewrite_ip(skb, is_ipv6, IPPROTO_UDP, ihl, new_hdr.ip, tproxy_ip, true))) { bpf_printk("Shot IP: %d", ret); return TC_ACT_SHOT; @@ -1242,12 +1259,13 @@ int tproxy_egress(struct __sk_buff *skb) { if (ret) { return TC_ACT_OK; } + bool is_ipv6 = ipversion == 6; // Parse saddr and daddr as ipv6 format. __be32 saddr[4]; __be32 daddr[4]; __be16 ipv4_tot_len = 0; - if (ipversion == 4) { + if (!is_ipv6) { saddr[0] = 0; saddr[1] = 0; saddr[2] = bpf_htonl(0x0000ffff); @@ -1259,11 +1277,9 @@ int tproxy_egress(struct __sk_buff *skb) { daddr[3] = iph.daddr; ipv4_tot_len = iph.tot_len; - } else if (ipversion == 6) { - __builtin_memcpy(daddr, ipv6h.daddr.in6_u.u6_addr32, IPV6_BYTE_LENGTH); - __builtin_memcpy(saddr, ipv6h.saddr.in6_u.u6_addr32, IPV6_BYTE_LENGTH); } else { - return TC_ACT_OK; + __builtin_memcpy(daddr, &ipv6h.daddr, IPV6_BYTE_LENGTH); + __builtin_memcpy(saddr, &ipv6h.saddr, IPV6_BYTE_LENGTH); } __be16 sport; if (l4proto == IPPROTO_TCP) { @@ -1280,7 +1296,7 @@ int tproxy_egress(struct __sk_buff *skb) { return TC_ACT_OK; } __be32 tproxy_ip[4]; - ret = ip_is_host(ipversion == 6, skb->ifindex, saddr, tproxy_ip); + ret = ip_is_host(is_ipv6, skb->ifindex, saddr, tproxy_ip); if (!(ret == 1) || !equal_ipv6_format(saddr, tproxy_ip)) { return TC_ACT_OK; } @@ -1308,8 +1324,8 @@ int tproxy_egress(struct __sk_buff *skb) { __u32 *src_ip = saddr; __u16 src_port = tcph.source; - if (rewrite_ip(skb, ipversion == 6, IPPROTO_TCP, ihl, src_ip, - original_dst->ip, false) < 0) { + if (rewrite_ip(skb, is_ipv6, IPPROTO_TCP, ihl, src_ip, original_dst->ip, + false) < 0) { bpf_printk("Shot IP: %d", ret); return TC_ACT_SHOT; } @@ -1333,12 +1349,12 @@ int tproxy_egress(struct __sk_buff *skb) { // Get source ip/port from our packet header. // Decap header to get fullcone tuple. - decap_after_udp_hdr(skb, ipversion == 6, ihl, ipv4_tot_len, &ori_src, + decap_after_udp_hdr(skb, is_ipv6, ihl, ipv4_tot_len, &ori_src, sizeof(ori_src)); // Rewrite udp src ip - if ((ret = rewrite_ip(skb, ipversion == 6, IPPROTO_UDP, ihl, src_ip, - ori_src.ip, false))) { + if ((ret = rewrite_ip(skb, is_ipv6, IPPROTO_UDP, ihl, src_ip, ori_src.ip, + false))) { bpf_printk("Shot IP: %d", ret); return TC_ACT_SHOT; } @@ -1378,6 +1394,39 @@ int tproxy_egress(struct __sk_buff *skb) { return TC_ACT_OK; } +// This function will modify the content of src_key. +static __always_inline struct pid_pname * +lookup_src_pid_map(bool is_ipv6, struct ip_port_proto *src_key) { + // Lookup twice or third. First for unspecific address, second for interface + // address. + + // Lookup pid in src_pid_map. + struct pid_pname *pid_pname; + if ((pid_pname = bpf_map_lookup_elem(&src_pid_map, src_key))) { + return pid_pname; + } + + // Second look-up. + // Set to unspecific address. + if (is_ipv6) { + __builtin_memset(src_key, 0, sizeof(struct ip_port_proto)); + } else { + src_key->ip[3] = 0; + } + if ((pid_pname = bpf_map_lookup_elem(&src_pid_map, src_key))) { + return pid_pname; + } + if (is_ipv6) { + return NULL; + } + + // Third look-up for IPv4 packet. + // Lookup IPv6 unspecific address. + // https://github.com/torvalds/linux/blob/62fb9874f5da54fdb243003b386128037319b219/net/ipv4/af_inet.c#L475 + src_key->ip[2] = 0; + return bpf_map_lookup_elem(&src_pid_map, src_key); +} + __u8 special_mac_to_tproxy[6] = {2, 0, 2, 3, 0, 0}; __u8 special_mac_from_tproxy[6] = {2, 0, 2, 3, 0, 1}; @@ -1389,11 +1438,9 @@ int tproxy_wan_egress(struct __sk_buff *skb) { if (skb->ingress_ifindex != NOWHERE_IFINDEX) { return TC_ACT_OK; } - if ((skb->mark & 0x80) == 0x80) { - return TC_ACT_OK; - } else if (skb->mark > 0) { - bpf_printk("mark", skb->mark); - } + // if ((skb->mark & 0x80) == 0x80) { + // return TC_ACT_OK; + // } struct ethhdr ethh; struct iphdr iph; @@ -1409,6 +1456,7 @@ int tproxy_wan_egress(struct __sk_buff *skb) { if (ret) { return TC_ACT_OK; } + bool is_ipv6 = ipversion == 6; __be16 sport; if (l4proto == IPPROTO_TCP) { @@ -1434,7 +1482,7 @@ int tproxy_wan_egress(struct __sk_buff *skb) { // Parse saddr and daddr as ipv6 format. __be32 saddr[4]; __be32 daddr[4]; - if (ipversion == 4) { + if (!is_ipv6) { saddr[0] = 0; saddr[1] = 0; saddr[2] = bpf_htonl(0x0000ffff); @@ -1446,12 +1494,11 @@ int tproxy_wan_egress(struct __sk_buff *skb) { daddr[3] = iph.daddr; ipv4_tot_len = iph.tot_len; - } else if (ipversion == 6) { + } else { __builtin_memcpy(daddr, &ipv6h.daddr, IPV6_BYTE_LENGTH); __builtin_memcpy(saddr, &ipv6h.saddr, IPV6_BYTE_LENGTH); - } else { - return TC_ACT_OK; } + if (tproxy_response) { // Packets from tproxy port. // We need to redirect it to original port. @@ -1475,6 +1522,37 @@ int tproxy_wan_egress(struct __sk_buff *skb) { return TC_ACT_REDIRECT; } else { // Normal packets. + + // Prepare key. + struct ip_port_proto src_key; + __builtin_memset(&src_key, 0, sizeof(struct ip_port_proto)); + src_key.proto = l4proto; + __builtin_memcpy(src_key.ip, saddr, IPV6_BYTE_LENGTH); + src_key.port = sport; + + struct pid_pname *pid_pname = lookup_src_pid_map(is_ipv6, &src_key); + if (pid_pname) { + // Get tproxy pid and compare if they are equal. + __u32 *pid_tproxy; + if (!(pid_tproxy = + bpf_map_lookup_elem(¶m_map, &control_plane_pid_key))) { + bpf_printk("control_plane_pid is not set."); + return TC_ACT_SHOT; + } + if (pid_pname->pid == *pid_tproxy) { + // Control plane to direct. + // bpf_printk("Control plane to direct."); + return TC_ACT_OK; + } + } else { + if ((skb->mark & 0x80) == 0x80) { + bpf_printk("No pid_pname found. But it should not happen: %pI6:%u (%u)", + saddr, bpf_ntohs(sport), l4proto); + } + } + + // Not from tproxy; from other processes. + if (l4proto == IPPROTO_TCP) { // Backup for further use. tcp_state_syn = tcph.syn && !tcph.ack; @@ -1489,12 +1567,12 @@ int tproxy_wan_egress(struct __sk_buff *skb) { // New TCP connection. // bpf_printk("[%X]New Connection", bpf_ntohl(tcph.seq)); __u32 flag[3] = {L4ProtoType_TCP}; // TCP - if (ipversion == 6) { - flag[1] = IpVersion_6; + if (is_ipv6) { + flag[1] = IpVersionType_6; } else { - flag[1] = IpVersion_4; + flag[1] = IpVersionType_4; } - flag[2] = skb->hash; + flag[2] = true; __be32 mac[4] = { 0, 0, @@ -1510,8 +1588,8 @@ int tproxy_wan_egress(struct __sk_buff *skb) { outbound = ret; // Print only new connection. - bpf_printk("tcp: outbound: %u, %pI6:%u", outbound, daddr, - bpf_ntohs(key_src.port)); + // bpf_printk("tcp(wan): outbound: %u, %pI6:%u", outbound, daddr, + // bpf_ntohs(key_src.port)); } else { // bpf_printk("[%X]Old Connection", bpf_ntohl(tcph.seq)); // The TCP connection exists. @@ -1569,12 +1647,12 @@ int tproxy_wan_egress(struct __sk_buff *skb) { // Routing. It decides if we redirect traffic to control plane. __u32 flag[3] = {L4ProtoType_UDP}; - if (ipversion == 6) { - flag[1] = IpVersion_6; + if (is_ipv6) { + flag[1] = IpVersionType_6; } else { - flag[1] = IpVersion_4; + flag[1] = IpVersionType_4; } - flag[2] = skb->hash; + flag[2] = true; __be32 mac[4] = { 0, 0, @@ -1587,8 +1665,8 @@ int tproxy_wan_egress(struct __sk_buff *skb) { return TC_ACT_SHOT; } new_hdr.outbound = ret; - bpf_printk("udp: outbound: %u, %pI6:%u", new_hdr.outbound, daddr, - bpf_ntohs(new_hdr.port)); + // bpf_printk("udp(wan): outbound: %u, %pI6:%u", new_hdr.outbound, daddr, + // bpf_ntohs(new_hdr.port)); if (new_hdr.outbound == OUTBOUND_DIRECT) { return TC_ACT_OK; @@ -1610,7 +1688,7 @@ int tproxy_wan_egress(struct __sk_buff *skb) { }; // Encap a header to transmit fullcone tuple. - encap_after_udp_hdr(skb, ipversion == 6, ihl, ipv4_tot_len, &new_hdr, + encap_after_udp_hdr(skb, is_ipv6, ihl, ipv4_tot_len, &new_hdr, sizeof(new_hdr)); // Redirect from egress to ingress. @@ -1641,6 +1719,7 @@ int tproxy_wan_ingress(struct __sk_buff *skb) { if (ret) { return TC_ACT_OK; } + bool is_ipv6 = ipversion == 6; // bpf_printk("bpf_ntohs(*(__u16 *)ðh.h_source[4]): %u", // bpf_ntohs(*(__u16 *)ðh.h_source[4])); @@ -1655,7 +1734,7 @@ int tproxy_wan_ingress(struct __sk_buff *skb) { __be32 saddr[4]; __be32 daddr[4]; __be32 ipv4_tot_len = 0; - if (ipversion == 4) { + if (!is_ipv6) { saddr[0] = 0; saddr[1] = 0; saddr[2] = bpf_htonl(0x0000ffff); @@ -1667,11 +1746,9 @@ int tproxy_wan_ingress(struct __sk_buff *skb) { daddr[3] = iph.daddr; ipv4_tot_len = iph.tot_len; - } else if (ipversion == 6) { + } else { __builtin_memcpy(daddr, &ipv6h.daddr, IPV6_BYTE_LENGTH); __builtin_memcpy(saddr, &ipv6h.saddr, IPV6_BYTE_LENGTH); - } else { - return TC_ACT_OK; } __be16 sport; __be16 dport; @@ -1713,8 +1790,8 @@ int tproxy_wan_ingress(struct __sk_buff *skb) { } // Rewrite sip and sport. - if (rewrite_ip(skb, ipversion == 6, IPPROTO_TCP, ihl, saddr, - original_dst->ip, false) < 0) { + if (rewrite_ip(skb, is_ipv6, IPPROTO_TCP, ihl, saddr, original_dst->ip, + false) < 0) { bpf_printk("Shot IP: %d", ret); return TC_ACT_SHOT; } @@ -1734,12 +1811,12 @@ int tproxy_wan_ingress(struct __sk_buff *skb) { // Get source ip/port from our packet header. // Decap header to get fullcone tuple. - decap_after_udp_hdr(skb, ipversion == 6, ihl, ipv4_tot_len, &ori_src, + decap_after_udp_hdr(skb, is_ipv6, ihl, ipv4_tot_len, &ori_src, sizeof(ori_src)); // Rewrite udp src ip - if ((ret = rewrite_ip(skb, ipversion == 6, IPPROTO_UDP, ihl, saddr, - ori_src.ip, false))) { + if ((ret = rewrite_ip(skb, is_ipv6, IPPROTO_UDP, ihl, saddr, ori_src.ip, + false))) { bpf_printk("Shot IP: %d", ret); return TC_ACT_SHOT; } @@ -1762,7 +1839,7 @@ int tproxy_wan_ingress(struct __sk_buff *skb) { // } } // Rewrite dip. - if (rewrite_ip(skb, ipversion == 6, l4proto, ihl, daddr, saddr, true) < 0) { + if (rewrite_ip(skb, is_ipv6, l4proto, ihl, daddr, saddr, true) < 0) { bpf_printk("Shot IP: %d", ret); return TC_ACT_SHOT; } @@ -1780,15 +1857,14 @@ int tproxy_wan_ingress(struct __sk_buff *skb) { // bpf_printk("should send to: %pI6:%u", tproxy_ip, // bpf_ntohs(*tproxy_port)); - if ((ret = rewrite_ip(skb, ipversion == 6, l4proto, ihl, daddr, tproxy_ip, - true))) { + if ((ret = + rewrite_ip(skb, is_ipv6, l4proto, ihl, daddr, tproxy_ip, true))) { bpf_printk("Shot IP: %d", ret); return TC_ACT_SHOT; } // (1) Use daddr as saddr to pass NIC verification. Notice that we do not // modify the so tproxy will send packet to it. - if ((ret = rewrite_ip(skb, ipversion == 6, l4proto, ihl, saddr, daddr, - false))) { + if ((ret = rewrite_ip(skb, is_ipv6, l4proto, ihl, saddr, daddr, false))) { bpf_printk("Shot IP: %d", ret); return TC_ACT_SHOT; } @@ -1807,52 +1883,210 @@ int tproxy_wan_ingress(struct __sk_buff *skb) { return TC_ACT_OK; } -// SEC("socket/src_pid_mapper") -// int src_pid_mapper(struct __sk_buff *skb) { -// struct ethhdr ethh; -// struct iphdr iph; -// struct ipv6hdr ipv6h; -// struct tcphdr tcph; -// struct udphdr udph; -// __u8 ihl; -// __u8 ipversion; -// __u8 l4proto; -// int ret = parse_transport(skb, ðh, &iph, &ipv6h, &tcph, &udph, &ihl, -// &ipversion, &l4proto); -// if (ret) { -// return 0; -// } +// Get sockfd bind addr. +SEC("kprobe/sys_bind") +int src_pid_mapper(struct pt_regs *ctx) { + struct sockaddr_in *in = (struct sockaddr_in *)PT_REGS_PARM2(ctx); + struct sockaddr_in6 *in6 = NULL; + __kernel_sa_family_t family = 0; -// struct ip_port_proto src_key; -// __builtin_memset(&src_key, 0, sizeof(src_key)); -// if (ipversion == 4) { -// src_key.ip[0] = 0; -// src_key.ip[1] = 0; -// src_key.ip[2] = bpf_htonl(0x0000ffff); -// src_key.ip[3] = iph.saddr; + int ret = bpf_core_read_user(&family, sizeof(family), &in->sin_family); + if (ret) { + if (ret == -EFAULT) { + bpf_printk("sys_bind: Failed to read data from memory. Maybe data is in " + "swap space.", + ret); + } else { + bpf_printk("sys_bind: %d", ret); + } + return 0; + } -// } else if (ipversion == 6) { -// __builtin_memcpy(src_key.ip, &ipv6h.saddr, IPV6_BYTE_LENGTH); -// } else { -// return 0; -// } -// if (l4proto == IPPROTO_TCP) { -// src_key.port = tcph.source; -// src_key.proto = IPPROTO_TCP; -// } else if (l4proto == IPPROTO_UDP) { -// src_key.port = udph.source; -// src_key.proto = IPPROTO_UDP; -// } else { -// return 0; -// } -// __u32 pid = bpf_get_current_pid_tgid() >> 32; -// if ((ret = bpf_map_update_elem(&src_pid_map, &src_key, &pid, BPF_ANY))) { -// bpf_printk("socket_pid_mapper: failed update map: %d", ret); -// return 0; -// } -// bpf_printk("socket_pid_mapper: %pI6:%u -> %u", src_key.ip, -// bpf_ntohs(src_key.port), pid); -// return 0; -// } + struct ip_port_proto src_key; + __builtin_memset(&src_key, 0, sizeof(src_key)); + if (family == AF_INET6) { + in6 = (struct sockaddr_in6 *)in; + in = NULL; + bpf_core_read_user(src_key.ip, sizeof(src_key.ip), &in6->sin6_addr); + bpf_core_read_user(&src_key.port, sizeof(src_key.port), &in6->sin6_port); + } else if (family == AF_INET) { + bpf_core_read_user(&src_key.ip[3], sizeof(src_key.ip[3]), &in->sin_addr); + src_key.ip[2] = bpf_htonl(0x0000ffff); + bpf_core_read_user(&src_key.port, sizeof(src_key.port), &in->sin_port); + } else { + bpf_printk("family: %d", family); + return 0; + } + + __u32 pid = bpf_get_current_pid_tgid() >> 32; + if ((ret = bpf_map_update_elem(&src_pid_map, &src_key, &pid, BPF_ANY))) { + bpf_printk("socket_pid_mapper: failed update map: %d", ret); + return 0; + } + bpf_printk("socket_pid_mapper: %pI6:%u -> %u", src_key.ip, + bpf_ntohs(src_key.port), pid); + return 0; +} + +static int __always_inline build_key_by_sk(struct sock *sk, + struct ip_port_proto *src_key) { + + // Build key. + __builtin_memset(src_key, 0, sizeof(struct ip_port_proto)); + + __u16 sk_type = BPF_CORE_READ(sk, sk_type); + if (sk_type == SOCK_STREAM) { + src_key->proto = IPPROTO_TCP; + // bpf_printk("TCP bind"); + } else if (sk_type == SOCK_DGRAM) { + src_key->proto = IPPROTO_UDP; + // bpf_printk("UDP bind"); + } else if (sk_type == SOCK_RAW) { + __u16 sk_proto = BPF_CORE_READ(sk, sk_protocol); + if (sk_proto == IPPROTO_TCP) { + src_key->proto = IPPROTO_TCP; + // bpf_printk("RAW TCP bind"); + } else if (sk_proto == IPPROTO_TCP) { + src_key->proto = IPPROTO_UDP; + // bpf_printk("RAW UDP bind"); + } else { + return -ERANGE; + } + } else { + return -ERANGE; + } + struct inet_sock *inet = (struct inet_sock *)sk; + unsigned short family = BPF_CORE_READ(sk, __sk_common.skc_family); + if (family == AF_INET) { + src_key->ip[2] = bpf_htonl(0x0000ffff); + src_key->ip[3] = BPF_CORE_READ(inet, inet_saddr); + } else if (family == AF_INET6) { + BPF_CORE_READ_INTO(&src_key->ip, inet, pinet6, saddr.in6_u.u6_addr32); + } else { + if (family == AF_UNSPEC) { + bpf_printk("oh shit AF_UNSPEC"); + } + return -ERANGE; + } + src_key->port = BPF_CORE_READ(inet, inet_sport); + return 0; +} + +static int __always_inline update_map_elem_by_sk(struct sock *sk) { + int ret; + + // Build key. + struct ip_port_proto src_key; + if ((ret = build_key_by_sk(sk, &src_key))) { + return ret; + } + + // Build value. + struct pid_pname val; + __builtin_memset(&val, 0, sizeof(struct pid_pname)); + val.pid = bpf_get_current_pid_tgid() >> 32; + if ((ret = bpf_get_current_comm(val.pname, sizeof(val.pname)))) { + return ret; + } + + // Update map. + /// TODO: We can use BPF_NOEXIST here to improve the performance. + /// But will the socket be released after processes dead abnormally? + if ((ret = bpf_map_update_elem(&src_pid_map, &src_key, &val, BPF_ANY))) { + // bpf_printk("setup_mapping_from_sk: failed update map: %d", ret); + return ret; + } + + // bpf_printk("setup_mapping_from_sk: %pI6:%u (%d)", src_key.ip, + // bpf_ntohs(src_key.port), src_key.proto); + // bpf_printk("setup_mapping_from_sk: -> %s (%d)", val.pname, val.pid); + return 0; +} + +// Get sip, sport to pid, pname mapping. +// kernel 5.5+ +// IPv4/IPv6 TCP/UDP send. +SEC("fexit/inet_release") +int BPF_PROG(inet_release, struct sock *sk, int ret) { + if (unlikely(ret)) { + return 0; + } + // Build key. + struct ip_port_proto src_key; + if ((ret = build_key_by_sk(sk, &src_key))) { + return 0; + } + if ((ret = bpf_map_delete_elem(&src_pid_map, &src_key))) { + // bpf_printk("setup_mapping_from_sk: failed update map: %d", ret); + return 0; + } + return 0; +} + +// Get sip, sport to pid, pname mapping. +// kernel 5.5+ +// IPv4/IPv6 TCP/UDP send. +SEC("fexit/inet_send_prepare") +int BPF_PROG(inet_send_prepare, struct sock *sk, int ret) { + if (unlikely(ret)) { + return 0; + } + /// TODO: inet_release + update_map_elem_by_sk(sk); + return 0; +} + +// Get sip, sport to pid, pname mapping. +// kernel 5.5+ +// IPv4 TCP/UDP listen. +SEC("fexit/inet_bind") +int BPF_PROG(inet_bind, struct socket *sock, struct sockaddr *uaddr, + int addr_len, int ret) { + if (ret) { + return 0; + } + /// TODO: inet_release + update_map_elem_by_sk(sock->sk); + return 0; +} + +// Get sip, sport to pid, pname mapping. +// kernel 5.5+ +// IPv4 TCP connect. +// We use fentry because it "Build a SYN and send it off". +// https://github.com/torvalds/linux/blob/62fb9874f5da54fdb243003b386128037319b219/net/ipv4/tcp_output.c#L3820 +SEC("fentry/tcp_connect") +int BPF_PROG(tcp_connect, struct sock *sk) { + /// TODO: inet4_release + update_map_elem_by_sk(sk); + return 0; +} + +// Get sip, sport to pid, pname mapping. +// kernel 5.5+ +// IPv4 UDP sendto/sendmsg. +SEC("fexit/inet_autobind") +int BPF_PROG(inet_autobind, struct sock *sk, int ret) { + if (ret) { + return 0; + } + /// TODO: inet4_release + update_map_elem_by_sk(sk); + return 0; +} + +// Get sip, sport to pid, pname mapping. +// kernel 5.5+ +// IPv6 TCP/UDP listen. +SEC("fexit/inet6_bind") +int BPF_PROG(inet6_bind, struct socket *sock, struct sockaddr *uaddr, + int addr_len, int ret) { + if (ret) { + return 0; + } + /// TODO: inet6_release + update_map_elem_by_sk(sock->sk); + return 0; +} SEC("license") const char __license[] = "Dual BSD/GPL"; \ No newline at end of file diff --git a/component/control/tcp.go b/component/control/tcp.go index 0ef1f79..3959bc6 100644 --- a/component/control/tcp.go +++ b/component/control/tcp.go @@ -6,7 +6,6 @@ package control import ( - "errors" "fmt" "github.com/mzz2017/softwind/pkg/zeroalloc/io" "github.com/v2rayA/dae/common" @@ -14,6 +13,7 @@ import ( internal "github.com/v2rayA/dae/pkg/ebpf_internal" "net" "net/netip" + "strings" "time" ) @@ -57,11 +57,13 @@ func (c *ControlPlane) handleConn(lConn net.Conn) (err error) { } defer rConn.Close() if err = RelayTCP(lConn, rConn); err != nil { - var netErr net.Error - if errors.As(err, &netErr) && netErr.Timeout() { - return nil // ignore i/o timeout + switch { + case strings.HasSuffix(err.Error(), "write: broken pipe"), + strings.HasSuffix(err.Error(), "i/o timeout"): + return nil // ignore + default: + return fmt.Errorf("handleTCP relay error: %w", err) } - return fmt.Errorf("handleTCP relay error: %w", err) } return nil } diff --git a/insert.sh b/insert.sh index a7145c7..5a8fda9 100755 --- a/insert.sh +++ b/insert.sh @@ -10,7 +10,7 @@ set -ex sudo rm -rf /sys/fs/bpf/tc/globals/* # clang -fno-stack-protector -O2 -g -emit-llvm -c component/control/kern/tproxy.c -o - | llc -march=bpf -mcpu=v3 -mattr=+alu32 -filetype=obj -o foo.o -clang -O2 -g -Wall -Werror -c component/control/kern/tproxy.c -target bpf -o foo.o +clang -O2 -g -Wall -Werror -c component/control/kern/tproxy.c -target bpf -D__TARGET_ARCH_x86 -o foo.o sudo tc filter del dev $lan ingress sudo tc filter del dev $lan egress sudo tc filter del dev $wan ingress