fix: problem that node address cannot be domain

This commit is contained in:
mzz2017
2023-01-31 18:08:38 +08:00
parent 3602e2e26b
commit e60295dac7
12 changed files with 921 additions and 162 deletions

5
.gitignore vendored
View File

@ -2,6 +2,7 @@
.idea
*.o
*.tmp
bpf_bpfeb.go
bpf_bpfel.go
bpf_bpfeb*.go
bpf_bpfel*.go
dae
vmlinux.h

View File

@ -10,6 +10,7 @@ CLANG ?= clang
STRIP ?= llvm-strip
OUTPUT ?= dae
CFLAGS := -O2 -Wall -Werror $(CFLAGS)
GOARCH ?= amd64
# Get version from .git.
date=$(shell git log -1 --format="%cd" --date=short | sed s/-//g)
@ -30,6 +31,7 @@ dae: ebpf
ebpf: export BPF_CLANG := $(CLANG)
ebpf: export BPF_STRIP := $(STRIP)
ebpf: export BPF_CFLAGS := $(CFLAGS)
ebpf: export BPF_GOARCH := $(GOARCH)
ebpf:
unset GOOS && \
unset GOARCH && \

View File

@ -32,5 +32,4 @@ See [example.dae](https://github.com/v2rayA/dae/blob/main/example.dae).
1. Handle the case that nodes do not support UDP.
1. L4Checksum problem.
1. Config support list like: `wan_interface: [wlp5s0, eth0]`.
1. Fix problem that node address cannot be domain.
1. ...

View File

@ -19,6 +19,7 @@ const (
BigEndianTproxyPortKey
DisableL4TxChecksumKey
DisableL4RxChecksumKey
ControlPlaneOidKey
)
type DisableL4ChecksumPolicy uint32

View File

@ -6,4 +6,5 @@
package control
// $BPF_CLANG and $BPF_CFLAGS are set by the Makefile.
//go:generate go run github.com/cilium/ebpf/cmd/bpf2go -cc $BPF_CLANG -strip $BPF_STRIP -cflags $BPF_CFLAGS bpf kern/tproxy.c --
//go:generate sh -c "bpftool btf dump file /sys/kernel/btf/vmlinux format c > kern/headers/vmlinux.h"
//go:generate go run github.com/cilium/ebpf/cmd/bpf2go -cc $BPF_CLANG -strip $BPF_STRIP -cflags $BPF_CFLAGS -target $BPF_GOARCH bpf kern/tproxy.c -- -I./headers

View File

@ -10,6 +10,7 @@ import (
"errors"
"fmt"
"github.com/cilium/ebpf"
ciliumLink "github.com/cilium/ebpf/link"
"github.com/cilium/ebpf/rlimit"
"github.com/mzz2017/softwind/pool"
"github.com/sirupsen/logrus"
@ -116,6 +117,11 @@ retryLoadBpf:
if err = bpf.ParamMap.Update(consts.DisableL4RxChecksumKey, consts.DisableL4ChecksumPolicy_SetZero, ebpf.UpdateAny); err != nil {
return nil, err
}
// Write tproxy (control plane) PID.
if err = bpf.ParamMap.Update(consts.ControlPlaneOidKey, uint32(os.Getpid()), ebpf.UpdateAny); err != nil {
return nil, err
}
// Write ip_proto to hdr_size map for IPv6 extension extraction.
if err = bpf.IpprotoHdrsizeMap.Update(uint32(unix.IPPROTO_HOPOPTS), int32(-1), ebpf.UpdateAny); err != nil {
return nil, err
}
@ -368,14 +374,89 @@ func (c *ControlPlane) BindWan(ifname string) error {
return err
}
//// Insert SrcPidMapper.
//sock, err := internal.OpenRawSock(link.Attrs().Index)
//if err != nil {
// return fmt.Errorf("failed to open raw sock: %v: %w", ifname, err)
//}
//if err = unix.SetsockoptInt(sock, unix.SOL_SOCKET, unix.SO_ATTACH_BPF, c.bpf.bpfPrograms.SrcPidMapper.FD()); err != nil {
// return fmt.Errorf("failed to attach SrcPidMapper")
//}
version, e := internal.KernelVersion()
if e != nil {
return fmt.Errorf("BindWan: failed to get kernel version: %w", e)
}
ftraceFeatureVersion := internal.Version{5, 5, 0}
if version.Less(ftraceFeatureVersion) {
// Not support ftrace (fentry/fexit).
// PID bypass needs it.
return fmt.Errorf("your kernel version %v does not support bind to WAN; expect >=%v", version.String(), ftraceFeatureVersion.String())
}
// Set-up SrcPidMapper.
// Attach programs to support pname routing.
// ipv4 tcp/udp: send
inetSendPrepare, err := ciliumLink.AttachTracing(ciliumLink.TracingOptions{
Program: c.bpf.InetSendPrepare,
})
if err != nil {
return fmt.Errorf("AttachTracing InetSendPrepare: %w", err)
}
c.deferFuncs = append(c.deferFuncs, func() error {
if err := inetSendPrepare.Close(); err != nil {
return fmt.Errorf("inetSendPrepare.Close(): %w", err)
}
return nil
})
// ipv4 tcp/udp: listen
inetBind, err := ciliumLink.AttachTracing(ciliumLink.TracingOptions{
Program: c.bpf.InetBind,
})
if err != nil {
return fmt.Errorf("AttachTracing InetBind: %w", err)
}
c.deferFuncs = append(c.deferFuncs, func() error {
if err := inetBind.Close(); err != nil {
return fmt.Errorf("inetBind.Close(): %w", err)
}
return nil
})
// ipv4 udp: sendto/sendmsg
inetAutoBind, err := ciliumLink.AttachTracing(ciliumLink.TracingOptions{
Program: c.bpf.InetAutobind,
})
if err != nil {
return fmt.Errorf("AttachTracing InetAutobind: %w", err)
}
c.deferFuncs = append(c.deferFuncs, func() error {
if err := inetAutoBind.Close(); err != nil {
return fmt.Errorf("inetAutoBind.Close(): %w", err)
}
return nil
})
// ipv4 tcp: connect
tcpConnect, err := ciliumLink.AttachTracing(ciliumLink.TracingOptions{
Program: c.bpf.TcpConnect,
})
if err != nil {
return fmt.Errorf("AttachTracing TcpConnect: %w", err)
}
c.deferFuncs = append(c.deferFuncs, func() error {
if err := tcpConnect.Close(); err != nil {
return fmt.Errorf("inetStreamConnect.Close(): %w", err)
}
return nil
})
// ipv6 tcp/udp: listen
inet6Bind, err := ciliumLink.AttachTracing(ciliumLink.TracingOptions{
Program: c.bpf.Inet6Bind,
})
if err != nil {
return fmt.Errorf("AttachTracing Inet6Bind: %w", err)
}
c.deferFuncs = append(c.deferFuncs, func() error {
if err := inet6Bind.Close(); err != nil {
return fmt.Errorf("inet6Bind.Close(): %w", err)
}
return nil
})
// Insert qdisc and tc filters.
qdisc := &netlink.GenericQdisc{

View File

@ -0,0 +1,137 @@
#ifndef __IF_ETHER_DEFS_H__
#define __IF_ETHER_DEFS_H__
/*
* IEEE 802.3 Ethernet magic constants. The frame sizes omit the preamble
* and FCS/CRC (frame check sequence).
*/
#define ETH_ALEN 6 /* Octets in one ethernet addr */
#define ETH_TLEN 2 /* Octets in ethernet type field */
#define ETH_HLEN 14 /* Total octets in header. */
#define ETH_ZLEN 60 /* Min. octets in frame sans FCS */
#define ETH_DATA_LEN 1500 /* Max. octets in payload */
#define ETH_FRAME_LEN 1514 /* Max. octets in frame sans FCS */
#define ETH_FCS_LEN 4 /* Octets in the FCS */
#define ETH_MIN_MTU 68 /* Min IPv4 MTU per RFC791 */
#define ETH_MAX_MTU 0xFFFFU /* 65535, same as IP_MAX_MTU */
/*
* These are the defined Ethernet Protocol ID's.
*/
#define ETH_P_LOOP 0x0060 /* Ethernet Loopback packet */
#define ETH_P_PUP 0x0200 /* Xerox PUP packet */
#define ETH_P_PUPAT 0x0201 /* Xerox PUP Addr Trans packet */
#define ETH_P_TSN 0x22F0 /* TSN (IEEE 1722) packet */
#define ETH_P_ERSPAN2 0x22EB /* ERSPAN version 2 (type III) */
#define ETH_P_IP 0x0800 /* Internet Protocol packet */
#define ETH_P_X25 0x0805 /* CCITT X.25 */
#define ETH_P_ARP 0x0806 /* Address Resolution packet */
#define ETH_P_BPQ 0x08FF /* G8BPQ AX.25 Ethernet Packet [ NOT AN OFFICIALLY REGISTERED ID ] */
#define ETH_P_IEEEPUP 0x0a00 /* Xerox IEEE802.3 PUP packet */
#define ETH_P_IEEEPUPAT 0x0a01 /* Xerox IEEE802.3 PUP Addr Trans packet */
#define ETH_P_BATMAN 0x4305 /* B.A.T.M.A.N.-Advanced packet [ NOT AN OFFICIALLY REGISTERED ID ] */
#define ETH_P_DEC 0x6000 /* DEC Assigned proto */
#define ETH_P_DNA_DL 0x6001 /* DEC DNA Dump/Load */
#define ETH_P_DNA_RC 0x6002 /* DEC DNA Remote Console */
#define ETH_P_DNA_RT 0x6003 /* DEC DNA Routing */
#define ETH_P_LAT 0x6004 /* DEC LAT */
#define ETH_P_DIAG 0x6005 /* DEC Diagnostics */
#define ETH_P_CUST 0x6006 /* DEC Customer use */
#define ETH_P_SCA 0x6007 /* DEC Systems Comms Arch */
#define ETH_P_TEB 0x6558 /* Trans Ether Bridging */
#define ETH_P_RARP 0x8035 /* Reverse Addr Res packet */
#define ETH_P_ATALK 0x809B /* Appletalk DDP */
#define ETH_P_AARP 0x80F3 /* Appletalk AARP */
#define ETH_P_8021Q 0x8100 /* 802.1Q VLAN Extended Header */
#define ETH_P_ERSPAN 0x88BE /* ERSPAN type II */
#define ETH_P_IPX 0x8137 /* IPX over DIX */
#define ETH_P_IPV6 0x86DD /* IPv6 over bluebook */
#define ETH_P_PAUSE 0x8808 /* IEEE Pause frames. See 802.3 31B */
#define ETH_P_SLOW 0x8809 /* Slow Protocol. See 802.3ad 43B */
#define ETH_P_WCCP 0x883E /* Web-cache coordination protocol
* defined in draft-wilson-wrec-wccp-v2-00.txt */
#define ETH_P_MPLS_UC 0x8847 /* MPLS Unicast traffic */
#define ETH_P_MPLS_MC 0x8848 /* MPLS Multicast traffic */
#define ETH_P_ATMMPOA 0x884c /* MultiProtocol Over ATM */
#define ETH_P_PPP_DISC 0x8863 /* PPPoE discovery messages */
#define ETH_P_PPP_SES 0x8864 /* PPPoE session messages */
#define ETH_P_LINK_CTL 0x886c /* HPNA, wlan link local tunnel */
#define ETH_P_ATMFATE 0x8884 /* Frame-based ATM Transport
* over Ethernet
*/
#define ETH_P_PAE 0x888E /* Port Access Entity (IEEE 802.1X) */
#define ETH_P_PROFINET 0x8892 /* PROFINET */
#define ETH_P_REALTEK 0x8899 /* Multiple proprietary protocols */
#define ETH_P_AOE 0x88A2 /* ATA over Ethernet */
#define ETH_P_ETHERCAT 0x88A4 /* EtherCAT */
#define ETH_P_8021AD 0x88A8 /* 802.1ad Service VLAN */
#define ETH_P_802_EX1 0x88B5 /* 802.1 Local Experimental 1. */
#define ETH_P_PREAUTH 0x88C7 /* 802.11 Preauthentication */
#define ETH_P_TIPC 0x88CA /* TIPC */
#define ETH_P_LLDP 0x88CC /* Link Layer Discovery Protocol */
#define ETH_P_MRP 0x88E3 /* Media Redundancy Protocol */
#define ETH_P_MACSEC 0x88E5 /* 802.1ae MACsec */
#define ETH_P_8021AH 0x88E7 /* 802.1ah Backbone Service Tag */
#define ETH_P_MVRP 0x88F5 /* 802.1Q MVRP */
#define ETH_P_1588 0x88F7 /* IEEE 1588 Timesync */
#define ETH_P_NCSI 0x88F8 /* NCSI protocol */
#define ETH_P_PRP 0x88FB /* IEC 62439-3 PRP/HSRv0 */
#define ETH_P_CFM 0x8902 /* Connectivity Fault Management */
#define ETH_P_FCOE 0x8906 /* Fibre Channel over Ethernet */
#define ETH_P_IBOE 0x8915 /* Infiniband over Ethernet */
#define ETH_P_TDLS 0x890D /* TDLS */
#define ETH_P_FIP 0x8914 /* FCoE Initialization Protocol */
#define ETH_P_80221 0x8917 /* IEEE 802.21 Media Independent Handover Protocol */
#define ETH_P_HSR 0x892F /* IEC 62439-3 HSRv1 */
#define ETH_P_NSH 0x894F /* Network Service Header */
#define ETH_P_LOOPBACK 0x9000 /* Ethernet loopback packet, per IEEE 802.3 */
#define ETH_P_QINQ1 0x9100 /* deprecated QinQ VLAN [ NOT AN OFFICIALLY REGISTERED ID ] */
#define ETH_P_QINQ2 0x9200 /* deprecated QinQ VLAN [ NOT AN OFFICIALLY REGISTERED ID ] */
#define ETH_P_QINQ3 0x9300 /* deprecated QinQ VLAN [ NOT AN OFFICIALLY REGISTERED ID ] */
#define ETH_P_EDSA 0xDADA /* Ethertype DSA [ NOT AN OFFICIALLY REGISTERED ID ] */
#define ETH_P_DSA_8021Q 0xDADB /* Fake VLAN Header for DSA [ NOT AN OFFICIALLY REGISTERED ID ] */
#define ETH_P_IFE 0xED3E /* ForCES inter-FE LFB type */
#define ETH_P_AF_IUCV 0xFBFB /* IBM af_iucv [ NOT AN OFFICIALLY REGISTERED ID ] */
#define ETH_P_802_3_MIN 0x0600 /* If the value in the ethernet type is more than this value
* then the frame is Ethernet II. Else it is 802.3 */
/*
* Non DIX types. Won't clash for 1500 types.
*/
#define ETH_P_802_3 0x0001 /* Dummy type for 802.3 frames */
#define ETH_P_AX25 0x0002 /* Dummy protocol id for AX.25 */
#define ETH_P_ALL 0x0003 /* Every packet (be careful!!!) */
#define ETH_P_802_2 0x0004 /* 802.2 frames */
#define ETH_P_SNAP 0x0005 /* Internal only */
#define ETH_P_DDCMP 0x0006 /* DEC DDCMP: Internal only */
#define ETH_P_WAN_PPP 0x0007 /* Dummy type for WAN PPP frames*/
#define ETH_P_PPP_MP 0x0008 /* Dummy type for PPP MP frames */
#define ETH_P_LOCALTALK 0x0009 /* Localtalk pseudo type */
#define ETH_P_CAN 0x000C /* CAN: Controller Area Network */
#define ETH_P_CANFD 0x000D /* CANFD: CAN flexible data rate*/
#define ETH_P_PPPTALK 0x0010 /* Dummy type for Atalk over PPP*/
#define ETH_P_TR_802_2 0x0011 /* 802.2 frames */
#define ETH_P_MOBITEX 0x0015 /* Mobitex (kaz@cafe.net) */
#define ETH_P_CONTROL 0x0016 /* Card specific control frames */
#define ETH_P_IRDA 0x0017 /* Linux-IrDA */
#define ETH_P_ECONET 0x0018 /* Acorn Econet */
#define ETH_P_HDLC 0x0019 /* HDLC frames */
#define ETH_P_ARCNET 0x001A /* 1A for ArcNet :-) */
#define ETH_P_DSA 0x001B /* Distributed Switch Arch. */
#define ETH_P_TRAILER 0x001C /* Trailer switch tagging */
#define ETH_P_PHONET 0x00F5 /* Nokia Phonet frames */
#define ETH_P_IEEE802154 0x00F6 /* IEEE802.15.4 frame */
#define ETH_P_CAIF 0x00F7 /* ST-Ericsson CAIF protocol */
#define ETH_P_XDSA 0x00F8 /* Multiplexed DSA protocol */
#define ETH_P_MAP 0x00F9 /* Qualcomm multiplexing and
* aggregation protocol
*/
#define ETH_P_MCTP 0x00FA /* Management component transport
* protocol packets
*/
#endif

View File

@ -0,0 +1,105 @@
/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
#ifndef __PKT_CLS_DEFS_H__
#define __PKT_CLS_DEFS_H__
#define TC_COOKIE_MAX_SIZE 16
/* Action attributes */
/* See other TCA_ACT_FLAGS_ * flags in include/net/act_api.h. */
#define TCA_ACT_FLAGS_NO_PERCPU_STATS \
(1 << 0) /* Don't use percpu allocator for \
* actions stats. \
*/
#define TCA_ACT_FLAGS_SKIP_HW (1 << 1) /* don't offload action to HW */
#define TCA_ACT_FLAGS_SKIP_SW (1 << 2) /* don't use action in SW */
/* tca HW stats type
* When user does not pass the attribute, he does not care.
* It is the same as if he would pass the attribute with
* all supported bits set.
* In case no bits are set, user is not interested in getting any HW statistics.
*/
#define TCA_ACT_HW_STATS_IMMEDIATE \
(1 << 0) /* Means that in dump, user \
* gets the current HW stats \
* state from the device \
* queried at the dump time. \
*/
#define TCA_ACT_HW_STATS_DELAYED \
(1 << 1) /* Means that in dump, user gets \
* HW stats that might be out of date \
* for some time, maybe couple of \
* seconds. This is the case when \
* driver polls stats updates \
* periodically or when it gets async \
* stats update from the device. \
*/
#define TCA_ACT_MAX __TCA_ACT_MAX
#define TCA_OLD_COMPAT (TCA_ACT_MAX + 1)
#define TCA_ACT_MAX_PRIO 32
#define TCA_ACT_BIND 1
#define TCA_ACT_NOBIND 0
#define TCA_ACT_UNBIND 1
#define TCA_ACT_NOUNBIND 0
#define TCA_ACT_REPLACE 1
#define TCA_ACT_NOREPLACE 0
#define TC_ACT_UNSPEC (-1)
#define TC_ACT_OK 0
#define TC_ACT_RECLASSIFY 1
#define TC_ACT_SHOT 2
#define TC_ACT_PIPE 3
#define TC_ACT_STOLEN 4
#define TC_ACT_QUEUED 5
#define TC_ACT_REPEAT 6
#define TC_ACT_REDIRECT 7
#define TC_ACT_TRAP 8
/* For hw path, this means "trap to cpu"
* and don't further process the frame
* in hardware. For sw path, this is
* equivalent of TC_ACT_STOLEN - drop
* the skb and act like everything
* is alright.
*/
#define TC_ACT_VALUE_MAX TC_ACT_TRAP
/* There is a special kind of actions called "extended actions",
* which need a value parameter. These have a local opcode located in
* the highest nibble, starting from 1. The rest of the bits
* are used to carry the value. These two parts together make
* a combined opcode.
*/
#define __TC_ACT_EXT_SHIFT 28
#define __TC_ACT_EXT(local) ((local) << __TC_ACT_EXT_SHIFT)
#define TC_ACT_EXT_VAL_MASK ((1 << __TC_ACT_EXT_SHIFT) - 1)
#define TC_ACT_EXT_OPCODE(combined) ((combined) & (~TC_ACT_EXT_VAL_MASK))
#define TC_ACT_EXT_CMP(combined, opcode) (TC_ACT_EXT_OPCODE(combined) == opcode)
#define TC_ACT_JUMP __TC_ACT_EXT(1)
#define TC_ACT_GOTO_CHAIN __TC_ACT_EXT(2)
#define TC_ACT_EXT_OPCODE_MAX TC_ACT_GOTO_CHAIN
/* These macros are put here for binary compatibility with userspace apps that
* make use of them. For kernel code and new userspace apps, use the TCA_ID_*
* versions.
*/
#define TCA_ACT_GACT 5
#define TCA_ACT_IPT 6
#define TCA_ACT_PEDIT 7
#define TCA_ACT_MIRRED 8
#define TCA_ACT_NAT 9
#define TCA_ACT_XT 10
#define TCA_ACT_SKBEDIT 11
#define TCA_ACT_VLAN 12
#define TCA_ACT_BPF 13
#define TCA_ACT_CONNMARK 14
#define TCA_ACT_SKBMOD 15
#define TCA_ACT_CSUM 16
#define TCA_ACT_TUNNEL_KEY 17
#define TCA_ACT_SIMP 22
#define TCA_ACT_IFE 25
#define TCA_ACT_SAMPLE 26
#endif

View File

@ -0,0 +1,196 @@
#ifndef __SOCKET_DEFS_H__
#define __SOCKET_DEFS_H__
/* Supported address families. */
#define AF_UNSPEC 0
#define AF_UNIX 1 /* Unix domain sockets */
#define AF_LOCAL 1 /* POSIX name for AF_UNIX */
#define AF_INET 2 /* Internet IP Protocol */
#define AF_AX25 3 /* Amateur Radio AX.25 */
#define AF_IPX 4 /* Novell IPX */
#define AF_APPLETALK 5 /* AppleTalk DDP */
#define AF_NETROM 6 /* Amateur Radio NET/ROM */
#define AF_BRIDGE 7 /* Multiprotocol bridge */
#define AF_ATMPVC 8 /* ATM PVCs */
#define AF_X25 9 /* Reserved for X.25 project */
#define AF_INET6 10 /* IP version 6 */
#define AF_ROSE 11 /* Amateur Radio X.25 PLP */
#define AF_DECnet 12 /* Reserved for DECnet project */
#define AF_NETBEUI 13 /* Reserved for 802.2LLC project*/
#define AF_SECURITY 14 /* Security callback pseudo AF */
#define AF_KEY 15 /* PF_KEY key management API */
#define AF_NETLINK 16
#define AF_ROUTE AF_NETLINK /* Alias to emulate 4.4BSD */
#define AF_PACKET 17 /* Packet family */
#define AF_ASH 18 /* Ash */
#define AF_ECONET 19 /* Acorn Econet */
#define AF_ATMSVC 20 /* ATM SVCs */
#define AF_RDS 21 /* RDS sockets */
#define AF_SNA 22 /* Linux SNA Project (nutters!) */
#define AF_IRDA 23 /* IRDA sockets */
#define AF_PPPOX 24 /* PPPoX sockets */
#define AF_WANPIPE 25 /* Wanpipe API Sockets */
#define AF_LLC 26 /* Linux LLC */
#define AF_IB 27 /* Native InfiniBand address */
#define AF_MPLS 28 /* MPLS */
#define AF_CAN 29 /* Controller Area Network */
#define AF_TIPC 30 /* TIPC sockets */
#define AF_BLUETOOTH 31 /* Bluetooth sockets */
#define AF_IUCV 32 /* IUCV sockets */
#define AF_RXRPC 33 /* RxRPC sockets */
#define AF_ISDN 34 /* mISDN sockets */
#define AF_PHONET 35 /* Phonet sockets */
#define AF_IEEE802154 36 /* IEEE802154 sockets */
#define AF_CAIF 37 /* CAIF sockets */
#define AF_ALG 38 /* Algorithm sockets */
#define AF_NFC 39 /* NFC sockets */
#define AF_VSOCK 40 /* vSockets */
#define AF_KCM 41 /* Kernel Connection Multiplexor*/
#define AF_QIPCRTR 42 /* Qualcomm IPC Router */
#define AF_SMC 43 /* smc sockets: reserve number for
* PF_SMC protocol family that
* reuses AF_INET address family
*/
#define AF_XDP 44 /* XDP sockets */
#define AF_MAX 45 /* For now.. */
/* Protocol families, same as address families. */
#define PF_UNSPEC AF_UNSPEC
#define PF_UNIX AF_UNIX
#define PF_LOCAL AF_LOCAL
#define PF_INET AF_INET
#define PF_AX25 AF_AX25
#define PF_IPX AF_IPX
#define PF_APPLETALK AF_APPLETALK
#define PF_NETROM AF_NETROM
#define PF_BRIDGE AF_BRIDGE
#define PF_ATMPVC AF_ATMPVC
#define PF_X25 AF_X25
#define PF_INET6 AF_INET6
#define PF_ROSE AF_ROSE
#define PF_DECnet AF_DECnet
#define PF_NETBEUI AF_NETBEUI
#define PF_SECURITY AF_SECURITY
#define PF_KEY AF_KEY
#define PF_NETLINK AF_NETLINK
#define PF_ROUTE AF_ROUTE
#define PF_PACKET AF_PACKET
#define PF_ASH AF_ASH
#define PF_ECONET AF_ECONET
#define PF_ATMSVC AF_ATMSVC
#define PF_RDS AF_RDS
#define PF_SNA AF_SNA
#define PF_IRDA AF_IRDA
#define PF_PPPOX AF_PPPOX
#define PF_WANPIPE AF_WANPIPE
#define PF_LLC AF_LLC
#define PF_IB AF_IB
#define PF_MPLS AF_MPLS
#define PF_CAN AF_CAN
#define PF_TIPC AF_TIPC
#define PF_BLUETOOTH AF_BLUETOOTH
#define PF_IUCV AF_IUCV
#define PF_RXRPC AF_RXRPC
#define PF_ISDN AF_ISDN
#define PF_PHONET AF_PHONET
#define PF_IEEE802154 AF_IEEE802154
#define PF_CAIF AF_CAIF
#define PF_ALG AF_ALG
#define PF_NFC AF_NFC
#define PF_VSOCK AF_VSOCK
#define PF_KCM AF_KCM
#define PF_QIPCRTR AF_QIPCRTR
#define PF_SMC AF_SMC
#define PF_XDP AF_XDP
#define PF_MAX AF_MAX
/* Maximum queue length specifiable by listen. */
#define SOMAXCONN 4096
/* Flags we can use with send/ and recv.
Added those for 1003.1g not all are supported yet
*/
#define MSG_OOB 1
#define MSG_PEEK 2
#define MSG_DONTROUTE 4
#define MSG_TRYHARD 4 /* Synonym for MSG_DONTROUTE for DECnet */
#define MSG_CTRUNC 8
#define MSG_PROBE 0x10 /* Do not send. Only probe path f.e. for MTU */
#define MSG_TRUNC 0x20
#define MSG_DONTWAIT 0x40 /* Nonblocking io */
#define MSG_EOR 0x80 /* End of record */
#define MSG_WAITALL 0x100 /* Wait for a full request */
#define MSG_FIN 0x200
#define MSG_SYN 0x400
#define MSG_CONFIRM 0x800 /* Confirm path validity */
#define MSG_RST 0x1000
#define MSG_ERRQUEUE 0x2000 /* Fetch message from error queue */
#define MSG_NOSIGNAL 0x4000 /* Do not generate SIGPIPE */
#define MSG_MORE 0x8000 /* Sender will send more */
#define MSG_WAITFORONE 0x10000 /* recvmmsg(): block until 1+ packets avail */
#define MSG_SENDPAGE_NOPOLICY 0x10000 /* sendpage() internal : do no apply policy */
#define MSG_SENDPAGE_NOTLAST 0x20000 /* sendpage() internal : not the last page */
#define MSG_BATCH 0x40000 /* sendmmsg(): more messages coming */
#define MSG_EOF MSG_FIN
#define MSG_NO_SHARED_FRAGS 0x80000 /* sendpage() internal : page frags are not shared */
#define MSG_SENDPAGE_DECRYPTED 0x100000 /* sendpage() internal : page may carry
* plain text and require encryption
*/
#define MSG_ZEROCOPY 0x4000000 /* Use user data in kernel path */
#define MSG_FASTOPEN 0x20000000 /* Send data in TCP SYN */
#define MSG_CMSG_CLOEXEC 0x40000000 /* Set close_on_exec for file
descriptor received through
SCM_RIGHTS */
#if defined(CONFIG_COMPAT)
#define MSG_CMSG_COMPAT 0x80000000 /* This message needs 32 bit fixups */
#else
#define MSG_CMSG_COMPAT 0 /* We never have 32 bit fixups */
#endif
/* Setsockoptions(2) level. Thanks to BSD these must match IPPROTO_xxx */
#define SOL_IP 0
/* #define SOL_ICMP 1 No-no-no! Due to Linux :-) we cannot use SOL_ICMP=1 */
#define SOL_TCP 6
#define SOL_UDP 17
#define SOL_IPV6 41
#define SOL_ICMPV6 58
#define SOL_SCTP 132
#define SOL_UDPLITE 136 /* UDP-Lite (RFC 3828) */
#define SOL_RAW 255
#define SOL_IPX 256
#define SOL_AX25 257
#define SOL_ATALK 258
#define SOL_NETROM 259
#define SOL_ROSE 260
#define SOL_DECNET 261
#define SOL_X25 262
#define SOL_PACKET 263
#define SOL_ATM 264 /* ATM layer (cell level) */
#define SOL_AAL 265 /* ATM Adaption Layer (packet level) */
#define SOL_IRDA 266
#define SOL_NETBEUI 267
#define SOL_LLC 268
#define SOL_DCCP 269
#define SOL_NETLINK 270
#define SOL_TIPC 271
#define SOL_RXRPC 272
#define SOL_PPPOL2TP 273
#define SOL_BLUETOOTH 274
#define SOL_PNPIPE 275
#define SOL_RDS 276
#define SOL_IUCV 277
#define SOL_CAIF 278
#define SOL_ALG 279
#define SOL_NFC 280
#define SOL_KCM 281
#define SOL_TLS 282
#define SOL_XDP 283
/* IPX options */
#define IPX_TYPE 1
#endif

View File

@ -3,20 +3,18 @@
* SPDX-License-Identifier: AGPL-3.0-only
* Copyright (c) since 2022, v2rayA Organization <team@v2raya.org>
*/
#include <errno.h>
#include <linux/bpf.h>
#include <linux/if_ether.h>
#include <linux/in.h>
#include <linux/in6.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <linux/pkt_cls.h>
#include <linux/tcp.h>
#include <linux/udp.h>
#include <stdbool.h>
#include "headers/if_ether_defs.h"
#include "headers/pkt_cls_defs.h"
#include "headers/socket_defs.h"
#include "headers/vmlinux.h" // Use "make ebpf" to generate.
#include <asm-generic/errno-base.h>
// #include <linux/bpf.h>
#include <bpf/bpf_core_read.h>
#include <bpf/bpf_endian.h>
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>
// #define likely(x) x
// #define unlikely(x) x
@ -24,6 +22,7 @@
#define unlikely(x) __builtin_expect((x), 0)
#define IPV6_BYTE_LENGTH 16
#define TASK_COMM_LEN 16
#define IPV4_CSUM_OFF (ETH_HLEN + offsetof(struct iphdr, check))
#define IPV4_DST_OFF (ETH_HLEN + offsetof(struct iphdr, daddr))
@ -41,8 +40,7 @@
//#define MAX_LPM_SIZE 20480
#define MAX_LPM_NUM (MAX_MATCH_SET_LEN + 8)
#define MAX_DST_MAPPING_NUM (65536 * 2)
#define MAX_SRC_PID_MAPPING_NUM \
65536 // It is enough because we use it just for routing.
#define MAX_SRC_PID_PNAME_MAPPING_NUM (65536)
#define IPV6_MAX_EXTENSIONS 4
#define OUTBOUND_DIRECT 0
@ -52,6 +50,11 @@
#define OUTBOUND_LOGICAL_AND 0xFF
#define OUTBOUND_LOGICAL_MASK 0xFE
/* Current network namespace */
enum {
BPF_F_CURRENT_NETNS = (-1L),
};
enum {
DisableL4ChecksumPolicy_EnableL4Checksum,
DisableL4ChecksumPolicy_Restore,
@ -63,6 +66,7 @@ static const __u32 zero_key = 0;
static const __u32 tproxy_port_key = 1;
static const __u32 disable_l4_tx_checksum_key = 2;
static const __u32 disable_l4_rx_checksum_key = 3;
static const __u32 control_plane_pid_key = 4;
struct ip_port {
__be32 ip[4];
@ -197,10 +201,10 @@ enum L4ProtoType {
L4ProtoType_UDP = 2,
L4ProtoType_X = 3,
};
enum IpVersion {
IpVersion_4 = 1,
IpVersion_6 = 2,
IpVersion_X = 3,
enum IpVersionType {
IpVersionType_4 = 1,
IpVersionType_6 = 2,
IpVersionType_X = 3,
};
struct port_range {
__u16 port_start;
@ -223,7 +227,7 @@ struct match_set {
__u32 index;
struct port_range port_range;
enum L4ProtoType l4proto_type;
enum IpVersion ip_version;
enum IpVersionType ip_version;
};
enum MatchType type;
bool not ; // A subrule flag (this is not a match_set flag).
@ -255,11 +259,16 @@ struct ip_port_proto {
__u8 proto;
};
struct pid_pname {
__u32 pid;
char pname[TASK_COMM_LEN];
};
struct {
__uint(type, BPF_MAP_TYPE_LRU_HASH);
__type(key, struct ip_port_proto);
__type(value, __u32); // pid
__uint(max_entries, MAX_SRC_PID_MAPPING_NUM);
__type(value, struct pid_pname);
__uint(max_entries, MAX_SRC_PID_PNAME_MAPPING_NUM);
/// NOTICE: No persistence.
// __uint(pinning, LIBBPF_PIN_BY_NAME);
} src_pid_map SEC(".maps");
@ -483,6 +492,10 @@ parse_transport(struct __sk_buff *skb, struct ethhdr *ethh, struct iphdr *iph,
// Skip ethhdr for next hdr.
offset += sizeof(struct ethhdr);
__builtin_memset(iph, 0, sizeof(struct iphdr));
__builtin_memset(ipv6h, 0, sizeof(struct ipv6hdr));
__builtin_memset(tcph, 0, sizeof(struct tcphdr));
__builtin_memset(udph, 0, sizeof(struct udphdr));
*ihl = 0;
*ipversion = 0;
*l4proto = 0;
@ -796,18 +809,20 @@ static __always_inline int decap_after_udp_hdr(struct __sk_buff *skb,
// Do not use __always_inline here because this function is too heavy.
static int routing(__u32 flag[3], void *l4_hdr, __be32 saddr[4],
__be32 daddr[4], __be32 mac[4]) {
#define _l4proto flag[0]
#define _ipversion flag[1]
#define _hash flag[2]
#define _l4proto_type flag[0]
#define _ipversion_type flag[1]
#define _from_localhost flag[2]
int ret;
/// TODO: BPF_MAP_UPDATE_BATCH ?
__u32 key = MatchType_L4Proto;
int ret;
if ((ret = bpf_map_update_elem(&l4proto_ipversion_map, &key, &_l4proto,
if ((ret = bpf_map_update_elem(&l4proto_ipversion_map, &key, &_l4proto_type,
BPF_ANY))) {
return ret;
};
key = MatchType_IpVersion;
if ((ret = bpf_map_update_elem(&l4proto_ipversion_map, &key, &_ipversion,
if ((ret = bpf_map_update_elem(&l4proto_ipversion_map, &key, &_ipversion_type,
BPF_ANY))) {
return ret;
};
@ -815,13 +830,14 @@ static int routing(__u32 flag[3], void *l4_hdr, __be32 saddr[4],
// Define variables for further use.
__u16 h_dport;
__u16 h_sport;
if (_l4proto == L4ProtoType_TCP) {
if (_l4proto_type == L4ProtoType_TCP) {
h_dport = bpf_ntohs(((struct tcphdr *)l4_hdr)->dest);
h_sport = bpf_ntohs(((struct tcphdr *)l4_hdr)->source);
} else {
h_dport = bpf_ntohs(((struct udphdr *)l4_hdr)->dest);
h_sport = bpf_ntohs(((struct udphdr *)l4_hdr)->source);
}
key = MatchType_SourcePort;
if ((ret = bpf_map_update_elem(&h_port_map, &key, &h_sport, BPF_ANY))) {
return ret;
@ -832,7 +848,7 @@ static int routing(__u32 flag[3], void *l4_hdr, __be32 saddr[4],
};
// Modify DNS upstream for routing.
if (h_dport == 53 && _l4proto == L4ProtoType_UDP) {
if (h_dport == 53 && _l4proto_type == L4ProtoType_UDP) {
struct ip_port *upstream =
bpf_map_lookup_elem(&dns_upstream_map, &zero_key);
if (!upstream) {
@ -975,7 +991,7 @@ static int routing(__u32 flag[3], void *l4_hdr, __be32 saddr[4],
// bpf_printk("MATCHED: match_set->type: %u, match_set->not: %d",
// match_set->type, match_set->not );
if (match_set->outbound == OUTBOUND_DIRECT && h_dport == 53 &&
_l4proto == L4ProtoType_UDP) {
_l4proto_type == L4ProtoType_UDP) {
// DNS packet should go through control plane.
return OUTBOUND_CONTROL_PLANE_DIRECT;
}
@ -987,8 +1003,9 @@ static int routing(__u32 flag[3], void *l4_hdr, __be32 saddr[4],
bpf_printk("No match_set hits. Did coder forget to sync "
"common/consts/ebpf.go with enum MatchType?");
return -EPERM;
#undef _l4proto
#undef _ip_version
#undef _l4proto_type
#undef _ipversion_type
#undef _from_localhost
}
// Do DNAT.
@ -1010,13 +1027,15 @@ int tproxy_ingress(struct __sk_buff *skb) {
bpf_printk("parse_transport: %d", ret);
return TC_ACT_OK;
}
bool is_ipv6 = ipversion == 6;
// Backup for further use.
__be16 ipv4_tot_len = 0;
// Parse saddr and daddr as ipv6 format.
__be32 saddr[4];
__be32 daddr[4];
if (ipversion == 4) {
if (!is_ipv6) {
saddr[0] = 0;
saddr[1] = 0;
saddr[2] = bpf_htonl(0x0000ffff);
@ -1028,16 +1047,14 @@ int tproxy_ingress(struct __sk_buff *skb) {
daddr[3] = iph.daddr;
ipv4_tot_len = iph.tot_len;
} else if (ipversion == 6) {
} else {
__builtin_memcpy(daddr, &ipv6h.daddr, IPV6_BYTE_LENGTH);
__builtin_memcpy(saddr, &ipv6h.saddr, IPV6_BYTE_LENGTH);
} else {
return TC_ACT_OK;
}
// If this packet is sent to this host, accept it.
__u32 tproxy_ip[4];
int to_host = ip_is_host(ipversion == 6, skb->ifindex, daddr, tproxy_ip);
int to_host = ip_is_host(is_ipv6, skb->ifindex, daddr, tproxy_ip);
if (to_host < 0) { // error
// bpf_printk("to_host: %ld", to_host);
return TC_ACT_OK;
@ -1069,12 +1086,12 @@ int tproxy_ingress(struct __sk_buff *skb) {
// New TCP connection.
// bpf_printk("[%X]New Connection", bpf_ntohl(tcph.seq));
__u32 flag[3] = {L4ProtoType_TCP}; // TCP
if (ipversion == 6) {
flag[1] = IpVersion_6;
if (is_ipv6) {
flag[1] = IpVersionType_6;
} else {
flag[1] = IpVersion_4;
flag[1] = IpVersionType_4;
}
flag[2] = skb->hash;
flag[2] = false;
__be32 mac[4] = {
0,
0,
@ -1090,8 +1107,8 @@ int tproxy_ingress(struct __sk_buff *skb) {
outbound = ret;
// Print only new connection.
bpf_printk("tcp: outbound: %u, %pI6:%u", outbound, daddr,
bpf_ntohs(key_src.port));
// bpf_printk("tcp(lan): outbound: %u, %pI6:%u", outbound, daddr,
// bpf_ntohs(key_src.port));
} else {
// bpf_printk("[%X]Old Connection", bpf_ntohl(tcph.seq));
// The TCP connection exists.
@ -1122,8 +1139,8 @@ int tproxy_ingress(struct __sk_buff *skb) {
__u32 *dst_ip = daddr;
__u16 dst_port = tcph.dest;
if ((ret = rewrite_ip(skb, ipversion == 6, IPPROTO_TCP, ihl, dst_ip,
tproxy_ip, true))) {
if ((ret = rewrite_ip(skb, is_ipv6, IPPROTO_TCP, ihl, dst_ip, tproxy_ip,
true))) {
bpf_printk("Shot IP: %d", ret);
return TC_ACT_SHOT;
}
@ -1143,12 +1160,12 @@ int tproxy_ingress(struct __sk_buff *skb) {
// Routing. It decides if we redirect traffic to control plane.
__u32 flag[3] = {L4ProtoType_UDP};
if (ipversion == 6) {
flag[1] = IpVersion_6;
if (is_ipv6) {
flag[1] = IpVersionType_6;
} else {
flag[1] = IpVersion_4;
flag[1] = IpVersionType_4;
}
flag[2] = skb->hash;
flag[2] = false;
__be32 mac[4] = {
0,
0,
@ -1161,8 +1178,8 @@ int tproxy_ingress(struct __sk_buff *skb) {
return TC_ACT_SHOT;
}
new_hdr.outbound = ret;
bpf_printk("udp: outbound: %u, %pI6:%u", new_hdr.outbound, daddr,
bpf_ntohs(new_hdr.port));
// bpf_printk("udp(lan): outbound: %u, %pI6:%u", new_hdr.outbound, daddr,
// bpf_ntohs(new_hdr.port));
if (new_hdr.outbound == OUTBOUND_DIRECT) {
return TC_ACT_OK;
@ -1172,12 +1189,12 @@ int tproxy_ingress(struct __sk_buff *skb) {
// Rewrite to control plane.
// Encap a header to transmit fullcone tuple.
encap_after_udp_hdr(skb, ipversion == 6, ihl, ipv4_tot_len, &new_hdr,
encap_after_udp_hdr(skb, is_ipv6, ihl, ipv4_tot_len, &new_hdr,
sizeof(new_hdr));
// Rewrite udp dst ip.
// bpf_printk("rewrite dst ip from %pI4", &ori_dst.ip);
if ((ret = rewrite_ip(skb, ipversion == 6, IPPROTO_UDP, ihl, new_hdr.ip,
if ((ret = rewrite_ip(skb, is_ipv6, IPPROTO_UDP, ihl, new_hdr.ip,
tproxy_ip, true))) {
bpf_printk("Shot IP: %d", ret);
return TC_ACT_SHOT;
@ -1242,12 +1259,13 @@ int tproxy_egress(struct __sk_buff *skb) {
if (ret) {
return TC_ACT_OK;
}
bool is_ipv6 = ipversion == 6;
// Parse saddr and daddr as ipv6 format.
__be32 saddr[4];
__be32 daddr[4];
__be16 ipv4_tot_len = 0;
if (ipversion == 4) {
if (!is_ipv6) {
saddr[0] = 0;
saddr[1] = 0;
saddr[2] = bpf_htonl(0x0000ffff);
@ -1259,11 +1277,9 @@ int tproxy_egress(struct __sk_buff *skb) {
daddr[3] = iph.daddr;
ipv4_tot_len = iph.tot_len;
} else if (ipversion == 6) {
__builtin_memcpy(daddr, ipv6h.daddr.in6_u.u6_addr32, IPV6_BYTE_LENGTH);
__builtin_memcpy(saddr, ipv6h.saddr.in6_u.u6_addr32, IPV6_BYTE_LENGTH);
} else {
return TC_ACT_OK;
__builtin_memcpy(daddr, &ipv6h.daddr, IPV6_BYTE_LENGTH);
__builtin_memcpy(saddr, &ipv6h.saddr, IPV6_BYTE_LENGTH);
}
__be16 sport;
if (l4proto == IPPROTO_TCP) {
@ -1280,7 +1296,7 @@ int tproxy_egress(struct __sk_buff *skb) {
return TC_ACT_OK;
}
__be32 tproxy_ip[4];
ret = ip_is_host(ipversion == 6, skb->ifindex, saddr, tproxy_ip);
ret = ip_is_host(is_ipv6, skb->ifindex, saddr, tproxy_ip);
if (!(ret == 1) || !equal_ipv6_format(saddr, tproxy_ip)) {
return TC_ACT_OK;
}
@ -1308,8 +1324,8 @@ int tproxy_egress(struct __sk_buff *skb) {
__u32 *src_ip = saddr;
__u16 src_port = tcph.source;
if (rewrite_ip(skb, ipversion == 6, IPPROTO_TCP, ihl, src_ip,
original_dst->ip, false) < 0) {
if (rewrite_ip(skb, is_ipv6, IPPROTO_TCP, ihl, src_ip, original_dst->ip,
false) < 0) {
bpf_printk("Shot IP: %d", ret);
return TC_ACT_SHOT;
}
@ -1333,12 +1349,12 @@ int tproxy_egress(struct __sk_buff *skb) {
// Get source ip/port from our packet header.
// Decap header to get fullcone tuple.
decap_after_udp_hdr(skb, ipversion == 6, ihl, ipv4_tot_len, &ori_src,
decap_after_udp_hdr(skb, is_ipv6, ihl, ipv4_tot_len, &ori_src,
sizeof(ori_src));
// Rewrite udp src ip
if ((ret = rewrite_ip(skb, ipversion == 6, IPPROTO_UDP, ihl, src_ip,
ori_src.ip, false))) {
if ((ret = rewrite_ip(skb, is_ipv6, IPPROTO_UDP, ihl, src_ip, ori_src.ip,
false))) {
bpf_printk("Shot IP: %d", ret);
return TC_ACT_SHOT;
}
@ -1378,6 +1394,39 @@ int tproxy_egress(struct __sk_buff *skb) {
return TC_ACT_OK;
}
// This function will modify the content of src_key.
static __always_inline struct pid_pname *
lookup_src_pid_map(bool is_ipv6, struct ip_port_proto *src_key) {
// Lookup twice or third. First for unspecific address, second for interface
// address.
// Lookup pid in src_pid_map.
struct pid_pname *pid_pname;
if ((pid_pname = bpf_map_lookup_elem(&src_pid_map, src_key))) {
return pid_pname;
}
// Second look-up.
// Set to unspecific address.
if (is_ipv6) {
__builtin_memset(src_key, 0, sizeof(struct ip_port_proto));
} else {
src_key->ip[3] = 0;
}
if ((pid_pname = bpf_map_lookup_elem(&src_pid_map, src_key))) {
return pid_pname;
}
if (is_ipv6) {
return NULL;
}
// Third look-up for IPv4 packet.
// Lookup IPv6 unspecific address.
// https://github.com/torvalds/linux/blob/62fb9874f5da54fdb243003b386128037319b219/net/ipv4/af_inet.c#L475
src_key->ip[2] = 0;
return bpf_map_lookup_elem(&src_pid_map, src_key);
}
__u8 special_mac_to_tproxy[6] = {2, 0, 2, 3, 0, 0};
__u8 special_mac_from_tproxy[6] = {2, 0, 2, 3, 0, 1};
@ -1389,11 +1438,9 @@ int tproxy_wan_egress(struct __sk_buff *skb) {
if (skb->ingress_ifindex != NOWHERE_IFINDEX) {
return TC_ACT_OK;
}
if ((skb->mark & 0x80) == 0x80) {
return TC_ACT_OK;
} else if (skb->mark > 0) {
bpf_printk("mark", skb->mark);
}
// if ((skb->mark & 0x80) == 0x80) {
// return TC_ACT_OK;
// }
struct ethhdr ethh;
struct iphdr iph;
@ -1409,6 +1456,7 @@ int tproxy_wan_egress(struct __sk_buff *skb) {
if (ret) {
return TC_ACT_OK;
}
bool is_ipv6 = ipversion == 6;
__be16 sport;
if (l4proto == IPPROTO_TCP) {
@ -1434,7 +1482,7 @@ int tproxy_wan_egress(struct __sk_buff *skb) {
// Parse saddr and daddr as ipv6 format.
__be32 saddr[4];
__be32 daddr[4];
if (ipversion == 4) {
if (!is_ipv6) {
saddr[0] = 0;
saddr[1] = 0;
saddr[2] = bpf_htonl(0x0000ffff);
@ -1446,12 +1494,11 @@ int tproxy_wan_egress(struct __sk_buff *skb) {
daddr[3] = iph.daddr;
ipv4_tot_len = iph.tot_len;
} else if (ipversion == 6) {
} else {
__builtin_memcpy(daddr, &ipv6h.daddr, IPV6_BYTE_LENGTH);
__builtin_memcpy(saddr, &ipv6h.saddr, IPV6_BYTE_LENGTH);
} else {
return TC_ACT_OK;
}
if (tproxy_response) {
// Packets from tproxy port.
// We need to redirect it to original port.
@ -1475,6 +1522,37 @@ int tproxy_wan_egress(struct __sk_buff *skb) {
return TC_ACT_REDIRECT;
} else {
// Normal packets.
// Prepare key.
struct ip_port_proto src_key;
__builtin_memset(&src_key, 0, sizeof(struct ip_port_proto));
src_key.proto = l4proto;
__builtin_memcpy(src_key.ip, saddr, IPV6_BYTE_LENGTH);
src_key.port = sport;
struct pid_pname *pid_pname = lookup_src_pid_map(is_ipv6, &src_key);
if (pid_pname) {
// Get tproxy pid and compare if they are equal.
__u32 *pid_tproxy;
if (!(pid_tproxy =
bpf_map_lookup_elem(&param_map, &control_plane_pid_key))) {
bpf_printk("control_plane_pid is not set.");
return TC_ACT_SHOT;
}
if (pid_pname->pid == *pid_tproxy) {
// Control plane to direct.
// bpf_printk("Control plane to direct.");
return TC_ACT_OK;
}
} else {
if ((skb->mark & 0x80) == 0x80) {
bpf_printk("No pid_pname found. But it should not happen: %pI6:%u (%u)",
saddr, bpf_ntohs(sport), l4proto);
}
}
// Not from tproxy; from other processes.
if (l4proto == IPPROTO_TCP) {
// Backup for further use.
tcp_state_syn = tcph.syn && !tcph.ack;
@ -1489,12 +1567,12 @@ int tproxy_wan_egress(struct __sk_buff *skb) {
// New TCP connection.
// bpf_printk("[%X]New Connection", bpf_ntohl(tcph.seq));
__u32 flag[3] = {L4ProtoType_TCP}; // TCP
if (ipversion == 6) {
flag[1] = IpVersion_6;
if (is_ipv6) {
flag[1] = IpVersionType_6;
} else {
flag[1] = IpVersion_4;
flag[1] = IpVersionType_4;
}
flag[2] = skb->hash;
flag[2] = true;
__be32 mac[4] = {
0,
0,
@ -1510,8 +1588,8 @@ int tproxy_wan_egress(struct __sk_buff *skb) {
outbound = ret;
// Print only new connection.
bpf_printk("tcp: outbound: %u, %pI6:%u", outbound, daddr,
bpf_ntohs(key_src.port));
// bpf_printk("tcp(wan): outbound: %u, %pI6:%u", outbound, daddr,
// bpf_ntohs(key_src.port));
} else {
// bpf_printk("[%X]Old Connection", bpf_ntohl(tcph.seq));
// The TCP connection exists.
@ -1569,12 +1647,12 @@ int tproxy_wan_egress(struct __sk_buff *skb) {
// Routing. It decides if we redirect traffic to control plane.
__u32 flag[3] = {L4ProtoType_UDP};
if (ipversion == 6) {
flag[1] = IpVersion_6;
if (is_ipv6) {
flag[1] = IpVersionType_6;
} else {
flag[1] = IpVersion_4;
flag[1] = IpVersionType_4;
}
flag[2] = skb->hash;
flag[2] = true;
__be32 mac[4] = {
0,
0,
@ -1587,8 +1665,8 @@ int tproxy_wan_egress(struct __sk_buff *skb) {
return TC_ACT_SHOT;
}
new_hdr.outbound = ret;
bpf_printk("udp: outbound: %u, %pI6:%u", new_hdr.outbound, daddr,
bpf_ntohs(new_hdr.port));
// bpf_printk("udp(wan): outbound: %u, %pI6:%u", new_hdr.outbound, daddr,
// bpf_ntohs(new_hdr.port));
if (new_hdr.outbound == OUTBOUND_DIRECT) {
return TC_ACT_OK;
@ -1610,7 +1688,7 @@ int tproxy_wan_egress(struct __sk_buff *skb) {
};
// Encap a header to transmit fullcone tuple.
encap_after_udp_hdr(skb, ipversion == 6, ihl, ipv4_tot_len, &new_hdr,
encap_after_udp_hdr(skb, is_ipv6, ihl, ipv4_tot_len, &new_hdr,
sizeof(new_hdr));
// Redirect from egress to ingress.
@ -1641,6 +1719,7 @@ int tproxy_wan_ingress(struct __sk_buff *skb) {
if (ret) {
return TC_ACT_OK;
}
bool is_ipv6 = ipversion == 6;
// bpf_printk("bpf_ntohs(*(__u16 *)&ethh.h_source[4]): %u",
// bpf_ntohs(*(__u16 *)&ethh.h_source[4]));
@ -1655,7 +1734,7 @@ int tproxy_wan_ingress(struct __sk_buff *skb) {
__be32 saddr[4];
__be32 daddr[4];
__be32 ipv4_tot_len = 0;
if (ipversion == 4) {
if (!is_ipv6) {
saddr[0] = 0;
saddr[1] = 0;
saddr[2] = bpf_htonl(0x0000ffff);
@ -1667,11 +1746,9 @@ int tproxy_wan_ingress(struct __sk_buff *skb) {
daddr[3] = iph.daddr;
ipv4_tot_len = iph.tot_len;
} else if (ipversion == 6) {
} else {
__builtin_memcpy(daddr, &ipv6h.daddr, IPV6_BYTE_LENGTH);
__builtin_memcpy(saddr, &ipv6h.saddr, IPV6_BYTE_LENGTH);
} else {
return TC_ACT_OK;
}
__be16 sport;
__be16 dport;
@ -1713,8 +1790,8 @@ int tproxy_wan_ingress(struct __sk_buff *skb) {
}
// Rewrite sip and sport.
if (rewrite_ip(skb, ipversion == 6, IPPROTO_TCP, ihl, saddr,
original_dst->ip, false) < 0) {
if (rewrite_ip(skb, is_ipv6, IPPROTO_TCP, ihl, saddr, original_dst->ip,
false) < 0) {
bpf_printk("Shot IP: %d", ret);
return TC_ACT_SHOT;
}
@ -1734,12 +1811,12 @@ int tproxy_wan_ingress(struct __sk_buff *skb) {
// Get source ip/port from our packet header.
// Decap header to get fullcone tuple.
decap_after_udp_hdr(skb, ipversion == 6, ihl, ipv4_tot_len, &ori_src,
decap_after_udp_hdr(skb, is_ipv6, ihl, ipv4_tot_len, &ori_src,
sizeof(ori_src));
// Rewrite udp src ip
if ((ret = rewrite_ip(skb, ipversion == 6, IPPROTO_UDP, ihl, saddr,
ori_src.ip, false))) {
if ((ret = rewrite_ip(skb, is_ipv6, IPPROTO_UDP, ihl, saddr, ori_src.ip,
false))) {
bpf_printk("Shot IP: %d", ret);
return TC_ACT_SHOT;
}
@ -1762,7 +1839,7 @@ int tproxy_wan_ingress(struct __sk_buff *skb) {
// }
}
// Rewrite dip.
if (rewrite_ip(skb, ipversion == 6, l4proto, ihl, daddr, saddr, true) < 0) {
if (rewrite_ip(skb, is_ipv6, l4proto, ihl, daddr, saddr, true) < 0) {
bpf_printk("Shot IP: %d", ret);
return TC_ACT_SHOT;
}
@ -1780,15 +1857,14 @@ int tproxy_wan_ingress(struct __sk_buff *skb) {
// bpf_printk("should send to: %pI6:%u", tproxy_ip,
// bpf_ntohs(*tproxy_port));
if ((ret = rewrite_ip(skb, ipversion == 6, l4proto, ihl, daddr, tproxy_ip,
true))) {
if ((ret =
rewrite_ip(skb, is_ipv6, l4proto, ihl, daddr, tproxy_ip, true))) {
bpf_printk("Shot IP: %d", ret);
return TC_ACT_SHOT;
}
// (1) Use daddr as saddr to pass NIC verification. Notice that we do not
// modify the <sport> so tproxy will send packet to it.
if ((ret = rewrite_ip(skb, ipversion == 6, l4proto, ihl, saddr, daddr,
false))) {
if ((ret = rewrite_ip(skb, is_ipv6, l4proto, ihl, saddr, daddr, false))) {
bpf_printk("Shot IP: %d", ret);
return TC_ACT_SHOT;
}
@ -1807,52 +1883,210 @@ int tproxy_wan_ingress(struct __sk_buff *skb) {
return TC_ACT_OK;
}
// SEC("socket/src_pid_mapper")
// int src_pid_mapper(struct __sk_buff *skb) {
// struct ethhdr ethh;
// struct iphdr iph;
// struct ipv6hdr ipv6h;
// struct tcphdr tcph;
// struct udphdr udph;
// __u8 ihl;
// __u8 ipversion;
// __u8 l4proto;
// int ret = parse_transport(skb, &ethh, &iph, &ipv6h, &tcph, &udph, &ihl,
// &ipversion, &l4proto);
// if (ret) {
// return 0;
// }
// Get sockfd bind addr.
SEC("kprobe/sys_bind")
int src_pid_mapper(struct pt_regs *ctx) {
struct sockaddr_in *in = (struct sockaddr_in *)PT_REGS_PARM2(ctx);
struct sockaddr_in6 *in6 = NULL;
__kernel_sa_family_t family = 0;
// struct ip_port_proto src_key;
// __builtin_memset(&src_key, 0, sizeof(src_key));
// if (ipversion == 4) {
// src_key.ip[0] = 0;
// src_key.ip[1] = 0;
// src_key.ip[2] = bpf_htonl(0x0000ffff);
// src_key.ip[3] = iph.saddr;
int ret = bpf_core_read_user(&family, sizeof(family), &in->sin_family);
if (ret) {
if (ret == -EFAULT) {
bpf_printk("sys_bind: Failed to read data from memory. Maybe data is in "
"swap space.",
ret);
} else {
bpf_printk("sys_bind: %d", ret);
}
return 0;
}
// } else if (ipversion == 6) {
// __builtin_memcpy(src_key.ip, &ipv6h.saddr, IPV6_BYTE_LENGTH);
// } else {
// return 0;
// }
// if (l4proto == IPPROTO_TCP) {
// src_key.port = tcph.source;
// src_key.proto = IPPROTO_TCP;
// } else if (l4proto == IPPROTO_UDP) {
// src_key.port = udph.source;
// src_key.proto = IPPROTO_UDP;
// } else {
// return 0;
// }
// __u32 pid = bpf_get_current_pid_tgid() >> 32;
// if ((ret = bpf_map_update_elem(&src_pid_map, &src_key, &pid, BPF_ANY))) {
// bpf_printk("socket_pid_mapper: failed update map: %d", ret);
// return 0;
// }
// bpf_printk("socket_pid_mapper: %pI6:%u -> %u", src_key.ip,
// bpf_ntohs(src_key.port), pid);
// return 0;
// }
struct ip_port_proto src_key;
__builtin_memset(&src_key, 0, sizeof(src_key));
if (family == AF_INET6) {
in6 = (struct sockaddr_in6 *)in;
in = NULL;
bpf_core_read_user(src_key.ip, sizeof(src_key.ip), &in6->sin6_addr);
bpf_core_read_user(&src_key.port, sizeof(src_key.port), &in6->sin6_port);
} else if (family == AF_INET) {
bpf_core_read_user(&src_key.ip[3], sizeof(src_key.ip[3]), &in->sin_addr);
src_key.ip[2] = bpf_htonl(0x0000ffff);
bpf_core_read_user(&src_key.port, sizeof(src_key.port), &in->sin_port);
} else {
bpf_printk("family: %d", family);
return 0;
}
__u32 pid = bpf_get_current_pid_tgid() >> 32;
if ((ret = bpf_map_update_elem(&src_pid_map, &src_key, &pid, BPF_ANY))) {
bpf_printk("socket_pid_mapper: failed update map: %d", ret);
return 0;
}
bpf_printk("socket_pid_mapper: %pI6:%u -> %u", src_key.ip,
bpf_ntohs(src_key.port), pid);
return 0;
}
static int __always_inline build_key_by_sk(struct sock *sk,
struct ip_port_proto *src_key) {
// Build key.
__builtin_memset(src_key, 0, sizeof(struct ip_port_proto));
__u16 sk_type = BPF_CORE_READ(sk, sk_type);
if (sk_type == SOCK_STREAM) {
src_key->proto = IPPROTO_TCP;
// bpf_printk("TCP bind");
} else if (sk_type == SOCK_DGRAM) {
src_key->proto = IPPROTO_UDP;
// bpf_printk("UDP bind");
} else if (sk_type == SOCK_RAW) {
__u16 sk_proto = BPF_CORE_READ(sk, sk_protocol);
if (sk_proto == IPPROTO_TCP) {
src_key->proto = IPPROTO_TCP;
// bpf_printk("RAW TCP bind");
} else if (sk_proto == IPPROTO_TCP) {
src_key->proto = IPPROTO_UDP;
// bpf_printk("RAW UDP bind");
} else {
return -ERANGE;
}
} else {
return -ERANGE;
}
struct inet_sock *inet = (struct inet_sock *)sk;
unsigned short family = BPF_CORE_READ(sk, __sk_common.skc_family);
if (family == AF_INET) {
src_key->ip[2] = bpf_htonl(0x0000ffff);
src_key->ip[3] = BPF_CORE_READ(inet, inet_saddr);
} else if (family == AF_INET6) {
BPF_CORE_READ_INTO(&src_key->ip, inet, pinet6, saddr.in6_u.u6_addr32);
} else {
if (family == AF_UNSPEC) {
bpf_printk("oh shit AF_UNSPEC");
}
return -ERANGE;
}
src_key->port = BPF_CORE_READ(inet, inet_sport);
return 0;
}
static int __always_inline update_map_elem_by_sk(struct sock *sk) {
int ret;
// Build key.
struct ip_port_proto src_key;
if ((ret = build_key_by_sk(sk, &src_key))) {
return ret;
}
// Build value.
struct pid_pname val;
__builtin_memset(&val, 0, sizeof(struct pid_pname));
val.pid = bpf_get_current_pid_tgid() >> 32;
if ((ret = bpf_get_current_comm(val.pname, sizeof(val.pname)))) {
return ret;
}
// Update map.
/// TODO: We can use BPF_NOEXIST here to improve the performance.
/// But will the socket be released after processes dead abnormally?
if ((ret = bpf_map_update_elem(&src_pid_map, &src_key, &val, BPF_ANY))) {
// bpf_printk("setup_mapping_from_sk: failed update map: %d", ret);
return ret;
}
// bpf_printk("setup_mapping_from_sk: %pI6:%u (%d)", src_key.ip,
// bpf_ntohs(src_key.port), src_key.proto);
// bpf_printk("setup_mapping_from_sk: -> %s (%d)", val.pname, val.pid);
return 0;
}
// Get sip, sport to pid, pname mapping.
// kernel 5.5+
// IPv4/IPv6 TCP/UDP send.
SEC("fexit/inet_release")
int BPF_PROG(inet_release, struct sock *sk, int ret) {
if (unlikely(ret)) {
return 0;
}
// Build key.
struct ip_port_proto src_key;
if ((ret = build_key_by_sk(sk, &src_key))) {
return 0;
}
if ((ret = bpf_map_delete_elem(&src_pid_map, &src_key))) {
// bpf_printk("setup_mapping_from_sk: failed update map: %d", ret);
return 0;
}
return 0;
}
// Get sip, sport to pid, pname mapping.
// kernel 5.5+
// IPv4/IPv6 TCP/UDP send.
SEC("fexit/inet_send_prepare")
int BPF_PROG(inet_send_prepare, struct sock *sk, int ret) {
if (unlikely(ret)) {
return 0;
}
/// TODO: inet_release
update_map_elem_by_sk(sk);
return 0;
}
// Get sip, sport to pid, pname mapping.
// kernel 5.5+
// IPv4 TCP/UDP listen.
SEC("fexit/inet_bind")
int BPF_PROG(inet_bind, struct socket *sock, struct sockaddr *uaddr,
int addr_len, int ret) {
if (ret) {
return 0;
}
/// TODO: inet_release
update_map_elem_by_sk(sock->sk);
return 0;
}
// Get sip, sport to pid, pname mapping.
// kernel 5.5+
// IPv4 TCP connect.
// We use fentry because it "Build a SYN and send it off".
// https://github.com/torvalds/linux/blob/62fb9874f5da54fdb243003b386128037319b219/net/ipv4/tcp_output.c#L3820
SEC("fentry/tcp_connect")
int BPF_PROG(tcp_connect, struct sock *sk) {
/// TODO: inet4_release
update_map_elem_by_sk(sk);
return 0;
}
// Get sip, sport to pid, pname mapping.
// kernel 5.5+
// IPv4 UDP sendto/sendmsg.
SEC("fexit/inet_autobind")
int BPF_PROG(inet_autobind, struct sock *sk, int ret) {
if (ret) {
return 0;
}
/// TODO: inet4_release
update_map_elem_by_sk(sk);
return 0;
}
// Get sip, sport to pid, pname mapping.
// kernel 5.5+
// IPv6 TCP/UDP listen.
SEC("fexit/inet6_bind")
int BPF_PROG(inet6_bind, struct socket *sock, struct sockaddr *uaddr,
int addr_len, int ret) {
if (ret) {
return 0;
}
/// TODO: inet6_release
update_map_elem_by_sk(sock->sk);
return 0;
}
SEC("license") const char __license[] = "Dual BSD/GPL";

View File

@ -6,7 +6,6 @@
package control
import (
"errors"
"fmt"
"github.com/mzz2017/softwind/pkg/zeroalloc/io"
"github.com/v2rayA/dae/common"
@ -14,6 +13,7 @@ import (
internal "github.com/v2rayA/dae/pkg/ebpf_internal"
"net"
"net/netip"
"strings"
"time"
)
@ -57,12 +57,14 @@ func (c *ControlPlane) handleConn(lConn net.Conn) (err error) {
}
defer rConn.Close()
if err = RelayTCP(lConn, rConn); err != nil {
var netErr net.Error
if errors.As(err, &netErr) && netErr.Timeout() {
return nil // ignore i/o timeout
}
switch {
case strings.HasSuffix(err.Error(), "write: broken pipe"),
strings.HasSuffix(err.Error(), "i/o timeout"):
return nil // ignore
default:
return fmt.Errorf("handleTCP relay error: %w", err)
}
}
return nil
}

View File

@ -10,7 +10,7 @@ set -ex
sudo rm -rf /sys/fs/bpf/tc/globals/*
# clang -fno-stack-protector -O2 -g -emit-llvm -c component/control/kern/tproxy.c -o - | llc -march=bpf -mcpu=v3 -mattr=+alu32 -filetype=obj -o foo.o
clang -O2 -g -Wall -Werror -c component/control/kern/tproxy.c -target bpf -o foo.o
clang -O2 -g -Wall -Werror -c component/control/kern/tproxy.c -target bpf -D__TARGET_ARCH_x86 -o foo.o
sudo tc filter del dev $lan ingress
sudo tc filter del dev $lan egress
sudo tc filter del dev $wan ingress