// SPDX-License-Identifier: AGPL-3.0-only // Copyright (c) 2022-2024, daeuniverse Organization // +build ignore #include "headers/errno-base.h" #include "headers/if_ether_defs.h" #include "headers/pkt_cls_defs.h" #include "headers/socket_defs.h" #include "headers/upai_in6_defs.h" #include "headers/vmlinux.h" #include "headers/bpf_core_read.h" #include "headers/bpf_endian.h" #include "headers/bpf_helpers.h" #include "headers/bpf_timer.h" // #define __DEBUG_ROUTING // #define __PRINT_ROUTING_RESULT // #define __PRINT_SETUP_PROCESS_CONNNECTION // #define __DEBUG // #define __UNROLL_ROUTE_LOOP #ifndef __DEBUG #undef bpf_printk #define bpf_printk(...) ((void)0) #endif // #define likely(x) x // #define unlikely(x) x #define likely(x) __builtin_expect((x), 1) #define unlikely(x) __builtin_expect((x), 0) #define IPV6_BYTE_LENGTH 16 #define TASK_COMM_LEN 16 #define IPV4_CSUM_OFF(link_h_len) ((link_h_len) + offsetof(struct iphdr, check)) #define IPV4_DST_OFF(link_h_len) ((link_h_len) + offsetof(struct iphdr, daddr)) #define IPV4_SRC_OFF(link_h_len) ((link_h_len) + offsetof(struct iphdr, saddr)) #define IPV6_DST_OFF(link_h_len) \ ((link_h_len) + offsetof(struct ipv6hdr, daddr)) #define IPV6_SRC_OFF(link_h_len) \ ((link_h_len) + offsetof(struct ipv6hdr, saddr)) #define PACKET_HOST 0 #define PACKET_OTHERHOST 3 #define NOWHERE_IFINDEX 0 #define LOOPBACK_IFINDEX 1 #define MAX_PARAM_LEN 16 #define MAX_INTERFACE_NUM 256 #ifndef MAX_MATCH_SET_LEN #define MAX_MATCH_SET_LEN \ (32 * 32) // Should be sync with common/consts/ebpf.go. #endif #define MAX_LPM_SIZE 2048000 #define MAX_LPM_NUM (MAX_MATCH_SET_LEN + 8) #define MAX_DST_MAPPING_NUM (65536 * 2) #define MAX_TGID_PNAME_MAPPING_NUM (8192) #define MAX_COOKIE_PID_PNAME_MAPPING_NUM (65536) #define MAX_DOMAIN_ROUTING_NUM 65536 #define MAX_ARG_LEN 128 #define IPV6_MAX_EXTENSIONS 4 #define OUTBOUND_DIRECT 0 #define OUTBOUND_BLOCK 1 #define OUTBOUND_MUST_RULES 0xFC #define OUTBOUND_CONTROL_PLANE_ROUTING 0xFD #define OUTBOUND_LOGICAL_OR 0xFE #define OUTBOUND_LOGICAL_AND 0xFF #define OUTBOUND_LOGICAL_MASK 0xFE #define IS_WAN 0 #define IS_LAN 1 #define TPROXY_MARK 0x8000000 #define RECOGNIZE 0x2017 #define ESOCKTNOSUPPORT 94 /* Socket type not supported */ #define TIMEOUT_UDP_CONN_STATE 3e11 /* 300s */ #define NDP_REDIRECT 137 enum { BPF_F_CURRENT_NETNS = -1 }; enum { DisableL4ChecksumPolicy_EnableL4Checksum, DisableL4ChecksumPolicy_Restore, DisableL4ChecksumPolicy_SetZero, }; // Param keys: static const __u32 zero_key; static const __u32 one_key = 1; // Outbound Connectivity Map: struct outbound_connectivity_query { __u8 outbound; __u8 l4proto; __u8 ipversion; }; struct { __uint(type, BPF_MAP_TYPE_HASH); __type(key, struct outbound_connectivity_query); __type(value, __u32); // true, false __uint(max_entries, 256 * 2 * 2); // outbound * l4proto * ipversion } outbound_connectivity_map SEC(".maps"); // Sockmap: struct { __uint(type, BPF_MAP_TYPE_SOCKMAP); __type(key, __u32); // 0 is tcp, 1 is udp. __type(value, __u64); // fd of socket. __uint(max_entries, 2); } listen_socket_map SEC(".maps"); union ip6 { __u8 u6_addr8[16]; __be16 u6_addr16[8]; __be32 u6_addr32[4]; __be64 u6_addr64[2]; }; struct redirect_tuple { union ip6 sip; union ip6 dip; }; struct redirect_entry { __u32 ifindex; __u8 smac[6]; __u8 dmac[6]; __u8 from_wan; }; struct { __uint(type, BPF_MAP_TYPE_LRU_HASH); __type(key, struct redirect_tuple); __type(value, struct redirect_entry); __uint(max_entries, 65536); } redirect_track SEC(".maps"); struct ip_port { union ip6 ip; __be16 port; }; struct routing_result { __u32 mark; __u8 must; __u8 mac[6]; __u8 outbound; __u8 pname[TASK_COMM_LEN]; __u32 pid; __u8 dscp; }; struct tuples_key { union ip6 sip; union ip6 dip; __u16 sport; __u16 dport; __u8 l4proto; }; struct tuples { struct tuples_key five; __u8 dscp; }; struct dae_param { __u32 tproxy_port; __u32 control_plane_pid; __u32 dae0_ifindex; __u32 dae_netns_id; __u8 dae0peer_mac[6]; __u8 padding[2]; }; static volatile const struct dae_param PARAM = {}; struct { __uint(type, BPF_MAP_TYPE_LRU_HASH); __type(key, __u32); // tgid __type(value, __u32[TASK_COMM_LEN / 4]); // process name. __uint(max_entries, MAX_TGID_PNAME_MAPPING_NUM); __uint(pinning, LIBBPF_PIN_BY_NAME); } tgid_pname_map SEC(".maps"); // This map is only for old method (redirect mode in WAN). struct { __uint(type, BPF_MAP_TYPE_LRU_HASH); __type(key, struct tuples_key); __type(value, struct routing_result); // outbound __uint(max_entries, MAX_DST_MAPPING_NUM); /// NOTICE: It MUST be pinned. __uint(pinning, LIBBPF_PIN_BY_NAME); } routing_tuples_map SEC(".maps"); /* Sockets in fast_sock map are used for fast-redirecting via * sk_msg/fast_redirect. Sockets are automactically deleted from map once * closed, so we don't need to worry about stale entries. */ struct { __uint(type, BPF_MAP_TYPE_SOCKHASH); __type(key, struct tuples_key); __type(value, __u64); __uint(max_entries, 65535); } fast_sock SEC(".maps"); // Link to type: #define LinkType_None 0 #define LinkType_Ethernet 1 struct { __uint(type, BPF_MAP_TYPE_HASH); __type(key, __u32); // ifindex __type(value, __u32); // link length __uint(max_entries, MAX_INTERFACE_NUM); /// NOTICE: No persistence. // __uint(pinning, LIBBPF_PIN_BY_NAME); } linklen_map SEC(".maps"); // Interface Ips: struct if_params { bool rx_cksm_offload; bool tx_l4_cksm_ip4_offload; bool tx_l4_cksm_ip6_offload; bool use_nonstandard_offload_algorithm; }; struct { __uint(type, BPF_MAP_TYPE_HASH); __type(key, __u32); // ifindex __type(value, struct if_params); // ip __uint(max_entries, MAX_INTERFACE_NUM); /// NOTICE: No persistence. // __uint(pinning, LIBBPF_PIN_BY_NAME); } ifindex_params_map SEC(".maps"); // Array of LPM tries: struct lpm_key { struct bpf_lpm_trie_key trie_key; __be32 data[4]; }; struct map_lpm_type { __uint(type, BPF_MAP_TYPE_LPM_TRIE); __uint(map_flags, BPF_F_NO_PREALLOC); __uint(max_entries, MAX_LPM_SIZE); __uint(key_size, sizeof(struct lpm_key)); __uint(value_size, sizeof(__u32)); } unused_lpm_type SEC(".maps"); struct { __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); __uint(key_size, sizeof(__u32)); __uint(max_entries, MAX_LPM_NUM); // __uint(pinning, LIBBPF_PIN_BY_NAME); __array(values, struct map_lpm_type); } lpm_array_map SEC(".maps"); enum __attribute__((packed)) MatchType { /// WARNING: MUST SYNC WITH common/consts/ebpf.go. MatchType_DomainSet, MatchType_IpSet, MatchType_SourceIpSet, MatchType_Port, MatchType_SourcePort, MatchType_L4Proto, MatchType_IpVersion, MatchType_Mac, MatchType_ProcessName, MatchType_Dscp, MatchType_Fallback, }; enum L4ProtoType { L4ProtoType_TCP = 1, L4ProtoType_UDP, L4ProtoType_X, }; enum IpVersionType { IpVersionType_4 = 1, IpVersionType_6, IpVersionType_X, }; struct port_range { __u16 port_start; __u16 port_end; }; /* * Rule is like as following: * * domain(geosite:cn, suffix: google.com) && l4proto(tcp) -> my_group * * pseudocode: domain(geosite:cn || suffix:google.com) && l4proto(tcp) -> * my_group * * A match_set can be: IP set geosite:cn, suffix google.com, tcp proto */ struct match_set { union { __u8 __value[16]; // Placeholder for bpf2go. __u32 index; struct port_range port_range; enum L4ProtoType l4proto_type; enum IpVersionType ip_version; __u32 pname[TASK_COMM_LEN / 4]; __u8 dscp; }; bool not ; // A subrule flag (this is not a match_set flag). enum MatchType type; __u8 outbound; // User-defined value range is [0, 252]. bool must; __u32 mark; }; struct { __uint(type, BPF_MAP_TYPE_ARRAY); __type(key, __u32); __type(value, struct match_set); __uint(max_entries, MAX_MATCH_SET_LEN); // __uint(pinning, LIBBPF_PIN_BY_NAME); } routing_map SEC(".maps"); struct domain_routing { __u32 bitmap[MAX_MATCH_SET_LEN / 32]; }; struct { __uint(type, BPF_MAP_TYPE_LRU_HASH); __type(key, __be32[4]); __type(value, struct domain_routing); __uint(max_entries, MAX_DOMAIN_ROUTING_NUM); /// NOTICE: No persistence. // __uint(pinning, LIBBPF_PIN_BY_NAME); } domain_routing_map SEC(".maps"); struct ip_port_proto { __u32 ip[4]; __be16 port; __u8 proto; }; struct pid_pname { __u32 pid; char pname[TASK_COMM_LEN]; }; struct { __uint(type, BPF_MAP_TYPE_LRU_HASH); __type(key, __u64); __type(value, struct pid_pname); __uint(max_entries, MAX_COOKIE_PID_PNAME_MAPPING_NUM); /// NOTICE: No persistence. __uint(pinning, LIBBPF_PIN_BY_NAME); } cookie_pid_map SEC(".maps"); struct udp_conn_state { bool is_egress; struct bpf_timer timer; }; struct { __uint(type, BPF_MAP_TYPE_HASH); __uint(max_entries, MAX_DST_MAPPING_NUM); __type(key, struct tuples_key); __type(value, struct udp_conn_state); } udp_conn_state_map SEC(".maps"); // Functions: static __always_inline __u8 ipv4_get_dscp(const struct iphdr *iph) { return (iph->tos & 0xfc) >> 2; } static __always_inline __u8 ipv6_get_dscp(const struct ipv6hdr *ipv6h) { return (ipv6h->priority << 2) | (ipv6h->flow_lbl[0] >> 6); } static __always_inline void get_tuples(const struct __sk_buff *skb, struct tuples *tuples, const struct iphdr *iph, const struct ipv6hdr *ipv6h, const struct tcphdr *tcph, const struct udphdr *udph, __u8 l4proto) { __builtin_memset(tuples, 0, sizeof(*tuples)); tuples->five.l4proto = l4proto; if (skb->protocol == bpf_htons(ETH_P_IP)) { tuples->five.sip.u6_addr32[2] = bpf_htonl(0x0000ffff); tuples->five.sip.u6_addr32[3] = iph->saddr; tuples->five.dip.u6_addr32[2] = bpf_htonl(0x0000ffff); tuples->five.dip.u6_addr32[3] = iph->daddr; tuples->dscp = ipv4_get_dscp(iph); } else { __builtin_memcpy(&tuples->five.dip, &ipv6h->daddr, IPV6_BYTE_LENGTH); __builtin_memcpy(&tuples->five.sip, &ipv6h->saddr, IPV6_BYTE_LENGTH); tuples->dscp = ipv6_get_dscp(ipv6h); } if (l4proto == IPPROTO_TCP) { tuples->five.sport = tcph->source; tuples->five.dport = tcph->dest; } else { tuples->five.sport = udph->source; tuples->five.dport = udph->dest; } } static __always_inline bool equal16(const __be32 x[4], const __be32 y[4]) { #if __clang_major__ >= 10 return ((__be64 *)x)[0] == ((__be64 *)y)[0] && ((__be64 *)x)[1] == ((__be64 *)y)[1]; // return x[0] == y[0] && x[1] == y[1] && x[2] == y[2] && x[3] == y[3]; #else return __builtin_bcmp(x, y, IPV6_BYTE_LENGTH) == 0; #endif } static __always_inline int handle_ipv6_extensions(const struct __sk_buff *skb, __u32 offset, __u32 hdr, struct icmp6hdr *icmp6h, struct tcphdr *tcph, struct udphdr *udph, __u8 *ihl, __u8 *l4proto) { __u8 hdr_length = 0; __u8 nexthdr = 0; *ihl = sizeof(struct ipv6hdr) / 4; int ret; // We only process TCP and UDP traffic. // Unroll can give less instructions but more memory consumption when loading. // We disable it here to support more poor memory devices. // #pragma unroll for (int i = 0; i < IPV6_MAX_EXTENSIONS; i++, offset += hdr_length, hdr = nexthdr, *ihl += hdr_length / 4) { if (hdr_length % 4) { bpf_printk( "IPv6 extension length is not multiples of 4"); return 1; } // See control/control_plane.go. switch (hdr) { case IPPROTO_ICMPV6: *l4proto = hdr; hdr_length = sizeof(struct icmp6hdr); // Assume ICMPV6 as a level 4 protocol. ret = bpf_skb_load_bytes(skb, offset, icmp6h, hdr_length); if (ret) { bpf_printk("not a valid IPv6 packet"); return -EFAULT; } return 0; case IPPROTO_HOPOPTS: case IPPROTO_ROUTING: ret = bpf_skb_load_bytes(skb, offset + 1, &hdr_length, sizeof(hdr_length)); if (ret) { bpf_printk("not a valid IPv6 packet"); return -EFAULT; } special_n1: ret = bpf_skb_load_bytes(skb, offset, &nexthdr, sizeof(nexthdr)); if (ret) { bpf_printk("not a valid IPv6 packet"); return -EFAULT; } break; case IPPROTO_FRAGMENT: hdr_length = 4; goto special_n1; case IPPROTO_TCP: case IPPROTO_UDP: *l4proto = hdr; if (hdr == IPPROTO_TCP) { // Upper layer; ret = bpf_skb_load_bytes(skb, offset, tcph, sizeof(struct tcphdr)); if (ret) { bpf_printk("not a valid IPv6 packet"); return -EFAULT; } } else if (hdr == IPPROTO_UDP) { // Upper layer; ret = bpf_skb_load_bytes(skb, offset, udph, sizeof(struct udphdr)); if (ret) { bpf_printk("not a valid IPv6 packet"); return -EFAULT; } } else { // Unknown hdr. bpf_printk("Unexpected hdr."); return 1; } return 0; default: /// EXPECTED: Maybe ICMP, etc. // bpf_printk("IPv6 but unrecognized extension protocol: %u", hdr); return 1; } } bpf_printk("exceeds IPV6_MAX_EXTENSIONS limit"); return 1; } static __always_inline int parse_transport(const struct __sk_buff *skb, __u32 link_h_len, struct ethhdr *ethh, struct iphdr *iph, struct ipv6hdr *ipv6h, struct icmp6hdr *icmp6h, struct tcphdr *tcph, struct udphdr *udph, __u8 *ihl, __u8 *l4proto) { __u32 offset = 0; int ret; if (link_h_len == ETH_HLEN) { ret = bpf_skb_load_bytes(skb, offset, ethh, sizeof(struct ethhdr)); if (ret) { bpf_printk("not ethernet packet"); return 1; } // Skip ethhdr for next hdr. offset += sizeof(struct ethhdr); } else { __builtin_memset(ethh, 0, sizeof(struct ethhdr)); ethh->h_proto = skb->protocol; } *ihl = 0; *l4proto = 0; __builtin_memset(iph, 0, sizeof(struct iphdr)); __builtin_memset(ipv6h, 0, sizeof(struct ipv6hdr)); __builtin_memset(icmp6h, 0, sizeof(struct icmp6hdr)); __builtin_memset(tcph, 0, sizeof(struct tcphdr)); __builtin_memset(udph, 0, sizeof(struct udphdr)); // bpf_printk("parse_transport: h_proto: %u ? %u %u", ethh->h_proto, // bpf_htons(ETH_P_IP), // bpf_htons(ETH_P_IPV6)); if (ethh->h_proto == bpf_htons(ETH_P_IP)) { ret = bpf_skb_load_bytes(skb, offset, iph, sizeof(struct iphdr)); if (ret) return -EFAULT; // Skip ipv4hdr and options for next hdr. offset += iph->ihl * 4; // We only process TCP and UDP traffic. *l4proto = iph->protocol; switch (iph->protocol) { case IPPROTO_TCP: { ret = bpf_skb_load_bytes(skb, offset, tcph, sizeof(struct tcphdr)); if (ret) { // Not a complete tcphdr. return -EFAULT; } } break; case IPPROTO_UDP: { ret = bpf_skb_load_bytes(skb, offset, udph, sizeof(struct udphdr)); if (ret) { // Not a complete udphdr. return -EFAULT; } } break; default: return 1; } *ihl = iph->ihl; return 0; } else if (ethh->h_proto == bpf_htons(ETH_P_IPV6)) { ret = bpf_skb_load_bytes(skb, offset, ipv6h, sizeof(struct ipv6hdr)); if (ret) { bpf_printk("not a valid IPv6 packet"); return -EFAULT; } offset += sizeof(struct ipv6hdr); return handle_ipv6_extensions(skb, offset, ipv6h->nexthdr, icmp6h, tcph, udph, ihl, l4proto); } else { /// EXPECTED: Maybe ICMP, MPLS, etc. // bpf_printk("IP but not supported packet: protocol is %u", // iph->protocol); // bpf_printk("unknown link proto: %u", bpf_ntohl(skb->protocol)); return 1; } } struct route_params { __u32 flag[8]; const void *l4hdr; const __be32 *saddr; const __be32 *daddr; __be32 mac[4]; }; struct route_ctx { const struct route_params *params; __u16 h_dport; __u16 h_sport; __s64 result; // high -> low: sign(1b) unused(23b) mark(32b) outbound(8b) struct lpm_key lpm_key_saddr, lpm_key_daddr, lpm_key_mac; volatile __u8 isdns_must_goodsubrule_badrule; }; static int route_loop_cb(__u32 index, void *data) { #define _l4proto_type ctx->params->flag[0] #define _ipversion_type ctx->params->flag[1] #define _pname (&ctx->params->flag[2]) #define _is_wan ctx->params->flag[2] #define _dscp ctx->params->flag[6] struct route_ctx *ctx = data; struct match_set *match_set; struct lpm_key *lpm_key; struct map_lpm_type *lpm; // Rule is like: domain(suffix:baidu.com, suffix:google.com) && port(443) -> // proxy Subrule is like: domain(suffix:baidu.com, suffix:google.com) Match // set is like: suffix:baidu.com struct domain_routing *domain_routing; if (unlikely(index / 32 >= MAX_MATCH_SET_LEN / 32)) { ctx->result = -EFAULT; return 1; } __u32 k = index; // Clone to pass code checker. match_set = bpf_map_lookup_elem(&routing_map, &k); if (unlikely(!match_set)) { ctx->result = -EFAULT; return 1; } if (ctx->isdns_must_goodsubrule_badrule & 0b11) { #ifdef __DEBUG_ROUTING bpf_printk("key(match_set->type): %llu", match_set->type); bpf_printk("Skip to judge. bad_rule: %d, good_subrule: %d", ctx->isdns_must_goodsubrule_badrule & 0b10, ctx->isdns_must_goodsubrule_badrule & 0b1); #endif goto before_next_loop; } switch (match_set->type) { case MatchType_Mac: lpm_key = &ctx->lpm_key_mac; goto lookup_lpm; case MatchType_IpSet: lpm_key = &ctx->lpm_key_daddr; goto lookup_lpm; case MatchType_SourceIpSet: lpm_key = &ctx->lpm_key_saddr; lookup_lpm: #ifdef __DEBUG_ROUTING bpf_printk( "CHECK: lpm_key_map, match_set->type: %u, not: %d, outbound: %u", match_set->type, match_set->not, match_set->outbound); bpf_printk("\tip: %pI6", lpm_key->data); #endif lpm = bpf_map_lookup_elem(&lpm_array_map, &match_set->index); if (unlikely(!lpm)) { ctx->result = -EFAULT; return 1; } if (bpf_map_lookup_elem(lpm, lpm_key)) { // match_set hits. ctx->isdns_must_goodsubrule_badrule |= 0b10; } break; case MatchType_Port: #ifdef __DEBUG_ROUTING bpf_printk( "CHECK: h_port_map, match_set->type: %u, not: %d, outbound: %u", match_set->type, match_set->not, match_set->outbound); bpf_printk("\tport: %u, range: [%u, %u]", ctx->h_dport, match_set->port_range.port_start, match_set->port_range.port_end); #endif if (match_set->port_range.port_start <= ctx->h_dport && ctx->h_dport <= match_set->port_range.port_end) { ctx->isdns_must_goodsubrule_badrule |= 0b10; } break; case MatchType_SourcePort: #ifdef __DEBUG_ROUTING bpf_printk( "CHECK: h_port_map, match_set->type: %u, not: %d, outbound: %u", match_set->type, match_set->not, match_set->outbound); bpf_printk("\tport: %u, range: [%u, %u]", ctx->h_sport, match_set->port_range.port_start, match_set->port_range.port_end); #endif if (match_set->port_range.port_start <= ctx->h_sport && ctx->h_sport <= match_set->port_range.port_end) { ctx->isdns_must_goodsubrule_badrule |= 0b10; } break; case MatchType_L4Proto: #ifdef __DEBUG_ROUTING bpf_printk( "CHECK: l4proto, match_set->type: %u, not: %d, outbound: %u", match_set->type, match_set->not, match_set->outbound); #endif if (_l4proto_type & match_set->l4proto_type) ctx->isdns_must_goodsubrule_badrule |= 0b10; break; case MatchType_IpVersion: #ifdef __DEBUG_ROUTING bpf_printk( "CHECK: ipversion, match_set->type: %u, not: %d, outbound: %u", match_set->type, match_set->not, match_set->outbound); #endif if (_ipversion_type & match_set->ip_version) ctx->isdns_must_goodsubrule_badrule |= 0b10; break; case MatchType_DomainSet: #ifdef __DEBUG_ROUTING bpf_printk( "CHECK: domain, match_set->type: %u, not: %d, outbound: %u", match_set->type, match_set->not, match_set->outbound); #endif // Get domain routing bitmap. domain_routing = bpf_map_lookup_elem(&domain_routing_map, ctx->params->daddr); // We use key instead of k to pass checker. if (domain_routing && (domain_routing->bitmap[index / 32] >> (index % 32)) & 1) ctx->isdns_must_goodsubrule_badrule |= 0b10; break; case MatchType_ProcessName: if (_is_wan && equal16(match_set->pname, _pname)) ctx->isdns_must_goodsubrule_badrule |= 0b10; break; case MatchType_Dscp: if (_dscp == match_set->dscp) ctx->isdns_must_goodsubrule_badrule |= 0b10; break; case MatchType_Fallback: #ifdef __DEBUG_ROUTING bpf_printk("CHECK: hit fallback"); #endif ctx->isdns_must_goodsubrule_badrule |= 0b10; break; default: #ifdef __DEBUG_ROUTING bpf_printk( "CHECK: , match_set->type: %u, not: %d, outbound: %u", match_set->type, match_set->not, match_set->outbound); #endif ctx->result = -EINVAL; return 1; } before_next_loop: #ifdef __DEBUG_ROUTING bpf_printk("good_subrule: %d, bad_rule: %d", ctx->isdns_must_goodsubrule_badrule & 0b10, ctx->isdns_must_goodsubrule_badrule & 0b1); #endif if (match_set->outbound != OUTBOUND_LOGICAL_OR) { // This match_set reaches the end of subrule. // We are now at end of rule, or next match_set belongs to another // subrule. if ((ctx->isdns_must_goodsubrule_badrule & 0b10) > 0 == match_set->not ) { // This subrule does not hit. ctx->isdns_must_goodsubrule_badrule |= 0b1; } // Reset good_subrule. ctx->isdns_must_goodsubrule_badrule &= ~0b10; } #ifdef __DEBUG_ROUTING bpf_printk("_bad_rule: %d", ctx->isdns_must_goodsubrule_badrule & 0b1); #endif if ((match_set->outbound & OUTBOUND_LOGICAL_MASK) != OUTBOUND_LOGICAL_MASK) { // Tail of a rule (line). // Decide whether to hit. if (!(ctx->isdns_must_goodsubrule_badrule & 0b1)) { #ifdef __DEBUG_ROUTING bpf_printk( "MATCHED: match_set->type: %u, match_set->not: %d", match_set->type, match_set->not ); #endif // DNS requests should routed by control plane if outbound is not // must_direct. if (unlikely(match_set->outbound == OUTBOUND_MUST_RULES)) { ctx->isdns_must_goodsubrule_badrule |= 0b100; } else { if (ctx->isdns_must_goodsubrule_badrule & 0b100) match_set->must = true; if (!match_set->must && (ctx->isdns_must_goodsubrule_badrule & 0b1000)) { ctx->result = (__s64)OUTBOUND_CONTROL_PLANE_ROUTING | ((__s64)match_set->mark << 8) | ((__s64)match_set->must << 40); #ifdef __DEBUG_ROUTING bpf_printk( "OUTBOUND_CONTROL_PLANE_ROUTING: %ld", ctx->result); #endif return 1; } ctx->result = (__s64)match_set->outbound | ((__s64)match_set->mark << 8) | ((__s64)match_set->must << 40); #ifdef __DEBUG_ROUTING bpf_printk("outbound %u: %ld", match_set->outbound, ctx->result); #endif return 1; } } ctx->isdns_must_goodsubrule_badrule &= ~0b1; } return 0; #undef _l4proto_type #undef _ipversion_type #undef _pname #undef _is_wan #undef _dscp } static __always_inline __s64 route(const struct route_params *params) { #define _l4proto_type params->flag[0] #define _ipversion_type params->flag[1] #define _pname (¶ms->flag[2]) #define _is_wan params->flag[2] #define _dscp params->flag[6] int ret; struct route_ctx ctx; __builtin_memset(&ctx, 0, sizeof(ctx)); ctx.params = params; ctx.result = -ENOEXEC; // Variables for further use. if (_l4proto_type == L4ProtoType_TCP) { ctx.h_dport = bpf_ntohs(((struct tcphdr *)params->l4hdr)->dest); ctx.h_sport = bpf_ntohs(((struct tcphdr *)params->l4hdr)->source); } else { ctx.h_dport = bpf_ntohs(((struct udphdr *)params->l4hdr)->dest); ctx.h_sport = bpf_ntohs(((struct udphdr *)params->l4hdr)->source); } // Rule is like: domain(suffix:baidu.com, suffix:google.com) && port(443) -> // proxy Subrule is like: domain(suffix:baidu.com, suffix:google.com) Match // set is like: suffix:baidu.com ctx.isdns_must_goodsubrule_badrule = (ctx.h_dport == 53 && _l4proto_type == L4ProtoType_UDP) << 3; struct lpm_key lpm_key_saddr = { .trie_key = { IPV6_BYTE_LENGTH * 8, {} }, }; ctx.lpm_key_saddr = lpm_key_saddr; struct lpm_key lpm_key_daddr = { .trie_key = { IPV6_BYTE_LENGTH * 8, {} }, }; ctx.lpm_key_daddr = lpm_key_daddr; struct lpm_key lpm_key_mac = { .trie_key = { IPV6_BYTE_LENGTH * 8, {} }, }; ctx.lpm_key_mac = lpm_key_mac; __builtin_memcpy(ctx.lpm_key_saddr.data, params->saddr, IPV6_BYTE_LENGTH); __builtin_memcpy(ctx.lpm_key_daddr.data, params->daddr, IPV6_BYTE_LENGTH); __builtin_memcpy(ctx.lpm_key_mac.data, params->mac, IPV6_BYTE_LENGTH); ret = bpf_loop(MAX_MATCH_SET_LEN, route_loop_cb, &ctx, 0); if (unlikely(ret < 0)) return ret; if (ctx.result >= 0) return ctx.result; bpf_printk( "No match_set hits. Did coder forget to sync common/consts/ebpf.go with enum MatchType?"); return -EPERM; #undef _l4proto_type #undef _ipversion_type #undef _pname #undef _is_wan #undef _dscp } static __always_inline __u32 get_link_h_len(__u32 ifindex, volatile __u32 *link_h_len) { __u32 *plink_h_len = bpf_map_lookup_elem(&linklen_map, &ifindex); if (!plink_h_len) return -EIO; *link_h_len = *plink_h_len; return 0; } static __always_inline int assign_listener(struct __sk_buff *skb, __u8 l4proto) { struct bpf_sock *sk; if (l4proto == IPPROTO_TCP) sk = bpf_map_lookup_elem(&listen_socket_map, &zero_key); else sk = bpf_map_lookup_elem(&listen_socket_map, &one_key); if (!sk) return -1; int ret = bpf_sk_assign(skb, sk, 0); bpf_sk_release(sk); return ret; } static __always_inline void prep_redirect_to_control_plane( struct __sk_buff *skb, __u32 link_h_len, struct tuples *tuples, __u8 l4proto, struct ethhdr *ethh, __u8 from_wan, struct tcphdr *tcph) { /* Redirect from L3 dev to L2 dev, e.g. wg0 -> veth */ if (!link_h_len) { __u16 l3proto = skb->protocol; bpf_skb_change_head(skb, sizeof(struct ethhdr), 0); bpf_skb_store_bytes(skb, offsetof(struct ethhdr, h_proto), &l3proto, sizeof(l3proto), 0); } bpf_skb_store_bytes(skb, offsetof(struct ethhdr, h_dest), (void *)&PARAM.dae0peer_mac, sizeof(ethh->h_dest), 0); struct redirect_tuple redirect_tuple = {}; if (skb->protocol == bpf_htons(ETH_P_IP)) { redirect_tuple.sip.u6_addr32[3] = tuples->five.sip.u6_addr32[3]; redirect_tuple.dip.u6_addr32[3] = tuples->five.dip.u6_addr32[3]; } else { __builtin_memcpy(&redirect_tuple.sip, &tuples->five.sip, IPV6_BYTE_LENGTH); __builtin_memcpy(&redirect_tuple.dip, &tuples->five.dip, IPV6_BYTE_LENGTH); } struct redirect_entry redirect_entry = {}; redirect_entry.ifindex = skb->ifindex; redirect_entry.from_wan = from_wan; __builtin_memcpy(redirect_entry.smac, ethh->h_source, sizeof(ethh->h_source)); __builtin_memcpy(redirect_entry.dmac, ethh->h_dest, sizeof(ethh->h_dest)); bpf_map_update_elem(&redirect_track, &redirect_tuple, &redirect_entry, BPF_ANY); skb->cb[0] = TPROXY_MARK; skb->cb[1] = 0; if ((l4proto == IPPROTO_TCP && tcph->syn) || l4proto == IPPROTO_UDP) skb->cb[1] = l4proto; } SEC("tc/egress") int tproxy_lan_egress(struct __sk_buff *skb) { if (skb->ingress_ifindex != NOWHERE_IFINDEX) return TC_ACT_PIPE; struct ethhdr ethh; struct iphdr iph; struct ipv6hdr ipv6h; struct icmp6hdr icmp6h; struct tcphdr tcph; struct udphdr udph; __u8 ihl; __u8 l4proto; __u32 link_h_len; if (get_link_h_len(skb->ifindex, &link_h_len)) return TC_ACT_OK; int ret = parse_transport(skb, link_h_len, ðh, &iph, &ipv6h, &icmp6h, &tcph, &udph, &ihl, &l4proto); if (ret) { bpf_printk("parse_transport: %d", ret); return TC_ACT_OK; } if (l4proto == IPPROTO_ICMPV6 && icmp6h.icmp6_type == NDP_REDIRECT) { // REDIRECT (NDP) return TC_ACT_SHOT; } return TC_ACT_PIPE; } SEC("tc/ingress") int tproxy_lan_ingress(struct __sk_buff *skb) { struct ethhdr ethh; struct iphdr iph; struct ipv6hdr ipv6h; struct icmp6hdr icmp6h; struct tcphdr tcph; struct udphdr udph; __u8 ihl; __u8 l4proto; __u32 link_h_len; if (get_link_h_len(skb->ifindex, &link_h_len)) return TC_ACT_OK; int ret = parse_transport(skb, link_h_len, ðh, &iph, &ipv6h, &icmp6h, &tcph, &udph, &ihl, &l4proto); if (ret) { bpf_printk("parse_transport: %d", ret); return TC_ACT_OK; } if (l4proto == IPPROTO_ICMPV6) return TC_ACT_OK; // Prepare five tuples. struct tuples tuples; get_tuples(skb, &tuples, &iph, &ipv6h, &tcph, &udph, l4proto); /* * ip rule add fwmark 0x8000000/0x8000000 table 2023 * ip route add local default dev lo table 2023 * ip -6 rule add fwmark 0x8000000/0x8000000 table 2023 * ip -6 route add local default dev lo table 2023 * ip rule del fwmark 0x8000000/0x8000000 table 2023 * ip route del local default dev lo table 2023 * ip -6 rule del fwmark 0x8000000/0x8000000 table 2023 * ip -6 route del local default dev lo table 2023 */ // Socket lookup and assign skb to existing socket connection. struct bpf_sock_tuple tuple = { 0 }; __u32 tuple_size; struct bpf_sock *sk; if (skb->protocol == bpf_htons(ETH_P_IP)) { tuple.ipv4.daddr = tuples.five.dip.u6_addr32[3]; tuple.ipv4.saddr = tuples.five.sip.u6_addr32[3]; tuple.ipv4.dport = tuples.five.dport; tuple.ipv4.sport = tuples.five.sport; tuple_size = sizeof(tuple.ipv4); } else { __builtin_memcpy(tuple.ipv6.daddr, &tuples.five.dip, IPV6_BYTE_LENGTH); __builtin_memcpy(tuple.ipv6.saddr, &tuples.five.sip, IPV6_BYTE_LENGTH); tuple.ipv6.dport = tuples.five.dport; tuple.ipv6.sport = tuples.five.sport; tuple_size = sizeof(tuple.ipv6); } if (l4proto == IPPROTO_TCP) { // TCP. if (tcph.syn && !tcph.ack) goto new_connection; sk = bpf_skc_lookup_tcp(skb, &tuple, tuple_size, PARAM.dae_netns_id, 0); if (sk) { if (sk->state != BPF_TCP_LISTEN) { bpf_sk_release(sk); goto control_plane; } bpf_sk_release(sk); } } // Routing for new connection. new_connection:; struct route_params params; __builtin_memset(¶ms, 0, sizeof(params)); if (l4proto == IPPROTO_TCP) { if (!(tcph.syn && !tcph.ack)) { // Not a new TCP connection. // Perhaps single-arm. return TC_ACT_OK; } params.l4hdr = &tcph; params.flag[0] = L4ProtoType_TCP; } else { params.l4hdr = &udph; params.flag[0] = L4ProtoType_UDP; } if (skb->protocol == bpf_htons(ETH_P_IP)) params.flag[1] = IpVersionType_4; else params.flag[1] = IpVersionType_6; params.flag[6] = tuples.dscp; params.mac[2] = bpf_htonl((ethh.h_source[0] << 8) | (ethh.h_source[1])); params.mac[3] = bpf_htonl((ethh.h_source[2] << 24) | (ethh.h_source[3] << 16) | (ethh.h_source[4] << 8) | (ethh.h_source[5])); params.saddr = tuples.five.sip.u6_addr32; params.daddr = tuples.five.dip.u6_addr32; __s64 s64_ret; s64_ret = route(¶ms); if (s64_ret < 0) { bpf_printk("shot routing: %d", s64_ret); return TC_ACT_SHOT; } struct routing_result routing_result = { 0 }; routing_result.outbound = s64_ret; routing_result.mark = s64_ret >> 8; routing_result.must = (s64_ret >> 40) & 1; routing_result.dscp = tuples.dscp; __builtin_memcpy(routing_result.mac, ethh.h_source, sizeof(routing_result.mac)); /// NOTICE: No pid pname info for LAN packet. // // Maybe this packet is also in the host (such as docker) ? // // I tried and it is false. //__u64 cookie = bpf_get_socket_cookie(skb); //struct pid_pname *pid_pname = // bpf_map_lookup_elem(&cookie_pid_map, &cookie); //if (pid_pname) { // __builtin_memcpy(routing_result.pname, pid_pname->pname, // TASK_COMM_LEN); // routing_result.pid = pid_pname->pid; //} // Save routing result. ret = bpf_map_update_elem(&routing_tuples_map, &tuples.five, &routing_result, BPF_ANY); if (ret) { bpf_printk("shot save routing result: %d", ret); return TC_ACT_SHOT; } #if defined(__DEBUG_ROUTING) || defined(__PRINT_ROUTING_RESULT) if (l4proto == IPPROTO_TCP) { bpf_printk("tcp(lan): outbound: %u, target: %pI6:%u", ret, tuples.five.dip.u6_addr32, bpf_ntohs(tuples.five.dport)); } else { bpf_printk("udp(lan): outbound: %u, target: %pI6:%u", routing_result.outbound, tuples.five.dip.u6_addr32, bpf_ntohs(tuples.five.dport)); } #endif if (routing_result.outbound == OUTBOUND_DIRECT) { skb->mark = routing_result.mark; #if defined(__DEBUG_ROUTING) || defined(__PRINT_ROUTING_RESULT) bpf_printk("GO OUTBOUND_DIRECT"); #endif goto direct; } else if (unlikely(routing_result.outbound == OUTBOUND_BLOCK)) { #if defined(__DEBUG_ROUTING) || defined(__PRINT_ROUTING_RESULT) bpf_printk("SHOT OUTBOUND_BLOCK"); #endif goto block; } // Check outbound connectivity in specific ipversion and l4proto. struct outbound_connectivity_query q = { 0 }; q.outbound = routing_result.outbound; q.ipversion = skb->protocol == bpf_htons(ETH_P_IP) ? 4 : 6; q.l4proto = l4proto; __u32 *alive; alive = bpf_map_lookup_elem(&outbound_connectivity_map, &q); if (alive && *alive == 0 && !(l4proto == IPPROTO_UDP && tuples.five.dport == bpf_htons(53))) { // Outbound is not alive. Dns is an exception. goto block; } // Assign to control plane. control_plane: prep_redirect_to_control_plane(skb, link_h_len, &tuples, l4proto, ðh, 0, &tcph); return bpf_redirect(PARAM.dae0_ifindex, 0); direct: return TC_ACT_OK; block: return TC_ACT_SHOT; } // Cookie will change after the first packet, so we just use it for // handshake. static __always_inline bool pid_is_control_plane(struct __sk_buff *skb, struct pid_pname **p) { struct pid_pname *pid_pname; __u64 cookie = bpf_get_socket_cookie(skb); pid_pname = bpf_map_lookup_elem(&cookie_pid_map, &cookie); if (pid_pname) { if (p) { // Assign. *p = pid_pname; } // Get tproxy pid and compare if they are equal. __u32 pid_tproxy; pid_tproxy = PARAM.control_plane_pid; if (!pid_tproxy) { bpf_printk("control_plane_pid is not set."); return false; } return pid_pname->pid == pid_tproxy; } if (p) *p = NULL; if ((skb->mark & 0x100) == 0x100) { bpf_printk("No pid_pname found. But it should not happen"); /* * if (l4proto == IPPROTO_TCP) { *if (tcph.syn && !tcph.ack) { * bpf_printk("No pid_pname found. But it should not happen: local:%u " * "(%u)[%llu]", * bpf_ntohs(sport), l4proto, cookie); *} else { * bpf_printk("No pid_pname found. But it should not happen: (Old " * "Connection): local:%u " * "(%u)[%llu]", * bpf_ntohs(sport), l4proto, cookie); *} * } else { *bpf_printk("No pid_pname found. But it should not happen: local:%u " * "(%u)[%llu]", * bpf_ntohs(sport), l4proto, cookie); * } */ return true; } return false; } static int refresh_udp_conn_state_timer_cb(void *_udp_conn_state_map, struct tuples_key *key, struct udp_conn_state *val) { bpf_map_delete_elem(&udp_conn_state_map, key); return 0; } static __always_inline void copy_reversed_tuples(struct tuples_key *key, struct tuples_key *dst) { __builtin_memset(dst, 0, sizeof(*dst)); dst->dip = key->sip; dst->sip = key->dip; dst->sport = key->dport; dst->dport = key->sport; dst->l4proto = key->l4proto; } static __always_inline struct udp_conn_state * refresh_udp_conn_state_timer(struct tuples_key *key, bool is_egress) { struct udp_conn_state *old_conn_state = bpf_map_lookup_elem(&udp_conn_state_map, key); struct udp_conn_state new_conn_state = { 0 }; if (old_conn_state) new_conn_state.is_egress = old_conn_state->is_egress; // Keep the value. else new_conn_state.is_egress = is_egress; long ret = bpf_map_update_elem(&udp_conn_state_map, key, &new_conn_state, BPF_ANY); if (unlikely(ret)) return NULL; struct udp_conn_state *value = bpf_map_lookup_elem(&udp_conn_state_map, key); if (unlikely(!value)) return NULL; if ((bpf_timer_init(&value->timer, &udp_conn_state_map, CLOCK_MONOTONIC))) goto retn; if ((bpf_timer_set_callback(&value->timer, refresh_udp_conn_state_timer_cb))) goto retn; if ((bpf_timer_start(&value->timer, TIMEOUT_UDP_CONN_STATE, 0))) goto retn; retn: return value; } SEC("tc/wan_ingress") int tproxy_wan_ingress(struct __sk_buff *skb) { struct ethhdr ethh; struct iphdr iph; struct ipv6hdr ipv6h; struct icmp6hdr icmp6h; struct tcphdr tcph; struct udphdr udph; __u8 ihl; __u8 l4proto; __u32 link_h_len; if (get_link_h_len(skb->ifindex, &link_h_len)) return TC_ACT_OK; int ret = parse_transport(skb, link_h_len, ðh, &iph, &ipv6h, &icmp6h, &tcph, &udph, &ihl, &l4proto); if (ret) return TC_ACT_OK; if (l4proto != IPPROTO_UDP) return TC_ACT_PIPE; struct tuples tuples; struct tuples_key reversed_tuples_key; get_tuples(skb, &tuples, &iph, &ipv6h, &tcph, &udph, l4proto); copy_reversed_tuples(&tuples.five, &reversed_tuples_key); if (!refresh_udp_conn_state_timer(&reversed_tuples_key, false)) return TC_ACT_SHOT; return TC_ACT_PIPE; } // Routing and redirect the packet back. // We cannot modify the dest address here. So we cooperate with wan_ingress. SEC("tc/wan_egress") int tproxy_wan_egress(struct __sk_buff *skb) { // Skip packets not from localhost. if (skb->ingress_ifindex != NOWHERE_IFINDEX) return TC_ACT_OK; // if ((skb->mark & 0x80) == 0x80) { // return TC_ACT_OK; // } struct ethhdr ethh; struct iphdr iph; struct ipv6hdr ipv6h; struct icmp6hdr icmp6h; struct tcphdr tcph; struct udphdr udph; __u8 ihl; __u8 l4proto; __u32 link_h_len; if (get_link_h_len(skb->ifindex, &link_h_len)) return TC_ACT_OK; bool tcp_state_syn; int ret = parse_transport(skb, link_h_len, ðh, &iph, &ipv6h, &icmp6h, &tcph, &udph, &ihl, &l4proto); if (ret) return TC_ACT_OK; if (l4proto == IPPROTO_ICMPV6) return TC_ACT_OK; // Backup for further use. struct tuples tuples; get_tuples(skb, &tuples, &iph, &ipv6h, &tcph, &udph, l4proto); // Normal packets. if (l4proto == IPPROTO_TCP) { // Backup for further use. tcp_state_syn = tcph.syn && !tcph.ack; __u8 outbound; bool must; __u32 mark; struct pid_pname *pid_pname = NULL; if (unlikely(tcp_state_syn)) { // New TCP connection. // bpf_printk("[%X]New Connection", bpf_ntohl(tcph.seq)); struct route_params params; __builtin_memset(¶ms, 0, sizeof(params)); params.l4hdr = &tcph; params.flag[0] = L4ProtoType_TCP; if (skb->protocol == bpf_htons(ETH_P_IP)) params.flag[1] = IpVersionType_4; else params.flag[1] = IpVersionType_6; params.flag[6] = tuples.dscp; if (pid_is_control_plane(skb, &pid_pname)) { // From control plane. Direct. return TC_ACT_OK; } if (pid_pname) { // 2, 3, 4, 5 __builtin_memcpy(¶ms.flag[2], pid_pname->pname, TASK_COMM_LEN); } params.mac[2] = bpf_htonl((ethh.h_source[0] << 8) | (ethh.h_source[1])); params.mac[3] = bpf_htonl((ethh.h_source[2] << 24) | (ethh.h_source[3] << 16) | (ethh.h_source[4] << 8) | (ethh.h_source[5])); params.saddr = tuples.five.sip.u6_addr32; params.daddr = tuples.five.dip.u6_addr32; __s64 s64_ret; s64_ret = route(¶ms); if (s64_ret < 0) { bpf_printk("shot routing: %d", s64_ret); return TC_ACT_SHOT; } outbound = s64_ret & 0xff; mark = s64_ret >> 8; must = (s64_ret >> 40) & 1; #if defined(__DEBUG_ROUTING) || defined(__PRINT_ROUTING_RESULT) // Print only new connection. __u32 pid = pid_pname ? pid_pname->pid : 0; bpf_printk("tcp(wan): from %pI6:%u [PID %u]", tuples.five.sip.u6_addr32, bpf_ntohs(tuples.five.sport), pid); bpf_printk("tcp(wan): outbound: %u, %pI6:%u", outbound, tuples.five.dip.u6_addr32, bpf_ntohs(tuples.five.dport)); #endif } else { // bpf_printk("[%X]Old Connection", bpf_ntohl(tcph.seq)); // The TCP connection exists. struct routing_result *routing_result = bpf_map_lookup_elem(&routing_tuples_map, &tuples.five); if (!routing_result) { // Do not impact previous connections and server connections. return TC_ACT_OK; } outbound = routing_result->outbound; mark = routing_result->mark; must = routing_result->must; } if (outbound == OUTBOUND_DIRECT && mark == 0 // If mark is not zero, we should re-route it, so we send it // to control plane in WAN. ) { #if defined(__DEBUG_ROUTING) || defined(__PRINT_ROUTING_RESULT) bpf_printk("GO OUTBOUND_DIRECT"); #endif skb->mark = mark; return TC_ACT_OK; } else if (unlikely(outbound == OUTBOUND_BLOCK)) { #if defined(__DEBUG_ROUTING) || defined(__PRINT_ROUTING_RESULT) bpf_printk("SHOT OUTBOUND_BLOCK"); #endif return TC_ACT_SHOT; } // Rewrite to control plane. // Check outbound connectivity in specific ipversion and l4proto. struct outbound_connectivity_query q = { 0 }; q.outbound = outbound; q.ipversion = skb->protocol == bpf_htons(ETH_P_IP) ? 4 : 6; q.l4proto = l4proto; __u32 *alive; alive = bpf_map_lookup_elem(&outbound_connectivity_map, &q); if (alive && *alive == 0 && !(l4proto == IPPROTO_UDP && tuples.five.dport == bpf_htons(53))) { // Outbound is not alive. Dns is an exception. return TC_ACT_SHOT; } if (unlikely(tcp_state_syn)) { struct routing_result routing_result = {}; routing_result.outbound = outbound; routing_result.mark = mark; routing_result.must = must; routing_result.dscp = tuples.dscp; __builtin_memcpy(routing_result.mac, ethh.h_source, sizeof(ethh.h_source)); if (pid_pname) { __builtin_memcpy(routing_result.pname, pid_pname->pname, TASK_COMM_LEN); routing_result.pid = pid_pname->pid; } bpf_map_update_elem(&routing_tuples_map, &tuples.five, &routing_result, BPF_ANY); } } else if (l4proto == IPPROTO_UDP) { // Routing. It decides if we redirect traffic to control plane. struct route_params params; __builtin_memset(¶ms, 0, sizeof(params)); params.l4hdr = &udph; params.flag[0] = L4ProtoType_UDP; if (skb->protocol == bpf_htons(ETH_P_IP)) params.flag[1] = IpVersionType_4; else params.flag[1] = IpVersionType_6; params.flag[6] = tuples.dscp; struct pid_pname *pid_pname; if (pid_is_control_plane(skb, &pid_pname)) { // from control plane // => direct. return TC_ACT_OK; } struct udp_conn_state *conn_state = refresh_udp_conn_state_timer(&tuples.five, true); if (!conn_state) return TC_ACT_SHOT; if (!conn_state->is_egress) { // Input udp connection // => direct. return TC_ACT_OK; } if (pid_pname) { // 2, 3, 4, 5 __builtin_memcpy(¶ms.flag[2], pid_pname->pname, TASK_COMM_LEN); } params.mac[2] = bpf_htonl((ethh.h_source[0] << 8) | (ethh.h_source[1])); params.mac[3] = bpf_htonl( (ethh.h_source[2] << 24) | (ethh.h_source[3] << 16) | (ethh.h_source[4] << 8) | (ethh.h_source[5])); params.saddr = tuples.five.sip.u6_addr32; params.daddr = tuples.five.dip.u6_addr32; __s64 s64_ret; s64_ret = route(¶ms); if (s64_ret < 0) { bpf_printk("shot routing: %d", s64_ret); return TC_ACT_SHOT; } // Construct new hdr to encap. struct routing_result routing_result = {}; routing_result.outbound = s64_ret; routing_result.mark = s64_ret >> 8; routing_result.must = (s64_ret >> 40) & 1; routing_result.dscp = tuples.dscp; __builtin_memcpy(routing_result.mac, ethh.h_source, sizeof(ethh.h_source)); if (pid_pname) { __builtin_memcpy(routing_result.pname, pid_pname->pname, TASK_COMM_LEN); routing_result.pid = pid_pname->pid; } bpf_map_update_elem(&routing_tuples_map, &tuples.five, &routing_result, BPF_ANY); #if defined(__DEBUG_ROUTING) || defined(__PRINT_ROUTING_RESULT) __u32 pid = pid_pname ? pid_pname->pid : 0; bpf_printk("udp(wan): from %pI6:%u [PID %u]", tuples.five.sip.u6_addr32, bpf_ntohs(tuples.five.sport), pid); bpf_printk("udp(wan): outbound: %u, %pI6:%u", routing_result.outbound, tuples.five.dip.u6_addr32, bpf_ntohs(tuples.five.dport)); #endif if (routing_result.outbound == OUTBOUND_DIRECT && routing_result.mark == 0 // If mark is not zero, we should re-route it, so we send it to control // plane in WAN. ) { return TC_ACT_OK; } else if (unlikely(routing_result.outbound == OUTBOUND_BLOCK)) { return TC_ACT_SHOT; } // Rewrite to control plane. // Check outbound connectivity in specific ipversion and l4proto. struct outbound_connectivity_query q = { 0 }; q.outbound = routing_result.outbound; q.ipversion = skb->protocol == bpf_htons(ETH_P_IP) ? 4 : 6; q.l4proto = l4proto; __u32 *alive; alive = bpf_map_lookup_elem(&outbound_connectivity_map, &q); if (alive && *alive == 0 && !(l4proto == IPPROTO_UDP && tuples.five.dport == bpf_htons(53))) { // Outbound is not alive. Dns is an exception. return TC_ACT_SHOT; } } prep_redirect_to_control_plane(skb, link_h_len, &tuples, l4proto, ðh, 1, &tcph); return bpf_redirect(PARAM.dae0_ifindex, 0); } SEC("tc/dae0peer_ingress") int tproxy_dae0peer_ingress(struct __sk_buff *skb) { /* Only packets redirected from wan_egress or lan_ingress have this cb mark. */ if (skb->cb[0] != TPROXY_MARK) return TC_ACT_SHOT; /* ip rule add fwmark 0x8000000/0x8000000 table 2023 * ip route add local default dev lo table 2023 */ skb->mark = TPROXY_MARK; bpf_skb_change_type(skb, PACKET_HOST); /* l4proto is stored in skb->cb[1] only for UDP and new TCP. As for * established TCP, kernel can take care of socket lookup, so just * return them to stack without calling bpf_sk_assign. */ __u8 l4proto = skb->cb[1]; if (l4proto != 0) assign_listener(skb, l4proto); return TC_ACT_OK; } SEC("tc/dae0_ingress") int tproxy_dae0_ingress(struct __sk_buff *skb) { // reverse the tuple! struct redirect_tuple redirect_tuple = {}; if (skb->protocol == bpf_htons(ETH_P_IP)) { bpf_skb_load_bytes(skb, ETH_HLEN + offsetof(struct iphdr, daddr), &redirect_tuple.sip.u6_addr32[3], sizeof(redirect_tuple.sip.u6_addr32[3])); bpf_skb_load_bytes(skb, ETH_HLEN + offsetof(struct iphdr, saddr), &redirect_tuple.dip.u6_addr32[3], sizeof(redirect_tuple.dip.u6_addr32[3])); } else { bpf_skb_load_bytes(skb, ETH_HLEN + offsetof(struct ipv6hdr, daddr), &redirect_tuple.sip, sizeof(redirect_tuple.sip)); bpf_skb_load_bytes(skb, ETH_HLEN + offsetof(struct ipv6hdr, saddr), &redirect_tuple.dip, sizeof(redirect_tuple.dip)); } struct redirect_entry *redirect_entry = bpf_map_lookup_elem(&redirect_track, &redirect_tuple); if (!redirect_entry) return TC_ACT_OK; bpf_skb_store_bytes(skb, offsetof(struct ethhdr, h_source), redirect_entry->dmac, sizeof(redirect_entry->dmac), 0); bpf_skb_store_bytes(skb, offsetof(struct ethhdr, h_dest), redirect_entry->smac, sizeof(redirect_entry->smac), 0); __u32 type = redirect_entry->from_wan ? PACKET_HOST : PACKET_OTHERHOST; bpf_skb_change_type(skb, type); __u64 flags = redirect_entry->from_wan ? BPF_F_INGRESS : 0; return bpf_redirect(redirect_entry->ifindex, flags); } struct get_real_comm_ctx { char *arg_buf; unsigned int l; }; static int __noinline get_real_comm_loop_cb(__u32 index, void *data) { /* * For string like: /usr/lib/sddm/sddm-helper --socket /tmp/sddm-auth1 * We extract "sddm-helper" from it. */ struct get_real_comm_ctx *ctx = (struct get_real_comm_ctx *)data; if (index >= MAX_ARG_LEN) // always false, just to make verifier happy return 1; if (unlikely(ctx->arg_buf[index] == '/')) ctx->l = index + 1; if (unlikely(ctx->arg_buf[index] == ' ' || ctx->arg_buf[index] == '\0')) { // Write to dst. ctx->arg_buf[index] = '\0'; return 1; } return 0; } /// Parse command line arguments to get the real command name and tgid. static __always_inline int get_pid_pname(struct pid_pname *pid_pname) { int ret; // Get pointer to args string. struct task_struct *task = (void *)bpf_get_current_task(); char *args = (void *)BPF_CORE_READ(task, mm, arg_start); // Read args to buffer. char arg_buf[MAX_ARG_LEN]; // Allocate it out of ctx to pass CO-RE struct get_real_comm_ctx ctx = { 0 }; ctx.arg_buf = arg_buf; ret = bpf_core_read_user_str(arg_buf, MAX_ARG_LEN, args); if (unlikely(ret < 0)) { bpf_printk( "failed to read process name: bpf_core_read_user_str: %d", ret); return ret; } // Find range of command name. ret = bpf_loop(MAX_ARG_LEN, get_real_comm_loop_cb, &ctx, 0); if (unlikely(ret < 0)) return ret; unsigned int offset = ctx.l; // Copy it to pass CO-RE ret = bpf_core_read_str(pid_pname->pname, sizeof(pid_pname->pname), arg_buf + offset); if (unlikely(ret < 0)) { bpf_printk("failed to read process name: bpf_core_read_str: %d", ret); return ret; } // Pupulate tgid ret = bpf_core_read(&pid_pname->pid, sizeof(pid_pname->pid), &task->tgid); if (unlikely(ret < 0)) { bpf_printk("failed to read pid: %d", ret); return ret; } return 0; } static __always_inline int _update_map_elem_by_cookie(const __u64 cookie) { if (unlikely(!cookie)) { bpf_printk("zero cookie"); return -EINVAL; } if (bpf_map_lookup_elem(&cookie_pid_map, &cookie)) { // Cookie to pid mapping already exists. return 0; } int ret; // Build value. struct pid_pname val = { 0 }; ret = get_pid_pname(&val); if (ret) return ret; // Update map. ret = bpf_map_update_elem(&cookie_pid_map, &cookie, &val, BPF_ANY); if (unlikely(ret)) { // bpf_printk("setup_mapping_from_sk: failed update map: %d", ret); return ret; } bpf_map_update_elem(&tgid_pname_map, &val.pid, &val.pname, BPF_ANY); #ifdef __PRINT_SETUP_PROCESS_CONNNECTION bpf_printk("setup_mapping: %llu -> %s (%d)", cookie, val.pname, val.pid); #endif return 0; } static __always_inline int update_map_elem_by_cookie(const __u64 cookie) { int ret; ret = _update_map_elem_by_cookie(cookie); if (ret) { // Fallback to only write pid to avoid loop due to packets sent by dae. struct pid_pname val = { 0 }; val.pid = bpf_get_current_pid_tgid() >> 32; __u32(*pname)[TASK_COMM_LEN] = bpf_map_lookup_elem(&tgid_pname_map, &val.pid); if (pname) { __builtin_memcpy(val.pname, *pname, TASK_COMM_LEN); ret = 0; bpf_printk("fallback [retrieve pname]: %u", val.pid); } else { bpf_printk("failed [retrieve pname]: %u", val.pid); } bpf_map_update_elem(&cookie_pid_map, &cookie, &val, BPF_ANY); return ret; } return 0; } // Create cookie to pid, pname mapping. SEC("cgroup/sock_create") int tproxy_wan_cg_sock_create(struct bpf_sock *sk) { update_map_elem_by_cookie(bpf_get_socket_cookie(sk)); return 1; } // Remove cookie to pid, pname mapping. SEC("cgroup/sock_release") int tproxy_wan_cg_sock_release(struct bpf_sock *sk) { __u64 cookie = bpf_get_socket_cookie(sk); if (unlikely(!cookie)) { bpf_printk("zero cookie"); return 1; } bpf_map_delete_elem(&cookie_pid_map, &cookie); return 1; } SEC("cgroup/connect4") int tproxy_wan_cg_connect4(struct bpf_sock_addr *ctx) { update_map_elem_by_cookie(bpf_get_socket_cookie(ctx)); return 1; } SEC("cgroup/connect6") int tproxy_wan_cg_connect6(struct bpf_sock_addr *ctx) { update_map_elem_by_cookie(bpf_get_socket_cookie(ctx)); return 1; } SEC("cgroup/sendmsg4") int tproxy_wan_cg_sendmsg4(struct bpf_sock_addr *ctx) { update_map_elem_by_cookie(bpf_get_socket_cookie(ctx)); return 1; } SEC("cgroup/sendmsg6") int tproxy_wan_cg_sendmsg6(struct bpf_sock_addr *ctx) { update_map_elem_by_cookie(bpf_get_socket_cookie(ctx)); return 1; } SEC("sockops") int local_tcp_sockops(struct bpf_sock_ops *skops) { struct task_struct *task = (struct task_struct *)bpf_get_current_task(); __u32 pid = BPF_CORE_READ(task, pid); /* Only local TCP connection has non-zero pids. */ if (pid == 0) return 0; struct tuples_key tuple = {}; tuple.l4proto = IPPROTO_TCP; tuple.sport = bpf_htonl(skops->local_port) >> 16; tuple.dport = skops->remote_port >> 16; if (skops->family == AF_INET) { tuple.sip.u6_addr32[2] = bpf_htonl(0x0000ffff); tuple.sip.u6_addr32[3] = skops->local_ip4; tuple.dip.u6_addr32[2] = bpf_htonl(0x0000ffff); tuple.dip.u6_addr32[3] = skops->remote_ip4; } else if (skops->family == AF_INET6) { tuple.sip.u6_addr32[3] = skops->local_ip6[3]; tuple.sip.u6_addr32[2] = skops->local_ip6[2]; tuple.sip.u6_addr32[1] = skops->local_ip6[1]; tuple.sip.u6_addr32[0] = skops->local_ip6[0]; tuple.dip.u6_addr32[3] = skops->remote_ip6[3]; tuple.dip.u6_addr32[2] = skops->remote_ip6[2]; tuple.dip.u6_addr32[1] = skops->remote_ip6[1]; tuple.dip.u6_addr32[0] = skops->remote_ip6[0]; } else { return 0; } switch (skops->op) { case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB: // dae sockets { struct tuples_key rev_tuple = {}; copy_reversed_tuples(&tuple, &rev_tuple); struct routing_result *routing_result; routing_result = bpf_map_lookup_elem(&routing_tuples_map, &rev_tuple); if (!routing_result || !routing_result->pid) break; if (!bpf_sock_hash_update(skops, &fast_sock, &tuple, BPF_ANY)) bpf_printk("fast_sock added: %pI4:%lu -> %pI4:%lu", &tuple.sip.u6_addr32[3], bpf_ntohs(tuple.sport), &tuple.dip.u6_addr32[3], bpf_ntohs(tuple.dport)); break; } case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB: // local client sockets { struct routing_result *routing_result; routing_result = bpf_map_lookup_elem(&routing_tuples_map, &tuple); if (!routing_result || !routing_result->pid) break; if (!bpf_sock_hash_update(skops, &fast_sock, &tuple, BPF_ANY)) bpf_printk("fast_sock added: %pI4:%lu -> %pI4:%lu", &tuple.sip.u6_addr32[3], bpf_ntohs(tuple.sport), &tuple.dip.u6_addr32[3], bpf_ntohs(tuple.dport)); break; } default: break; } return 0; } SEC("sk_msg/fast_redirect") int sk_msg_fast_redirect(struct sk_msg_md *msg) { struct tuples_key rev_tuple = {}; rev_tuple.l4proto = IPPROTO_TCP; rev_tuple.sport = msg->remote_port >> 16; rev_tuple.dport = bpf_htonl(msg->local_port) >> 16; if (msg->family == AF_INET) { rev_tuple.sip.u6_addr32[2] = bpf_htonl(0x0000ffff); rev_tuple.sip.u6_addr32[3] = msg->remote_ip4; rev_tuple.dip.u6_addr32[2] = bpf_htonl(0x0000ffff); rev_tuple.dip.u6_addr32[3] = msg->local_ip4; } else if (msg->family == AF_INET6) { rev_tuple.sip.u6_addr32[3] = msg->remote_ip6[3]; rev_tuple.sip.u6_addr32[2] = msg->remote_ip6[2]; rev_tuple.sip.u6_addr32[1] = msg->remote_ip6[1]; rev_tuple.sip.u6_addr32[0] = msg->remote_ip6[0]; rev_tuple.dip.u6_addr32[3] = msg->local_ip6[3]; rev_tuple.dip.u6_addr32[2] = msg->local_ip6[2]; rev_tuple.dip.u6_addr32[1] = msg->local_ip6[1]; rev_tuple.dip.u6_addr32[0] = msg->local_ip6[0]; } else { return SK_PASS; } if (bpf_msg_redirect_hash(msg, &fast_sock, &rev_tuple, BPF_F_INGRESS) == SK_PASS) bpf_printk("tcp fast redirect: %pI4:%lu -> %pI4:%lu", &rev_tuple.sip.u6_addr32[3], bpf_ntohs(rev_tuple.sport), &rev_tuple.dip.u6_addr32[3], bpf_ntohs(rev_tuple.dport)); return SK_PASS; } SEC("license") const char __license[] = "Dual BSD/GPL";