这是我第一次探索内核网络栈,如果有错误欢迎指正(邮箱),之后我会在文中标注。
TProxy(Transparent Proxy)是内核支持的一种透明代理方式,于 Linux 2.6.28 引入。不同于 NAT 修改数据包目的地址实现重定向,TProxy 仅替换数据包的 skb 原本持有的 socket,不需要修改数据包标头。
名词区分:TProxy 是功能的统称,TPROXY 是一个 iptabales 扩展的名称。
IP_TRANSPARENT
IP_TRANSPARENT
选项允许 socket 将任意非本机地址视为本机地址,进而可以绑定在非本机地址,伪装为非本机地址发送、接收数据。
1int opt = 1;
2setsockopt(sockfd, SOL_IP, IP_TRANSPARENT, &opt, sizeof(opt));
例如,网关(192.168.0.1
/ 123.x.x.94
)作为透明代理,劫持了客户端(192.168.0.200
)与远端(157.x.x.149
)的连接。代替客户端与远端连接,又伪装成远端与客户端连接:
1$ netstat -atunp
2Proto Recv-Q Send-Q Local Address Foreign Address State PID/Program name
3tcp 0 0 123.x.x.94:37338 157.x.x.149:443 ESTABLISHED 2904/proxy
4tcp 0 0 ::ffff:157.x.x.149:443 ::ffff:192.168.0.200:56418 ESTABLISHED 2904/proxy
入站重定向
为什么替换 socket
内核网络栈收到一个数据包时,会根据数据包四元组从相应协议的哈希表中找出匹配度最高的 socket,然后将数据包放入 socket 的接收队列。以 UDP 为例:
1// https://elixir.bootlin.com/linux/v6.1.34/source/net/ipv4/udp.c#L2405
2int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
3 int proto)
4{
5 // ...
6 sk = skb_steal_sock(skb, &refcounted);
7 if (sk) {
8 // ...
9 ret = udp_unicast_rcv_skb(sk, skb, uh);
10
11
12static inline struct sock *
13skb_steal_sock(struct sk_buff *skb, bool *refcounted)
14{
15 if (skb->sk) {
16 struct sock *sk = skb->sk;
17 // ...
18 return sk;
19
20
21static int udp_unicast_rcv_skb(struct sock *sk, struct sk_buff *skb,
22 struct udphdr *uh)
23{
24 // ...
25 ret = udp_queue_rcv_skb(sk, skb);
所以利用 netfilter hook 的执行时机早于协议接收方法,提前修改数据包的 skb 持有的 socket,就决定了此后数据包最终会被放入哪个 socket 的接收队列。
内核实现
基于内核 v6.1.34,以 iptables TPROXY 模块的实现为例。nftables 中的实现基本相同。
核心逻辑
主要处理流程在 net/netfilter/xt_TPROXY.c
的 tproxy_tg4()
。
从 skb 中提取标头:
1static unsigned int
2tproxy_tg4(struct net *net, struct sk_buff *skb, __be32 laddr, __be16 lport,
3 u_int32_t mark_mask, u_int32_t mark_value)
4{
5 const struct iphdr *iph = ip_hdr(skb);
6 struct udphdr _hdr, *hp;
7 struct sock *sk;
8
9 hp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_hdr), &_hdr);
10 if (hp == NULL)
11 return NF_DROP;
然后开始寻找一个 socket(代码中的 sk
) 用来替换数据包 skb 的原 socket。
如果之前对相同四元组的数据包做过重定向,则代理程序应该已经与客户端建立了连接,当前数据包也应该重定向到该连接:
1 /* check if there's an ongoing connection on the packet
2 * addresses, this happens if the redirect already happened
3 * and the current packet belongs to an already established
4 * connection */
5 sk = nf_tproxy_get_sock_v4(net, skb, iph->protocol,
6 iph->saddr, iph->daddr,
7 hp->source, hp->dest,
8 skb->dev, NF_TPROXY_LOOKUP_ESTABLISHED);
设置默认的重定向目的地,没处理过的数据包都应该重定向到此处。优先使用规则中的指定,否则使用接收数据包的网络设备主地址:
1 laddr = nf_tproxy_laddr4(skb, laddr, iph->daddr);
2 if (!lport)
3 lport = hp->dest;
1__be32 nf_tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr)
2{
3 const struct in_ifaddr *ifa;
4 struct in_device *indev;
5 __be32 laddr;
6
7 if (user_laddr)
8 return user_laddr;
9
10 laddr = 0;
11 indev = __in_dev_get_rcu(skb->dev);
12
13 in_dev_for_each_ifa_rcu(ifa, indev) {
14 if (ifa->ifa_flags & IFA_F_SECONDARY)
15 continue;
16
17 laddr = ifa->ifa_local;
18 break;
19 }
20
21 return laddr ? laddr : daddr;
22}
如果连接不可用后客户端想重新建立连接,则放弃当前的旧连接,让重定向目的地 socket 处理握手请求:
1 /* UDP has no TCP_TIME_WAIT state, so we never enter here */
2 if (sk && sk->sk_state == TCP_TIME_WAIT)
3 /* reopening a TIME_WAIT connection needs special handling */
4 sk = nf_tproxy_handle_time_wait4(net, skb, laddr, lport, sk);
1/**
2 * nf_tproxy_handle_time_wait4 - handle IPv4 TCP TIME_WAIT reopen redirections
3 * @skb: The skb being processed.
4 * @laddr: IPv4 address to redirect to or zero.
5 * @lport: TCP port to redirect to or zero.
6 * @sk: The TIME_WAIT TCP socket found by the lookup.
7 *
8 * We have to handle SYN packets arriving to TIME_WAIT sockets
9 * differently: instead of reopening the connection we should rather
10 * redirect the new connection to the proxy if there's a listener
11 * socket present.
12 *
13 * nf_tproxy_handle_time_wait4() consumes the socket reference passed in.
14 *
15 * Returns the listener socket if there's one, the TIME_WAIT socket if
16 * no such listener is found, or NULL if the TCP header is incomplete.
17 */
18struct sock *
19nf_tproxy_handle_time_wait4(struct net *net, struct sk_buff *skb,
20 __be32 laddr, __be16 lport, struct sock *sk)
21{
22 const struct iphdr *iph = ip_hdr(skb);
23 struct tcphdr _hdr, *hp;
24
25 hp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_hdr), &_hdr);
26 if (hp == NULL) {
27 inet_twsk_put(inet_twsk(sk));
28 return NULL;
29 }
30
31 if (hp->syn && !hp->rst && !hp->ack && !hp->fin) {
32 /* SYN to a TIME_WAIT socket, we'd rather redirect it
33 * to a listener socket if there's one */
34 struct sock *sk2;
35
36 sk2 = nf_tproxy_get_sock_v4(net, skb, iph->protocol,
37 iph->saddr, laddr ? laddr : iph->daddr,
38 hp->source, lport ? lport : hp->dest,
39 skb->dev, NF_TPROXY_LOOKUP_LISTENER);
40 if (sk2) {
41 nf_tproxy_twsk_deschedule_put(inet_twsk(sk));
42 sk = sk2;
43 }
44 }
45
46 return sk;
47}
如果没有匹配到已建立的连接,使用监听状态下的重定向目的地 socket:
1 else if (!sk)
2 /* no, there's no established connection, check if
3 * there's a listener on the redirected addr/port */
4 sk = nf_tproxy_get_sock_v4(net, skb, iph->protocol,
5 iph->saddr, laddr,
6 hp->source, lport,
7 skb->dev, NF_TPROXY_LOOKUP_LISTENER);
最后确认一下新 socket 符合透明代理条件,用其替换数据包 skb 的原 socket:
1 /* NOTE: assign_sock consumes our sk reference */
2 if (sk && nf_tproxy_sk_is_transparent(sk)) {
3 /* This should be in a separate target, but we don't do multiple
4 targets on the same rule yet */
5 skb->mark = (skb->mark & ~mark_mask) ^ mark_value;
6 nf_tproxy_assign_sock(skb, sk);
7 return NF_ACCEPT;
8 }
9
10 return NF_DROP;
11}
1/* assign a socket to the skb -- consumes sk */
2static inline void nf_tproxy_assign_sock(struct sk_buff *skb, struct sock *sk)
3{
4 skb_orphan(skb);
5 skb->sk = sk;
6 skb->destructor = sock_edemux;
7}
socket 匹配
nf_tproxy_get_sock_v4()
是对 TCP/UDP socket 通用匹配方法的简单封装。
1// https://elixir.bootlin.com/linux/v6.1.34/source/net/ipv4/netfilter/nf_tproxy_ipv4.c#L75
2/*
3 * This is used when the user wants to intercept a connection matching
4 * an explicit iptables rule. In this case the sockets are assumed
5 * matching in preference order:
6 *
7 * - match: if there's a fully established connection matching the
8 * _packet_ tuple, it is returned, assuming the redirection
9 * already took place and we process a packet belonging to an
10 * established connection
11 *
12 * - match: if there's a listening socket matching the redirection
13 * (e.g. on-port & on-ip of the connection), it is returned,
14 * regardless if it was bound to 0.0.0.0 or an explicit
15 * address. The reasoning is that if there's an explicit rule, it
16 * does not really matter if the listener is bound to an interface
17 * or to 0. The user already stated that he wants redirection
18 * (since he added the rule).
19 *
20 * Please note that there's an overlap between what a TPROXY target
21 * and a socket match will match. Normally if you have both rules the
22 * "socket" match will be the first one, effectively all packets
23 * belonging to established connections going through that one.
24 */
25struct sock *
26nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb,
27 const u8 protocol,
28 const __be32 saddr, const __be32 daddr,
29 const __be16 sport, const __be16 dport,
30 const struct net_device *in,
31 const enum nf_tproxy_lookup_t lookup_type)
32{
33 struct inet_hashinfo *hinfo = net->ipv4.tcp_death_row.hashinfo;
34 struct sock *sk;
35 switch (protocol) {
TCP 两种状态都有对应的匹配方法,只需要额外为监听状态的 socket 增加引用计数避免被清理:
1 case IPPROTO_TCP: {
2 struct tcphdr _hdr, *hp;
3
4 hp = skb_header_pointer(skb, ip_hdrlen(skb),
5 sizeof(struct tcphdr), &_hdr);
6 if (hp == NULL)
7 return NULL;
8
9 switch (lookup_type) {
10 case NF_TPROXY_LOOKUP_LISTENER:
11 sk = inet_lookup_listener(net, hinfo, skb,
12 ip_hdrlen(skb) + __tcp_hdrlen(hp),
13 saddr, sport, daddr, dport,
14 in->ifindex, 0);
15
16 if (sk && !refcount_inc_not_zero(&sk->sk_refcnt))
17 sk = NULL;
18 /* NOTE: we return listeners even if bound to
19 * 0.0.0.0, those are filtered out in
20 * xt_socket, since xt_TPROXY needs 0 bound
21 * listeners too
22 */
23 break;
24 case NF_TPROXY_LOOKUP_ESTABLISHED:
25 sk = inet_lookup_established(net, hinfo, saddr, sport,
26 daddr, dport, in->ifindex);
27 break;
28 default:
29 BUG();
30 }
31 break;
32 }
UDP 需要额外判断匹配结果是否可用:
1 case IPPROTO_UDP:
2 sk = udp4_lib_lookup(net, saddr, sport, daddr, dport,
3 in->ifindex);
4 if (sk) {
5 int connected = (sk->sk_state == TCP_ESTABLISHED);
6 int wildcard = (inet_sk(sk)->inet_rcv_saddr == 0);
7
8 /* NOTE: we return listeners even if bound to
9 * 0.0.0.0, those are filtered out in
10 * xt_socket, since xt_TPROXY needs 0 bound
11 * listeners too
12 */
13 if ((lookup_type == NF_TPROXY_LOOKUP_ESTABLISHED &&
14 (!connected || wildcard)) ||
15 (lookup_type == NF_TPROXY_LOOKUP_LISTENER && connected)) {
16 sock_put(sk);
17 sk = NULL;
18 }
19 }
20 break;
有两个限定条件:
- connected 表示是否已“连接”
- wildcard 表示绑定地址是否是 INADDR_ANY(
0.0.0.0
)
但不理解 !connected || wildcard
这个判定条件,因为 conncted 为真时 wildcard 一定为假,|| wildcard
是多余的。
一个 UDP socket connect()
一个目标后变为已连接状态。如果之前没有绑定在一个能写入 IP 数据包目的地址字段的准确 IP,则 connect()
中会由系统静态路由选择一个本机地址做源地址和本地绑定地址,同时设置给 inet_rcv_saddr
字段。只有 disconnect 才会再次将 inet_rcv_saddr
字段设为 INADDR_ANY:
1// https://elixir.bootlin.com/linux/v6.1.34/source/net/ipv4/datagram.c#L64
2int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
3{
4 //...
5
6 if (!inet->inet_saddr)
7 inet->inet_saddr = fl4->saddr; /* Update source address */
8 if (!inet->inet_rcv_saddr) {
9 inet->inet_rcv_saddr = fl4->saddr;
10 if (sk->sk_prot->rehash)
11 sk->sk_prot->rehash(sk);
12 }
13
14 // ...
15
16 sk->sk_state = TCP_ESTABLISHED;
17
18 // ...
19}
20
21int __udp_disconnect(struct sock *sk, int flags)
22{
23 struct inet_sock *inet = inet_sk(sk);
24 /*
25 * 1003.1g - break association.
26 */
27
28 sk->sk_state = TCP_CLOSE;
29
30 // ...
31
32 if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) {
33 inet_reset_saddr(sk);
34
35 // ...
36}
37
38static __inline__ void inet_reset_saddr(struct sock *sk)
39{
40 inet_sk(sk)->inet_rcv_saddr = inet_sk(sk)->inet_saddr = 0;
所以,已连接状态的 UDP socket 的 inet_rcv_saddr
肯定是一个准确的 IP,不可能是 INADDR_ANY 。
添加这些限制条件的 commit 中提到 nf_tproxy_get_sock_v4()
也会被 iptables socket 扩展使用。猜测这里可能是历史遗留问题?
使用方式
以 iptables TPROXY 扩展 为例:
- 由
--on-port
/--on-ip
指定重定向目的地 - 由于没有修改数据包目的地址,在 PREROUTING 之后的路由选择仍会因为目的地址不是本机而走到 FORWARD 链。所以需要策略路由来引导数据包进入 INPUT 链
1ip rule add fwmark 0x233 table 100
2ip route add local default dev lo table 100
3
4iptables -t mangle -A PREROUTING -p udp -j TPROXY --on-ip 127.0.0.1 --on-port 10000 --tproxy-mark 0x233
5iptables -t mangle -A PREROUTING -p tcp -j TPROXY --on-ip 127.0.0.1 --on-port 10000 --tproxy-mark 0x233
用绑定在 :10000
的 socket 替换数据包原 socket,同时打上 0x233 标记。设置策略路由,让所有带有 0x233 标记的数据包使用 100 号路由表。在 100 号表中设定默认路由走 lo 本地回环设备。而从本地回环设备发出的数据包都会被视作发向本机,也就避免了被转发出去。
使用 -m socket
优化性能
nf_tproxy_get_sock_v4()
的注释中提到了这一点:
1/*
2 * Please note that there's an overlap between what a TPROXY target
3 * and a socket match will match. Normally if you have both rules the
4 * "socket" match will be the first one, effectively all packets
5 * belonging to established connections going through that one.
6*/
被 TProxy 重定向过的数据包建立连接后,网络栈中有了数据包原始五元组与 socket 的映射关系。之后相同五元组的数据包在网络栈的常规处理中匹配到的 socket,也即 TPROXY 中第一次用数据包五元组匹配的 sk = nf_tproxy_get_sock_v4(...., NF_TPROXY_LOOKUP_ESTABLISHED)
,就是“正确”的(或者说已重定向过的),没必要进行后续的 socket 替换。所以用 iptables socket 规则分流出这一部分,提升性能。
UDP 比较少见代理程序会 connect()
客户端,所以只以 TCP 为例:
1iptables -t mangle -N tproxy_divert
2iptables -t mangle -A tproxy_divert -j MARK --set-mark 0x233
3iptables -t mangle -A tproxy_divert -j ACCEPT
4
5iptables -t mangle -A PREROUTING -p tcp -m socket -j tproxy_divert
6iptables -t mangle -A PREROUTING -p tcp -j TPROXY --on-port 10000 --on-ip 127.0.0.1 --tproxy-mark 0x233
获取原始目标地址
TCP
用 getsockname()
获取客户端 socket 的“本地”地址,即为数据包的原始目标地址:
1client_fd = accept(server_fd, (struct sockaddr*)&client_addr, &addr_len);
2
3getsockname(client_fd, (struct sockaddr*) orig_dst, &addrlen)
UDP
- 使用
setsockopt(..., SOL_IP, IP_RECVORIGDSTADDR, ...)
设置 socket 选项让recvmsg()
提供 IP_RECVORIGDST 辅助信息,即数据包目的地址。得益于 TProxy 没有修改原始数据包,该辅助信息是从 IP 标头中获取的:
1// /net/ipv4/ip_sockglue.c
2static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb)
3{
4 struct sockaddr_in sin;
5 const struct iphdr *iph = ip_hdr(skb);
6 __be16 *ports = (__be16 *)skb_transport_header(skb);
7
8 if (skb_transport_offset(skb) + 4 > (int)skb->len)
9 return;
10
11 /* All current transport protocols have the port numbers in the
12 * first four bytes of the transport header and this function is
13 * written with this assumption in mind.
14 */
15
16 sin.sin_family = AF_INET;
17 sin.sin_addr.s_addr = iph->daddr;
18 sin.sin_port = ports[1];
19 memset(sin.sin_zero, 0, sizeof(sin.sin_zero));
20
21 put_cmsg(msg, SOL_IP, IP_ORIGDSTADDR, sizeof(sin), &sin);
22}
- 使用
recvmsg()
读取数据包和辅助信息 - 辅助信息中级别为 SOL_IP ,类型为 IP_ORIGDSTADDR 的数据就是原始目标地址
完整示例:
1#include <arpa/inet.h>
2#include <netinet/in.h>
3#include <stdio.h>
4#include <stdlib.h>
5#include <string.h>
6#include <sys/socket.h>
7#include <sys/types.h>
8#include <unistd.h>
9
10#define MAX_BUF_SIZE 1024
11#define SRC_ADDR INADDR_ANY
12#define SRC_PORT 9999
13
14int main() {
15 int sockfd;
16 struct sockaddr_in bind_addr, client_addr;
17 char buffer[MAX_BUF_SIZE];
18
19 if ((sockfd = socket(AF_INET, SOCK_DGRAM, 0)) < 0) {
20 perror("socket");
21 exit(EXIT_FAILURE);
22 }
23
24 int opt = 1;
25 if (setsockopt(sockfd, SOL_IP, IP_TRANSPARENT, &opt, sizeof(opt)) < 0) {
26 perror("IP_TRANSPARENT");
27 exit(EXIT_FAILURE);
28 }
29
30 // bind
31 memset(&bind_addr, 0, sizeof(bind_addr));
32 bind_addr.sin_family = AF_INET;
33 bind_addr.sin_addr.s_addr = htonl(SRC_ADDR);
34 bind_addr.sin_port = htons(SRC_PORT);
35 if (bind(sockfd, (struct sockaddr *)&bind_addr, sizeof(bind_addr)) < 0) {
36 perror("bind");
37 exit(EXIT_FAILURE);
38 }
39
40 // recvmsg
41 if (setsockopt(sockfd, SOL_IP, IP_RECVORIGDSTADDR, &opt, sizeof(opt)) < 0) {
42 perror("IP_RECVORIGDSTADDR");
43 exit(EXIT_FAILURE);
44 }
45 while (1) {
46 memset(buffer, 0, sizeof(buffer));
47 struct msghdr msgh = {0};
48 struct iovec iov[1];
49 iov[0].iov_base = buffer;
50 iov[0].iov_len = sizeof(buffer);
51 msgh.msg_iov = iov;
52 msgh.msg_iovlen = 1;
53 msgh.msg_name = &client_addr;
54 msgh.msg_namelen = sizeof(client_addr);
55 char cmsgbuf[CMSG_SPACE(sizeof(int))];
56 msgh.msg_control = cmsgbuf;
57 msgh.msg_controllen = sizeof(cmsgbuf);
58 if (recvmsg(sockfd, &msgh, 0) < 0) {
59 perror("recvmsg");
60 continue;
61 }
62
63 struct cmsghdr *cmsg;
64 for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg != NULL;
65 cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
66 if (cmsg->cmsg_level == IPPROTO_IP && cmsg->cmsg_type == IP_ORIGDSTADDR) {
67 struct sockaddr_in *addr = (struct sockaddr_in *)CMSG_DATA(cmsg);
68 printf("Original DST ADDR: %s\n", inet_ntoa(addr->sin_addr));
69 break;
70 }
71 }
72 printf("Data: %s\n", buffer);
73 }
74
75 close(sockfd);
76
77 return 0;
78}
参考
- 官方文档
- Linux transparent proxy support
- TProxy 探秘
- Abusing Linux’s firewall: the hack that allowed us to build Spectrum
- iptables-extensions 中 socket 模块是个啥?
- 从 ss-redir 的实现到 Linux NAT
- Linux 网络栈接收数据(RX):原理及内核实现(2022)
示例: