这是我第一次探索内核网络栈,如果有错误欢迎指正(邮箱),之后我会在文中标注。


TProxy(Transparent Proxy)是内核支持的一种透明代理方式,于 Linux 2.6.28 引入。不同于 NAT 修改数据包目的地址实现重定向,TProxy 仅替换数据包的 skb 原本持有的 socket,不需要修改数据包标头。

名词区分:TProxy 是功能的统称,TPROXY 是一个 iptabales 扩展的名称。

IP_TRANSPARENT

IP_TRANSPARENT 选项允许 socket 将任意非本机地址视为本机地址,进而可以绑定在非本机地址,伪装为非本机地址发送、接收数据。

1int opt = 1;
2setsockopt(sockfd, SOL_IP, IP_TRANSPARENT, &opt, sizeof(opt));

例如,网关(192.168.0.1 / 123.x.x.94)作为透明代理,劫持了客户端(192.168.0.200)与远端(157.x.x.149)的连接。代替客户端与远端连接,又伪装成远端与客户端连接:

1$ netstat -atunp
2Proto Recv-Q Send-Q Local Address           Foreign Address            State       PID/Program name
3tcp        0      0 123.x.x.94:37338        157.x.x.149:443            ESTABLISHED 2904/proxy
4tcp        0      0 ::ffff:157.x.x.149:443  ::ffff:192.168.0.200:56418 ESTABLISHED 2904/proxy

入站重定向

为什么替换 socket

内核网络栈收到一个数据包时,会根据数据包五元组从相应协议的哈希表中找出匹配度最高的 socket,然后将数据包放入 socket 的接收队列。以 UDP 为例:

1// https://elixir.bootlin.com/linux/v6.1.34/source/net/ipv4/udp.c#L2405
2int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
3		   int proto)
4{
5	// ...
6	sk = skb_steal_sock(skb, &refcounted);
7	if (sk) {
8		// ...
9		ret = udp_unicast_rcv_skb(sk, skb, uh);
1static inline struct sock *
2skb_steal_sock(struct sk_buff *skb, bool *refcounted)
3{
4	if (skb->sk) {
5		struct sock *sk = skb->sk;
6		// ...
7		return sk;
1static int udp_unicast_rcv_skb(struct sock *sk, struct sk_buff *skb,
2			       struct udphdr *uh)
3{
4	// ...
5	ret = udp_queue_rcv_skb(sk, skb);

Netfilter hooks 的执行先于协议栈,所以在 netfilter 中修改 skb->sk 就能决定数据包最终会被放入哪个 socket 的接收队列。

内核实现

基于内核 v6.1.34,以 iptables TPROXY 模块的实现为例。nftables 中的实现基本相同。

核心逻辑

主要处理流程在 net/netfilter/xt_TPROXY.ctproxy_tg4()

从 skb 中提取标头:

 1static unsigned int
 2tproxy_tg4(struct net *net, struct sk_buff *skb, __be32 laddr, __be16 lport,
 3	   u_int32_t mark_mask, u_int32_t mark_value)
 4{
 5	const struct iphdr *iph = ip_hdr(skb);
 6	struct udphdr _hdr, *hp;
 7	struct sock *sk;
 8
 9	hp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_hdr), &_hdr);
10	if (hp == NULL)
11		return NF_DROP;

然后开始寻找一个 socket(代码中的 sk) 用来替换数据包 skb 的原 socket。

如果之前对相同四元组的数据包做过重定向,则代理程序应该已经与客户端建立了连接,当前数据包也应该重定向到该连接:

1	/* check if there's an ongoing connection on the packet
2	 * addresses, this happens if the redirect already happened
3	 * and the current packet belongs to an already established
4	 * connection */
5	sk = nf_tproxy_get_sock_v4(net, skb, iph->protocol,
6				   iph->saddr, iph->daddr,
7				   hp->source, hp->dest,
8				   skb->dev, NF_TPROXY_LOOKUP_ESTABLISHED);

设置默认的重定向目的地,没处理过的数据包都应该重定向到此处。优先使用规则中的指定,否则使用接收数据包的网络设备主地址:

1	laddr = nf_tproxy_laddr4(skb, laddr, iph->daddr);
2	if (!lport)
3		lport = hp->dest;
 1__be32 nf_tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr)
 2{
 3	const struct in_ifaddr *ifa;
 4	struct in_device *indev;
 5	__be32 laddr;
 6
 7	if (user_laddr)
 8		return user_laddr;
 9
10	laddr = 0;
11	indev = __in_dev_get_rcu(skb->dev);
12
13	in_dev_for_each_ifa_rcu(ifa, indev) {
14		if (ifa->ifa_flags & IFA_F_SECONDARY)
15			continue;
16
17		laddr = ifa->ifa_local;
18		break;
19	}
20
21	return laddr ? laddr : daddr;
22}

转发 SYN 到代理来建立新连接,而不是复用已经 TIME_WAIT 的连接。我猜这是为了让代理能更简单地同步两侧连接(客户端 <-> 代理 <-> 远端)的状态:

1	/* UDP has no TCP_TIME_WAIT state, so we never enter here */
2	if (sk && sk->sk_state == TCP_TIME_WAIT)
3		/* reopening a TIME_WAIT connection needs special handling */
4		sk = nf_tproxy_handle_time_wait4(net, skb, laddr, lport, sk);
 1/**
 2 * nf_tproxy_handle_time_wait4 - handle IPv4 TCP TIME_WAIT reopen redirections
 3 * @skb:	The skb being processed.
 4 * @laddr:	IPv4 address to redirect to or zero.
 5 * @lport:	TCP port to redirect to or zero.
 6 * @sk:		The TIME_WAIT TCP socket found by the lookup.
 7 *
 8 * We have to handle SYN packets arriving to TIME_WAIT sockets
 9 * differently: instead of reopening the connection we should rather
10 * redirect the new connection to the proxy if there's a listener
11 * socket present.
12 *
13 * nf_tproxy_handle_time_wait4() consumes the socket reference passed in.
14 *
15 * Returns the listener socket if there's one, the TIME_WAIT socket if
16 * no such listener is found, or NULL if the TCP header is incomplete.
17 */
18struct sock *
19nf_tproxy_handle_time_wait4(struct net *net, struct sk_buff *skb,
20			 __be32 laddr, __be16 lport, struct sock *sk)
21{
22	const struct iphdr *iph = ip_hdr(skb);
23	struct tcphdr _hdr, *hp;
24
25	hp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_hdr), &_hdr);
26	if (hp == NULL) {
27		inet_twsk_put(inet_twsk(sk));
28		return NULL;
29	}
30
31	if (hp->syn && !hp->rst && !hp->ack && !hp->fin) {
32		/* SYN to a TIME_WAIT socket, we'd rather redirect it
33		 * to a listener socket if there's one */
34		struct sock *sk2;
35
36		sk2 = nf_tproxy_get_sock_v4(net, skb, iph->protocol,
37					    iph->saddr, laddr ? laddr : iph->daddr,
38					    hp->source, lport ? lport : hp->dest,
39					    skb->dev, NF_TPROXY_LOOKUP_LISTENER);
40		if (sk2) {
41			nf_tproxy_twsk_deschedule_put(inet_twsk(sk));
42			sk = sk2;
43		}
44	}
45
46	return sk;
47}

如果没有匹配到已建立的连接,使用监听状态下的重定向目的地 socket:

1	else if (!sk)
2		/* no, there's no established connection, check if
3		 * there's a listener on the redirected addr/port */
4		sk = nf_tproxy_get_sock_v4(net, skb, iph->protocol,
5					   iph->saddr, laddr,
6					   hp->source, lport,
7					   skb->dev, NF_TPROXY_LOOKUP_LISTENER);

最后确认一下新 socket 符合透明代理条件,用其替换数据包 skb 的原 socket:

 1	/* NOTE: assign_sock consumes our sk reference */
 2	if (sk && nf_tproxy_sk_is_transparent(sk)) {
 3		/* This should be in a separate target, but we don't do multiple
 4		   targets on the same rule yet */
 5		skb->mark = (skb->mark & ~mark_mask) ^ mark_value;
 6		nf_tproxy_assign_sock(skb, sk);
 7		return NF_ACCEPT;
 8	}
 9
10	return NF_DROP;
11}
1/* assign a socket to the skb -- consumes sk */
2static inline void nf_tproxy_assign_sock(struct sk_buff *skb, struct sock *sk)
3{
4	skb_orphan(skb);
5	skb->sk = sk;
6	skb->destructor = sock_edemux;
7}

socket 匹配

nf_tproxy_get_sock_v4() 是对 TCP/UDP socket 通用匹配方法的简单封装。

 1// https://elixir.bootlin.com/linux/v6.1.34/source/net/ipv4/netfilter/nf_tproxy_ipv4.c#L75
 2/*
 3 * This is used when the user wants to intercept a connection matching
 4 * an explicit iptables rule. In this case the sockets are assumed
 5 * matching in preference order:
 6 *
 7 *   - match: if there's a fully established connection matching the
 8 *     _packet_ tuple, it is returned, assuming the redirection
 9 *     already took place and we process a packet belonging to an
10 *     established connection
11 *
12 *   - match: if there's a listening socket matching the redirection
13 *     (e.g. on-port & on-ip of the connection), it is returned,
14 *     regardless if it was bound to 0.0.0.0 or an explicit
15 *     address. The reasoning is that if there's an explicit rule, it
16 *     does not really matter if the listener is bound to an interface
17 *     or to 0. The user already stated that he wants redirection
18 *     (since he added the rule).
19 *
20 * Please note that there's an overlap between what a TPROXY target
21 * and a socket match will match. Normally if you have both rules the
22 * "socket" match will be the first one, effectively all packets
23 * belonging to established connections going through that one.
24 */
25struct sock *
26nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb,
27		      const u8 protocol,
28		      const __be32 saddr, const __be32 daddr,
29		      const __be16 sport, const __be16 dport,
30		      const struct net_device *in,
31		      const enum nf_tproxy_lookup_t lookup_type)
32{
33	struct inet_hashinfo *hinfo = net->ipv4.tcp_death_row.hashinfo;
34	struct sock *sk;
35	switch (protocol) {

TCP 两种状态都有对应的匹配方法,只需要额外为监听状态的 socket 增加引用计数避免被清理:

 1	case IPPROTO_TCP: {
 2		struct tcphdr _hdr, *hp;
 3
 4		hp = skb_header_pointer(skb, ip_hdrlen(skb),
 5					sizeof(struct tcphdr), &_hdr);
 6		if (hp == NULL)
 7			return NULL;
 8
 9		switch (lookup_type) {
10		case NF_TPROXY_LOOKUP_LISTENER:
11			sk = inet_lookup_listener(net, hinfo, skb,
12						  ip_hdrlen(skb) + __tcp_hdrlen(hp),
13						  saddr, sport, daddr, dport,
14						  in->ifindex, 0);
15
16			if (sk && !refcount_inc_not_zero(&sk->sk_refcnt))
17				sk = NULL;
18			/* NOTE: we return listeners even if bound to
19			 * 0.0.0.0, those are filtered out in
20			 * xt_socket, since xt_TPROXY needs 0 bound
21			 * listeners too
22			 */
23			break;
24		case NF_TPROXY_LOOKUP_ESTABLISHED:
25			sk = inet_lookup_established(net, hinfo, saddr, sport,
26						     daddr, dport, in->ifindex);
27			break;
28		default:
29			BUG();
30		}
31		break;
32		}

UDP 需要额外判断匹配结果是否可用:

 1	case IPPROTO_UDP:
 2		sk = udp4_lib_lookup(net, saddr, sport, daddr, dport,
 3				     in->ifindex);
 4		if (sk) {
 5			int connected = (sk->sk_state == TCP_ESTABLISHED);
 6			int wildcard = (inet_sk(sk)->inet_rcv_saddr == 0);
 7
 8			/* NOTE: we return listeners even if bound to
 9			 * 0.0.0.0, those are filtered out in
10			 * xt_socket, since xt_TPROXY needs 0 bound
11			 * listeners too
12			 */
13			if ((lookup_type == NF_TPROXY_LOOKUP_ESTABLISHED &&
14			      (!connected || wildcard)) ||
15			    (lookup_type == NF_TPROXY_LOOKUP_LISTENER && connected)) {
16				sock_put(sk);
17				sk = NULL;
18			}
19		}
20		break;

有两个限定条件:

  • connected 表示是否已“连接”
  • wildcard 表示绑定地址是否是 INADDR_ANY(0.0.0.0

但不理解 !connected || wildcard 这个判定条件,因为 conncted 为真时 wildcard 一定为假,|| wildcard 是多余的。

一个 UDP socket connect() 一个目标后变为已连接状态。如果之前没有绑定在一个能写入 IP 数据包目的地址字段的准确 IP,则 connect() 中会由系统静态路由选择一个本机地址做源地址和本地绑定地址,同时设置给 inet_rcv_saddr 字段。只有 disconnect 才会再次将 inet_rcv_saddr 字段设为 INADDR_ANY:

 1// https://elixir.bootlin.com/linux/v6.1.34/source/net/ipv4/datagram.c#L64
 2int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 3{
 4	//...
 5
 6	if (!inet->inet_saddr)
 7		inet->inet_saddr = fl4->saddr;	/* Update source address */
 8	if (!inet->inet_rcv_saddr) {
 9		inet->inet_rcv_saddr = fl4->saddr;
10		if (sk->sk_prot->rehash)
11			sk->sk_prot->rehash(sk);
12	}
13
14	// ...
15
16	sk->sk_state = TCP_ESTABLISHED;
17
18	// ...
19}
20
21int __udp_disconnect(struct sock *sk, int flags)
22{
23	struct inet_sock *inet = inet_sk(sk);
24	/*
25	 *	1003.1g - break association.
26	 */
27
28	sk->sk_state = TCP_CLOSE;
29
30	// ...
31
32	if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) {
33		inet_reset_saddr(sk);
34
35	// ...
36}
37
38static __inline__ void inet_reset_saddr(struct sock *sk)
39{
40	inet_sk(sk)->inet_rcv_saddr = inet_sk(sk)->inet_saddr = 0;

所以,已连接状态的 UDP socket 的 inet_rcv_saddr 肯定是一个准确的 IP,不可能是 INADDR_ANY 。

添加这些限制条件的 commit 中提到 nf_tproxy_get_sock_v4() 也会被 iptables socket 扩展使用。猜测这里可能是历史遗留问题?

使用方式

以 iptables TPROXY 扩展 为例:

  1. --on-port/--on-ip 指定重定向目的地
  2. 由于没有修改数据包目的地址,在 PREROUTING 之后的路由选择仍会因为目的地址不是本机而走到 FORWARD 链。所以需要策略路由来引导数据包进入 INPUT 链
1ip rule add fwmark 0x233 table 100
2ip route add local default dev lo table 100
3
4iptables -t mangle -A PREROUTING -p udp -j TPROXY --on-ip 127.0.0.1 --on-port 10000 --tproxy-mark 0x233
5iptables -t mangle -A PREROUTING -p tcp -j TPROXY --on-ip 127.0.0.1 --on-port 10000 --tproxy-mark 0x233

用绑定在 :10000 的 socket 替换数据包原 socket,同时打上 0x233 标记。设置策略路由,让所有带有 0x233 标记的数据包使用 100 号路由表。在 100 号表中设定默认路由走 lo 本地回环设备。而从本地回环设备发出的数据包都会被视作发向本机,也就避免了被转发出去。

使用 -m socket 优化性能

nf_tproxy_get_sock_v4() 的注释中提到了这一点:

1/*
2 * Please note that there's an overlap between what a TPROXY target
3 * and a socket match will match. Normally if you have both rules the
4 * "socket" match will be the first one, effectively all packets
5 * belonging to established connections going through that one.
6*/

被 TProxy 重定向过的数据包建立连接后,网络栈中有了数据包原始五元组与 socket 的映射关系。之后相同五元组的数据包在网络栈的常规处理中匹配到的 socket,也即 TPROXY 中第一次用数据包五元组匹配的 sk = nf_tproxy_get_sock_v4(...., NF_TPROXY_LOOKUP_ESTABLISHED) ,就是“正确”的(或者说已重定向过的),没必要进行后续的 socket 替换。所以用 iptables socket 规则分流出这一部分,提升性能。

UDP 比较少见代理程序会 connect() 客户端,所以只以 TCP 为例:

1iptables -t mangle -N tproxy_divert
2iptables -t mangle -A tproxy_divert -j MARK --set-mark 0x233
3iptables -t mangle -A tproxy_divert -j ACCEPT
4
5iptables -t mangle -A PREROUTING -p tcp -m socket -j tproxy_divert
6iptables -t mangle -A PREROUTING -p tcp -j TPROXY --on-port 10000 --on-ip 127.0.0.1 --tproxy-mark 0x233

获取原始目标地址

TCP

getsockname() 获取客户端 socket 的“本地”地址,即为数据包的原始目标地址:

1client_fd = accept(server_fd, (struct sockaddr*)&client_addr, &addr_len);
2
3getsockname(client_fd, (struct sockaddr*) orig_dst, &addrlen)

UDP

  1. 使用 setsockopt(..., SOL_IP, IP_RECVORIGDSTADDR, ...) 设置 socket 选项让 recvmsg() 提供 IP_RECVORIGDST 辅助信息,即数据包目的地址。得益于 TProxy 没有修改原始数据包,该辅助信息是从 IP 标头中获取的:
 1// /net/ipv4/ip_sockglue.c
 2static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb)
 3{
 4	struct sockaddr_in sin;
 5	const struct iphdr *iph = ip_hdr(skb);
 6	__be16 *ports = (__be16 *)skb_transport_header(skb);
 7
 8	if (skb_transport_offset(skb) + 4 > (int)skb->len)
 9		return;
10
11	/* All current transport protocols have the port numbers in the
12	 * first four bytes of the transport header and this function is
13	 * written with this assumption in mind.
14	 */
15
16	sin.sin_family = AF_INET;
17	sin.sin_addr.s_addr = iph->daddr;
18	sin.sin_port = ports[1];
19	memset(sin.sin_zero, 0, sizeof(sin.sin_zero));
20
21	put_cmsg(msg, SOL_IP, IP_ORIGDSTADDR, sizeof(sin), &sin);
22}
  1. 使用 recvmsg() 读取数据包和辅助信息
  2. 辅助信息中级别为 SOL_IP ,类型为 IP_ORIGDSTADDR 的数据就是原始目标地址

完整示例:

 1#include <arpa/inet.h>
 2#include <netinet/in.h>
 3#include <stdio.h>
 4#include <stdlib.h>
 5#include <string.h>
 6#include <sys/socket.h>
 7#include <sys/types.h>
 8#include <unistd.h>
 9
10#define MAX_BUF_SIZE 1024
11#define SRC_ADDR INADDR_ANY
12#define SRC_PORT 9999
13
14int main() {
15  int sockfd;
16  struct sockaddr_in bind_addr, client_addr;
17  char buffer[MAX_BUF_SIZE];
18
19  if ((sockfd = socket(AF_INET, SOCK_DGRAM, 0)) < 0) {
20    perror("socket");
21    exit(EXIT_FAILURE);
22  }
23
24  int opt = 1;
25  if (setsockopt(sockfd, SOL_IP, IP_TRANSPARENT, &opt, sizeof(opt)) < 0) {
26    perror("IP_TRANSPARENT");
27    exit(EXIT_FAILURE);
28  }
29
30  // bind
31  memset(&bind_addr, 0, sizeof(bind_addr));
32  bind_addr.sin_family = AF_INET;
33  bind_addr.sin_addr.s_addr = htonl(SRC_ADDR);
34  bind_addr.sin_port = htons(SRC_PORT);
35  if (bind(sockfd, (struct sockaddr *)&bind_addr, sizeof(bind_addr)) < 0) {
36    perror("bind");
37    exit(EXIT_FAILURE);
38  }
39
40  // recvmsg
41  if (setsockopt(sockfd, SOL_IP, IP_RECVORIGDSTADDR, &opt, sizeof(opt)) < 0) {
42    perror("IP_RECVORIGDSTADDR");
43    exit(EXIT_FAILURE);
44  }
45  while (1) {
46    memset(buffer, 0, sizeof(buffer));
47    struct msghdr msgh = {0};
48    struct iovec iov[1];
49    iov[0].iov_base = buffer;
50    iov[0].iov_len = sizeof(buffer);
51    msgh.msg_iov = iov;
52    msgh.msg_iovlen = 1;
53    msgh.msg_name = &client_addr;
54    msgh.msg_namelen = sizeof(client_addr);
55    char cmsgbuf[CMSG_SPACE(sizeof(int))];
56    msgh.msg_control = cmsgbuf;
57    msgh.msg_controllen = sizeof(cmsgbuf);
58    if (recvmsg(sockfd, &msgh, 0) < 0) {
59      perror("recvmsg");
60      continue;
61    }
62
63    struct cmsghdr *cmsg;
64    for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg != NULL;
65         cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
66      if (cmsg->cmsg_level == IPPROTO_IP && cmsg->cmsg_type == IP_ORIGDSTADDR) {
67        struct sockaddr_in *addr = (struct sockaddr_in *)CMSG_DATA(cmsg);
68        printf("Original DST ADDR: %s\n", inet_ntoa(addr->sin_addr));
69        break;
70      }
71    }
72    printf("Data: %s\n", buffer);
73  }
74
75  close(sockfd);
76
77  return 0;
78}

参考

示例: