深入理解 Linux TProxy

这是我第一次探索内核网络栈，如果有错误欢迎指正（邮箱），之后我会在文中标注。

TProxy（Transparent Proxy）是内核支持的一种透明代理方式，于 Linux 2.6.28 引入。不同于 NAT 修改数据包目的地址实现重定向，TProxy 仅替换数据包的 skb 原本持有的 socket，不需要修改数据包标头。

名词区分：TProxy 是功能的统称，TPROXY 是一个 iptabales 扩展的名称。

IP_TRANSPARENT

IP_TRANSPARENT 选项允许 socket 将任意非本机地址视为本机地址，进而可以绑定在非本机地址，伪装为非本机地址发送、接收数据。

int opt = 1;
setsockopt(sockfd, SOL_IP, IP_TRANSPARENT, &opt, sizeof(opt));

例如，网关（192.168.0.1 / 123.x.x.94）作为透明代理，劫持了客户端（192.168.0.200）与远端（157.x.x.149）的连接。代替客户端与远端连接，又伪装成远端与客户端连接：

$ netstat -atunp
Proto Recv-Q Send-Q Local Address           Foreign Address            State       PID/Program name
tcp        0      0 123.x.x.94:37338        157.x.x.149:443            ESTABLISHED 2904/proxy
tcp        0      0 ::ffff:157.x.x.149:443  ::ffff:192.168.0.200:56418 ESTABLISHED 2904/proxy

入站重定向

为什么替换 socket

内核网络栈收到一个数据包时，会根据数据包五元组从相应协议的哈希表中找出匹配度最高的 socket，然后将数据包放入 socket 的接收队列。以 UDP 为例：

// https://elixir.bootlin.com/linux/v6.1.34/source/net/ipv4/udp.c#L2405
int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
		   int proto)
{
	// ...
	sk = skb_steal_sock(skb, &refcounted);
	if (sk) {
		// ...
		ret = udp_unicast_rcv_skb(sk, skb, uh);

static inline struct sock *
skb_steal_sock(struct sk_buff *skb, bool *refcounted)
{
	if (skb->sk) {
		struct sock *sk = skb->sk;
		// ...
		return sk;

static int udp_unicast_rcv_skb(struct sock *sk, struct sk_buff *skb,
			       struct udphdr *uh)
{
	// ...
	ret = udp_queue_rcv_skb(sk, skb);

Netfilter hooks 的执行先于协议栈，所以在 netfilter 中修改 skb->sk 就能决定数据包最终会被放入哪个 socket 的接收队列。

内核实现

基于内核 v6.1.34，以 iptables TPROXY 模块的实现为例。nftables 中的实现基本相同。

核心逻辑

主要处理流程在 net/netfilter/xt_TPROXY.c 的 tproxy_tg4()。

从 skb 中提取标头：

static unsigned int
tproxy_tg4(struct net *net, struct sk_buff *skb, __be32 laddr, __be16 lport,
	   u_int32_t mark_mask, u_int32_t mark_value)
{
	const struct iphdr *iph = ip_hdr(skb);
	struct udphdr _hdr, *hp;
	struct sock *sk;

	hp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_hdr), &_hdr);
	if (hp == NULL)
		return NF_DROP;

然后开始寻找一个 socket（代码中的 sk）用来替换数据包 skb 的原 socket。

如果之前对相同四元组的数据包做过重定向，则代理程序应该已经与客户端建立了连接，当前数据包也应该重定向到该连接：

	/* check if there's an ongoing connection on the packet
	 * addresses, this happens if the redirect already happened
	 * and the current packet belongs to an already established
	 * connection */
	sk = nf_tproxy_get_sock_v4(net, skb, iph->protocol,
				   iph->saddr, iph->daddr,
				   hp->source, hp->dest,
				   skb->dev, NF_TPROXY_LOOKUP_ESTABLISHED);

设置默认的重定向目的地，没处理过的数据包都应该重定向到此处。优先使用规则中的指定，否则使用接收数据包的网络设备主地址：

	laddr = nf_tproxy_laddr4(skb, laddr, iph->daddr);
	if (!lport)
		lport = hp->dest;

__be32 nf_tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr)
{
	const struct in_ifaddr *ifa;
	struct in_device *indev;
	__be32 laddr;

	if (user_laddr)
		return user_laddr;

	laddr = 0;
	indev = __in_dev_get_rcu(skb->dev);

	in_dev_for_each_ifa_rcu(ifa, indev) {
		if (ifa->ifa_flags & IFA_F_SECONDARY)
			continue;

		laddr = ifa->ifa_local;
		break;
	}

	return laddr ? laddr : daddr;
}

转发 SYN 到代理来建立新连接，而不是复用已经 TIME_WAIT 的连接。我猜这是为了让代理能更简单地同步两侧连接（客户端 <-> 代理 <-> 远端）的状态：

	/* UDP has no TCP_TIME_WAIT state, so we never enter here */
	if (sk && sk->sk_state == TCP_TIME_WAIT)
		/* reopening a TIME_WAIT connection needs special handling */
		sk = nf_tproxy_handle_time_wait4(net, skb, laddr, lport, sk);

/**
 * nf_tproxy_handle_time_wait4 - handle IPv4 TCP TIME_WAIT reopen redirections
 * @skb:	The skb being processed.
 * @laddr:	IPv4 address to redirect to or zero.
 * @lport:	TCP port to redirect to or zero.
 * @sk:		The TIME_WAIT TCP socket found by the lookup.
 *
 * We have to handle SYN packets arriving to TIME_WAIT sockets
 * differently: instead of reopening the connection we should rather
 * redirect the new connection to the proxy if there's a listener
 * socket present.
 *
 * nf_tproxy_handle_time_wait4() consumes the socket reference passed in.
 *
 * Returns the listener socket if there's one, the TIME_WAIT socket if
 * no such listener is found, or NULL if the TCP header is incomplete.
 */
struct sock *
nf_tproxy_handle_time_wait4(struct net *net, struct sk_buff *skb,
			 __be32 laddr, __be16 lport, struct sock *sk)
{
	const struct iphdr *iph = ip_hdr(skb);
	struct tcphdr _hdr, *hp;

	hp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_hdr), &_hdr);
	if (hp == NULL) {
		inet_twsk_put(inet_twsk(sk));
		return NULL;
	}

	if (hp->syn && !hp->rst && !hp->ack && !hp->fin) {
		/* SYN to a TIME_WAIT socket, we'd rather redirect it
		 * to a listener socket if there's one */
		struct sock *sk2;

		sk2 = nf_tproxy_get_sock_v4(net, skb, iph->protocol,
					    iph->saddr, laddr ? laddr : iph->daddr,
					    hp->source, lport ? lport : hp->dest,
					    skb->dev, NF_TPROXY_LOOKUP_LISTENER);
		if (sk2) {
			nf_tproxy_twsk_deschedule_put(inet_twsk(sk));
			sk = sk2;
		}
	}

	return sk;
}

如果没有匹配到已建立的连接，使用监听状态下的重定向目的地 socket：

	else if (!sk)
		/* no, there's no established connection, check if
		 * there's a listener on the redirected addr/port */
		sk = nf_tproxy_get_sock_v4(net, skb, iph->protocol,
					   iph->saddr, laddr,
					   hp->source, lport,
					   skb->dev, NF_TPROXY_LOOKUP_LISTENER);

最后确认一下新 socket 符合透明代理条件，用其替换数据包 skb 的原 socket：

	/* NOTE: assign_sock consumes our sk reference */
	if (sk && nf_tproxy_sk_is_transparent(sk)) {
		/* This should be in a separate target, but we don't do multiple
		   targets on the same rule yet */
		skb->mark = (skb->mark & ~mark_mask) ^ mark_value;
		nf_tproxy_assign_sock(skb, sk);
		return NF_ACCEPT;
	}

	return NF_DROP;
}

/* assign a socket to the skb -- consumes sk */
static inline void nf_tproxy_assign_sock(struct sk_buff *skb, struct sock *sk)
{
	skb_orphan(skb);
	skb->sk = sk;
	skb->destructor = sock_edemux;
}

socket 匹配

nf_tproxy_get_sock_v4() 是对 TCP/UDP socket 通用匹配方法的简单封装。

// https://elixir.bootlin.com/linux/v6.1.34/source/net/ipv4/netfilter/nf_tproxy_ipv4.c#L75
/*
 * This is used when the user wants to intercept a connection matching
 * an explicit iptables rule. In this case the sockets are assumed
 * matching in preference order:
 *
 *   - match: if there's a fully established connection matching the
 *     _packet_ tuple, it is returned, assuming the redirection
 *     already took place and we process a packet belonging to an
 *     established connection
 *
 *   - match: if there's a listening socket matching the redirection
 *     (e.g. on-port & on-ip of the connection), it is returned,
 *     regardless if it was bound to 0.0.0.0 or an explicit
 *     address. The reasoning is that if there's an explicit rule, it
 *     does not really matter if the listener is bound to an interface
 *     or to 0. The user already stated that he wants redirection
 *     (since he added the rule).
 *
 * Please note that there's an overlap between what a TPROXY target
 * and a socket match will match. Normally if you have both rules the
 * "socket" match will be the first one, effectively all packets
 * belonging to established connections going through that one.
 */
struct sock *
nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb,
		      const u8 protocol,
		      const __be32 saddr, const __be32 daddr,
		      const __be16 sport, const __be16 dport,
		      const struct net_device *in,
		      const enum nf_tproxy_lookup_t lookup_type)
{
	struct inet_hashinfo *hinfo = net->ipv4.tcp_death_row.hashinfo;
	struct sock *sk;
	switch (protocol) {

TCP 两种状态都有对应的匹配方法，只需要额外为监听状态的 socket 增加引用计数避免被清理：

	case IPPROTO_TCP: {
		struct tcphdr _hdr, *hp;

		hp = skb_header_pointer(skb, ip_hdrlen(skb),
					sizeof(struct tcphdr), &_hdr);
		if (hp == NULL)
			return NULL;

		switch (lookup_type) {
		case NF_TPROXY_LOOKUP_LISTENER:
			sk = inet_lookup_listener(net, hinfo, skb,
						  ip_hdrlen(skb) + __tcp_hdrlen(hp),
						  saddr, sport, daddr, dport,
						  in->ifindex, 0);

			if (sk && !refcount_inc_not_zero(&sk->sk_refcnt))
				sk = NULL;
			/* NOTE: we return listeners even if bound to
			 * 0.0.0.0, those are filtered out in
			 * xt_socket, since xt_TPROXY needs 0 bound
			 * listeners too
			 */
			break;
		case NF_TPROXY_LOOKUP_ESTABLISHED:
			sk = inet_lookup_established(net, hinfo, saddr, sport,
						     daddr, dport, in->ifindex);
			break;
		default:
			BUG();
		}
		break;
		}

UDP 需要额外判断匹配结果是否可用：

	case IPPROTO_UDP:
		sk = udp4_lib_lookup(net, saddr, sport, daddr, dport,
				     in->ifindex);
		if (sk) {
			int connected = (sk->sk_state == TCP_ESTABLISHED);
			int wildcard = (inet_sk(sk)->inet_rcv_saddr == 0);

			/* NOTE: we return listeners even if bound to
			 * 0.0.0.0, those are filtered out in
			 * xt_socket, since xt_TPROXY needs 0 bound
			 * listeners too
			 */
			if ((lookup_type == NF_TPROXY_LOOKUP_ESTABLISHED &&
			      (!connected || wildcard)) ||
			    (lookup_type == NF_TPROXY_LOOKUP_LISTENER && connected)) {
				sock_put(sk);
				sk = NULL;
			}
		}
		break;

有两个限定条件：

connected 表示是否已“连接”
wildcard 表示绑定地址是否是 INADDR_ANY（0.0.0.0）

但不理解 !connected || wildcard 这个判定条件，因为 conncted 为真时 wildcard 一定为假，|| wildcard 是多余的。

一个 UDP socket connect() 一个目标后变为已连接状态。如果之前没有绑定在一个能写入 IP 数据包目的地址字段的准确 IP，则 connect() 中会由系统静态路由选择一个本机地址做源地址和本地绑定地址，同时设置给 inet_rcv_saddr 字段。只有 disconnect 才会再次将 inet_rcv_saddr 字段设为 INADDR_ANY：

// https://elixir.bootlin.com/linux/v6.1.34/source/net/ipv4/datagram.c#L64
int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
	//...

	if (!inet->inet_saddr)
		inet->inet_saddr = fl4->saddr;	/* Update source address */
	if (!inet->inet_rcv_saddr) {
		inet->inet_rcv_saddr = fl4->saddr;
		if (sk->sk_prot->rehash)
			sk->sk_prot->rehash(sk);
	}

	// ...

	sk->sk_state = TCP_ESTABLISHED;

	// ...
}

int __udp_disconnect(struct sock *sk, int flags)
{
	struct inet_sock *inet = inet_sk(sk);
	/*
	 *	1003.1g - break association.
	 */

	sk->sk_state = TCP_CLOSE;

	// ...

	if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) {
		inet_reset_saddr(sk);

	// ...
}

static __inline__ void inet_reset_saddr(struct sock *sk)
{
	inet_sk(sk)->inet_rcv_saddr = inet_sk(sk)->inet_saddr = 0;

所以，已连接状态的 UDP socket 的 inet_rcv_saddr 肯定是一个准确的 IP，不可能是 INADDR_ANY 。

添加这些限制条件的 commit 中提到 nf_tproxy_get_sock_v4() 也会被 iptables socket 扩展使用。猜测这里可能是历史遗留问题？

使用方式

以 iptables TPROXY 扩展为例：

由 --on-port/--on-ip 指定重定向目的地
由于没有修改数据包目的地址，在 PREROUTING 之后的路由选择仍会因为目的地址不是本机而走到 FORWARD 链。所以需要策略路由来引导数据包进入 INPUT 链

ip rule add fwmark 0x233 table 100
ip route add local default dev lo table 100

iptables -t mangle -A PREROUTING -p udp -j TPROXY --on-ip 127.0.0.1 --on-port 10000 --tproxy-mark 0x233
iptables -t mangle -A PREROUTING -p tcp -j TPROXY --on-ip 127.0.0.1 --on-port 10000 --tproxy-mark 0x233

用绑定在 :10000 的 socket 替换数据包原 socket，同时打上 0x233 标记。设置策略路由，让所有带有 0x233 标记的数据包使用 100 号路由表。在 100 号表中设定默认路由走 lo 本地回环设备。而从本地回环设备发出的数据包都会被视作发向本机，也就避免了被转发出去。

使用 `-m socket` 分流，提升性能

这一点没有很明确的解释，以下都是个人的理解和推测。

nf_tproxy_get_sock_v4() 的注释中提到了这一点：

/*
 * Please note that there's an overlap between what a TPROXY target
 * and a socket match will match. Normally if you have both rules the
 * "socket" match will be the first one, effectively all packets
 * belonging to established connections going through that one.
*/

被 TProxy 重定向过的数据包建立连接后，网络栈中有了数据包原始五元组与 socket 的映射关系。之后该连接的数据包在网络栈的常规处理中匹配到的 socket，也即 TPROXY 中 sk = nf_tproxy_get_sock_v4(...., NF_TPROXY_LOOKUP_ESTABLISHED) 匹配的，就是已重定向过的，所以没必要进行后续的替换。

2024/06/17 更新：关于性能差异的分析。

TProxy 中会执行 nf_tproxy_assign_sock 替换 sk，其中 skb_orphan 调用的 skb 析构函数 sock_edemux 会调用 sock_gen_put 递减 sk 的引用计数。但对于“已重定向过的连接”来说完全是多余的，因为新、旧 sk 是同一个。

而 socket 模块只当找到的 sk 不是 skb 关联的时才需要 sock_gen_put。

所以 TProxy 中多余的、频繁的触发 sock_gen_put 一定程度上会影响性能。

另外，由于 TProxy 和 socket 是一起提交的。所以我推测开发者默认透明代理应该由这两个模块分工完成，socket 负责已建立的连接，TProxy 负责修改新的连接。这也能解释 TProxy 在替换 sk 时为什么不判断 sk != skb->sk？或许正是因为开发者默认 TProxy 处理的大都是没重定向过的新连接，其中对已建立连接的判断也只是为了兜底。

UDP 比较少见代理程序会 connect() 客户端，所以只以 TCP 为例：

iptables -t mangle -N tproxy_divert
iptables -t mangle -A tproxy_divert -j MARK --set-mark 0x233
iptables -t mangle -A tproxy_divert -j ACCEPT

iptables -t mangle -A PREROUTING -p tcp -m socket -j tproxy_divert
iptables -t mangle -A PREROUTING -p tcp -j TPROXY --on-port 10000 --on-ip 127.0.0.1 --tproxy-mark 0x233

获取原始目标地址

TCP

用 getsockname() 获取客户端 socket 的“本地”地址，即为数据包的原始目标地址：

client_fd = accept(server_fd, (struct sockaddr*)&client_addr, &addr_len);

getsockname(client_fd, (struct sockaddr*) orig_dst, &addrlen)

UDP

使用 setsockopt(..., SOL_IP, IP_RECVORIGDSTADDR, ...) 设置 socket 选项让 recvmsg() 提供 IP_RECVORIGDST 辅助信息，即数据包目的地址。得益于 TProxy 没有修改原始数据包，该辅助信息是从 IP 标头中获取的：

// /net/ipv4/ip_sockglue.c
static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb)
{
	struct sockaddr_in sin;
	const struct iphdr *iph = ip_hdr(skb);
	__be16 *ports = (__be16 *)skb_transport_header(skb);

	if (skb_transport_offset(skb) + 4 > (int)skb->len)
		return;

	/* All current transport protocols have the port numbers in the
	 * first four bytes of the transport header and this function is
	 * written with this assumption in mind.
	 */

	sin.sin_family = AF_INET;
	sin.sin_addr.s_addr = iph->daddr;
	sin.sin_port = ports[1];
	memset(sin.sin_zero, 0, sizeof(sin.sin_zero));

	put_cmsg(msg, SOL_IP, IP_ORIGDSTADDR, sizeof(sin), &sin);
}

使用 recvmsg() 读取数据包和辅助信息
辅助信息中级别为 SOL_IP ，类型为 IP_ORIGDSTADDR 的数据就是原始目标地址

完整示例：

#include <arpa/inet.h>
#include <netinet/in.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/socket.h>
#include <sys/types.h>
#include <unistd.h>

#define MAX_BUF_SIZE 1024
#define SRC_ADDR INADDR_ANY
#define SRC_PORT 9999

int main() {
  int sockfd;
  struct sockaddr_in bind_addr, client_addr;
  char buffer[MAX_BUF_SIZE];

  if ((sockfd = socket(AF_INET, SOCK_DGRAM, 0)) < 0) {
    perror("socket");
    exit(EXIT_FAILURE);
  }

  int opt = 1;
  if (setsockopt(sockfd, SOL_IP, IP_TRANSPARENT, &opt, sizeof(opt)) < 0) {
    perror("IP_TRANSPARENT");
    exit(EXIT_FAILURE);
  }

  // bind
  memset(&bind_addr, 0, sizeof(bind_addr));
  bind_addr.sin_family = AF_INET;
  bind_addr.sin_addr.s_addr = htonl(SRC_ADDR);
  bind_addr.sin_port = htons(SRC_PORT);
  if (bind(sockfd, (struct sockaddr *)&bind_addr, sizeof(bind_addr)) < 0) {
    perror("bind");
    exit(EXIT_FAILURE);
  }

  // recvmsg
  if (setsockopt(sockfd, SOL_IP, IP_RECVORIGDSTADDR, &opt, sizeof(opt)) < 0) {
    perror("IP_RECVORIGDSTADDR");
    exit(EXIT_FAILURE);
  }
  while (1) {
    memset(buffer, 0, sizeof(buffer));
    struct msghdr msgh = {0};
    struct iovec iov[1];
    iov[0].iov_base = buffer;
    iov[0].iov_len = sizeof(buffer);
    msgh.msg_iov = iov;
    msgh.msg_iovlen = 1;
    msgh.msg_name = &client_addr;
    msgh.msg_namelen = sizeof(client_addr);
    char cmsgbuf[CMSG_SPACE(sizeof(int))];
    msgh.msg_control = cmsgbuf;
    msgh.msg_controllen = sizeof(cmsgbuf);
    if (recvmsg(sockfd, &msgh, 0) < 0) {
      perror("recvmsg");
      continue;
    }

    struct cmsghdr *cmsg;
    for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg != NULL;
         cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
      if (cmsg->cmsg_level == IPPROTO_IP && cmsg->cmsg_type == IP_ORIGDSTADDR) {
        struct sockaddr_in *addr = (struct sockaddr_in *)CMSG_DATA(cmsg);
        printf("Original DST ADDR: %s\n", inet_ntoa(addr->sin_addr));
        break;
      }
    }
    printf("Data: %s\n", buffer);
  }

  close(sockfd);

  return 0;
}

参考

示例：