深入理解 Linux TProxy
这是我第一次探索内核网络栈,如果有错误欢迎指正(邮箱),之后我会在文中标注。
TProxy(Transparent Proxy)是内核支持的一种透明代理方式,于 Linux 2.6.28 引入。不同于 NAT 修改数据包目的地址实现重定向,TProxy 仅替换数据包的 skb 原本持有的 socket,不需要修改数据包标头。
名词区分:TProxy 是功能的统称,TPROXY 是一个 iptabales 扩展的名称。
IP_TRANSPARENT
IP_TRANSPARENT
选项允许 socket 将任意非本机地址视为本机地址,进而可以绑定在非本机地址,伪装为非本机地址发送、接收数据。
int opt = 1;
setsockopt(sockfd, SOL_IP, IP_TRANSPARENT, &opt, sizeof(opt));
例如,网关(192.168.0.1
/ 123.x.x.94
)作为透明代理,劫持了客户端(192.168.0.200
)与远端(157.x.x.149
)的连接。代替客户端与远端连接,又伪装成远端与客户端连接:
$ netstat -atunp
Proto Recv-Q Send-Q Local Address Foreign Address State PID/Program name
tcp 0 0 123.x.x.94:37338 157.x.x.149:443 ESTABLISHED 2904/proxy
tcp 0 0 ::ffff:157.x.x.149:443 ::ffff:192.168.0.200:56418 ESTABLISHED 2904/proxy
入站重定向
为什么替换 socket
内核网络栈收到一个数据包时,会根据数据包五元组从相应协议的哈希表中找出匹配度最高的 socket,然后将数据包放入 socket 的接收队列。以 UDP 为例:
// https://elixir.bootlin.com/linux/v6.1.34/source/net/ipv4/udp.c#L2405
int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
int proto)
{
// ...
sk = skb_steal_sock(skb, &refcounted);
if (sk) {
// ...
ret = udp_unicast_rcv_skb(sk, skb, uh);
static inline struct sock *
skb_steal_sock(struct sk_buff *skb, bool *refcounted)
{
if (skb->sk) {
struct sock *sk = skb->sk;
// ...
return sk;
static int udp_unicast_rcv_skb(struct sock *sk, struct sk_buff *skb,
struct udphdr *uh)
{
// ...
ret = udp_queue_rcv_skb(sk, skb);
Netfilter hooks 的执行先于协议栈,所以在 netfilter 中修改 skb->sk
就能决定数据包最终会被放入哪个 socket 的接收队列。
内核实现
基于内核 v6.1.34,以 iptables TPROXY 模块的实现为例。nftables 中的实现基本相同。
核心逻辑
主要处理流程在 net/netfilter/xt_TPROXY.c
的 tproxy_tg4()
。
从 skb 中提取标头:
static unsigned int
tproxy_tg4(struct net *net, struct sk_buff *skb, __be32 laddr, __be16 lport,
u_int32_t mark_mask, u_int32_t mark_value)
{
const struct iphdr *iph = ip_hdr(skb);
struct udphdr _hdr, *hp;
struct sock *sk;
hp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_hdr), &_hdr);
if (hp == NULL)
return NF_DROP;
然后开始寻找一个 socket(代码中的 sk
) 用来替换数据包 skb 的原 socket。
如果之前对相同四元组的数据包做过重定向,则代理程序应该已经与客户端建立了连接,当前数据包也应该重定向到该连接:
/* check if there's an ongoing connection on the packet
* addresses, this happens if the redirect already happened
* and the current packet belongs to an already established
* connection */
sk = nf_tproxy_get_sock_v4(net, skb, iph->protocol,
iph->saddr, iph->daddr,
hp->source, hp->dest,
skb->dev, NF_TPROXY_LOOKUP_ESTABLISHED);
设置默认的重定向目的地,没处理过的数据包都应该重定向到此处。优先使用规则中的指定,否则使用接收数据包的网络设备主地址:
laddr = nf_tproxy_laddr4(skb, laddr, iph->daddr);
if (!lport)
lport = hp->dest;
__be32 nf_tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr)
{
const struct in_ifaddr *ifa;
struct in_device *indev;
__be32 laddr;
if (user_laddr)
return user_laddr;
laddr = 0;
indev = __in_dev_get_rcu(skb->dev);
in_dev_for_each_ifa_rcu(ifa, indev) {
if (ifa->ifa_flags & IFA_F_SECONDARY)
continue;
laddr = ifa->ifa_local;
break;
}
return laddr ? laddr : daddr;
}
转发 SYN 到代理来建立新连接,而不是复用已经 TIME_WAIT 的连接。我猜这是为了让代理能更简单地同步两侧连接(客户端 <-> 代理 <-> 远端)的状态:
/* UDP has no TCP_TIME_WAIT state, so we never enter here */
if (sk && sk->sk_state == TCP_TIME_WAIT)
/* reopening a TIME_WAIT connection needs special handling */
sk = nf_tproxy_handle_time_wait4(net, skb, laddr, lport, sk);
/**
* nf_tproxy_handle_time_wait4 - handle IPv4 TCP TIME_WAIT reopen redirections
* @skb: The skb being processed.
* @laddr: IPv4 address to redirect to or zero.
* @lport: TCP port to redirect to or zero.
* @sk: The TIME_WAIT TCP socket found by the lookup.
*
* We have to handle SYN packets arriving to TIME_WAIT sockets
* differently: instead of reopening the connection we should rather
* redirect the new connection to the proxy if there's a listener
* socket present.
*
* nf_tproxy_handle_time_wait4() consumes the socket reference passed in.
*
* Returns the listener socket if there's one, the TIME_WAIT socket if
* no such listener is found, or NULL if the TCP header is incomplete.
*/
struct sock *
nf_tproxy_handle_time_wait4(struct net *net, struct sk_buff *skb,
__be32 laddr, __be16 lport, struct sock *sk)
{
const struct iphdr *iph = ip_hdr(skb);
struct tcphdr _hdr, *hp;
hp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_hdr), &_hdr);
if (hp == NULL) {
inet_twsk_put(inet_twsk(sk));
return NULL;
}
if (hp->syn && !hp->rst && !hp->ack && !hp->fin) {
/* SYN to a TIME_WAIT socket, we'd rather redirect it
* to a listener socket if there's one */
struct sock *sk2;
sk2 = nf_tproxy_get_sock_v4(net, skb, iph->protocol,
iph->saddr, laddr ? laddr : iph->daddr,
hp->source, lport ? lport : hp->dest,
skb->dev, NF_TPROXY_LOOKUP_LISTENER);
if (sk2) {
nf_tproxy_twsk_deschedule_put(inet_twsk(sk));
sk = sk2;
}
}
return sk;
}
如果没有匹配到已建立的连接,使用监听状态下的重定向目的地 socket:
else if (!sk)
/* no, there's no established connection, check if
* there's a listener on the redirected addr/port */
sk = nf_tproxy_get_sock_v4(net, skb, iph->protocol,
iph->saddr, laddr,
hp->source, lport,
skb->dev, NF_TPROXY_LOOKUP_LISTENER);
最后确认一下新 socket 符合透明代理条件,用其替换数据包 skb 的原 socket:
/* NOTE: assign_sock consumes our sk reference */
if (sk && nf_tproxy_sk_is_transparent(sk)) {
/* This should be in a separate target, but we don't do multiple
targets on the same rule yet */
skb->mark = (skb->mark & ~mark_mask) ^ mark_value;
nf_tproxy_assign_sock(skb, sk);
return NF_ACCEPT;
}
return NF_DROP;
}
/* assign a socket to the skb -- consumes sk */
static inline void nf_tproxy_assign_sock(struct sk_buff *skb, struct sock *sk)
{
skb_orphan(skb);
skb->sk = sk;
skb->destructor = sock_edemux;
}
socket 匹配
nf_tproxy_get_sock_v4()
是对 TCP/UDP socket 通用匹配方法的简单封装。
// https://elixir.bootlin.com/linux/v6.1.34/source/net/ipv4/netfilter/nf_tproxy_ipv4.c#L75
/*
* This is used when the user wants to intercept a connection matching
* an explicit iptables rule. In this case the sockets are assumed
* matching in preference order:
*
* - match: if there's a fully established connection matching the
* _packet_ tuple, it is returned, assuming the redirection
* already took place and we process a packet belonging to an
* established connection
*
* - match: if there's a listening socket matching the redirection
* (e.g. on-port & on-ip of the connection), it is returned,
* regardless if it was bound to 0.0.0.0 or an explicit
* address. The reasoning is that if there's an explicit rule, it
* does not really matter if the listener is bound to an interface
* or to 0. The user already stated that he wants redirection
* (since he added the rule).
*
* Please note that there's an overlap between what a TPROXY target
* and a socket match will match. Normally if you have both rules the
* "socket" match will be the first one, effectively all packets
* belonging to established connections going through that one.
*/
struct sock *
nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb,
const u8 protocol,
const __be32 saddr, const __be32 daddr,
const __be16 sport, const __be16 dport,
const struct net_device *in,
const enum nf_tproxy_lookup_t lookup_type)
{
struct inet_hashinfo *hinfo = net->ipv4.tcp_death_row.hashinfo;
struct sock *sk;
switch (protocol) {
TCP 两种状态都有对应的匹配方法,只需要额外为监听状态的 socket 增加引用计数避免被清理:
case IPPROTO_TCP: {
struct tcphdr _hdr, *hp;
hp = skb_header_pointer(skb, ip_hdrlen(skb),
sizeof(struct tcphdr), &_hdr);
if (hp == NULL)
return NULL;
switch (lookup_type) {
case NF_TPROXY_LOOKUP_LISTENER:
sk = inet_lookup_listener(net, hinfo, skb,
ip_hdrlen(skb) + __tcp_hdrlen(hp),
saddr, sport, daddr, dport,
in->ifindex, 0);
if (sk && !refcount_inc_not_zero(&sk->sk_refcnt))
sk = NULL;
/* NOTE: we return listeners even if bound to
* 0.0.0.0, those are filtered out in
* xt_socket, since xt_TPROXY needs 0 bound
* listeners too
*/
break;
case NF_TPROXY_LOOKUP_ESTABLISHED:
sk = inet_lookup_established(net, hinfo, saddr, sport,
daddr, dport, in->ifindex);
break;
default:
BUG();
}
break;
}
UDP 需要额外判断匹配结果是否可用:
case IPPROTO_UDP:
sk = udp4_lib_lookup(net, saddr, sport, daddr, dport,
in->ifindex);
if (sk) {
int connected = (sk->sk_state == TCP_ESTABLISHED);
int wildcard = (inet_sk(sk)->inet_rcv_saddr == 0);
/* NOTE: we return listeners even if bound to
* 0.0.0.0, those are filtered out in
* xt_socket, since xt_TPROXY needs 0 bound
* listeners too
*/
if ((lookup_type == NF_TPROXY_LOOKUP_ESTABLISHED &&
(!connected || wildcard)) ||
(lookup_type == NF_TPROXY_LOOKUP_LISTENER && connected)) {
sock_put(sk);
sk = NULL;
}
}
break;
有两个限定条件:
- connected 表示是否已“连接”
- wildcard 表示绑定地址是否是 INADDR_ANY(
0.0.0.0
)
但不理解 !connected || wildcard
这个判定条件,因为 conncted 为真时 wildcard 一定为假,|| wildcard
是多余的。
一个 UDP socket connect()
一个目标后变为已连接状态。如果之前没有绑定在一个能写入 IP 数据包目的地址字段的准确 IP,则 connect()
中会由系统静态路由选择一个本机地址做源地址和本地绑定地址,同时设置给 inet_rcv_saddr
字段。只有 disconnect 才会再次将 inet_rcv_saddr
字段设为 INADDR_ANY:
// https://elixir.bootlin.com/linux/v6.1.34/source/net/ipv4/datagram.c#L64
int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
//...
if (!inet->inet_saddr)
inet->inet_saddr = fl4->saddr; /* Update source address */
if (!inet->inet_rcv_saddr) {
inet->inet_rcv_saddr = fl4->saddr;
if (sk->sk_prot->rehash)
sk->sk_prot->rehash(sk);
}
// ...
sk->sk_state = TCP_ESTABLISHED;
// ...
}
int __udp_disconnect(struct sock *sk, int flags)
{
struct inet_sock *inet = inet_sk(sk);
/*
* 1003.1g - break association.
*/
sk->sk_state = TCP_CLOSE;
// ...
if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) {
inet_reset_saddr(sk);
// ...
}
static __inline__ void inet_reset_saddr(struct sock *sk)
{
inet_sk(sk)->inet_rcv_saddr = inet_sk(sk)->inet_saddr = 0;
所以,已连接状态的 UDP socket 的 inet_rcv_saddr
肯定是一个准确的 IP,不可能是 INADDR_ANY 。
添加这些限制条件的 commit 中提到 nf_tproxy_get_sock_v4()
也会被 iptables socket 扩展使用。猜测这里可能是历史遗留问题?
使用方式
以 iptables TPROXY 扩展 为例:
- 由
--on-port
/--on-ip
指定重定向目的地 - 由于没有修改数据包目的地址,在 PREROUTING 之后的路由选择仍会因为目的地址不是本机而走到 FORWARD 链。所以需要策略路由来引导数据包进入 INPUT 链
ip rule add fwmark 0x233 table 100
ip route add local default dev lo table 100
iptables -t mangle -A PREROUTING -p udp -j TPROXY --on-ip 127.0.0.1 --on-port 10000 --tproxy-mark 0x233
iptables -t mangle -A PREROUTING -p tcp -j TPROXY --on-ip 127.0.0.1 --on-port 10000 --tproxy-mark 0x233
用绑定在 :10000
的 socket 替换数据包原 socket,同时打上 0x233 标记。设置策略路由,让所有带有 0x233 标记的数据包使用 100 号路由表。在 100 号表中设定默认路由走 lo 本地回环设备。而从本地回环设备发出的数据包都会被视作发向本机,也就避免了被转发出去。
使用 -m socket
分流,提升性能
这一点没有很明确的解释,以下都是个人的理解和推测。
nf_tproxy_get_sock_v4()
的注释中提到了这一点:
/*
* Please note that there's an overlap between what a TPROXY target
* and a socket match will match. Normally if you have both rules the
* "socket" match will be the first one, effectively all packets
* belonging to established connections going through that one.
*/
被 TProxy 重定向过的数据包建立连接后,网络栈中有了数据包原始五元组与 socket 的映射关系。之后该连接的数据包在网络栈的常规处理中匹配到的 socket,也即 TPROXY 中 sk = nf_tproxy_get_sock_v4(...., NF_TPROXY_LOOKUP_ESTABLISHED)
匹配的,就是已重定向过的,所以没必要进行后续的替换。
2024/06/17 更新:关于性能差异的分析。
TProxy 中会执行 nf_tproxy_assign_sock
替换 sk,其中 skb_orphan
调用的 skb 析构函数 sock_edemux
会调用 sock_gen_put
递减 sk 的引用计数。但对于“已重定向过的连接”来说完全是多余的,因为新、旧 sk 是同一个。
而 socket 模块只当找到的 sk 不是 skb 关联的时才需要 sock_gen_put
。
所以 TProxy 中多余的、频繁的触发 sock_gen_put
一定程度上会影响性能。
另外,由于 TProxy 和 socket 是一起提交的。所以我推测开发者默认透明代理应该由这两个模块分工完成,socket 负责已建立的连接,TProxy 负责修改新的连接。这也能解释 TProxy 在替换 sk 时为什么不判断 sk != skb->sk
?或许正是因为开发者默认 TProxy 处理的大都是没重定向过的新连接,其中对已建立连接的判断也只是为了兜底。
UDP 比较少见代理程序会 connect()
客户端,所以只以 TCP 为例:
iptables -t mangle -N tproxy_divert
iptables -t mangle -A tproxy_divert -j MARK --set-mark 0x233
iptables -t mangle -A tproxy_divert -j ACCEPT
iptables -t mangle -A PREROUTING -p tcp -m socket -j tproxy_divert
iptables -t mangle -A PREROUTING -p tcp -j TPROXY --on-port 10000 --on-ip 127.0.0.1 --tproxy-mark 0x233
获取原始目标地址
TCP
用 getsockname()
获取客户端 socket 的“本地”地址,即为数据包的原始目标地址:
client_fd = accept(server_fd, (struct sockaddr*)&client_addr, &addr_len);
getsockname(client_fd, (struct sockaddr*) orig_dst, &addrlen)
UDP
- 使用
setsockopt(..., SOL_IP, IP_RECVORIGDSTADDR, ...)
设置 socket 选项让recvmsg()
提供 IP_RECVORIGDST 辅助信息,即数据包目的地址。得益于 TProxy 没有修改原始数据包,该辅助信息是从 IP 标头中获取的:
// /net/ipv4/ip_sockglue.c
static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb)
{
struct sockaddr_in sin;
const struct iphdr *iph = ip_hdr(skb);
__be16 *ports = (__be16 *)skb_transport_header(skb);
if (skb_transport_offset(skb) + 4 > (int)skb->len)
return;
/* All current transport protocols have the port numbers in the
* first four bytes of the transport header and this function is
* written with this assumption in mind.
*/
sin.sin_family = AF_INET;
sin.sin_addr.s_addr = iph->daddr;
sin.sin_port = ports[1];
memset(sin.sin_zero, 0, sizeof(sin.sin_zero));
put_cmsg(msg, SOL_IP, IP_ORIGDSTADDR, sizeof(sin), &sin);
}
- 使用
recvmsg()
读取数据包和辅助信息 - 辅助信息中级别为 SOL_IP ,类型为 IP_ORIGDSTADDR 的数据就是原始目标地址
完整示例:
#include <arpa/inet.h>
#include <netinet/in.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/socket.h>
#include <sys/types.h>
#include <unistd.h>
#define MAX_BUF_SIZE 1024
#define SRC_ADDR INADDR_ANY
#define SRC_PORT 9999
int main() {
int sockfd;
struct sockaddr_in bind_addr, client_addr;
char buffer[MAX_BUF_SIZE];
if ((sockfd = socket(AF_INET, SOCK_DGRAM, 0)) < 0) {
perror("socket");
exit(EXIT_FAILURE);
}
int opt = 1;
if (setsockopt(sockfd, SOL_IP, IP_TRANSPARENT, &opt, sizeof(opt)) < 0) {
perror("IP_TRANSPARENT");
exit(EXIT_FAILURE);
}
// bind
memset(&bind_addr, 0, sizeof(bind_addr));
bind_addr.sin_family = AF_INET;
bind_addr.sin_addr.s_addr = htonl(SRC_ADDR);
bind_addr.sin_port = htons(SRC_PORT);
if (bind(sockfd, (struct sockaddr *)&bind_addr, sizeof(bind_addr)) < 0) {
perror("bind");
exit(EXIT_FAILURE);
}
// recvmsg
if (setsockopt(sockfd, SOL_IP, IP_RECVORIGDSTADDR, &opt, sizeof(opt)) < 0) {
perror("IP_RECVORIGDSTADDR");
exit(EXIT_FAILURE);
}
while (1) {
memset(buffer, 0, sizeof(buffer));
struct msghdr msgh = {0};
struct iovec iov[1];
iov[0].iov_base = buffer;
iov[0].iov_len = sizeof(buffer);
msgh.msg_iov = iov;
msgh.msg_iovlen = 1;
msgh.msg_name = &client_addr;
msgh.msg_namelen = sizeof(client_addr);
char cmsgbuf[CMSG_SPACE(sizeof(int))];
msgh.msg_control = cmsgbuf;
msgh.msg_controllen = sizeof(cmsgbuf);
if (recvmsg(sockfd, &msgh, 0) < 0) {
perror("recvmsg");
continue;
}
struct cmsghdr *cmsg;
for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg != NULL;
cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
if (cmsg->cmsg_level == IPPROTO_IP && cmsg->cmsg_type == IP_ORIGDSTADDR) {
struct sockaddr_in *addr = (struct sockaddr_in *)CMSG_DATA(cmsg);
printf("Original DST ADDR: %s\n", inet_ntoa(addr->sin_addr));
break;
}
}
printf("Data: %s\n", buffer);
}
close(sockfd);
return 0;
}
参考
- 官方文档
- Linux transparent proxy support
- TProxy 探秘
- Abusing Linux’s firewall: the hack that allowed us to build Spectrum
- iptables-extensions 中 socket 模块是个啥?
- 从 ss-redir 的实现到 Linux NAT
- Linux 网络栈接收数据(RX):原理及内核实现(2022)
示例: