1 /* NAT for netfilter; shared with compatibility layer. */
3 /* (C) 1999-2001 Paul `Rusty' Russell
4 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
11 #include <linux/module.h>
12 #include <linux/types.h>
13 #include <linux/timer.h>
14 #include <linux/skbuff.h>
15 #include <linux/netfilter_ipv4.h>
16 #include <linux/vmalloc.h>
17 #include <net/checksum.h>
20 #include <net/tcp.h> /* For tcp_prot in getorigdst */
21 #include <linux/icmp.h>
22 #include <linux/udp.h>
23 #include <linux/jhash.h>
25 #define ASSERT_READ_LOCK(x)
26 #define ASSERT_WRITE_LOCK(x)
28 #include <linux/netfilter_ipv4/ip_conntrack.h>
29 #include <linux/netfilter_ipv4/ip_conntrack_core.h>
30 #include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
31 #include <linux/netfilter_ipv4/ip_nat.h>
32 #include <linux/netfilter_ipv4/ip_nat_protocol.h>
33 #include <linux/netfilter_ipv4/ip_nat_core.h>
34 #include <linux/netfilter_ipv4/ip_nat_helper.h>
35 #include <linux/netfilter_ipv4/ip_conntrack_helper.h>
36 #include <linux/netfilter_ipv4/listhelp.h>
41 #define DEBUGP(format, args...)
44 DEFINE_RWLOCK(ip_nat_lock);
46 /* Calculated at init based on memory size */
47 static unsigned int ip_nat_htable_size;
49 static struct list_head *bysource;
51 #define MAX_IP_NAT_PROTO 256
52 struct ip_nat_protocol *ip_nat_protos[MAX_IP_NAT_PROTO];
54 static inline struct ip_nat_protocol *
55 __ip_nat_proto_find(u_int8_t protonum)
57 return ip_nat_protos[protonum];
60 struct ip_nat_protocol *
61 ip_nat_proto_find_get(u_int8_t protonum)
63 struct ip_nat_protocol *p;
65 /* we need to disable preemption to make sure 'p' doesn't get
66 * removed until we've grabbed the reference */
68 p = __ip_nat_proto_find(protonum);
69 if (!try_module_get(p->me))
70 p = &ip_nat_unknown_protocol;
75 EXPORT_SYMBOL_GPL(ip_nat_proto_find_get);
78 ip_nat_proto_put(struct ip_nat_protocol *p)
82 EXPORT_SYMBOL_GPL(ip_nat_proto_put);
84 /* We keep an extra hash for each conntrack, for fast searching. */
85 static inline unsigned int
86 hash_by_src(const struct ip_conntrack_tuple *tuple)
88 /* Original src, to ensure we map it consistently if poss. */
89 return jhash_3words(tuple->src.ip, tuple->src.u.all,
90 tuple->dst.protonum, 0) % ip_nat_htable_size;
93 /* Noone using conntrack by the time this called. */
94 static void ip_nat_cleanup_conntrack(struct ip_conntrack *conn)
96 if (!(conn->status & IPS_NAT_DONE_MASK))
99 write_lock_bh(&ip_nat_lock);
100 list_del(&conn->nat.info.bysource);
101 write_unlock_bh(&ip_nat_lock);
104 /* We do checksum mangling, so if they were wrong before they're still
105 * wrong. Also works for incomplete packets (eg. ICMP dest
108 ip_nat_cheat_check(u_int32_t oldvalinv, u_int32_t newval, u_int16_t oldcheck)
110 u_int32_t diffs[] = { oldvalinv, newval };
111 return csum_fold(csum_partial((char *)diffs, sizeof(diffs),
114 EXPORT_SYMBOL(ip_nat_cheat_check);
116 /* Is this tuple already taken? (not by us) */
118 ip_nat_used_tuple(const struct ip_conntrack_tuple *tuple,
119 const struct ip_conntrack *ignored_conntrack)
121 /* Conntrack tracking doesn't keep track of outgoing tuples; only
122 incoming ones. NAT means they don't have a fixed mapping,
123 so we invert the tuple and look for the incoming reply.
125 We could keep a separate hash if this proves too slow. */
126 struct ip_conntrack_tuple reply;
128 invert_tuplepr(&reply, tuple);
129 return ip_conntrack_tuple_taken(&reply, ignored_conntrack);
131 EXPORT_SYMBOL(ip_nat_used_tuple);
133 /* If we source map this tuple so reply looks like reply_tuple, will
134 * that meet the constraints of range. */
136 in_range(const struct ip_conntrack_tuple *tuple,
137 const struct ip_nat_range *range)
139 struct ip_nat_protocol *proto =
140 __ip_nat_proto_find(tuple->dst.protonum);
142 /* If we are supposed to map IPs, then we must be in the
143 range specified, otherwise let this drag us onto a new src IP. */
144 if (range->flags & IP_NAT_RANGE_MAP_IPS) {
145 if (ntohl(tuple->src.ip) < ntohl(range->min_ip)
146 || ntohl(tuple->src.ip) > ntohl(range->max_ip))
150 if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)
151 || proto->in_range(tuple, IP_NAT_MANIP_SRC,
152 &range->min, &range->max))
159 same_src(const struct ip_conntrack *ct,
160 const struct ip_conntrack_tuple *tuple)
162 return (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum
163 == tuple->dst.protonum
164 && ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip
166 && ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.all
167 == tuple->src.u.all);
170 /* Only called for SRC manip */
172 find_appropriate_src(const struct ip_conntrack_tuple *tuple,
173 struct ip_conntrack_tuple *result,
174 const struct ip_nat_range *range)
176 unsigned int h = hash_by_src(tuple);
177 struct ip_conntrack *ct;
179 read_lock_bh(&ip_nat_lock);
180 list_for_each_entry(ct, &bysource[h], nat.info.bysource) {
181 if (same_src(ct, tuple)) {
182 /* Copy source part from reply tuple. */
183 invert_tuplepr(result,
184 &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
185 result->dst = tuple->dst;
187 if (in_range(result, range)) {
188 read_unlock_bh(&ip_nat_lock);
193 read_unlock_bh(&ip_nat_lock);
197 /* For [FUTURE] fragmentation handling, we want the least-used
198 src-ip/dst-ip/proto triple. Fairness doesn't come into it. Thus
199 if the range specifies 1.2.3.4 ports 10000-10005 and 1.2.3.5 ports
200 1-65535, we don't do pro-rata allocation based on ports; we choose
201 the ip with the lowest src-ip/dst-ip/proto usage.
204 find_best_ips_proto(struct ip_conntrack_tuple *tuple,
205 const struct ip_nat_range *range,
206 const struct ip_conntrack *conntrack,
207 enum ip_nat_manip_type maniptype)
211 u_int32_t minip, maxip, j;
213 /* No IP mapping? Do nothing. */
214 if (!(range->flags & IP_NAT_RANGE_MAP_IPS))
217 if (maniptype == IP_NAT_MANIP_SRC)
218 var_ipp = &tuple->src.ip;
220 var_ipp = &tuple->dst.ip;
222 /* Fast path: only one choice. */
223 if (range->min_ip == range->max_ip) {
224 *var_ipp = range->min_ip;
228 /* Hashing source and destination IPs gives a fairly even
229 * spread in practice (if there are a small number of IPs
230 * involved, there usually aren't that many connections
231 * anyway). The consistency means that servers see the same
232 * client coming from the same IP (some Internet Banking sites
233 * like this), even across reboots. */
234 minip = ntohl(range->min_ip);
235 maxip = ntohl(range->max_ip);
236 j = jhash_2words(tuple->src.ip, tuple->dst.ip, 0);
237 *var_ipp = htonl(minip + j % (maxip - minip + 1));
240 /* Manipulate the tuple into the range given. For NF_IP_POST_ROUTING,
241 * we change the source to map into the range. For NF_IP_PRE_ROUTING
242 * and NF_IP_LOCAL_OUT, we change the destination to map into the
243 * range. It might not be possible to get a unique tuple, but we try.
244 * At worst (or if we race), we will end up with a final duplicate in
245 * __ip_conntrack_confirm and drop the packet. */
247 get_unique_tuple(struct ip_conntrack_tuple *tuple,
248 const struct ip_conntrack_tuple *orig_tuple,
249 const struct ip_nat_range *range,
250 struct ip_conntrack *conntrack,
251 enum ip_nat_manip_type maniptype)
253 struct ip_nat_protocol *proto;
255 /* 1) If this srcip/proto/src-proto-part is currently mapped,
256 and that same mapping gives a unique tuple within the given
259 This is only required for source (ie. NAT/masq) mappings.
260 So far, we don't do local source mappings, so multiple
261 manips not an issue. */
262 if (maniptype == IP_NAT_MANIP_SRC) {
263 if (find_appropriate_src(orig_tuple, tuple, range)) {
264 DEBUGP("get_unique_tuple: Found current src map\n");
265 if (!ip_nat_used_tuple(tuple, conntrack))
270 /* 2) Select the least-used IP/proto combination in the given
272 *tuple = *orig_tuple;
273 find_best_ips_proto(tuple, range, conntrack, maniptype);
275 /* 3) The per-protocol part of the manip is made to map into
276 the range to make a unique tuple. */
278 proto = ip_nat_proto_find_get(orig_tuple->dst.protonum);
280 /* Only bother mapping if it's not already in range and unique */
281 if ((!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)
282 || proto->in_range(tuple, maniptype, &range->min, &range->max))
283 && !ip_nat_used_tuple(tuple, conntrack)) {
284 ip_nat_proto_put(proto);
288 /* Last change: get protocol to try to obtain unique tuple. */
289 proto->unique_tuple(tuple, range, maniptype, conntrack);
291 ip_nat_proto_put(proto);
295 ip_nat_setup_info(struct ip_conntrack *conntrack,
296 const struct ip_nat_range *range,
297 unsigned int hooknum)
299 struct ip_conntrack_tuple curr_tuple, new_tuple;
300 struct ip_nat_info *info = &conntrack->nat.info;
301 int have_to_hash = !(conntrack->status & IPS_NAT_DONE_MASK);
302 enum ip_nat_manip_type maniptype = HOOK2MANIP(hooknum);
304 IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING
305 || hooknum == NF_IP_POST_ROUTING
306 || hooknum == NF_IP_LOCAL_IN
307 || hooknum == NF_IP_LOCAL_OUT);
308 BUG_ON(ip_nat_initialized(conntrack, maniptype));
310 /* What we've got will look like inverse of reply. Normally
311 this is what is in the conntrack, except for prior
312 manipulations (future optimization: if num_manips == 0,
314 conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple) */
315 invert_tuplepr(&curr_tuple,
316 &conntrack->tuplehash[IP_CT_DIR_REPLY].tuple);
318 get_unique_tuple(&new_tuple, &curr_tuple, range, conntrack, maniptype);
320 if (!ip_ct_tuple_equal(&new_tuple, &curr_tuple)) {
321 struct ip_conntrack_tuple reply;
323 /* Alter conntrack table so will recognize replies. */
324 invert_tuplepr(&reply, &new_tuple);
325 ip_conntrack_alter_reply(conntrack, &reply);
327 /* Non-atomic: we own this at the moment. */
328 if (maniptype == IP_NAT_MANIP_SRC)
329 conntrack->status |= IPS_SRC_NAT;
331 conntrack->status |= IPS_DST_NAT;
334 /* Place in source hash if this is the first time. */
337 = hash_by_src(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
339 write_lock_bh(&ip_nat_lock);
340 list_add(&info->bysource, &bysource[srchash]);
341 write_unlock_bh(&ip_nat_lock);
345 if (maniptype == IP_NAT_MANIP_DST)
346 set_bit(IPS_DST_NAT_DONE_BIT, &conntrack->status);
348 set_bit(IPS_SRC_NAT_DONE_BIT, &conntrack->status);
352 EXPORT_SYMBOL(ip_nat_setup_info);
354 /* Returns true if succeeded. */
356 manip_pkt(u_int16_t proto,
357 struct sk_buff **pskb,
358 unsigned int iphdroff,
359 const struct ip_conntrack_tuple *target,
360 enum ip_nat_manip_type maniptype)
363 struct ip_nat_protocol *p;
365 if (!skb_make_writable(pskb, iphdroff + sizeof(*iph)))
368 iph = (void *)(*pskb)->data + iphdroff;
370 /* Manipulate protcol part. */
371 p = ip_nat_proto_find_get(proto);
372 if (!p->manip_pkt(pskb, iphdroff, target, maniptype)) {
378 iph = (void *)(*pskb)->data + iphdroff;
380 if (maniptype == IP_NAT_MANIP_SRC) {
381 iph->check = ip_nat_cheat_check(~iph->saddr, target->src.ip,
383 iph->saddr = target->src.ip;
385 iph->check = ip_nat_cheat_check(~iph->daddr, target->dst.ip,
387 iph->daddr = target->dst.ip;
392 /* Do packet manipulations according to ip_nat_setup_info. */
393 unsigned int ip_nat_packet(struct ip_conntrack *ct,
394 enum ip_conntrack_info ctinfo,
395 unsigned int hooknum,
396 struct sk_buff **pskb)
398 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
399 unsigned long statusbit;
400 enum ip_nat_manip_type mtype = HOOK2MANIP(hooknum);
402 if (mtype == IP_NAT_MANIP_SRC)
403 statusbit = IPS_SRC_NAT;
405 statusbit = IPS_DST_NAT;
407 /* Invert if this is reply dir. */
408 if (dir == IP_CT_DIR_REPLY)
409 statusbit ^= IPS_NAT_MASK;
411 /* Non-atomic: these bits don't change. */
412 if (ct->status & statusbit) {
413 struct ip_conntrack_tuple target;
415 /* We are aiming to look like inverse of other direction. */
416 invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
418 if (!manip_pkt(target.dst.protonum, pskb, 0, &target, mtype))
423 EXPORT_SYMBOL_GPL(ip_nat_packet);
425 /* Dir is direction ICMP is coming from (opposite to packet it contains) */
426 int ip_nat_icmp_reply_translation(struct sk_buff **pskb,
427 struct ip_conntrack *ct,
428 enum ip_nat_manip_type manip,
429 enum ip_conntrack_dir dir)
435 struct ip_conntrack_tuple inner, target;
436 int hdrlen = (*pskb)->nh.iph->ihl * 4;
438 if (!skb_make_writable(pskb, hdrlen + sizeof(*inside)))
441 inside = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4;
443 /* We're actually going to mangle it beyond trivial checksum
444 adjustment, so make sure the current checksum is correct. */
445 if ((*pskb)->ip_summed != CHECKSUM_UNNECESSARY) {
446 hdrlen = (*pskb)->nh.iph->ihl * 4;
447 if ((u16)csum_fold(skb_checksum(*pskb, hdrlen,
448 (*pskb)->len - hdrlen, 0)))
452 /* Must be RELATED */
453 IP_NF_ASSERT((*pskb)->nfctinfo == IP_CT_RELATED ||
454 (*pskb)->nfctinfo == IP_CT_RELATED+IP_CT_IS_REPLY);
456 /* Redirects on non-null nats must be dropped, else they'll
457 start talking to each other without our translation, and be
459 if (inside->icmp.type == ICMP_REDIRECT) {
460 /* If NAT isn't finished, assume it and drop. */
461 if ((ct->status & IPS_NAT_DONE_MASK) != IPS_NAT_DONE_MASK)
464 if (ct->status & IPS_NAT_MASK)
468 DEBUGP("icmp_reply_translation: translating error %p manp %u dir %s\n",
469 *pskb, manip, dir == IP_CT_DIR_ORIGINAL ? "ORIG" : "REPLY");
471 if (!ip_ct_get_tuple(&inside->ip, *pskb, (*pskb)->nh.iph->ihl*4 +
472 sizeof(struct icmphdr) + inside->ip.ihl*4,
474 __ip_conntrack_proto_find(inside->ip.protocol)))
477 /* Change inner back to look like incoming packet. We do the
478 opposite manip on this hook to normal, because it might not
479 pass all hooks (locally-generated ICMP). Consider incoming
480 packet: PREROUTING (DST manip), routing produces ICMP, goes
481 through POSTROUTING (which must correct the DST manip). */
482 if (!manip_pkt(inside->ip.protocol, pskb,
483 (*pskb)->nh.iph->ihl*4
484 + sizeof(inside->icmp),
485 &ct->tuplehash[!dir].tuple,
489 /* Reloading "inside" here since manip_pkt inner. */
490 inside = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4;
491 inside->icmp.checksum = 0;
492 inside->icmp.checksum = csum_fold(skb_checksum(*pskb, hdrlen,
493 (*pskb)->len - hdrlen,
496 /* Change outer to look the reply to an incoming packet
497 * (proto 0 means don't invert per-proto part). */
499 /* Obviously, we need to NAT destination IP, but source IP
500 should be NAT'ed only if it is from a NAT'd host.
502 Explanation: some people use NAT for anonymizing. Also,
503 CERT recommends dropping all packets from private IP
504 addresses (although ICMP errors from internal links with
505 such addresses are not too uncommon, as Alan Cox points
507 if (manip != IP_NAT_MANIP_SRC
508 || ((*pskb)->nh.iph->saddr == ct->tuplehash[dir].tuple.src.ip)) {
509 invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
510 if (!manip_pkt(0, pskb, 0, &target, manip))
516 EXPORT_SYMBOL_GPL(ip_nat_icmp_reply_translation);
518 /* Protocol registration. */
519 int ip_nat_protocol_register(struct ip_nat_protocol *proto)
523 write_lock_bh(&ip_nat_lock);
524 if (ip_nat_protos[proto->protonum] != &ip_nat_unknown_protocol) {
528 ip_nat_protos[proto->protonum] = proto;
530 write_unlock_bh(&ip_nat_lock);
533 EXPORT_SYMBOL(ip_nat_protocol_register);
535 /* Noone stores the protocol anywhere; simply delete it. */
536 void ip_nat_protocol_unregister(struct ip_nat_protocol *proto)
538 write_lock_bh(&ip_nat_lock);
539 ip_nat_protos[proto->protonum] = &ip_nat_unknown_protocol;
540 write_unlock_bh(&ip_nat_lock);
542 /* Someone could be still looking at the proto in a bh. */
545 EXPORT_SYMBOL(ip_nat_protocol_unregister);
547 #if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
548 defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
550 ip_nat_port_range_to_nfattr(struct sk_buff *skb,
551 const struct ip_nat_range *range)
553 NFA_PUT(skb, CTA_PROTONAT_PORT_MIN, sizeof(u_int16_t),
554 &range->min.tcp.port);
555 NFA_PUT(skb, CTA_PROTONAT_PORT_MAX, sizeof(u_int16_t),
556 &range->max.tcp.port);
565 ip_nat_port_nfattr_to_range(struct nfattr *tb[], struct ip_nat_range *range)
569 /* we have to return whether we actually parsed something or not */
571 if (tb[CTA_PROTONAT_PORT_MIN-1]) {
573 range->min.tcp.port =
574 *(u_int16_t *)NFA_DATA(tb[CTA_PROTONAT_PORT_MIN-1]);
577 if (!tb[CTA_PROTONAT_PORT_MAX-1]) {
579 range->max.tcp.port = range->min.tcp.port;
582 range->max.tcp.port =
583 *(u_int16_t *)NFA_DATA(tb[CTA_PROTONAT_PORT_MAX-1]);
588 EXPORT_SYMBOL_GPL(ip_nat_port_nfattr_to_range);
589 EXPORT_SYMBOL_GPL(ip_nat_port_range_to_nfattr);
592 static int __init ip_nat_init(void)
596 /* Leave them the same for the moment. */
597 ip_nat_htable_size = ip_conntrack_htable_size;
599 /* One vmalloc for both hash tables */
600 bysource = vmalloc(sizeof(struct list_head) * ip_nat_htable_size);
604 /* Sew in builtin protocols. */
605 write_lock_bh(&ip_nat_lock);
606 for (i = 0; i < MAX_IP_NAT_PROTO; i++)
607 ip_nat_protos[i] = &ip_nat_unknown_protocol;
608 ip_nat_protos[IPPROTO_TCP] = &ip_nat_protocol_tcp;
609 ip_nat_protos[IPPROTO_UDP] = &ip_nat_protocol_udp;
610 ip_nat_protos[IPPROTO_ICMP] = &ip_nat_protocol_icmp;
611 write_unlock_bh(&ip_nat_lock);
613 for (i = 0; i < ip_nat_htable_size; i++) {
614 INIT_LIST_HEAD(&bysource[i]);
617 /* FIXME: Man, this is a hack. <SIGH> */
618 IP_NF_ASSERT(ip_conntrack_destroyed == NULL);
619 ip_conntrack_destroyed = &ip_nat_cleanup_conntrack;
621 /* Initialize fake conntrack so that NAT will skip it */
622 ip_conntrack_untracked.status |= IPS_NAT_DONE_MASK;
626 /* Clear NAT section of all conntracks, in case we're loaded again. */
627 static int clean_nat(struct ip_conntrack *i, void *data)
629 memset(&i->nat, 0, sizeof(i->nat));
630 i->status &= ~(IPS_NAT_MASK | IPS_NAT_DONE_MASK | IPS_SEQ_ADJUST);
634 static void __exit ip_nat_cleanup(void)
636 ip_ct_iterate_cleanup(&clean_nat, NULL);
637 ip_conntrack_destroyed = NULL;
641 MODULE_LICENSE("GPL");
643 module_init(ip_nat_init);
644 module_exit(ip_nat_cleanup);