123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169 |
- From: Pablo Neira Ayuso <pablo@netfilter.org>
- Date: Sun, 7 Jan 2018 01:03:56 +0100
- Subject: [PATCH] netfilter: nf_conntrack: add IPS_OFFLOAD status bit
- This new bit tells us that the conntrack entry is owned by the flow
- table offload infrastructure.
- # cat /proc/net/nf_conntrack
- ipv4 2 tcp 6 src=10.141.10.2 dst=147.75.205.195 sport=36392 dport=443 src=147.75.205.195 dst=192.168.2.195 sport=443 dport=36392 [OFFLOAD] mark=0 zone=0 use=2
- Note the [OFFLOAD] tag in the listing.
- The timer of such conntrack entries look like stopped from userspace.
- In practise, to make sure the conntrack entry does not go away, the
- conntrack timer is periodically set to an arbitrary large value that
- gets refreshed on every iteration from the garbage collector, so it
- never expires- and they display no internal state in the case of TCP
- flows. This allows us to save a bitcheck from the packet path via
- nf_ct_is_expired().
- Conntrack entries that have been offloaded to the flow table
- infrastructure cannot be deleted/flushed via ctnetlink. The flow table
- infrastructure is also responsible for releasing this conntrack entry.
- Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
- ---
- --- a/include/uapi/linux/netfilter/nf_conntrack_common.h
- +++ b/include/uapi/linux/netfilter/nf_conntrack_common.h
- @@ -101,12 +101,16 @@ enum ip_conntrack_status {
- IPS_HELPER_BIT = 13,
- IPS_HELPER = (1 << IPS_HELPER_BIT),
-
- + /* Conntrack has been offloaded to flow table. */
- + IPS_OFFLOAD_BIT = 14,
- + IPS_OFFLOAD = (1 << IPS_OFFLOAD_BIT),
- +
- /* Be careful here, modifying these bits can make things messy,
- * so don't let users modify them directly.
- */
- IPS_UNCHANGEABLE_MASK = (IPS_NAT_DONE_MASK | IPS_NAT_MASK |
- IPS_EXPECTED | IPS_CONFIRMED | IPS_DYING |
- - IPS_SEQ_ADJUST | IPS_TEMPLATE),
- + IPS_SEQ_ADJUST | IPS_TEMPLATE | IPS_OFFLOAD),
-
- __IPS_MAX_BIT = 14,
- };
- --- a/net/netfilter/nf_conntrack_core.c
- +++ b/net/netfilter/nf_conntrack_core.c
- @@ -975,6 +975,9 @@ static unsigned int early_drop_list(stru
- hlist_nulls_for_each_entry_rcu(h, n, head, hnnode) {
- tmp = nf_ct_tuplehash_to_ctrack(h);
-
- + if (test_bit(IPS_OFFLOAD_BIT, &tmp->status))
- + continue;
- +
- if (nf_ct_is_expired(tmp)) {
- nf_ct_gc_expired(tmp);
- continue;
- @@ -1052,6 +1055,18 @@ static bool gc_worker_can_early_drop(con
- return false;
- }
-
- +#define DAY (86400 * HZ)
- +
- +/* Set an arbitrary timeout large enough not to ever expire, this save
- + * us a check for the IPS_OFFLOAD_BIT from the packet path via
- + * nf_ct_is_expired().
- + */
- +static void nf_ct_offload_timeout(struct nf_conn *ct)
- +{
- + if (nf_ct_expires(ct) < DAY / 2)
- + ct->timeout = nfct_time_stamp + DAY;
- +}
- +
- static void gc_worker(struct work_struct *work)
- {
- unsigned int min_interval = max(HZ / GC_MAX_BUCKETS_DIV, 1u);
- @@ -1088,6 +1103,11 @@ static void gc_worker(struct work_struct
- tmp = nf_ct_tuplehash_to_ctrack(h);
-
- scanned++;
- + if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) {
- + nf_ct_offload_timeout(tmp);
- + continue;
- + }
- +
- if (nf_ct_is_expired(tmp)) {
- nf_ct_gc_expired(tmp);
- expired_count++;
- --- a/net/netfilter/nf_conntrack_netlink.c
- +++ b/net/netfilter/nf_conntrack_netlink.c
- @@ -1125,6 +1125,14 @@ static const struct nla_policy ct_nla_po
- .len = NF_CT_LABELS_MAX_SIZE },
- };
-
- +static int ctnetlink_flush_iterate(struct nf_conn *ct, void *data)
- +{
- + if (test_bit(IPS_OFFLOAD_BIT, &ct->status))
- + return 0;
- +
- + return ctnetlink_filter_match(ct, data);
- +}
- +
- static int ctnetlink_flush_conntrack(struct net *net,
- const struct nlattr * const cda[],
- u32 portid, int report)
- @@ -1137,7 +1145,7 @@ static int ctnetlink_flush_conntrack(str
- return PTR_ERR(filter);
- }
-
- - nf_ct_iterate_cleanup_net(net, ctnetlink_filter_match, filter,
- + nf_ct_iterate_cleanup_net(net, ctnetlink_flush_iterate, filter,
- portid, report);
- kfree(filter);
-
- @@ -1183,6 +1191,11 @@ static int ctnetlink_del_conntrack(struc
-
- ct = nf_ct_tuplehash_to_ctrack(h);
-
- + if (test_bit(IPS_OFFLOAD_BIT, &ct->status)) {
- + nf_ct_put(ct);
- + return -EBUSY;
- + }
- +
- if (cda[CTA_ID]) {
- __be32 id = nla_get_be32(cda[CTA_ID]);
-
- --- a/net/netfilter/nf_conntrack_proto_tcp.c
- +++ b/net/netfilter/nf_conntrack_proto_tcp.c
- @@ -305,6 +305,9 @@ static bool tcp_invert_tuple(struct nf_c
- /* Print out the private part of the conntrack. */
- static void tcp_print_conntrack(struct seq_file *s, struct nf_conn *ct)
- {
- + if (test_bit(IPS_OFFLOAD_BIT, &ct->status))
- + return;
- +
- seq_printf(s, "%s ", tcp_conntrack_names[ct->proto.tcp.state]);
- }
- #endif
- --- a/net/netfilter/nf_conntrack_standalone.c
- +++ b/net/netfilter/nf_conntrack_standalone.c
- @@ -310,10 +310,12 @@ static int ct_seq_show(struct seq_file *
- WARN_ON(!l4proto);
-
- ret = -ENOSPC;
- - seq_printf(s, "%-8s %u %-8s %u %ld ",
- + seq_printf(s, "%-8s %u %-8s %u ",
- l3proto_name(l3proto->l3proto), nf_ct_l3num(ct),
- - l4proto_name(l4proto->l4proto), nf_ct_protonum(ct),
- - nf_ct_expires(ct) / HZ);
- + l4proto_name(l4proto->l4proto), nf_ct_protonum(ct));
- +
- + if (!test_bit(IPS_OFFLOAD_BIT, &ct->status))
- + seq_printf(s, "%ld ", nf_ct_expires(ct) / HZ);
-
- if (l4proto->print_conntrack)
- l4proto->print_conntrack(s, ct);
- @@ -340,7 +342,9 @@ static int ct_seq_show(struct seq_file *
- if (seq_print_acct(s, ct, IP_CT_DIR_REPLY))
- goto release;
-
- - if (test_bit(IPS_ASSURED_BIT, &ct->status))
- + if (test_bit(IPS_OFFLOAD_BIT, &ct->status))
- + seq_puts(s, "[OFFLOAD] ");
- + else if (test_bit(IPS_ASSURED_BIT, &ct->status))
- seq_puts(s, "[ASSURED] ");
-
- if (seq_has_overflowed(s))
|