Thread (6 messages) 6 messages, 3 authors, 2018-05-24

Re: [PATCH net-next v4 2/2] openvswitch: Support conntrack zone limit

From: Pravin Shelar <hidden>
Date: 2018-05-24 06:49:05

On Mon, May 21, 2018 at 5:16 PM, Yi-Hung Wei [off-list ref] wrote:
Currently, nf_conntrack_max is used to limit the maximum number of
conntrack entries in the conntrack table for every network namespace.
For the VMs and containers that reside in the same namespace,
they share the same conntrack table, and the total # of conntrack entries
for all the VMs and containers are limited by nf_conntrack_max.  In this
case, if one of the VM/container abuses the usage the conntrack entries,
it blocks the others from committing valid conntrack entries into the
conntrack table.  Even if we can possibly put the VM in different network
namespace, the current nf_conntrack_max configuration is kind of rigid
that we cannot limit different VM/container to have different # conntrack
entries.

To address the aforementioned issue, this patch proposes to have a
fine-grained mechanism that could further limit the # of conntrack entries
per-zone.  For example, we can designate different zone to different VM,
and set conntrack limit to each zone.  By providing this isolation, a
mis-behaved VM only consumes the conntrack entries in its own zone, and
it will not influence other well-behaved VMs.  Moreover, the users can
set various conntrack limit to different zone based on their preference.

The proposed implementation utilizes Netfilter's nf_conncount backend
to count the number of connections in a particular zone.  If the number of
connection is above a configured limitation, ovs will return ENOMEM to the
userspace.  If userspace does not configure the zone limit, the limit
defaults to zero that is no limitation, which is backward compatible to
the behavior without this patch.

The following high leve APIs are provided to the userspace:
  - OVS_CT_LIMIT_CMD_SET:
    * set default connection limit for all zones
    * set the connection limit for a particular zone
  - OVS_CT_LIMIT_CMD_DEL:
    * remove the connection limit for a particular zone
  - OVS_CT_LIMIT_CMD_GET:
    * get the default connection limit for all zones
    * get the connection limit for a particular zone

Signed-off-by: Yi-Hung Wei <redacted>
I have few comments, but otherwise patch looks good.
quoted hunk ↗ jump to hunk
---
 net/openvswitch/Kconfig     |   3 +-
 net/openvswitch/conntrack.c | 541 +++++++++++++++++++++++++++++++++++++++++++-
 net/openvswitch/conntrack.h |   9 +-
 net/openvswitch/datapath.c  |   7 +-
 net/openvswitch/datapath.h  |   3 +
 5 files changed, 557 insertions(+), 6 deletions(-)
diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig
index 2650205cdaf9..89da9512ec1e 100644
--- a/net/openvswitch/Kconfig
+++ b/net/openvswitch/Kconfig
@@ -9,7 +9,8 @@ config OPENVSWITCH
                   (NF_CONNTRACK && ((!NF_DEFRAG_IPV6 || NF_DEFRAG_IPV6) && \
                                     (!NF_NAT || NF_NAT) && \
                                     (!NF_NAT_IPV4 || NF_NAT_IPV4) && \
-                                    (!NF_NAT_IPV6 || NF_NAT_IPV6)))
+                                    (!NF_NAT_IPV6 || NF_NAT_IPV6) && \
+                                    (!NETFILTER_CONNCOUNT || NETFILTER_CONNCOUNT)))
        select LIBCRC32C
        select MPLS
        select NET_MPLS_GSO
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index 02fc343feb66..e8bb91420ca9 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -16,8 +16,11 @@
 #include <linux/tcp.h>
 #include <linux/udp.h>
 #include <linux/sctp.h>
+#include <linux/static_key.h>
 #include <net/ip.h>
+#include <net/genetlink.h>
 #include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_count.h>
 #include <net/netfilter/nf_conntrack_helper.h>
 #include <net/netfilter/nf_conntrack_labels.h>
 #include <net/netfilter/nf_conntrack_seqadj.h>
@@ -76,6 +79,31 @@ struct ovs_conntrack_info {
 #endif
 };

+#if    IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT)
+#define OVS_CT_LIMIT_UNLIMITED 0
+#define OVS_CT_LIMIT_DEFAULT OVS_CT_LIMIT_UNLIMITED
+#define CT_LIMIT_HASH_BUCKETS 512
+static DEFINE_STATIC_KEY_FALSE(ovs_ct_limit_enabled);
+
+struct ovs_ct_limit {
+       /* Elements in ovs_ct_limit_info->limits hash table */
+       struct hlist_node hlist_node;
+       struct rcu_head rcu;
+       u16 zone;
+       u32 limit;
+};
+
+struct ovs_ct_limit_info {
+       u32 default_limit;
+       struct hlist_head *limits;
+       struct nf_conncount_data *data __aligned(8);
Why does it need explicit alignment attribute?
+};
+
+static const struct nla_policy ct_limit_policy[OVS_CT_LIMIT_ATTR_MAX + 1] = {
+       [OVS_CT_LIMIT_ATTR_ZONE_LIMIT] = { .type = NLA_NESTED, },
+};
+#endif
+
 static bool labels_nonzero(const struct ovs_key_ct_labels *labels);
...
+static int ovs_ct_check_limit(struct net *net,
+                             const struct ovs_conntrack_info *info,
+                             const struct nf_conntrack_tuple *tuple)
+{
+       struct ovs_net *ovs_net = net_generic(net, ovs_net_id);
+       const struct ovs_ct_limit_info *ct_limit_info = ovs_net->ct_limit_info;
+       u32 per_zone_limit, connections;
+       u32 conncount_key[5];
If the key size of single u32, why the array of 5 is defined for the key?
+
+       conncount_key[0] = info->zone.id;
+
+       per_zone_limit = ct_limit_get(ct_limit_info, info->zone.id);
+       if (per_zone_limit == OVS_CT_LIMIT_UNLIMITED)
+               return 0;
+
+       connections = nf_conncount_count(net, ct_limit_info->data,
+                                        conncount_key, tuple, &info->zone);
+       if (connections > per_zone_limit)
+               return -ENOMEM;
+
+       return 0;
...
-void ovs_ct_init(struct net *net)
+#if    IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT)
+static int ovs_ct_limit_init(struct net *net, struct ovs_net *ovs_net)
+{
+       int i, err;
+
+       ovs_net->ct_limit_info = kmalloc(sizeof(*ovs_net->ct_limit_info),
+                                        GFP_KERNEL);
+       if (!ovs_net->ct_limit_info)
+               return -ENOMEM;
+
+       ovs_net->ct_limit_info->default_limit = OVS_CT_LIMIT_DEFAULT;
+       ovs_net->ct_limit_info->limits =
+               kmalloc_array(CT_LIMIT_HASH_BUCKETS, sizeof(struct hlist_head),
+                             GFP_KERNEL);
+       if (!ovs_net->ct_limit_info->limits) {
+               kfree(ovs_net->ct_limit_info);
+               return -ENOMEM;
+       }
+
+       for (i = 0; i < CT_LIMIT_HASH_BUCKETS; i++)
+               INIT_HLIST_HEAD(&ovs_net->ct_limit_info->limits[i]);
+
+       ovs_net->ct_limit_info->data =
+               nf_conncount_init(net, NFPROTO_INET, sizeof(u32));
+
+       if (IS_ERR(ovs_net->ct_limit_info->data)) {
Can you print error msg, other wise it would be really hard to debug a
namespace launch failure due to this issue.
+               err = PTR_ERR(ovs_net->ct_limit_info->data);
+               kfree(ovs_net->ct_limit_info->limits);
+               kfree(ovs_net->ct_limit_info);
+               return err;
+       }
+       return 0;
+}
+
....
+static int ovs_ct_limit_del_zone_limit(struct nlattr *nla_zone_limit,
+                                      struct ovs_ct_limit_info *info)
+{
+       struct ovs_zone_limit *zone_limit;
+       int rem;
+       u16 zone;
+
+       rem = NLA_ALIGN(nla_len(nla_zone_limit));
+       zone_limit = (struct ovs_zone_limit *)nla_data(nla_zone_limit);
+
+       while (rem >= sizeof(*zone_limit)) {
+               if (unlikely(!check_zone_id(zone_limit->zone_id, &zone))) {
+                       OVS_NLERR(true, "zone id is out of range");
There is no need to check if the port is out of range when we are
deleting it. since hash table lookup would fail anyways.
+               } else {
+                       ovs_lock();
+                       ct_limit_del(info, zone);
+                       ovs_unlock();
+               }
+               rem -= NLA_ALIGN(sizeof(*zone_limit));
+               zone_limit = (struct ovs_zone_limit *)((u8 *)zone_limit +
+                               NLA_ALIGN(sizeof(*zone_limit)));
+       }
+
This API does not handle delete of default limit.
+       if (rem)
+               OVS_NLERR(true, "del zone limit has %d unknown bytes", rem);
+
+       return 0;
+}
+
+static int ovs_ct_limit_get_default_limit(struct ovs_ct_limit_info *info,
+                                         struct sk_buff *reply)
+{
+       struct ovs_zone_limit zone_limit;
+       int err;
+
+       zone_limit.zone_id = -1;
This is part of UAPI, Can you define constant in openvswitch.h for
default zone id to be -1.
+       zone_limit.limit = info->default_limit;
+       err = nla_put_nohdr(reply, sizeof(zone_limit), &zone_limit);
+       if (err)
+               return err;
+
+       return 0;
+}
+
+static int ovs_ct_limit_get_zone_limit(struct net *net,
+                                      struct nlattr *nla_zone_limit,
+                                      struct ovs_ct_limit_info *info,
+                                      struct sk_buff *reply)
+{
+       struct nf_conntrack_zone ct_zone;
+       struct ovs_zone_limit *zone_limit;
+       int rem, err;
+       u32 conncount_key[5];
+       u16 zone;
+
+       rem = NLA_ALIGN(nla_len(nla_zone_limit));
+       zone_limit = (struct ovs_zone_limit *)nla_data(nla_zone_limit);
+
+       while (rem >= sizeof(*zone_limit)) {
+               if (unlikely(zone_limit->zone_id == -1)) {
+                       err = ovs_ct_limit_get_default_limit(info, reply);
+                       if (err)
+                               return err;
+               } else if (unlikely(!check_zone_id(zone_limit->zone_id,
+                                                       &zone))) {
+                       OVS_NLERR(true, "zone id is out of range");
+               } else {
+                       rcu_read_lock();
+                       zone_limit->limit = ct_limit_get(info, zone);
+                       rcu_read_unlock();
+
+                       nf_ct_zone_init(&ct_zone, zone, NF_CT_DEFAULT_ZONE_DIR,
+                                       0);
+                       conncount_key[0] = zone;
+                       zone_limit->count = nf_conncount_count(
+                               net, info->data, conncount_key, NULL, &ct_zone);
+                       err = nla_put_nohdr(reply, sizeof(*zone_limit),
+                                           zone_limit);
+                       if (err)
+                               return err;
+               }
+               rem -= NLA_ALIGN(sizeof(*zone_limit));
+               zone_limit = (struct ovs_zone_limit *)((u8 *)zone_limit +
+                               NLA_ALIGN(sizeof(*zone_limit)));
+       }
+
+       if (rem)
+               OVS_NLERR(true, "get zone limit has %d unknown bytes", rem);
+
+       return 0;
+}
+
+static int ovs_ct_limit_get_all_zone_limit(struct net *net,
+                                          struct ovs_ct_limit_info *info,
+                                          struct sk_buff *reply)
+{
+       struct nf_conntrack_zone ct_zone;
+       struct ovs_zone_limit zone_limit;
+       struct ovs_ct_limit *ct_limit;
+       struct hlist_head *head;
+       u32 conncount_key[5];
+       int i, err = 0;
+
+       err = ovs_ct_limit_get_default_limit(info, reply);
+       if (err)
+               return err;
+
+       rcu_read_lock();
+       for (i = 0; i < CT_LIMIT_HASH_BUCKETS; ++i) {
+               head = &info->limits[i];
+               hlist_for_each_entry_rcu(ct_limit, head, hlist_node) {
+                       zone_limit.zone_id = ct_limit->zone;
+                       zone_limit.limit = ct_limit->limit;
+                       nf_ct_zone_init(&ct_zone, ct_limit->zone,
+                                       NF_CT_DEFAULT_ZONE_DIR, 0);
+
+                       conncount_key[0] = ct_limit->zone;
+                       zone_limit.count = nf_conncount_count(net, info->data,
+                                       conncount_key, NULL, &ct_zone);
+                       err = nla_put_nohdr(reply, sizeof(zone_limit),
+                                       &zone_limit);
+                       if (err)
+                               goto exit_err;
Can you write a single helper function to build reply zone_limit
object that can be used in ovs_ct_limit_get_zone_limit() and
ovs_ct_limit_get_all_zone_limit()?
+               }
+       }
+
+exit_err:
+       rcu_read_unlock();
+       return err;
+}
+
...
Keyboard shortcuts
hback out one level
jnext message in thread
kprevious message in thread
ldrill in
Escclose help / fold thread tree
?toggle this help