Thread (9 messages) 9 messages, 5 authors, 2016-12-02

Re: [RFC PATCH net-next v2] ipv6: implement consistent hashing for equal-cost multipath routing

From: Tom Herbert <hidden>
Date: 2016-11-30 04:04:44

On Tue, Nov 29, 2016 at 9:15 AM, David Lebrun [off-list ref] wrote:
When multiple nexthops are available for a given route, the routing engine
chooses a nexthop by computing the flow hash through get_hash_from_flowi6
and by taking that value modulo the number of nexthops. The resulting value
indexes the nexthop to select. This method causes issues when a new nexthop
is added or one is removed (e.g. link failure). In that case, the number
of nexthops changes and potentially all the flows get re-routed to another
nexthop.
This is a lot of code to make ECMP work better. Can you be more
specific as to what the "issues" are? Assuming this is just the
transient packet reorder that happens in one link flap I am wondering
if this complexity is justified.

Tom
quoted hunk ↗ jump to hunk
This patch implements a consistent hash method to select the nexthop in
case of ECMP. The idea is to generate K slices (or intervals) for each
route with multiple nexthops. The nexthops are randomly assigned to those
slices, in a uniform manner. The number K is configurable through a sysctl
net.ipv6.route.ecmp_slices and is always an exponent of 2. To select the
nexthop, the algorithm takes the flow hash and computes an index which is
the flow hash modulo K. As K = 2^x, the modulo can be computed using a
simple binary AND operation (idx = hash & (K - 1)). The resulting index
references the selected nexthop. The lookup time complexity is thus O(1).

When a nexthop is added, it steals K/N slices from the other nexthops,
where N is the new number of nexthops. The slices are stolen randomly and
uniformly from the other nexthops. When a nexthop is removed, the orphan
slices are randomly reassigned to the other nexthops.

The number of slices for a route also fixes the maximum number of nexthops
possible for that route.

Signed-off-by: David Lebrun <redacted>
---
 include/net/ip6_fib.h    |  25 +++++++
 include/net/netns/ipv6.h |   1 +
 net/ipv6/ip6_fib.c       | 174 ++++++++++++++++++++++++++++++++++++++++++++++-
 net/ipv6/route.c         |  58 ++++++++--------
 4 files changed, 229 insertions(+), 29 deletions(-)
diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index a74e2aa..29594cc 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -93,6 +93,30 @@ struct rt6key {

 struct fib6_table;

+/* Multipath nexthop.
+ * @nh is the actual nexthop
+ * @nslices is the number of slices assigned to this nexthop
+ */
+struct rt6_multi_nh {
+       struct rt6_info *nh;
+       unsigned int    nslices;
+};
+
+/* Multipath routes map.
+ * @nhs is an array of available nexthops
+ * @size is the number of elements in @nhs
+ * @nslices is the number of slices and the number of elements in @index
+ *         and is always in the form 2^x to prevent using a division for
+ *         the computation of the modulo.
+ * @index is an array mapping a slice index to a nexthop index in @nhs
+ */
+struct rt6_multi_map {
+       struct rt6_multi_nh     *nhs;
+       unsigned int            size;
+       unsigned int            nslices;
+       __u8                    *index;
+};
+
 struct rt6_info {
        struct dst_entry                dst;
@@ -113,6 +137,7 @@ struct rt6_info {
         */
        struct list_head                rt6i_siblings;
        unsigned int                    rt6i_nsiblings;
+       struct rt6_multi_map            *rt6i_nh_map;

        atomic_t                        rt6i_ref;
diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h
index de7745e..4967846 100644
--- a/include/net/netns/ipv6.h
+++ b/include/net/netns/ipv6.h
@@ -36,6 +36,7 @@ struct netns_sysctl_ipv6 {
        int idgen_retries;
        int idgen_delay;
        int flowlabel_state_ranges;
+       int ip6_rt_ecmp_slices;
 };

 struct netns_ipv6 {
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index ef54852..b6b1895 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -727,6 +727,166 @@ static void fib6_purge_rt(struct rt6_info *rt, struct fib6_node *fn,
        }
 }

+static void fib6_mp_free(struct rt6_info *rt)
+{
+       struct rt6_multi_map *nh_map = rt->rt6i_nh_map;
+       struct rt6_info *sibling;
+
+       if (nh_map) {
+               list_for_each_entry(sibling, &rt->rt6i_siblings,
+                                   rt6i_siblings) {
+                       sibling->rt6i_nh_map = NULL;
+               }
+
+               rt->rt6i_nh_map = NULL;
+
+               kfree(nh_map->nhs);
+               kfree(nh_map->index);
+               kfree(nh_map);
+       }
+}
+
+static int fib6_mp_extend(struct net *net, struct rt6_info *sref,
+                         struct rt6_info *rt)
+{
+       struct rt6_multi_map *nh_map = sref->rt6i_nh_map;
+       struct rt6_multi_nh *tmp_nhs, *old_nhs;
+       unsigned int kslices;
+       unsigned int i, j;
+
+       if (!nh_map) {
+               __u8 *index;
+
+               nh_map = kmalloc(sizeof(*nh_map), GFP_ATOMIC);
+               if (!nh_map)
+                       return -ENOMEM;
+
+               nh_map->size = 1;
+               nh_map->nslices = (1 << net->ipv6.sysctl.ip6_rt_ecmp_slices);
+
+               tmp_nhs = kmalloc(sizeof(*tmp_nhs), GFP_ATOMIC);
+               if (!tmp_nhs) {
+                       kfree(nh_map);
+                       return -ENOMEM;
+               }
+
+               tmp_nhs[0].nh = sref;
+               tmp_nhs[0].nslices = nh_map->nslices;
+               nh_map->nhs = tmp_nhs;
+
+               index = kmalloc_array(nh_map->nslices, sizeof(*index),
+                                     GFP_ATOMIC);
+               if (!index) {
+                       kfree(nh_map->nhs);
+                       kfree(nh_map);
+                       return -ENOMEM;
+               }
+
+               for (i = 0; i < nh_map->nslices; i++)
+                       index[i] = 0;
+
+               nh_map->index = index;
+               sref->rt6i_nh_map = nh_map;
+       }
+
+       if (nh_map->size == nh_map->nslices)
+               return -ENOBUFS;
+
+       nh_map->size++;
+
+       tmp_nhs = kmalloc_array(nh_map->size, sizeof(*tmp_nhs), GFP_ATOMIC);
+       if (!tmp_nhs)
+               return -ENOMEM;
+
+       for (i = 0; i < nh_map->size - 1; i++)
+               tmp_nhs[i] = nh_map->nhs[i];
+
+       tmp_nhs[nh_map->size - 1].nh = rt;
+       tmp_nhs[nh_map->size - 1].nslices = 0;
+
+       kslices = nh_map->nslices / nh_map->size;
+
+       /* Loop over nexthops and steal a random slice until we have kslices for
+        * the new nexthop. This algorithm ensures that flows are taken as
+        * equally as possible from current nexthops.
+        *
+        * At each iteration, @j is the index in tmp_nhs of the nexthop
+        * to steal from and @idx is the index of the slice to steal
+        * among @j's slices.
+        */
+       for (i = 0, j = 0; i < kslices; i++) {
+               unsigned int idx, k = 0;
+
+               if (tmp_nhs[j].nslices == 1)
+                       continue;
+
+               idx = (prandom_u32() % tmp_nhs[j].nslices) + 1;
+               do {
+                       if (nh_map->index[k++] == j)
+                               idx--;
+               } while (idx);
+
+               nh_map->index[k - 1] = nh_map->size - 1;
+               tmp_nhs[nh_map->size - 1].nslices++;
+               tmp_nhs[j].nslices--;
+
+               j = (j + 1) % (nh_map->size - 1);
+       }
+
+       WARN_ON(tmp_nhs[nh_map->size - 1].nslices != kslices);
+
+       old_nhs = nh_map->nhs;
+       nh_map->nhs = tmp_nhs;
+       kfree(old_nhs);
+
+       rt->rt6i_nh_map = nh_map;
+
+       return 0;
+}
+
+static int fib6_mp_shrink(struct rt6_info *sref, struct rt6_info *rt)
+{
+       struct rt6_multi_map *nh_map = sref->rt6i_nh_map;
+       struct rt6_multi_nh *tmp_nhs, *old_nhs;
+       unsigned int i, j, idx = 0;
+
+       tmp_nhs = kmalloc_array(nh_map->size - 1, sizeof(*tmp_nhs), GFP_ATOMIC);
+       if (!tmp_nhs)
+               return -ENOMEM;
+
+       for (i = 0, j = 0; i < nh_map->size; i++) {
+               if (nh_map->nhs[i].nh != rt)
+                       tmp_nhs[j++] = nh_map->nhs[i];
+               else
+                       idx = i;
+       }
+
+       nh_map->size--;
+       old_nhs = nh_map->nhs;
+       nh_map->nhs = tmp_nhs;
+       kfree(old_nhs);
+
+       rt->rt6i_nh_map = NULL;
+
+       /* Update the slices index. For each slice mapping to the removed
+        * nexthop, assign another random nexthop. For each slice mapping
+        * to a nexthop that was after the removed nexthop, decrement the
+        * nexthop index to reflect the shift. Nexthops that were before
+        * the removed nexthop are unaffected.
+        */
+       for (i = 0; i < nh_map->nslices; i++) {
+               if (nh_map->index[i] == idx) {
+                       j = prandom_u32() % nh_map->size;
+                       nh_map->index[i] = j;
+                       nh_map->nhs[j].nslices++;
+               } else if (nh_map->index[i] > idx) {
+                       nh_map->index[i]--;
+               }
+       }
+
+       return 0;
+}
+
 /*
  *     Insert routing information in a node.
  */
@@ -837,6 +997,9 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
                        }
                        sibling = sibling->dst.rt6_next;
                }
+               err = fib6_mp_extend(info->nl_net, sibling, rt);
+               if (err < 0)
+                       return err;
                /* For each sibling in the list, increment the counter of
                 * siblings. BUG() if counters does not match, list of siblings
                 * is broken!
@@ -900,6 +1063,8 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
                        fn->fn_flags |= RTN_RTINFO;
                }
                nsiblings = iter->rt6i_nsiblings;
+               if (nsiblings)
+                       fib6_mp_free(iter);
                fib6_purge_rt(iter, fn, info->nl_net);
                rt6_release(iter);
@@ -1407,13 +1572,20 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp,

        /* Remove this entry from other siblings */
        if (rt->rt6i_nsiblings) {
-               struct rt6_info *sibling, *next_sibling;
+               struct rt6_info *sibling, *next_sibling, *sref;
+
+               sref = list_first_entry(&rt->rt6i_siblings, struct rt6_info,
+                                       rt6i_siblings);

                list_for_each_entry_safe(sibling, next_sibling,
                                         &rt->rt6i_siblings, rt6i_siblings)
                        sibling->rt6i_nsiblings--;
                rt->rt6i_nsiblings = 0;
                list_del_init(&rt->rt6i_siblings);
+               if (!sref->rt6i_nsiblings)
+                       fib6_mp_free(sref);
+               else
+                       fib6_mp_shrink(sref, rt);
        }

        /* Adjust walkers */
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index b317bb1..e9f13e0 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -427,39 +427,27 @@ static bool rt6_check_expired(const struct rt6_info *rt)
        return false;
 }

-/* Multipath route selection:
- *   Hash based function using packet header and flowlabel.
- * Adapted from fib_info_hashfn()
- */
-static int rt6_info_hash_nhsfn(unsigned int candidate_count,
-                              const struct flowi6 *fl6)
-{
-       return get_hash_from_flowi6(fl6) % candidate_count;
-}
-
 static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
                                             struct flowi6 *fl6, int oif,
                                             int strict)
 {
-       struct rt6_info *sibling, *next_sibling;
-       int route_choosen;
+       struct rt6_multi_map *nh_map = match->rt6i_nh_map;
+       __u32 hash = get_hash_from_flowi6(fl6);
+       unsigned int slice, idx;
+       struct rt6_info *res;

-       route_choosen = rt6_info_hash_nhsfn(match->rt6i_nsiblings + 1, fl6);
-       /* Don't change the route, if route_choosen == 0
-        * (siblings does not include ourself)
-        */
-       if (route_choosen)
-               list_for_each_entry_safe(sibling, next_sibling,
-                               &match->rt6i_siblings, rt6i_siblings) {
-                       route_choosen--;
-                       if (route_choosen == 0) {
-                               if (rt6_score_route(sibling, oif, strict) < 0)
-                                       break;
-                               match = sibling;
-                               break;
-                       }
-               }
-       return match;
+       WARN_ON(!nh_map);
+       if (!nh_map)
+               return match;
+
+       slice = hash & (nh_map->nslices - 1);
+       idx = nh_map->index[slice];
+       res = nh_map->nhs[idx].nh;
+
+       if (rt6_score_route(res, oif, strict) < 0)
+               res = match;
+
+       return res;
 }

 /*
@@ -3550,6 +3538,9 @@ int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
        return 0;
 }

+static int one = 1;
+static int thirtyone = 31;
+
 struct ctl_table ipv6_route_table_template[] = {
        {
                .procname       =       "flush",
@@ -3621,6 +3612,15 @@ struct ctl_table ipv6_route_table_template[] = {
                .mode           =       0644,
                .proc_handler   =       proc_dointvec_ms_jiffies,
        },
+       {
+               .procname       =       "ecmp_slices",
+               .data           =       &init_net.ipv6.sysctl.ip6_rt_ecmp_slices,
+               .maxlen         =       sizeof(int),
+               .mode           =       0644,
+               .proc_handler   =       proc_dointvec_minmax,
+               .extra1         =       &one,
+               .extra2         =       &thirtyone,
+       },
        { }
 };
@@ -3644,6 +3644,7 @@ struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
                table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
                table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
                table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
+               table[10].data = &net->ipv6.sysctl.ip6_rt_ecmp_slices;

                /* Don't export sysctls to unprivileged users */
                if (net->user_ns != &init_user_ns)
@@ -3707,6 +3708,7 @@ static int __net_init ip6_route_net_init(struct net *net)
        net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
        net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
        net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
+       net->ipv6.sysctl.ip6_rt_ecmp_slices = 5;

        net->ipv6.ip6_rt_gc_expire = 30*HZ;

--
2.7.3
Keyboard shortcuts
hback out one level
jnext message in thread
kprevious message in thread
ldrill in
Escclose help / fold thread tree
?toggle this help