Thread (20 messages) 20 messages, 2 authors, 2018-03-08

Re: [bpf-next V1 PATCH 13/15] mlx5: use page_pool for xdp_return_frame call

From: Jesper Dangaard Brouer <hidden>
Date: 2018-03-08 12:44:53

On Wed, 7 Mar 2018 14:11:36 +0200
Tariq Toukan [off-list ref] wrote:
On 06/03/2018 11:48 PM, Jesper Dangaard Brouer wrote:
quoted
This patch shows how it is possible to have both the driver local page
cache, which uses elevated refcnt for "catching"/avoiding SKB
put_page.  And at the same time, have pages getting returned to the
page_pool from ndp_xdp_xmit DMA completion.

Performance is surprisingly good. Tested DMA-TX completion on ixgbe,
that calls "xdp_return_frame", which call page_pool_put_page().
Stats show DMA-TX-completion runs on CPU#9 and mlx5 RX runs on CPU#5.
(Internally page_pool uses ptr_ring, which is what gives the good
cross CPU performance).

Show adapter(s) (ixgbe2 mlx5p2) statistics (ONLY that changed!)
Ethtool(ixgbe2  ) stat:    732863573 (    732,863,573) <= tx_bytes /sec
Ethtool(ixgbe2  ) stat:    781724427 (    781,724,427) <= tx_bytes_nic /sec
Ethtool(ixgbe2  ) stat:     12214393 (     12,214,393) <= tx_packets /sec
Ethtool(ixgbe2  ) stat:     12214435 (     12,214,435) <= tx_pkts_nic /sec
Ethtool(mlx5p2  ) stat:     12211786 (     12,211,786) <= rx3_cache_empty /sec
Ethtool(mlx5p2  ) stat:     36506736 (     36,506,736) <= rx_64_bytes_phy /sec
Ethtool(mlx5p2  ) stat:   2336430575 (  2,336,430,575) <= rx_bytes_phy /sec
Ethtool(mlx5p2  ) stat:     12211786 (     12,211,786) <= rx_cache_empty /sec
Ethtool(mlx5p2  ) stat:     22823073 (     22,823,073) <= rx_discards_phy /sec
Ethtool(mlx5p2  ) stat:      1471860 (      1,471,860) <= rx_out_of_buffer /sec
Ethtool(mlx5p2  ) stat:     36506715 (     36,506,715) <= rx_packets_phy /sec
Ethtool(mlx5p2  ) stat:   2336542282 (  2,336,542,282) <= rx_prio0_bytes /sec
Ethtool(mlx5p2  ) stat:     13683921 (     13,683,921) <= rx_prio0_packets /sec
Ethtool(mlx5p2  ) stat:    821015537 (    821,015,537) <= rx_vport_unicast_bytes /sec
Ethtool(mlx5p2  ) stat:     13683608 (     13,683,608) <= rx_vport_unicast_packets /sec

Before this patch: single flow performance was 6Mpps, and if I started
two flows the collective performance drop to 4Mpps, because we hit the
page allocator lock (further negative scaling occurs).

Signed-off-by: Jesper Dangaard Brouer <redacted>
---
  drivers/net/ethernet/mellanox/mlx5/core/en.h      |    3 ++
  drivers/net/ethernet/mellanox/mlx5/core/en_main.c |   39 ++++++++++++++++++---
  drivers/net/ethernet/mellanox/mlx5/core/en_rx.c   |   10 ++++-
  3 files changed, 45 insertions(+), 7 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 28cc26debeda..ab91166f7c5a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -53,6 +53,8 @@
  #include "mlx5_core.h"
  #include "en_stats.h"
  
+struct page_pool;
+  
you can have the include here instead:
#include <net/page_pool.h>

and remove it from the .c files.
Hmm... I'm not too happy with this change request.  I did this to avoid
having to recompile all the drivers files when I changed page_pool.h
(yes, the kernel makefile system detect the dependencies automatically).
I'll keep this, let me know if you feel strongly about this...
quoted
  #define MLX5_SET_CFG(p, f, v) MLX5_SET(create_flow_group_in, p, f, v)
  
  #define MLX5E_ETH_HARD_MTU (ETH_HLEN + VLAN_HLEN + ETH_FCS_LEN)
@@ -535,6 +537,7 @@ struct mlx5e_rq {
  	/* XDP */
  	struct bpf_prog       *xdp_prog;
  	struct mlx5e_xdpsq     xdpsq;
+	struct page_pool      *page_pool;
  
  	/* control */
  	struct mlx5_wq_ctrl    wq_ctrl;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 49732c8c27c1..fbe27110ff02 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -44,6 +44,8 @@
  #include "accel/ipsec.h"
  #include "vxlan.h"
  
+#include <net/page_pool.h>
+
  struct mlx5e_rq_param {
  	u32			rqc[MLX5_ST_SZ_DW(rqc)];
  	struct mlx5_wq_param	wq;
@@ -396,6 +398,8 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
  	int err;
  	int i;
  
+	struct page_pool_params pp_params = { 0 };
+
  	rqp->wq.db_numa_node = cpu_to_node(c->cpu);
  
  	err = mlx5_wq_ll_create(mdev, &rqp->wq, rqc_wq, &rq->wq,
@@ -506,12 +510,33 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
  		rq->mkey_be = c->mkey_be;
  	}
  
-	/* This must only be activate for order-0 pages */
-	if (rq->xdp_prog)
-		err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq,
-						 MEM_TYPE_PAGE_ORDER0, NULL);
-	if (err)
+	/* Create a page_pool and register it with rxq */
+	pp_params.size      = PAGE_POOL_PARAMS_SIZE;
+	pp_params.order     = rq->buff.page_order;
+	pp_params.dev       = c->pdev;
+	pp_params.nid       = cpu_to_node(c->cpu);
+	pp_params.dma_dir   = rq->buff.map_dir;
+	pp_params.pool_size = 1 << params->log_rq_size;  
if rq->wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ, need to multiply 
by MLX5_MPWRQ_PAGES_PER_WQE.
Okay, doing so.
quoted
+	pp_params.flags     = 0; /* No-internal DMA mapping in page_pool */
+
+	/* page_pool can be used even when there is no rq->xdp_prog,
+	 * given page_pool does not handle DMA mapping there is no
+	 * required state to clear. And page_pool gracefully handle
+	 * elevated refcnt.
+	 */
+	rq->page_pool = page_pool_create(&pp_params);
+	if (IS_ERR_OR_NULL(rq->page_pool)) {
+		kfree(rq->wqe.frag_info);
+		rq->page_pool = NULL;
+		err = -ENOMEM;  
better: PTR_ERR(rq->page_pool)
Ok, I changed page_pool_create() return values to be a little more sane
(not returning NULL), so we can do as you suggest (else using PTR_ERR
directly for err can return zero, which indicate success for
mlx5e_alloc_rq() even when it failed...).

 
quoted
  		goto err_rq_wq_destroy;
+	}
+	err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq,
+					 MEM_TYPE_PAGE_POOL,
rq->page_pool);
+	if (err) {
+		err = -ENOMEM;  
why change err value?
Okay.
quoted
+		goto err_rq_wq_destroy;
+	}
  
  	for (i = 0; i < wq_sz; i++) {
  		struct mlx5e_rx_wqe *wqe =
mlx5_wq_ll_get_wqe(&rq->wq, i); @@ -549,6 +574,8 @@ static int
mlx5e_alloc_rq(struct mlx5e_channel *c, if (rq->xdp_prog)
  		bpf_prog_put(rq->xdp_prog);
  	xdp_rxq_info_unreg(&rq->xdp_rxq);
+	if (rq->page_pool)
+		page_pool_destroy_rcu(rq->page_pool);
  	mlx5_wq_destroy(&rq->wq_ctrl);
  
  	return err;
@@ -562,6 +589,8 @@ static void mlx5e_free_rq(struct mlx5e_rq *rq)
  		bpf_prog_put(rq->xdp_prog);
  
  	xdp_rxq_info_unreg(&rq->xdp_rxq);
+	if (rq->page_pool)
+		page_pool_destroy_rcu(rq->page_pool);
  
  	switch (rq->wq_type) {
  	case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index
6dcc3e8fbd3e..4898239467d9 100644 ---
a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++
b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -45,6 +45,8 @@
  #include "en_accel/ipsec_rxtx.h"
  #include "lib/clock.h"
  
+#include <net/page_pool.h>
+
  static inline bool mlx5e_rx_hw_stamp(struct hwtstamp_config
*config) {
  	return config->rx_filter == HWTSTAMP_FILTER_ALL;
@@ -221,7 +223,7 @@ static inline int
mlx5e_page_alloc_mapped(struct mlx5e_rq *rq, if
(mlx5e_rx_cache_get(rq, dma_info)) return 0;
  
-	dma_info->page = dev_alloc_pages(rq->buff.page_order);
+	dma_info->page = page_pool_dev_alloc_pages(rq->page_pool);
  	if (unlikely(!dma_info->page))
  		return -ENOMEM;
  
@@ -250,7 +252,11 @@ void mlx5e_page_release(struct mlx5e_rq *rq,
struct mlx5e_dma_info *dma_info, return;
  
  	mlx5e_page_dma_unmap(rq, dma_info);
-	put_page(dma_info->page);
+
+	if (likely(recycle))
+		page_pool_recycle_direct(rq->page_pool,
dma_info->page);
+	else
+		put_page(dma_info->page);
  }
    
we can save a branch, this way:

-       if (likely(recycle) && mlx5e_rx_cache_put(rq, dma_info))
+       if (likely(recycle)) {
+               if (mlx5e_rx_cache_put(rq, dma_info))
+                       return;
+
+               mlx5e_page_dma_unmap(rq, dma_info);
+               page_pool_recycle_direct(rq->page_pool,
dma_info->page); return;
+       }
Okay, doing this, but you missed that we now need to call
mlx5e_page_dma_unmap(rq, dma_info) in two cases now.

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer
Keyboard shortcuts
hback out one level
jnext message in thread
kprevious message in thread
ldrill in
Escclose help / fold thread tree
?toggle this help