[bpf-next V1 PATCH 13/15] mlx5: use page_pool for xdp_return_frame call
From: Jesper Dangaard Brouer <hidden>
Date: 2018-03-06 21:48:53
Subsystem:
mellanox ethernet driver (mlx5e), mellanox mlx5 core vpi driver, networking drivers, the rest · Maintainers:
Saeed Mahameed, Tariq Toukan, Mark Bloch, Leon Romanovsky, Andrew Lunn, "David S. Miller", Eric Dumazet, Jakub Kicinski, Paolo Abeni, Linus Torvalds
This patch shows how it is possible to have both the driver local page cache, which uses elevated refcnt for "catching"/avoiding SKB put_page. And at the same time, have pages getting returned to the page_pool from ndp_xdp_xmit DMA completion. Performance is surprisingly good. Tested DMA-TX completion on ixgbe, that calls "xdp_return_frame", which call page_pool_put_page(). Stats show DMA-TX-completion runs on CPU#9 and mlx5 RX runs on CPU#5. (Internally page_pool uses ptr_ring, which is what gives the good cross CPU performance). Show adapter(s) (ixgbe2 mlx5p2) statistics (ONLY that changed!) Ethtool(ixgbe2 ) stat: 732863573 ( 732,863,573) <= tx_bytes /sec Ethtool(ixgbe2 ) stat: 781724427 ( 781,724,427) <= tx_bytes_nic /sec Ethtool(ixgbe2 ) stat: 12214393 ( 12,214,393) <= tx_packets /sec Ethtool(ixgbe2 ) stat: 12214435 ( 12,214,435) <= tx_pkts_nic /sec Ethtool(mlx5p2 ) stat: 12211786 ( 12,211,786) <= rx3_cache_empty /sec Ethtool(mlx5p2 ) stat: 36506736 ( 36,506,736) <= rx_64_bytes_phy /sec Ethtool(mlx5p2 ) stat: 2336430575 ( 2,336,430,575) <= rx_bytes_phy /sec Ethtool(mlx5p2 ) stat: 12211786 ( 12,211,786) <= rx_cache_empty /sec Ethtool(mlx5p2 ) stat: 22823073 ( 22,823,073) <= rx_discards_phy /sec Ethtool(mlx5p2 ) stat: 1471860 ( 1,471,860) <= rx_out_of_buffer /sec Ethtool(mlx5p2 ) stat: 36506715 ( 36,506,715) <= rx_packets_phy /sec Ethtool(mlx5p2 ) stat: 2336542282 ( 2,336,542,282) <= rx_prio0_bytes /sec Ethtool(mlx5p2 ) stat: 13683921 ( 13,683,921) <= rx_prio0_packets /sec Ethtool(mlx5p2 ) stat: 821015537 ( 821,015,537) <= rx_vport_unicast_bytes /sec Ethtool(mlx5p2 ) stat: 13683608 ( 13,683,608) <= rx_vport_unicast_packets /sec Before this patch: single flow performance was 6Mpps, and if I started two flows the collective performance drop to 4Mpps, because we hit the page allocator lock (further negative scaling occurs). Signed-off-by: Jesper Dangaard Brouer <redacted> --- drivers/net/ethernet/mellanox/mlx5/core/en.h | 3 ++ drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 39 ++++++++++++++++++--- drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 10 ++++- 3 files changed, 45 insertions(+), 7 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 28cc26debeda..ab91166f7c5a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h@@ -53,6 +53,8 @@ #include "mlx5_core.h" #include "en_stats.h" +struct page_pool; + #define MLX5_SET_CFG(p, f, v) MLX5_SET(create_flow_group_in, p, f, v) #define MLX5E_ETH_HARD_MTU (ETH_HLEN + VLAN_HLEN + ETH_FCS_LEN)
@@ -535,6 +537,7 @@ struct mlx5e_rq { /* XDP */ struct bpf_prog *xdp_prog; struct mlx5e_xdpsq xdpsq; + struct page_pool *page_pool; /* control */ struct mlx5_wq_ctrl wq_ctrl;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 49732c8c27c1..fbe27110ff02 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c@@ -44,6 +44,8 @@ #include "accel/ipsec.h" #include "vxlan.h" +#include <net/page_pool.h> + struct mlx5e_rq_param { u32 rqc[MLX5_ST_SZ_DW(rqc)]; struct mlx5_wq_param wq;
@@ -396,6 +398,8 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c, int err; int i; + struct page_pool_params pp_params = { 0 }; + rqp->wq.db_numa_node = cpu_to_node(c->cpu); err = mlx5_wq_ll_create(mdev, &rqp->wq, rqc_wq, &rq->wq,
@@ -506,12 +510,33 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c, rq->mkey_be = c->mkey_be; } - /* This must only be activate for order-0 pages */ - if (rq->xdp_prog) - err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq, - MEM_TYPE_PAGE_ORDER0, NULL); - if (err) + /* Create a page_pool and register it with rxq */ + pp_params.size = PAGE_POOL_PARAMS_SIZE; + pp_params.order = rq->buff.page_order; + pp_params.dev = c->pdev; + pp_params.nid = cpu_to_node(c->cpu); + pp_params.dma_dir = rq->buff.map_dir; + pp_params.pool_size = 1 << params->log_rq_size; + pp_params.flags = 0; /* No-internal DMA mapping in page_pool */ + + /* page_pool can be used even when there is no rq->xdp_prog, + * given page_pool does not handle DMA mapping there is no + * required state to clear. And page_pool gracefully handle + * elevated refcnt. + */ + rq->page_pool = page_pool_create(&pp_params); + if (IS_ERR_OR_NULL(rq->page_pool)) { + kfree(rq->wqe.frag_info); + rq->page_pool = NULL; + err = -ENOMEM; goto err_rq_wq_destroy; + } + err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq, + MEM_TYPE_PAGE_POOL, rq->page_pool); + if (err) { + err = -ENOMEM; + goto err_rq_wq_destroy; + } for (i = 0; i < wq_sz; i++) { struct mlx5e_rx_wqe *wqe = mlx5_wq_ll_get_wqe(&rq->wq, i);
@@ -549,6 +574,8 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c, if (rq->xdp_prog) bpf_prog_put(rq->xdp_prog); xdp_rxq_info_unreg(&rq->xdp_rxq); + if (rq->page_pool) + page_pool_destroy_rcu(rq->page_pool); mlx5_wq_destroy(&rq->wq_ctrl); return err;
@@ -562,6 +589,8 @@ static void mlx5e_free_rq(struct mlx5e_rq *rq) bpf_prog_put(rq->xdp_prog); xdp_rxq_info_unreg(&rq->xdp_rxq); + if (rq->page_pool) + page_pool_destroy_rcu(rq->page_pool); switch (rq->wq_type) { case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index 6dcc3e8fbd3e..4898239467d9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c@@ -45,6 +45,8 @@ #include "en_accel/ipsec_rxtx.h" #include "lib/clock.h" +#include <net/page_pool.h> + static inline bool mlx5e_rx_hw_stamp(struct hwtstamp_config *config) { return config->rx_filter == HWTSTAMP_FILTER_ALL;
@@ -221,7 +223,7 @@ static inline int mlx5e_page_alloc_mapped(struct mlx5e_rq *rq, if (mlx5e_rx_cache_get(rq, dma_info)) return 0; - dma_info->page = dev_alloc_pages(rq->buff.page_order); + dma_info->page = page_pool_dev_alloc_pages(rq->page_pool); if (unlikely(!dma_info->page)) return -ENOMEM;
@@ -250,7 +252,11 @@ void mlx5e_page_release(struct mlx5e_rq *rq, struct mlx5e_dma_info *dma_info, return; mlx5e_page_dma_unmap(rq, dma_info); - put_page(dma_info->page); + + if (likely(recycle)) + page_pool_recycle_direct(rq->page_pool, dma_info->page); + else + put_page(dma_info->page); } static inline bool mlx5e_page_reuse(struct mlx5e_rq *rq,