[PATCH v5 net-next 05/36] nvme-tcp: Add DDP offload control path
From: Boris Pismenny <borisp@nvidia.com>
Date: 2021-07-22 11:06:11
Also in:
netdev
Subsystem:
networking [general], nvm express driver, the rest · Maintainers:
"David S. Miller", Eric Dumazet, Jakub Kicinski, Paolo Abeni, Keith Busch, Jens Axboe, Christoph Hellwig, Sagi Grimberg, Linus Torvalds
From: Boris Pismenny <redacted> This commit introduces direct data placement offload to NVME TCP. There is a context per queue, which is established after the handshake using the ulp_ddp_sk_add/del NDOs. Additionally, a resynchronization routine is used to assist hardware recovery from TCP OOO, and continue the offload. Resynchronization operates as follows: 1. TCP OOO causes the NIC HW to stop the offload 2. NIC HW identifies a PDU header at some TCP sequence number, and asks NVMe-TCP to confirm it. This request is delivered from the NIC driver to NVMe-TCP by first finding the socket for the packet that triggered the request, and then finding the nvme_tcp_queue that is used by this routine. Finally, the request is recorded in the nvme_tcp_queue. 3. When NVMe-TCP observes the requested TCP sequence, it will compare it with the PDU header TCP sequence, and report the result to the NIC driver (ulp_ddp_resync), which will update the HW, and resume offload when all is successful. Furthermore, we let the offloading driver advertise what is the max hw sectors/segments via ulp_ddp_limits. A follow-up patch introduces the data-path changes required for this offload. Signed-off-by: Boris Pismenny <redacted> Signed-off-by: Ben Ben-Ishay <redacted> Signed-off-by: Or Gerlitz <redacted> Signed-off-by: Yoray Zack <redacted> --- drivers/nvme/host/tcp.c | 180 +++++++++++++++++++++++++++++++++++++++- include/linux/skbuff.h | 4 +- net/core/datagram.c | 4 +- 3 files changed, 182 insertions(+), 6 deletions(-)
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index c7bd37103cf4..f1a5520cabec 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c@@ -14,6 +14,7 @@ #include <linux/blk-mq.h> #include <crypto/hash.h> #include <net/busy_poll.h> +#include <net/ulp_ddp.h> #include "nvme.h" #include "fabrics.h"
@@ -62,6 +63,7 @@ enum nvme_tcp_queue_flags { NVME_TCP_Q_ALLOCATED = 0, NVME_TCP_Q_LIVE = 1, NVME_TCP_Q_POLLING = 2, + NVME_TCP_Q_OFF_DDP = 3, }; enum nvme_tcp_recv_state {
@@ -111,6 +113,8 @@ struct nvme_tcp_queue { void (*state_change)(struct sock *); void (*data_ready)(struct sock *); void (*write_space)(struct sock *); + + atomic64_t resync_req; }; struct nvme_tcp_ctrl {
@@ -130,6 +134,8 @@ struct nvme_tcp_ctrl { struct delayed_work connect_work; struct nvme_tcp_request async_req; u32 io_queues[HCTX_MAX_TYPES]; + + struct net_device *offloading_netdev; }; static LIST_HEAD(nvme_tcp_ctrl_list);
@@ -219,6 +225,167 @@ static inline size_t nvme_tcp_pdu_last_send(struct nvme_tcp_request *req, return nvme_tcp_pdu_data_left(req) <= len; } +#ifdef CONFIG_ULP_DDP + +static bool nvme_tcp_resync_request(struct sock *sk, u32 seq, u32 flags); +static const struct ulp_ddp_ulp_ops nvme_tcp_ddp_ulp_ops = { + .resync_request = nvme_tcp_resync_request, +}; + +static int nvme_tcp_offload_socket(struct nvme_tcp_queue *queue) +{ + struct net_device *netdev = queue->ctrl->offloading_netdev; + struct nvme_tcp_ddp_config config = {}; + int ret; + + if (!netdev || !(netdev->features & NETIF_F_HW_ULP_DDP)) + return -EOPNOTSUPP; + + config.cfg.type = ULP_DDP_NVME; + config.pfv = NVME_TCP_PFV_1_0; + config.cpda = 0; + config.dgst = queue->hdr_digest ? + NVME_TCP_HDR_DIGEST_ENABLE : 0; + config.dgst |= queue->data_digest ? + NVME_TCP_DATA_DIGEST_ENABLE : 0; + config.queue_size = queue->queue_size; + config.queue_id = nvme_tcp_queue_id(queue); + config.io_cpu = queue->io_cpu; + + dev_hold(netdev); /* put by unoffload_socket */ + ret = netdev->ulp_ddp_ops->ulp_ddp_sk_add(netdev, + queue->sock->sk, + &config.cfg); + if (ret) { + dev_put(netdev); + return ret; + } + + inet_csk(queue->sock->sk)->icsk_ulp_ddp_ops = &nvme_tcp_ddp_ulp_ops; + if (netdev->features & NETIF_F_HW_ULP_DDP) + set_bit(NVME_TCP_Q_OFF_DDP, &queue->flags); + + return ret; +} + +static void nvme_tcp_unoffload_socket(struct nvme_tcp_queue *queue) +{ + struct net_device *netdev = queue->ctrl->offloading_netdev; + + if (!netdev) { + dev_info_ratelimited(queue->ctrl->ctrl.device, "netdev not found\n"); + return; + } + + clear_bit(NVME_TCP_Q_OFF_DDP, &queue->flags); + + netdev->ulp_ddp_ops->ulp_ddp_sk_del(netdev, queue->sock->sk); + + inet_csk(queue->sock->sk)->icsk_ulp_ddp_ops = NULL; + dev_put(netdev); /* held by offload_socket */ +} + +static int nvme_tcp_offload_limits(struct nvme_tcp_queue *queue) +{ + struct net_device *netdev = get_netdev_for_sock(queue->sock->sk, true); + struct ulp_ddp_limits limits; + int ret = 0; + + if (!netdev) { + dev_info_ratelimited(queue->ctrl->ctrl.device, "netdev not found\n"); + queue->ctrl->offloading_netdev = NULL; + return -ENODEV; + } + + if ((netdev->features & NETIF_F_HW_ULP_DDP) && + netdev->ulp_ddp_ops && + netdev->ulp_ddp_ops->ulp_ddp_limits) + ret = netdev->ulp_ddp_ops->ulp_ddp_limits(netdev, &limits); + else + ret = -EOPNOTSUPP; + + if (!ret) { + queue->ctrl->offloading_netdev = netdev; + dev_dbg_ratelimited(queue->ctrl->ctrl.device, + "netdev %s offload limits: max_ddp_sgl_len %d\n", + netdev->name, limits.max_ddp_sgl_len); + queue->ctrl->ctrl.max_segments = limits.max_ddp_sgl_len; + queue->ctrl->ctrl.max_hw_sectors = + limits.max_ddp_sgl_len << (ilog2(SZ_4K) - 9); + } else { + queue->ctrl->offloading_netdev = NULL; + } + + /* release the device as no offload context is established yet. */ + dev_put(netdev); + + return ret; +} + +static void nvme_tcp_resync_response(struct nvme_tcp_queue *queue, + struct sk_buff *skb, unsigned int offset) +{ + u64 pdu_seq = TCP_SKB_CB(skb)->seq + offset - queue->pdu_offset; + struct net_device *netdev = queue->ctrl->offloading_netdev; + u64 pdu_val = (pdu_seq << 32) | ULP_DDP_RESYNC_REQ; + u64 resync_val; + u32 resync_seq; + + resync_val = atomic64_read(&queue->resync_req); + /* Lower 32 bit flags. Check validity of the request */ + if ((resync_val & ULP_DDP_RESYNC_REQ) == 0) + return; + + /* Obtain and check requested sequence number: is this PDU header before the request? */ + resync_seq = resync_val >> 32; + if (before(pdu_seq, resync_seq)) + return; + + if (unlikely(!netdev)) { + pr_info_ratelimited("%s: netdev not found\n", __func__); + return; + } + + /** + * The atomic operation gurarantees that we don't miss any NIC driver + * resync requests submitted after the above checks. + */ + if (atomic64_cmpxchg(&queue->resync_req, pdu_val, + pdu_val & ~ULP_DDP_RESYNC_REQ) != atomic64_read(&queue->resync_req)) + netdev->ulp_ddp_ops->ulp_ddp_resync(netdev, queue->sock->sk, pdu_seq); +} + +static bool nvme_tcp_resync_request(struct sock *sk, u32 seq, u32 flags) +{ + struct nvme_tcp_queue *queue = sk->sk_user_data; + + atomic64_set(&queue->resync_req, + (((uint64_t)seq << 32) | flags)); + + return true; +} + +#else + +static int nvme_tcp_offload_socket(struct nvme_tcp_queue *queue) +{ + return -EINVAL; +} + +static void nvme_tcp_unoffload_socket(struct nvme_tcp_queue *queue) +{} + +static int nvme_tcp_offload_limits(struct nvme_tcp_queue *queue) +{ + return -EINVAL; +} + +static void nvme_tcp_resync_response(struct nvme_tcp_queue *queue, + struct sk_buff *skb, unsigned int offset) +{} + +#endif + static void nvme_tcp_init_iter(struct nvme_tcp_request *req, unsigned int dir) {
@@ -649,6 +816,9 @@ static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb, size_t rcv_len = min_t(size_t, *len, queue->pdu_remaining); int ret; + if (test_bit(NVME_TCP_Q_OFF_DDP, &queue->flags)) + nvme_tcp_resync_response(queue, skb, *offset); + ret = skb_copy_bits(skb, *offset, &pdu[queue->pdu_offset], rcv_len); if (unlikely(ret))
@@ -1555,6 +1725,9 @@ static void __nvme_tcp_stop_queue(struct nvme_tcp_queue *queue) kernel_sock_shutdown(queue->sock, SHUT_RDWR); nvme_tcp_restore_sock_calls(queue); cancel_work_sync(&queue->io_work); + + if (test_bit(NVME_TCP_Q_OFF_DDP, &queue->flags)) + nvme_tcp_unoffload_socket(queue); } static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid)
@@ -1573,10 +1746,13 @@ static int nvme_tcp_start_queue(struct nvme_ctrl *nctrl, int idx) struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl); int ret; - if (idx) + if (idx) { ret = nvmf_connect_io_queue(nctrl, idx, false); - else + nvme_tcp_offload_socket(&ctrl->queues[idx]); + } else { ret = nvmf_connect_admin_queue(nctrl); + nvme_tcp_offload_limits(&ctrl->queues[idx]); + } if (!ret) { set_bit(NVME_TCP_Q_LIVE, &ctrl->queues[idx].flags);
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 8c1bfd7081d1..55dc858ff349 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h@@ -3613,7 +3613,7 @@ __poll_t datagram_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait); int skb_copy_datagram_iter(const struct sk_buff *from, int offset, struct iov_iter *to, int size); -#ifdef CONFIG_TCP_DDP +#ifdef CONFIG_ULP_DDP int skb_ddp_copy_datagram_iter(const struct sk_buff *from, int offset, struct iov_iter *to, int size); #endif
@@ -3627,7 +3627,7 @@ int skb_copy_and_csum_datagram_msg(struct sk_buff *skb, int hlen, int skb_copy_and_hash_datagram_iter(const struct sk_buff *skb, int offset, struct iov_iter *to, int len, struct ahash_request *hash); -#ifdef CONFIG_TCP_DDP +#ifdef CONFIG_ULP_DDP int skb_ddp_copy_and_hash_datagram_iter(const struct sk_buff *skb, int offset, struct iov_iter *to, int len, struct ahash_request *hash);
diff --git a/net/core/datagram.c b/net/core/datagram.c
index d346fd5da22c..5ad5fb22d3f8 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c@@ -495,7 +495,7 @@ static int __skb_datagram_iter(const struct sk_buff *skb, int offset, return 0; } -#ifdef CONFIG_TCP_DDP +#ifdef CONFIG_ULP_DDP /** * skb_ddp_copy_and_hash_datagram_iter - Copies datagrams from skb frags to * an iterator and update a hash. If the iterator and skb frag point to the
@@ -534,7 +534,7 @@ int skb_copy_and_hash_datagram_iter(const struct sk_buff *skb, int offset, } EXPORT_SYMBOL(skb_copy_and_hash_datagram_iter); -#ifdef CONFIG_TCP_DDP +#ifdef CONFIG_ULP_DDP static size_t simple_ddp_copy_to_iter(const void *addr, size_t bytes, void *data __always_unused, struct iov_iter *i)
--
2.24.1
_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme