Re: [PATCH 1/2] NET: Multiple queue network device support

From: Thomas Graf <tgraf@suug.ch>
Date: 2007-03-09 13:40:20
Also in: lkml

* Kok, Auke [off-list ref] 2007-02-08 16:09

quoted hunk ↗ jump to hunk

diff --git a/net/core/dev.c b/net/core/dev.c
index 455d589..42b635c 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c

@@ -1477,6 +1477,49 @@ gso:
 	skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_EGRESS);
 #endif
 	if (q->enqueue) {
+#ifdef CONFIG_NET_MULTI_QUEUE_DEVICE
+		int queue_index;
+		/* If we're a multi-queue device, get a queue index to lock */
+		if (netif_is_multiqueue(dev))
+		{
+			/* Get the queue index and lock it. */
+			if (likely(q->ops->map_queue)) {
+				queue_index = q->ops->map_queue(skb, q);
+				spin_lock(&dev->egress_subqueue[queue_index].queue_lock);
+				rc = q->enqueue(skb, q);
+				/*
+				 * Unlock because the underlying qdisc
+				 * may queue and send a packet from a
+				 * different queue.
+				 */
+				spin_unlock(&dev->egress_subqueue[queue_index].queue_lock);
+				qdisc_run(dev);

I really dislike this asymmetric locking logic, while enqueue() is called
with queue_lock held dequeue() is repsonsible to acquire the lock for
qdisc_restart().

+				rc = rc == NET_XMIT_BYPASS
+				           ? NET_XMIT_SUCCESS : rc;
+				goto out;
+			} else {
+				printk(KERN_CRIT "Device %s is "
+					"multiqueue, but map_queue is "
+					"undefined in the qdisc!!\n",
+					dev->name);
+				kfree_skb(skb);

Move this check to tc_modify_qdisc(), it's useless to check this for
every packet being delivered.

+			}
+		} else {
+			/* We're not a multi-queue device. */
+			spin_lock(&dev->queue_lock);
+			q = dev->qdisc;
+			if (q->enqueue) {
+				rc = q->enqueue(skb, q);
+				qdisc_run(dev);
+				spin_unlock(&dev->queue_lock);
+				rc = rc == NET_XMIT_BYPASS
+				           ? NET_XMIT_SUCCESS : rc;
+				goto out;
+			}
+			spin_unlock(&dev->queue_lock);

Please don't duplicate already existing code.

quoted hunk ↗ jump to hunk

@@ -130,6 +140,72 @@ static inline int qdisc_restart(struct net_device *dev)
 		}
 		
 		{
+#ifdef CONFIG_NET_MULTI_QUEUE_DEVICE
+			if (netif_is_multiqueue(dev)) {
+				if (likely(q->ops->map_queue)) {
+					queue_index = q->ops->map_queue(skb, q);
+				} else {
+					printk(KERN_CRIT "Device %s is "
+						"multiqueue, but map_queue is "
+						"undefined in the qdisc!!\n",
+						dev->name);
+					goto requeue;
+				}

Yet another condition completely useless for every transmission.

+				spin_unlock(&dev->egress_subqueue[queue_index].queue_lock);
+				/* Check top level device, and any sub-device */
+				if ((!netif_queue_stopped(dev)) &&
+				  (!netif_subqueue_stopped(dev, queue_index))) {
+					int ret;
+					ret = dev->hard_start_subqueue_xmit(skb, dev, queue_index);
+					if (ret == NETDEV_TX_OK) {
+						if (!nolock) {
+							netif_tx_unlock(dev);
+						}
+						return -1;
+					}
+					if (ret == NETDEV_TX_LOCKED && nolock) {
+						spin_lock(&dev->egress_subqueue[queue_index].queue_lock);
+						goto collision;
+					}
+				}
+				/* NETDEV_TX_BUSY - we need to requeue */
+				/* Release the driver */
+				if (!nolock) {
+					netif_tx_unlock(dev);
+				}
+				spin_lock(&dev->egress_subqueue[queue_index].queue_lock);
+				q = dev->qdisc;

This is identical to the existing logic except for the different lock,
the additional to check subqueue state and a different hard_start_xmit
call. Share the logic.

+			}
+			else
+			{
+				/* We're a single-queue device */
+				/* And release queue */
+				spin_unlock(&dev->queue_lock);
+				if (!netif_queue_stopped(dev)) {
+					int ret;
+
+					ret = dev->hard_start_xmit(skb, dev);
+					if (ret == NETDEV_TX_OK) {
+						if (!nolock) {
+							netif_tx_unlock(dev);
+						}
+						spin_lock(&dev->queue_lock);
+						return -1;
+					}
+					if (ret == NETDEV_TX_LOCKED && nolock) {
+						spin_lock(&dev->queue_lock);
+						goto collision;
+					}
+				}
+				/* NETDEV_TX_BUSY - we need to requeue */
+				/* Release the driver */
+				if (!nolock) {
+					netif_tx_unlock(dev);
+				}
+				spin_lock(&dev->queue_lock);
+				q = dev->qdisc;

Again, you just copied the existing code into a separate branch, fix the
branching logic!

quoted hunk ↗ jump to hunk

@@ -356,10 +454,18 @@ static struct sk_buff *pfifo_fast_dequeue(struct Qdisc* qdisc)
 	struct sk_buff_head *list = qdisc_priv(qdisc);
 
 	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
+#ifdef CONFIG_NET_MULTI_QUEUE_DEVICE
+		if (netif_is_multiqueue(qdisc->dev))
+			spin_lock(&qdisc->dev->egress_subqueue[0].queue_lock);
+#endif
 		if (!skb_queue_empty(list + prio)) {
 			qdisc->q.qlen--;
 			return __qdisc_dequeue_head(qdisc, list + prio);
 		}
+#ifdef CONFIG_NET_MULTI_QUEUE_DEVICE
+		if (netif_is_multiqueue(qdisc->dev))
+			spin_unlock(&qdisc->dev->egress_subqueue[0].queue_lock);
+#endif

This lock/unlock for every band definitely screws performance.

quoted hunk ↗ jump to hunk

 	}
 
 	return NULL;

@@ -141,18 +174,53 @@ prio_dequeue(struct Qdisc* sch)
 	struct sk_buff *skb;
 	struct prio_sched_data *q = qdisc_priv(sch);
 	int prio;
+#ifdef CONFIG_NET_MULTI_QUEUE_DEVICE
+	int queue;
+#endif
 	struct Qdisc *qdisc;
 
+	/*
+	 * If we're multiqueue, the basic approach is try the lock on each
+	 * queue.  If it's locked, either we're already dequeuing, or enqueue
+	 * is doing something.  Go to the next band if we're locked.  Once we
+	 * have a packet, unlock the queue.  NOTE: the underlying qdisc CANNOT
+	 * be a PRIO qdisc, otherwise we will deadlock.  FIXME
+	 */
 	for (prio = 0; prio < q->bands; prio++) {
+#ifdef CONFIG_NET_MULTI_QUEUE_DEVICE
+		if (netif_is_multiqueue(sch->dev)) {
+			queue = q->band2queue[prio];
+			if (spin_trylock(&sch->dev->egress_subqueue[queue].queue_lock)) {
+				qdisc = q->queues[prio];
+				skb = qdisc->dequeue(qdisc);
+				if (skb) {
+					sch->q.qlen--;
+					skb->priority = prio;
+					spin_unlock(&sch->dev->egress_subqueue[queue].queue_lock);
+					return skb;
+				}
+				spin_unlock(&sch->dev->egress_subqueue[queue].queue_lock);
+			}

Your modified qdisc_restart() expects the queue_lock to be locked, how
can this work?

+		} else {
+			qdisc = q->queues[prio];
+			skb = qdisc->dequeue(qdisc);
+			if (skb) {
+				sch->q.qlen--;
+				skb->priority = prio;
+				return skb;
+			}
+		}
+#else
 		qdisc = q->queues[prio];
 		skb = qdisc->dequeue(qdisc);
 		if (skb) {
 			sch->q.qlen--;
+			skb->priority = prio;
 			return skb;
 		}
+#endif
 	}
 	return NULL;
-
 }
 
 static unsigned int prio_drop(struct Qdisc* sch)

`h`	back out one level
`j`	next message in thread
`k`	previous message in thread
`l`	drill in
`Esc`	close help / fold thread tree
`?`	toggle this help