net: netem: use a list in addition to rbtree

When testing high-bandwidth TCP streams with large windows, high latency, and low jitter, netem consumes a lot of CPU cycles doing rbtree rebalancing. This patch uses a linear list/queue in addition to the rbtree: if an incoming packet is past the tail of the linear queue, it is added there, otherwise it is inserted into the rbtree. Without this patch, perf shows netem_enqueue, netem_dequeue, and rb_* functions among the top offenders. With this patch, only netem_enqueue is noticeable if jitter is low/absent. Suggested-by: Eric Dumazet <edumazet@google.com> Signed-off-by: Peter Oskolkov <posk@google.com> Reviewed-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2018-12-04 11:55:56 -08:00 · 2018-12-04 11:55:56 -08:00 · d66280b12b
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@ -77,6 +77,10 @@ struct netem_sched_data {
 	/* internal t(ime)fifo qdisc uses t_root and sch->limit */
 	struct rb_root t_root;

+	/* a linear queue; reduces rbtree rebalancing when jitter is low */
+	struct sk_buff	*t_head;
+	struct sk_buff	*t_tail;
+
 	/* optional qdisc for classful handling (NULL at netem init) */
 	struct Qdisc	*qdisc;

@ -369,12 +373,24 @@ static void tfifo_reset(struct Qdisc *sch)
 		rb_erase(&skb->rbnode, &q->t_root);
 		rtnl_kfree_skbs(skb, skb);
 	}
+
+	rtnl_kfree_skbs(q->t_head, q->t_tail);
+	q->t_head = NULL;
+	q->t_tail = NULL;
 }

 static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
 {
 	struct netem_sched_data *q = qdisc_priv(sch);
 	u64 tnext = netem_skb_cb(nskb)->time_to_send;
+
+	if (!q->t_tail || tnext >= netem_skb_cb(q->t_tail)->time_to_send) {
+		if (q->t_tail)
+			q->t_tail->next = nskb;
+		else
+			q->t_head = nskb;
+		q->t_tail = nskb;
+	} else {
 		struct rb_node **p = &q->t_root.rb_node, *parent = NULL;

 		while (*p) {
@ -389,6 +405,7 @@ static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
 		}
 		rb_link_node(&nskb->rbnode, parent, p);
 		rb_insert_color(&nskb->rbnode, &q->t_root);
+	}
 	sch->q.qlen++;
 }

@ -530,9 +547,16 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 				t_skb = skb_rb_last(&q->t_root);
 				t_last = netem_skb_cb(t_skb);
 				if (!last ||
-				    t_last->time_to_send > last->time_to_send) {
+				    t_last->time_to_send > last->time_to_send)
 					last = t_last;
 			}
+			if (q->t_tail) {
+				struct netem_skb_cb *t_last =
+					netem_skb_cb(q->t_tail);
+
+				if (!last ||
+				    t_last->time_to_send > last->time_to_send)
+					last = t_last;
 			}

 			if (last) {
@ -611,11 +635,38 @@ static void get_slot_next(struct netem_sched_data *q, u64 now)
 	q->slot.bytes_left = q->slot_config.max_bytes;
 }

+static struct sk_buff *netem_peek(struct netem_sched_data *q)
+{
+	struct sk_buff *skb = skb_rb_first(&q->t_root);
+	u64 t1, t2;
+
+	if (!skb)
+		return q->t_head;
+	if (!q->t_head)
+		return skb;
+
+	t1 = netem_skb_cb(skb)->time_to_send;
+	t2 = netem_skb_cb(q->t_head)->time_to_send;
+	if (t1 < t2)
+		return skb;
+	return q->t_head;
+}
+
+static void netem_erase_head(struct netem_sched_data *q, struct sk_buff *skb)
+{
+	if (skb == q->t_head) {
+		q->t_head = skb->next;
+		if (!q->t_head)
+			q->t_tail = NULL;
+	} else {
+		rb_erase(&skb->rbnode, &q->t_root);
+	}
+}
+
 static struct sk_buff *netem_dequeue(struct Qdisc *sch)
 {
 	struct netem_sched_data *q = qdisc_priv(sch);
 	struct sk_buff *skb;
-	struct rb_node *p;

 tfifo_dequeue:
 	skb = __qdisc_dequeue_head(&sch->q);
@ -625,20 +676,18 @@ deliver:
 		qdisc_bstats_update(sch, skb);
 		return skb;
 	}
-	p = rb_first(&q->t_root);
-	if (p) {
+	skb = netem_peek(q);
+	if (skb) {
 		u64 time_to_send;
 		u64 now = ktime_get_ns();

-		skb = rb_to_skb(p);
-
 		/* if more time remaining? */
 		time_to_send = netem_skb_cb(skb)->time_to_send;
 		if (q->slot.slot_next && q->slot.slot_next < time_to_send)
 			get_slot_next(q, now);

 		if (time_to_send <= now && q->slot.slot_next <= now) {
-			rb_erase(p, &q->t_root);
+			netem_erase_head(q, skb);
 			sch->q.qlen--;
 			qdisc_qstats_backlog_dec(sch, skb);
 			skb->next = NULL;