Skip to content

Commit 2f530df

Browse files
vladimirolteandavem330
authored andcommitted
net/sched: taprio: give higher priority to higher TCs in software dequeue mode
Current taprio software implementation is haunted by the shadow of the igb/igc hardware model. It iterates over child qdiscs in increasing order of TXQ index, therefore giving higher xmit priority to TXQ 0 and lower to TXQ N. According to discussions with Vinicius, that is the default (perhaps even unchangeable) prioritization scheme used for the NICs that taprio was first written for (igb, igc), and we have a case of two bugs canceling out, resulting in a functional setup on igb/igc, but a less sane one on other NICs. To the best of my understanding, taprio should prioritize based on the traffic class, so it should really dequeue starting with the highest traffic class and going down from there. We get to the TXQ using the tc_to_txq[] netdev property. TXQs within the same TC have the same (strict) priority, so we should pick from them as fairly as we can. We can achieve that by implementing something very similar to q->curband from multiq_dequeue(). Since igb/igc really do have TXQ 0 of higher hardware priority than TXQ 1 etc, we need to preserve the behavior for them as well. We really have no choice, because in txtime-assist mode, taprio is essentially a software scheduler towards offloaded child tc-etf qdiscs, so the TXQ selection really does matter (not all igb TXQs support ETF/SO_TXTIME, says Kurt Kanzenbach). To preserve the behavior, we need a capability bit so that taprio can determine if it's running on igb/igc, or on something else. Because igb doesn't offload taprio at all, we can't piggyback on the qdisc_offload_query_caps() call from taprio_enable_offload(), but instead we need a separate call which is also made for software scheduling. Introduce two static keys to minimize the performance penalty on systems which only have igb/igc NICs, and on systems which only have other NICs. For mixed systems, taprio will have to dynamically check whether to dequeue using one prioritization algorithm or using the other. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
1 parent 4c22942 commit 2f530df

File tree

4 files changed

+143
-11
lines changed

4 files changed

+143
-11
lines changed

drivers/net/ethernet/intel/igb/igb_main.c

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2810,6 +2810,22 @@ static int igb_offload_txtime(struct igb_adapter *adapter,
28102810
return 0;
28112811
}
28122812

2813+
static int igb_tc_query_caps(struct igb_adapter *adapter,
2814+
struct tc_query_caps_base *base)
2815+
{
2816+
switch (base->type) {
2817+
case TC_SETUP_QDISC_TAPRIO: {
2818+
struct tc_taprio_caps *caps = base->caps;
2819+
2820+
caps->broken_mqprio = true;
2821+
2822+
return 0;
2823+
}
2824+
default:
2825+
return -EOPNOTSUPP;
2826+
}
2827+
}
2828+
28132829
static LIST_HEAD(igb_block_cb_list);
28142830

28152831
static int igb_setup_tc(struct net_device *dev, enum tc_setup_type type,
@@ -2818,6 +2834,8 @@ static int igb_setup_tc(struct net_device *dev, enum tc_setup_type type,
28182834
struct igb_adapter *adapter = netdev_priv(dev);
28192835

28202836
switch (type) {
2837+
case TC_QUERY_CAPS:
2838+
return igb_tc_query_caps(adapter, type_data);
28212839
case TC_SETUP_QDISC_CBS:
28222840
return igb_offload_cbs(adapter, type_data);
28232841
case TC_SETUP_BLOCK:

drivers/net/ethernet/intel/igc/igc_main.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6214,10 +6214,10 @@ static int igc_tc_query_caps(struct igc_adapter *adapter,
62146214
case TC_SETUP_QDISC_TAPRIO: {
62156215
struct tc_taprio_caps *caps = base->caps;
62166216

6217-
if (hw->mac.type != igc_i225)
6218-
return -EOPNOTSUPP;
6217+
caps->broken_mqprio = true;
62196218

6220-
caps->gate_mask_per_txq = true;
6219+
if (hw->mac.type == igc_i225)
6220+
caps->gate_mask_per_txq = true;
62216221

62226222
return 0;
62236223
}

include/net/pkt_sched.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -177,6 +177,11 @@ struct tc_mqprio_qopt_offload {
177177
struct tc_taprio_caps {
178178
bool supports_queue_max_sdu:1;
179179
bool gate_mask_per_txq:1;
180+
/* Device expects lower TXQ numbers to have higher priority over higher
181+
* TXQs, regardless of their TC mapping. DO NOT USE FOR NEW DRIVERS,
182+
* INSTEAD ENFORCE A PROPER TC:TXQ MAPPING COMING FROM USER SPACE.
183+
*/
184+
bool broken_mqprio:1;
180185
};
181186

182187
struct tc_taprio_sched_entry {

net/sched/sch_taprio.c

Lines changed: 117 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@
2929
#include "sch_mqprio_lib.h"
3030

3131
static LIST_HEAD(taprio_list);
32+
static struct static_key_false taprio_have_broken_mqprio;
33+
static struct static_key_false taprio_have_working_mqprio;
3234

3335
#define TAPRIO_ALL_GATES_OPEN -1
3436

@@ -69,6 +71,8 @@ struct taprio_sched {
6971
enum tk_offsets tk_offset;
7072
int clockid;
7173
bool offloaded;
74+
bool detected_mqprio;
75+
bool broken_mqprio;
7276
atomic64_t picos_per_byte; /* Using picoseconds because for 10Gbps+
7377
* speeds it's sub-nanoseconds per byte
7478
*/
@@ -80,6 +84,7 @@ struct taprio_sched {
8084
struct sched_gate_list __rcu *admin_sched;
8185
struct hrtimer advance_timer;
8286
struct list_head taprio_list;
87+
int cur_txq[TC_MAX_QUEUE];
8388
u32 max_frm_len[TC_MAX_QUEUE]; /* for the fast path */
8489
u32 max_sdu[TC_MAX_QUEUE]; /* for dump and offloading */
8590
u32 txtime_delay;
@@ -568,17 +573,78 @@ static struct sk_buff *taprio_dequeue_from_txq(struct Qdisc *sch, int txq,
568573
return skb;
569574
}
570575

576+
static void taprio_next_tc_txq(struct net_device *dev, int tc, int *txq)
577+
{
578+
int offset = dev->tc_to_txq[tc].offset;
579+
int count = dev->tc_to_txq[tc].count;
580+
581+
(*txq)++;
582+
if (*txq == offset + count)
583+
*txq = offset;
584+
}
585+
586+
/* Prioritize higher traffic classes, and select among TXQs belonging to the
587+
* same TC using round robin
588+
*/
589+
static struct sk_buff *taprio_dequeue_tc_priority(struct Qdisc *sch,
590+
struct sched_entry *entry,
591+
u32 gate_mask)
592+
{
593+
struct taprio_sched *q = qdisc_priv(sch);
594+
struct net_device *dev = qdisc_dev(sch);
595+
int num_tc = netdev_get_num_tc(dev);
596+
struct sk_buff *skb;
597+
int tc;
598+
599+
for (tc = num_tc - 1; tc >= 0; tc--) {
600+
int first_txq = q->cur_txq[tc];
601+
602+
if (!(gate_mask & BIT(tc)))
603+
continue;
604+
605+
do {
606+
skb = taprio_dequeue_from_txq(sch, q->cur_txq[tc],
607+
entry, gate_mask);
608+
609+
taprio_next_tc_txq(dev, tc, &q->cur_txq[tc]);
610+
611+
if (skb)
612+
return skb;
613+
} while (q->cur_txq[tc] != first_txq);
614+
}
615+
616+
return NULL;
617+
}
618+
619+
/* Broken way of prioritizing smaller TXQ indices and ignoring the traffic
620+
* class other than to determine whether the gate is open or not
621+
*/
622+
static struct sk_buff *taprio_dequeue_txq_priority(struct Qdisc *sch,
623+
struct sched_entry *entry,
624+
u32 gate_mask)
625+
{
626+
struct net_device *dev = qdisc_dev(sch);
627+
struct sk_buff *skb;
628+
int i;
629+
630+
for (i = 0; i < dev->num_tx_queues; i++) {
631+
skb = taprio_dequeue_from_txq(sch, i, entry, gate_mask);
632+
if (skb)
633+
return skb;
634+
}
635+
636+
return NULL;
637+
}
638+
571639
/* Will not be called in the full offload case, since the TX queues are
572640
* attached to the Qdisc created using qdisc_create_dflt()
573641
*/
574642
static struct sk_buff *taprio_dequeue(struct Qdisc *sch)
575643
{
576644
struct taprio_sched *q = qdisc_priv(sch);
577-
struct net_device *dev = qdisc_dev(sch);
578645
struct sk_buff *skb = NULL;
579646
struct sched_entry *entry;
580647
u32 gate_mask;
581-
int i;
582648

583649
rcu_read_lock();
584650
entry = rcu_dereference(q->current_entry);
@@ -588,14 +654,23 @@ static struct sk_buff *taprio_dequeue(struct Qdisc *sch)
588654
* "AdminGateStates"
589655
*/
590656
gate_mask = entry ? entry->gate_mask : TAPRIO_ALL_GATES_OPEN;
591-
592657
if (!gate_mask)
593658
goto done;
594659

595-
for (i = 0; i < dev->num_tx_queues; i++) {
596-
skb = taprio_dequeue_from_txq(sch, i, entry, gate_mask);
597-
if (skb)
598-
goto done;
660+
if (static_branch_unlikely(&taprio_have_broken_mqprio) &&
661+
!static_branch_likely(&taprio_have_working_mqprio)) {
662+
/* Single NIC kind which is broken */
663+
skb = taprio_dequeue_txq_priority(sch, entry, gate_mask);
664+
} else if (static_branch_likely(&taprio_have_working_mqprio) &&
665+
!static_branch_unlikely(&taprio_have_broken_mqprio)) {
666+
/* Single NIC kind which prioritizes properly */
667+
skb = taprio_dequeue_tc_priority(sch, entry, gate_mask);
668+
} else {
669+
/* Mixed NIC kinds present in system, need dynamic testing */
670+
if (q->broken_mqprio)
671+
skb = taprio_dequeue_txq_priority(sch, entry, gate_mask);
672+
else
673+
skb = taprio_dequeue_tc_priority(sch, entry, gate_mask);
599674
}
600675

601676
done:
@@ -1157,6 +1232,34 @@ static void taprio_sched_to_offload(struct net_device *dev,
11571232
offload->num_entries = i;
11581233
}
11591234

1235+
static void taprio_detect_broken_mqprio(struct taprio_sched *q)
1236+
{
1237+
struct net_device *dev = qdisc_dev(q->root);
1238+
struct tc_taprio_caps caps;
1239+
1240+
qdisc_offload_query_caps(dev, TC_SETUP_QDISC_TAPRIO,
1241+
&caps, sizeof(caps));
1242+
1243+
q->broken_mqprio = caps.broken_mqprio;
1244+
if (q->broken_mqprio)
1245+
static_branch_inc(&taprio_have_broken_mqprio);
1246+
else
1247+
static_branch_inc(&taprio_have_working_mqprio);
1248+
1249+
q->detected_mqprio = true;
1250+
}
1251+
1252+
static void taprio_cleanup_broken_mqprio(struct taprio_sched *q)
1253+
{
1254+
if (!q->detected_mqprio)
1255+
return;
1256+
1257+
if (q->broken_mqprio)
1258+
static_branch_dec(&taprio_have_broken_mqprio);
1259+
else
1260+
static_branch_dec(&taprio_have_working_mqprio);
1261+
}
1262+
11601263
static int taprio_enable_offload(struct net_device *dev,
11611264
struct taprio_sched *q,
11621265
struct sched_gate_list *sched,
@@ -1538,10 +1641,12 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt,
15381641
err = netdev_set_num_tc(dev, mqprio->num_tc);
15391642
if (err)
15401643
goto free_sched;
1541-
for (i = 0; i < mqprio->num_tc; i++)
1644+
for (i = 0; i < mqprio->num_tc; i++) {
15421645
netdev_set_tc_queue(dev, i,
15431646
mqprio->count[i],
15441647
mqprio->offset[i]);
1648+
q->cur_txq[i] = mqprio->offset[i];
1649+
}
15451650

15461651
/* Always use supplied priority mappings */
15471652
for (i = 0; i <= TC_BITMASK; i++)
@@ -1676,6 +1781,8 @@ static void taprio_destroy(struct Qdisc *sch)
16761781

16771782
if (admin)
16781783
call_rcu(&admin->rcu, taprio_free_sched_cb);
1784+
1785+
taprio_cleanup_broken_mqprio(q);
16791786
}
16801787

16811788
static int taprio_init(struct Qdisc *sch, struct nlattr *opt,
@@ -1740,6 +1847,8 @@ static int taprio_init(struct Qdisc *sch, struct nlattr *opt,
17401847
q->qdiscs[i] = qdisc;
17411848
}
17421849

1850+
taprio_detect_broken_mqprio(q);
1851+
17431852
return taprio_change(sch, opt, extack);
17441853
}
17451854

0 commit comments

Comments
 (0)