Skip to content

Commit 989d957

Browse files
committed
Merge branch 'TC-Introduce-qevents'
Petr Machata says: ==================== TC: Introduce qevents The Spectrum hardware allows execution of one of several actions as a result of queue management decisions: tail-dropping, early-dropping, marking a packet, or passing a configured latency threshold or buffer size. Such packets can be mirrored, trapped, or sampled. Modeling the action to be taken as simply a TC action is very attractive, but it is not obvious where to put these actions. At least with ECN marking one could imagine a tree of qdiscs and classifiers that effectively accomplishes this task, albeit in an impractically complex manner. But there is just no way to match on dropped-ness of a packet, let alone dropped-ness due to a particular reason. To allow configuring user-defined actions as a result of inner workings of a qdisc, this patch set introduces a concept of qevents. Those are attach points for TC blocks, where filters can be put that are executed as the packet hits well-defined points in the qdisc algorithms. The attached blocks can be shared, in a manner similar to clsact ingress and egress blocks, arbitrary classifiers with arbitrary actions can be put on them, etc. For example: red limit 500K avpkt 1K qevent early_drop block 10 matchall action mirred egress mirror dev eth1 The central patch #2 introduces several helpers to allow easy and uniform addition of qevents to qdiscs: initialization, destruction, qevent block number change validation, and qevent handling, i.e. dispatch of the filters attached to the block bound to a qevent. Patch #1 adds root_lock argument to qdisc enqueue op. The problem this is tackling is that if a qevent filter pushes packets to the same qdisc tree that holds the qevent in the first place, attempt to take qdisc root lock for the second time will lead to a deadlock. To solve the issue, qevent handler needs to unlock and relock the root lock around the filter processing. Passing root_lock around makes it possible to get the lock where it is needed, and visibly so, such that it is obvious the lock will be used when invoking a qevent. The following two patches, #3 and #4, then add two qevents to the RED qdisc: "early_drop" qevent fires when a packet is early-dropped; "mark" qevent, when it is ECN-marked. Patch #5 contains a selftest. I have mentioned this test when pushing the RED ECN nodrop mode and said that "I have no confidence in its portability to [...] different configurations". That still holds. The backlog and packet size are tuned to make the test deterministic. But it is better than nothing, and on the boxes that I ran it on it does work and shows that qevents work the way they are supposed to, and that their addition has not broken the other tested features. This patch set does not deal with offloading. The idea there is that a driver will be able to figure out that a given block is used in qevent context by looking at binder type. A future patch-set will add a qdisc pointer to struct flow_block_offload, which a driver will be able to consult to glean the TC or other relevant attributes. Changes from RFC to v1: - Move a "q = qdisc_priv(sch)" from patch #3 to patch #4 - Fix deadlock caused by mirroring packet back to the same qdisc tree. - Rename "tail" qevent to "tail_drop". - Adapt to the new 100-column standard. - Add a selftest ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
2 parents 5e701e4 + 6cf0291 commit 989d957

40 files changed

+822
-84
lines changed

include/net/flow_offload.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -424,6 +424,8 @@ enum flow_block_binder_type {
424424
FLOW_BLOCK_BINDER_TYPE_UNSPEC,
425425
FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS,
426426
FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS,
427+
FLOW_BLOCK_BINDER_TYPE_RED_EARLY_DROP,
428+
FLOW_BLOCK_BINDER_TYPE_RED_MARK,
427429
};
428430

429431
struct flow_block {

include/net/pkt_cls.h

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,12 @@ struct tcf_block_ext_info {
3232
u32 block_index;
3333
};
3434

35+
struct tcf_qevent {
36+
struct tcf_block *block;
37+
struct tcf_block_ext_info info;
38+
struct tcf_proto __rcu *filter_chain;
39+
};
40+
3541
struct tcf_block_cb;
3642
bool tcf_queue_work(struct rcu_work *rwork, work_func_t func);
3743

@@ -553,6 +559,49 @@ int tc_setup_cb_reoffload(struct tcf_block *block, struct tcf_proto *tp,
553559
void *cb_priv, u32 *flags, unsigned int *in_hw_count);
554560
unsigned int tcf_exts_num_actions(struct tcf_exts *exts);
555561

562+
#ifdef CONFIG_NET_CLS_ACT
563+
int tcf_qevent_init(struct tcf_qevent *qe, struct Qdisc *sch,
564+
enum flow_block_binder_type binder_type,
565+
struct nlattr *block_index_attr,
566+
struct netlink_ext_ack *extack);
567+
void tcf_qevent_destroy(struct tcf_qevent *qe, struct Qdisc *sch);
568+
int tcf_qevent_validate_change(struct tcf_qevent *qe, struct nlattr *block_index_attr,
569+
struct netlink_ext_ack *extack);
570+
struct sk_buff *tcf_qevent_handle(struct tcf_qevent *qe, struct Qdisc *sch, struct sk_buff *skb,
571+
spinlock_t *root_lock, struct sk_buff **to_free, int *ret);
572+
int tcf_qevent_dump(struct sk_buff *skb, int attr_name, struct tcf_qevent *qe);
573+
#else
574+
static inline int tcf_qevent_init(struct tcf_qevent *qe, struct Qdisc *sch,
575+
enum flow_block_binder_type binder_type,
576+
struct nlattr *block_index_attr,
577+
struct netlink_ext_ack *extack)
578+
{
579+
return 0;
580+
}
581+
582+
static inline void tcf_qevent_destroy(struct tcf_qevent *qe, struct Qdisc *sch)
583+
{
584+
}
585+
586+
static inline int tcf_qevent_validate_change(struct tcf_qevent *qe, struct nlattr *block_index_attr,
587+
struct netlink_ext_ack *extack)
588+
{
589+
return 0;
590+
}
591+
592+
static inline struct sk_buff *
593+
tcf_qevent_handle(struct tcf_qevent *qe, struct Qdisc *sch, struct sk_buff *skb,
594+
spinlock_t *root_lock, struct sk_buff **to_free, int *ret)
595+
{
596+
return skb;
597+
}
598+
599+
static inline int tcf_qevent_dump(struct sk_buff *skb, int attr_name, struct tcf_qevent *qe)
600+
{
601+
return 0;
602+
}
603+
#endif
604+
556605
struct tc_cls_u32_knode {
557606
struct tcf_exts *exts;
558607
struct tcf_result *res;

include/net/sch_generic.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ struct qdisc_skb_head {
5757
struct Qdisc {
5858
int (*enqueue)(struct sk_buff *skb,
5959
struct Qdisc *sch,
60+
spinlock_t *root_lock,
6061
struct sk_buff **to_free);
6162
struct sk_buff * (*dequeue)(struct Qdisc *sch);
6263
unsigned int flags;
@@ -241,6 +242,7 @@ struct Qdisc_ops {
241242

242243
int (*enqueue)(struct sk_buff *skb,
243244
struct Qdisc *sch,
245+
spinlock_t *root_lock,
244246
struct sk_buff **to_free);
245247
struct sk_buff * (*dequeue)(struct Qdisc *);
246248
struct sk_buff * (*peek)(struct Qdisc *);
@@ -788,11 +790,11 @@ static inline void qdisc_calculate_pkt_len(struct sk_buff *skb,
788790
#endif
789791
}
790792

791-
static inline int qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
793+
static inline int qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, spinlock_t *root_lock,
792794
struct sk_buff **to_free)
793795
{
794796
qdisc_calculate_pkt_len(skb, sch);
795-
return sch->enqueue(skb, sch, to_free);
797+
return sch->enqueue(skb, sch, root_lock, to_free);
796798
}
797799

798800
static inline void _bstats_update(struct gnet_stats_basic_packed *bstats,

include/uapi/linux/pkt_sched.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -257,6 +257,8 @@ enum {
257257
TCA_RED_STAB,
258258
TCA_RED_MAX_P,
259259
TCA_RED_FLAGS, /* bitfield32 */
260+
TCA_RED_EARLY_DROP_BLOCK, /* u32 */
261+
TCA_RED_MARK_BLOCK, /* u32 */
260262
__TCA_RED_MAX,
261263
};
262264

net/core/dev.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3749,7 +3749,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
37493749
qdisc_calculate_pkt_len(skb, q);
37503750

37513751
if (q->flags & TCQ_F_NOLOCK) {
3752-
rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
3752+
rc = q->enqueue(skb, q, NULL, &to_free) & NET_XMIT_MASK;
37533753
qdisc_run(q);
37543754

37553755
if (unlikely(to_free))
@@ -3792,7 +3792,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
37923792
qdisc_run_end(q);
37933793
rc = NET_XMIT_SUCCESS;
37943794
} else {
3795-
rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
3795+
rc = q->enqueue(skb, q, root_lock, &to_free) & NET_XMIT_MASK;
37963796
if (qdisc_run_begin(q)) {
37973797
if (unlikely(contended)) {
37983798
spin_unlock(&q->busylock);

net/sched/cls_api.c

Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3748,6 +3748,125 @@ unsigned int tcf_exts_num_actions(struct tcf_exts *exts)
37483748
}
37493749
EXPORT_SYMBOL(tcf_exts_num_actions);
37503750

3751+
#ifdef CONFIG_NET_CLS_ACT
3752+
static int tcf_qevent_parse_block_index(struct nlattr *block_index_attr,
3753+
u32 *p_block_index,
3754+
struct netlink_ext_ack *extack)
3755+
{
3756+
*p_block_index = nla_get_u32(block_index_attr);
3757+
if (!*p_block_index) {
3758+
NL_SET_ERR_MSG(extack, "Block number may not be zero");
3759+
return -EINVAL;
3760+
}
3761+
3762+
return 0;
3763+
}
3764+
3765+
int tcf_qevent_init(struct tcf_qevent *qe, struct Qdisc *sch,
3766+
enum flow_block_binder_type binder_type,
3767+
struct nlattr *block_index_attr,
3768+
struct netlink_ext_ack *extack)
3769+
{
3770+
u32 block_index;
3771+
int err;
3772+
3773+
if (!block_index_attr)
3774+
return 0;
3775+
3776+
err = tcf_qevent_parse_block_index(block_index_attr, &block_index, extack);
3777+
if (err)
3778+
return err;
3779+
3780+
if (!block_index)
3781+
return 0;
3782+
3783+
qe->info.binder_type = binder_type;
3784+
qe->info.chain_head_change = tcf_chain_head_change_dflt;
3785+
qe->info.chain_head_change_priv = &qe->filter_chain;
3786+
qe->info.block_index = block_index;
3787+
3788+
return tcf_block_get_ext(&qe->block, sch, &qe->info, extack);
3789+
}
3790+
EXPORT_SYMBOL(tcf_qevent_init);
3791+
3792+
void tcf_qevent_destroy(struct tcf_qevent *qe, struct Qdisc *sch)
3793+
{
3794+
if (qe->info.block_index)
3795+
tcf_block_put_ext(qe->block, sch, &qe->info);
3796+
}
3797+
EXPORT_SYMBOL(tcf_qevent_destroy);
3798+
3799+
int tcf_qevent_validate_change(struct tcf_qevent *qe, struct nlattr *block_index_attr,
3800+
struct netlink_ext_ack *extack)
3801+
{
3802+
u32 block_index;
3803+
int err;
3804+
3805+
if (!block_index_attr)
3806+
return 0;
3807+
3808+
err = tcf_qevent_parse_block_index(block_index_attr, &block_index, extack);
3809+
if (err)
3810+
return err;
3811+
3812+
/* Bounce newly-configured block or change in block. */
3813+
if (block_index != qe->info.block_index) {
3814+
NL_SET_ERR_MSG(extack, "Change of blocks is not supported");
3815+
return -EINVAL;
3816+
}
3817+
3818+
return 0;
3819+
}
3820+
EXPORT_SYMBOL(tcf_qevent_validate_change);
3821+
3822+
struct sk_buff *tcf_qevent_handle(struct tcf_qevent *qe, struct Qdisc *sch, struct sk_buff *skb,
3823+
spinlock_t *root_lock, struct sk_buff **to_free, int *ret)
3824+
{
3825+
struct tcf_result cl_res;
3826+
struct tcf_proto *fl;
3827+
3828+
if (!qe->info.block_index)
3829+
return skb;
3830+
3831+
fl = rcu_dereference_bh(qe->filter_chain);
3832+
3833+
if (root_lock)
3834+
spin_unlock(root_lock);
3835+
3836+
switch (tcf_classify(skb, fl, &cl_res, false)) {
3837+
case TC_ACT_SHOT:
3838+
qdisc_qstats_drop(sch);
3839+
__qdisc_drop(skb, to_free);
3840+
*ret = __NET_XMIT_BYPASS;
3841+
return NULL;
3842+
case TC_ACT_STOLEN:
3843+
case TC_ACT_QUEUED:
3844+
case TC_ACT_TRAP:
3845+
__qdisc_drop(skb, to_free);
3846+
*ret = __NET_XMIT_STOLEN;
3847+
return NULL;
3848+
case TC_ACT_REDIRECT:
3849+
skb_do_redirect(skb);
3850+
*ret = __NET_XMIT_STOLEN;
3851+
return NULL;
3852+
}
3853+
3854+
if (root_lock)
3855+
spin_lock(root_lock);
3856+
3857+
return skb;
3858+
}
3859+
EXPORT_SYMBOL(tcf_qevent_handle);
3860+
3861+
int tcf_qevent_dump(struct sk_buff *skb, int attr_name, struct tcf_qevent *qe)
3862+
{
3863+
if (!qe->info.block_index)
3864+
return 0;
3865+
return nla_put_u32(skb, attr_name, qe->info.block_index);
3866+
}
3867+
EXPORT_SYMBOL(tcf_qevent_dump);
3868+
#endif
3869+
37513870
static __net_init int tcf_net_init(struct net *net)
37523871
{
37533872
struct tcf_net *tn = net_generic(net, tcf_net_id);

net/sched/sch_atm.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -374,7 +374,7 @@ static struct tcf_block *atm_tc_tcf_block(struct Qdisc *sch, unsigned long cl,
374374

375375
/* --------------------------- Qdisc operations ---------------------------- */
376376

377-
static int atm_tc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
377+
static int atm_tc_enqueue(struct sk_buff *skb, struct Qdisc *sch, spinlock_t *root_lock,
378378
struct sk_buff **to_free)
379379
{
380380
struct atm_qdisc_data *p = qdisc_priv(sch);
@@ -432,7 +432,7 @@ static int atm_tc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
432432
#endif
433433
}
434434

435-
ret = qdisc_enqueue(skb, flow->q, to_free);
435+
ret = qdisc_enqueue(skb, flow->q, root_lock, to_free);
436436
if (ret != NET_XMIT_SUCCESS) {
437437
drop: __maybe_unused
438438
if (net_xmit_drop_count(ret)) {

net/sched/sch_blackhole.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
#include <linux/skbuff.h>
1414
#include <net/pkt_sched.h>
1515

16-
static int blackhole_enqueue(struct sk_buff *skb, struct Qdisc *sch,
16+
static int blackhole_enqueue(struct sk_buff *skb, struct Qdisc *sch, spinlock_t *root_lock,
1717
struct sk_buff **to_free)
1818
{
1919
qdisc_drop(skb, sch, to_free);

net/sched/sch_cake.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1687,7 +1687,7 @@ static u32 cake_classify(struct Qdisc *sch, struct cake_tin_data **t,
16871687

16881688
static void cake_reconfigure(struct Qdisc *sch);
16891689

1690-
static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
1690+
static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, spinlock_t *root_lock,
16911691
struct sk_buff **to_free)
16921692
{
16931693
struct cake_sched_data *q = qdisc_priv(sch);

net/sched/sch_cbq.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -356,7 +356,7 @@ cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
356356
}
357357

358358
static int
359-
cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
359+
cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch, spinlock_t *root_lock,
360360
struct sk_buff **to_free)
361361
{
362362
struct cbq_sched_data *q = qdisc_priv(sch);
@@ -373,7 +373,7 @@ cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
373373
return ret;
374374
}
375375

376-
ret = qdisc_enqueue(skb, cl->q, to_free);
376+
ret = qdisc_enqueue(skb, cl->q, root_lock, to_free);
377377
if (ret == NET_XMIT_SUCCESS) {
378378
sch->q.qlen++;
379379
cbq_mark_toplevel(q, cl);

0 commit comments

Comments
 (0)