@@ -187,9 +187,11 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
187187
188188static void packet_flush_mclist (struct sock * sk );
189189
190+ struct packet_fanout ;
190191struct packet_sock {
191192 /* struct sock has to be the first member of packet_sock */
192193 struct sock sk ;
194+ struct packet_fanout * fanout ;
193195 struct tpacket_stats stats ;
194196 struct packet_ring_buffer rx_ring ;
195197 struct packet_ring_buffer tx_ring ;
@@ -212,6 +214,24 @@ struct packet_sock {
212214 struct packet_type prot_hook ____cacheline_aligned_in_smp ;
213215};
214216
217+ #define PACKET_FANOUT_MAX 256
218+
219+ struct packet_fanout {
220+ #ifdef CONFIG_NET_NS
221+ struct net * net ;
222+ #endif
223+ unsigned int num_members ;
224+ u16 id ;
225+ u8 type ;
226+ u8 pad ;
227+ atomic_t rr_cur ;
228+ struct list_head list ;
229+ struct sock * arr [PACKET_FANOUT_MAX ];
230+ spinlock_t lock ;
231+ atomic_t sk_ref ;
232+ struct packet_type prot_hook ____cacheline_aligned_in_smp ;
233+ };
234+
215235struct packet_skb_cb {
216236 unsigned int origlen ;
217237 union {
@@ -227,6 +247,9 @@ static inline struct packet_sock *pkt_sk(struct sock *sk)
227247 return (struct packet_sock * )sk ;
228248}
229249
250+ static void __fanout_unlink (struct sock * sk , struct packet_sock * po );
251+ static void __fanout_link (struct sock * sk , struct packet_sock * po );
252+
230253/* register_prot_hook must be invoked with the po->bind_lock held,
231254 * or from a context in which asynchronous accesses to the packet
232255 * socket is not possible (packet_create()).
@@ -235,7 +258,10 @@ static void register_prot_hook(struct sock *sk)
235258{
236259 struct packet_sock * po = pkt_sk (sk );
237260 if (!po -> running ) {
238- dev_add_pack (& po -> prot_hook );
261+ if (po -> fanout )
262+ __fanout_link (sk , po );
263+ else
264+ dev_add_pack (& po -> prot_hook );
239265 sock_hold (sk );
240266 po -> running = 1 ;
241267 }
@@ -253,7 +279,10 @@ static void __unregister_prot_hook(struct sock *sk, bool sync)
253279 struct packet_sock * po = pkt_sk (sk );
254280
255281 po -> running = 0 ;
256- __dev_remove_pack (& po -> prot_hook );
282+ if (po -> fanout )
283+ __fanout_unlink (sk , po );
284+ else
285+ __dev_remove_pack (& po -> prot_hook );
257286 __sock_put (sk );
258287
259288 if (sync ) {
@@ -388,6 +417,201 @@ static void packet_sock_destruct(struct sock *sk)
388417 sk_refcnt_debug_dec (sk );
389418}
390419
420+ static int fanout_rr_next (struct packet_fanout * f , unsigned int num )
421+ {
422+ int x = atomic_read (& f -> rr_cur ) + 1 ;
423+
424+ if (x >= num )
425+ x = 0 ;
426+
427+ return x ;
428+ }
429+
430+ static struct sock * fanout_demux_hash (struct packet_fanout * f , struct sk_buff * skb , unsigned int num )
431+ {
432+ u32 idx , hash = skb -> rxhash ;
433+
434+ idx = ((u64 )hash * num ) >> 32 ;
435+
436+ return f -> arr [idx ];
437+ }
438+
439+ static struct sock * fanout_demux_lb (struct packet_fanout * f , struct sk_buff * skb , unsigned int num )
440+ {
441+ int cur , old ;
442+
443+ cur = atomic_read (& f -> rr_cur );
444+ while ((old = atomic_cmpxchg (& f -> rr_cur , cur ,
445+ fanout_rr_next (f , num ))) != cur )
446+ cur = old ;
447+ return f -> arr [cur ];
448+ }
449+
450+ static int packet_rcv_fanout_hash (struct sk_buff * skb , struct net_device * dev ,
451+ struct packet_type * pt , struct net_device * orig_dev )
452+ {
453+ struct packet_fanout * f = pt -> af_packet_priv ;
454+ unsigned int num = f -> num_members ;
455+ struct packet_sock * po ;
456+ struct sock * sk ;
457+
458+ if (!net_eq (dev_net (dev ), read_pnet (& f -> net )) ||
459+ !num ) {
460+ kfree_skb (skb );
461+ return 0 ;
462+ }
463+
464+ skb_get_rxhash (skb );
465+
466+ sk = fanout_demux_hash (f , skb , num );
467+ po = pkt_sk (sk );
468+
469+ return po -> prot_hook .func (skb , dev , & po -> prot_hook , orig_dev );
470+ }
471+
472+ static int packet_rcv_fanout_lb (struct sk_buff * skb , struct net_device * dev ,
473+ struct packet_type * pt , struct net_device * orig_dev )
474+ {
475+ struct packet_fanout * f = pt -> af_packet_priv ;
476+ unsigned int num = f -> num_members ;
477+ struct packet_sock * po ;
478+ struct sock * sk ;
479+
480+ if (!net_eq (dev_net (dev ), read_pnet (& f -> net )) ||
481+ !num ) {
482+ kfree_skb (skb );
483+ return 0 ;
484+ }
485+
486+ sk = fanout_demux_lb (f , skb , num );
487+ po = pkt_sk (sk );
488+
489+ return po -> prot_hook .func (skb , dev , & po -> prot_hook , orig_dev );
490+ }
491+
492+ static DEFINE_MUTEX (fanout_mutex );
493+ static LIST_HEAD (fanout_list );
494+
495+ static void __fanout_link (struct sock * sk , struct packet_sock * po )
496+ {
497+ struct packet_fanout * f = po -> fanout ;
498+
499+ spin_lock (& f -> lock );
500+ f -> arr [f -> num_members ] = sk ;
501+ smp_wmb ();
502+ f -> num_members ++ ;
503+ spin_unlock (& f -> lock );
504+ }
505+
506+ static void __fanout_unlink (struct sock * sk , struct packet_sock * po )
507+ {
508+ struct packet_fanout * f = po -> fanout ;
509+ int i ;
510+
511+ spin_lock (& f -> lock );
512+ for (i = 0 ; i < f -> num_members ; i ++ ) {
513+ if (f -> arr [i ] == sk )
514+ break ;
515+ }
516+ BUG_ON (i >= f -> num_members );
517+ f -> arr [i ] = f -> arr [f -> num_members - 1 ];
518+ f -> num_members -- ;
519+ spin_unlock (& f -> lock );
520+ }
521+
522+ static int fanout_add (struct sock * sk , u16 id , u8 type )
523+ {
524+ struct packet_sock * po = pkt_sk (sk );
525+ struct packet_fanout * f , * match ;
526+ int err ;
527+
528+ switch (type ) {
529+ case PACKET_FANOUT_HASH :
530+ case PACKET_FANOUT_LB :
531+ break ;
532+ default :
533+ return - EINVAL ;
534+ }
535+
536+ if (!po -> running )
537+ return - EINVAL ;
538+
539+ if (po -> fanout )
540+ return - EALREADY ;
541+
542+ mutex_lock (& fanout_mutex );
543+ match = NULL ;
544+ list_for_each_entry (f , & fanout_list , list ) {
545+ if (f -> id == id &&
546+ read_pnet (& f -> net ) == sock_net (sk )) {
547+ match = f ;
548+ break ;
549+ }
550+ }
551+ if (!match ) {
552+ match = kzalloc (sizeof (* match ), GFP_KERNEL );
553+ if (match ) {
554+ write_pnet (& match -> net , sock_net (sk ));
555+ match -> id = id ;
556+ match -> type = type ;
557+ atomic_set (& match -> rr_cur , 0 );
558+ INIT_LIST_HEAD (& match -> list );
559+ spin_lock_init (& match -> lock );
560+ atomic_set (& match -> sk_ref , 0 );
561+ match -> prot_hook .type = po -> prot_hook .type ;
562+ match -> prot_hook .dev = po -> prot_hook .dev ;
563+ switch (type ) {
564+ case PACKET_FANOUT_HASH :
565+ match -> prot_hook .func = packet_rcv_fanout_hash ;
566+ break ;
567+ case PACKET_FANOUT_LB :
568+ match -> prot_hook .func = packet_rcv_fanout_lb ;
569+ break ;
570+ }
571+ match -> prot_hook .af_packet_priv = match ;
572+ dev_add_pack (& match -> prot_hook );
573+ list_add (& match -> list , & fanout_list );
574+ }
575+ }
576+ err = - ENOMEM ;
577+ if (match ) {
578+ err = - EINVAL ;
579+ if (match -> type == type &&
580+ match -> prot_hook .type == po -> prot_hook .type &&
581+ match -> prot_hook .dev == po -> prot_hook .dev ) {
582+ err = - ENOSPC ;
583+ if (atomic_read (& match -> sk_ref ) < PACKET_FANOUT_MAX ) {
584+ __dev_remove_pack (& po -> prot_hook );
585+ po -> fanout = match ;
586+ atomic_inc (& match -> sk_ref );
587+ __fanout_link (sk , po );
588+ err = 0 ;
589+ }
590+ }
591+ }
592+ mutex_unlock (& fanout_mutex );
593+ return err ;
594+ }
595+
596+ static void fanout_release (struct sock * sk )
597+ {
598+ struct packet_sock * po = pkt_sk (sk );
599+ struct packet_fanout * f ;
600+
601+ f = po -> fanout ;
602+ if (!f )
603+ return ;
604+
605+ po -> fanout = NULL ;
606+
607+ mutex_lock (& fanout_mutex );
608+ if (atomic_dec_and_test (& f -> sk_ref )) {
609+ list_del (& f -> list );
610+ dev_remove_pack (& f -> prot_hook );
611+ kfree (f );
612+ }
613+ mutex_unlock (& fanout_mutex );
614+ }
391615
392616static const struct proto_ops packet_ops ;
393617
@@ -1398,6 +1622,8 @@ static int packet_release(struct socket *sock)
13981622 if (po -> tx_ring .pg_vec )
13991623 packet_set_ring (sk , & req , 1 , 1 );
14001624
1625+ fanout_release (sk );
1626+
14011627 synchronize_net ();
14021628 /*
14031629 * Now the socket is dead. No more input will appear.
@@ -1421,9 +1647,9 @@ static int packet_release(struct socket *sock)
14211647static int packet_do_bind (struct sock * sk , struct net_device * dev , __be16 protocol )
14221648{
14231649 struct packet_sock * po = pkt_sk (sk );
1424- /*
1425- * Detach an existing hook if present.
1426- */
1650+
1651+ if ( po -> fanout )
1652+ return - EINVAL ;
14271653
14281654 lock_sock (sk );
14291655
@@ -2133,6 +2359,17 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
21332359 po -> tp_tstamp = val ;
21342360 return 0 ;
21352361 }
2362+ case PACKET_FANOUT :
2363+ {
2364+ int val ;
2365+
2366+ if (optlen != sizeof (val ))
2367+ return - EINVAL ;
2368+ if (copy_from_user (& val , optval , sizeof (val )))
2369+ return - EFAULT ;
2370+
2371+ return fanout_add (sk , val & 0xffff , val >> 16 );
2372+ }
21362373 default :
21372374 return - ENOPROTOOPT ;
21382375 }
@@ -2231,6 +2468,15 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
22312468 val = po -> tp_tstamp ;
22322469 data = & val ;
22332470 break ;
2471+ case PACKET_FANOUT :
2472+ if (len > sizeof (int ))
2473+ len = sizeof (int );
2474+ val = (po -> fanout ?
2475+ ((u32 )po -> fanout -> id |
2476+ ((u32 )po -> fanout -> type << 16 )) :
2477+ 0 );
2478+ data = & val ;
2479+ break ;
22342480 default :
22352481 return - ENOPROTOOPT ;
22362482 }
0 commit comments