@@ -38,10 +38,12 @@ static bool nft_rbtree_interval_start(const struct nft_rbtree_elem *rbe)
3838 return !nft_rbtree_interval_end (rbe );
3939}
4040
41- static bool nft_rbtree_equal (const struct nft_set * set , const void * this ,
42- const struct nft_rbtree_elem * interval )
41+ static int nft_rbtree_cmp (const struct nft_set * set ,
42+ const struct nft_rbtree_elem * e1 ,
43+ const struct nft_rbtree_elem * e2 )
4344{
44- return memcmp (this , nft_set_ext_key (& interval -> ext ), set -> klen ) == 0 ;
45+ return memcmp (nft_set_ext_key (& e1 -> ext ), nft_set_ext_key (& e2 -> ext ),
46+ set -> klen );
4547}
4648
4749static bool __nft_rbtree_lookup (const struct net * net , const struct nft_set * set ,
@@ -52,7 +54,6 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set
5254 const struct nft_rbtree_elem * rbe , * interval = NULL ;
5355 u8 genmask = nft_genmask_cur (net );
5456 const struct rb_node * parent ;
55- const void * this ;
5657 int d ;
5758
5859 parent = rcu_dereference_raw (priv -> root .rb_node );
@@ -62,12 +63,11 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set
6263
6364 rbe = rb_entry (parent , struct nft_rbtree_elem , node );
6465
65- this = nft_set_ext_key (& rbe -> ext );
66- d = memcmp (this , key , set -> klen );
66+ d = memcmp (nft_set_ext_key (& rbe -> ext ), key , set -> klen );
6767 if (d < 0 ) {
6868 parent = rcu_dereference_raw (parent -> rb_left );
6969 if (interval &&
70- nft_rbtree_equal (set , this , interval ) &&
70+ ! nft_rbtree_cmp (set , rbe , interval ) &&
7171 nft_rbtree_interval_end (rbe ) &&
7272 nft_rbtree_interval_start (interval ))
7373 continue ;
@@ -215,154 +215,216 @@ static void *nft_rbtree_get(const struct net *net, const struct nft_set *set,
215215 return rbe ;
216216}
217217
218+ static int nft_rbtree_gc_elem (const struct nft_set * __set ,
219+ struct nft_rbtree * priv ,
220+ struct nft_rbtree_elem * rbe )
221+ {
222+ struct nft_set * set = (struct nft_set * )__set ;
223+ struct rb_node * prev = rb_prev (& rbe -> node );
224+ struct nft_rbtree_elem * rbe_prev ;
225+ struct nft_set_gc_batch * gcb ;
226+
227+ gcb = nft_set_gc_batch_check (set , NULL , GFP_ATOMIC );
228+ if (!gcb )
229+ return - ENOMEM ;
230+
231+ /* search for expired end interval coming before this element. */
232+ do {
233+ rbe_prev = rb_entry (prev , struct nft_rbtree_elem , node );
234+ if (nft_rbtree_interval_end (rbe_prev ))
235+ break ;
236+
237+ prev = rb_prev (prev );
238+ } while (prev != NULL );
239+
240+ rb_erase (& rbe_prev -> node , & priv -> root );
241+ rb_erase (& rbe -> node , & priv -> root );
242+ atomic_sub (2 , & set -> nelems );
243+
244+ nft_set_gc_batch_add (gcb , rbe );
245+ nft_set_gc_batch_complete (gcb );
246+
247+ return 0 ;
248+ }
249+
250+ static bool nft_rbtree_update_first (const struct nft_set * set ,
251+ struct nft_rbtree_elem * rbe ,
252+ struct rb_node * first )
253+ {
254+ struct nft_rbtree_elem * first_elem ;
255+
256+ first_elem = rb_entry (first , struct nft_rbtree_elem , node );
257+ /* this element is closest to where the new element is to be inserted:
258+ * update the first element for the node list path.
259+ */
260+ if (nft_rbtree_cmp (set , rbe , first_elem ) < 0 )
261+ return true;
262+
263+ return false;
264+ }
265+
218266static int __nft_rbtree_insert (const struct net * net , const struct nft_set * set ,
219267 struct nft_rbtree_elem * new ,
220268 struct nft_set_ext * * ext )
221269{
222- bool overlap = false, dup_end_left = false, dup_end_right = false;
270+ struct nft_rbtree_elem * rbe , * rbe_le = NULL , * rbe_ge = NULL ;
271+ struct rb_node * node , * parent , * * p , * first = NULL ;
223272 struct nft_rbtree * priv = nft_set_priv (set );
224273 u8 genmask = nft_genmask_next (net );
225- struct nft_rbtree_elem * rbe ;
226- struct rb_node * parent , * * p ;
227- int d ;
274+ int d , err ;
228275
229- /* Detect overlaps as we descend the tree. Set the flag in these cases:
230- *
231- * a1. _ _ __>| ?_ _ __| (insert end before existing end)
232- * a2. _ _ ___| ?_ _ _>| (insert end after existing end)
233- * a3. _ _ ___? >|_ _ __| (insert start before existing end)
234- *
235- * and clear it later on, as we eventually reach the points indicated by
236- * '?' above, in the cases described below. We'll always meet these
237- * later, locally, due to tree ordering, and overlaps for the intervals
238- * that are the closest together are always evaluated last.
239- *
240- * b1. _ _ __>| !_ _ __| (insert end before existing start)
241- * b2. _ _ ___| !_ _ _>| (insert end after existing start)
242- * b3. _ _ ___! >|_ _ __| (insert start after existing end, as a leaf)
243- * '--' no nodes falling in this range
244- * b4. >|_ _ ! (insert start before existing start)
245- *
246- * Case a3. resolves to b3.:
247- * - if the inserted start element is the leftmost, because the '0'
248- * element in the tree serves as end element
249- * - otherwise, if an existing end is found immediately to the left. If
250- * there are existing nodes in between, we need to further descend the
251- * tree before we can conclude the new start isn't causing an overlap
252- *
253- * or to b4., which, preceded by a3., means we already traversed one or
254- * more existing intervals entirely, from the right.
255- *
256- * For a new, rightmost pair of elements, we'll hit cases b3. and b2.,
257- * in that order.
258- *
259- * The flag is also cleared in two special cases:
260- *
261- * b5. |__ _ _!|<_ _ _ (insert start right before existing end)
262- * b6. |__ _ >|!__ _ _ (insert end right after existing start)
263- *
264- * which always happen as last step and imply that no further
265- * overlapping is possible.
266- *
267- * Another special case comes from the fact that start elements matching
268- * an already existing start element are allowed: insertion is not
269- * performed but we return -EEXIST in that case, and the error will be
270- * cleared by the caller if NLM_F_EXCL is not present in the request.
271- * This way, request for insertion of an exact overlap isn't reported as
272- * error to userspace if not desired.
273- *
274- * However, if the existing start matches a pre-existing start, but the
275- * end element doesn't match the corresponding pre-existing end element,
276- * we need to report a partial overlap. This is a local condition that
277- * can be noticed without need for a tracking flag, by checking for a
278- * local duplicated end for a corresponding start, from left and right,
279- * separately.
276+ /* Descend the tree to search for an existing element greater than the
277+ * key value to insert that is greater than the new element. This is the
278+ * first element to walk the ordered elements to find possible overlap.
280279 */
281-
282280 parent = NULL ;
283281 p = & priv -> root .rb_node ;
284282 while (* p != NULL ) {
285283 parent = * p ;
286284 rbe = rb_entry (parent , struct nft_rbtree_elem , node );
287- d = memcmp (nft_set_ext_key (& rbe -> ext ),
288- nft_set_ext_key (& new -> ext ),
289- set -> klen );
285+ d = nft_rbtree_cmp (set , rbe , new );
286+
290287 if (d < 0 ) {
291288 p = & parent -> rb_left ;
292-
293- if (nft_rbtree_interval_start (new )) {
294- if (nft_rbtree_interval_end (rbe ) &&
295- nft_set_elem_active (& rbe -> ext , genmask ) &&
296- !nft_set_elem_expired (& rbe -> ext ) && !* p )
297- overlap = false;
298- } else {
299- if (dup_end_left && !* p )
300- return - ENOTEMPTY ;
301-
302- overlap = nft_rbtree_interval_end (rbe ) &&
303- nft_set_elem_active (& rbe -> ext ,
304- genmask ) &&
305- !nft_set_elem_expired (& rbe -> ext );
306-
307- if (overlap ) {
308- dup_end_right = true;
309- continue ;
310- }
311- }
312289 } else if (d > 0 ) {
313- p = & parent -> rb_right ;
290+ if (!first ||
291+ nft_rbtree_update_first (set , rbe , first ))
292+ first = & rbe -> node ;
314293
315- if (nft_rbtree_interval_end (new )) {
316- if (dup_end_right && !* p )
317- return - ENOTEMPTY ;
318-
319- overlap = nft_rbtree_interval_end (rbe ) &&
320- nft_set_elem_active (& rbe -> ext ,
321- genmask ) &&
322- !nft_set_elem_expired (& rbe -> ext );
323-
324- if (overlap ) {
325- dup_end_left = true;
326- continue ;
327- }
328- } else if (nft_set_elem_active (& rbe -> ext , genmask ) &&
329- !nft_set_elem_expired (& rbe -> ext )) {
330- overlap = nft_rbtree_interval_end (rbe );
331- }
294+ p = & parent -> rb_right ;
332295 } else {
333- if (nft_rbtree_interval_end (rbe ) &&
334- nft_rbtree_interval_start (new )) {
296+ if (nft_rbtree_interval_end (rbe ))
335297 p = & parent -> rb_left ;
336-
337- if (nft_set_elem_active (& rbe -> ext , genmask ) &&
338- !nft_set_elem_expired (& rbe -> ext ))
339- overlap = false;
340- } else if (nft_rbtree_interval_start (rbe ) &&
341- nft_rbtree_interval_end (new )) {
298+ else
342299 p = & parent -> rb_right ;
300+ }
301+ }
302+
303+ if (!first )
304+ first = rb_first (& priv -> root );
305+
306+ /* Detect overlap by going through the list of valid tree nodes.
307+ * Values stored in the tree are in reversed order, starting from
308+ * highest to lowest value.
309+ */
310+ for (node = first ; node != NULL ; node = rb_next (node )) {
311+ rbe = rb_entry (node , struct nft_rbtree_elem , node );
312+
313+ if (!nft_set_elem_active (& rbe -> ext , genmask ))
314+ continue ;
343315
344- if (nft_set_elem_active (& rbe -> ext , genmask ) &&
345- !nft_set_elem_expired (& rbe -> ext ))
346- overlap = false;
347- } else if (nft_set_elem_active (& rbe -> ext , genmask ) &&
348- !nft_set_elem_expired (& rbe -> ext )) {
349- * ext = & rbe -> ext ;
350- return - EEXIST ;
351- } else {
352- overlap = false;
353- if (nft_rbtree_interval_end (rbe ))
354- p = & parent -> rb_left ;
355- else
356- p = & parent -> rb_right ;
316+ /* perform garbage collection to avoid bogus overlap reports. */
317+ if (nft_set_elem_expired (& rbe -> ext )) {
318+ err = nft_rbtree_gc_elem (set , priv , rbe );
319+ if (err < 0 )
320+ return err ;
321+
322+ continue ;
323+ }
324+
325+ d = nft_rbtree_cmp (set , rbe , new );
326+ if (d == 0 ) {
327+ /* Matching end element: no need to look for an
328+ * overlapping greater or equal element.
329+ */
330+ if (nft_rbtree_interval_end (rbe )) {
331+ rbe_le = rbe ;
332+ break ;
333+ }
334+
335+ /* first element that is greater or equal to key value. */
336+ if (!rbe_ge ) {
337+ rbe_ge = rbe ;
338+ continue ;
339+ }
340+
341+ /* this is a closer more or equal element, update it. */
342+ if (nft_rbtree_cmp (set , rbe_ge , new ) != 0 ) {
343+ rbe_ge = rbe ;
344+ continue ;
357345 }
346+
347+ /* element is equal to key value, make sure flags are
348+ * the same, an existing more or equal start element
349+ * must not be replaced by more or equal end element.
350+ */
351+ if ((nft_rbtree_interval_start (new ) &&
352+ nft_rbtree_interval_start (rbe_ge )) ||
353+ (nft_rbtree_interval_end (new ) &&
354+ nft_rbtree_interval_end (rbe_ge ))) {
355+ rbe_ge = rbe ;
356+ continue ;
357+ }
358+ } else if (d > 0 ) {
359+ /* annotate element greater than the new element. */
360+ rbe_ge = rbe ;
361+ continue ;
362+ } else if (d < 0 ) {
363+ /* annotate element less than the new element. */
364+ rbe_le = rbe ;
365+ break ;
358366 }
367+ }
359368
360- dup_end_left = dup_end_right = false;
369+ /* - new start element matching existing start element: full overlap
370+ * reported as -EEXIST, cleared by caller if NLM_F_EXCL is not given.
371+ */
372+ if (rbe_ge && !nft_rbtree_cmp (set , new , rbe_ge ) &&
373+ nft_rbtree_interval_start (rbe_ge ) == nft_rbtree_interval_start (new )) {
374+ * ext = & rbe_ge -> ext ;
375+ return - EEXIST ;
376+ }
377+
378+ /* - new end element matching existing end element: full overlap
379+ * reported as -EEXIST, cleared by caller if NLM_F_EXCL is not given.
380+ */
381+ if (rbe_le && !nft_rbtree_cmp (set , new , rbe_le ) &&
382+ nft_rbtree_interval_end (rbe_le ) == nft_rbtree_interval_end (new )) {
383+ * ext = & rbe_le -> ext ;
384+ return - EEXIST ;
361385 }
362386
363- if (overlap )
387+ /* - new start element with existing closest, less or equal key value
388+ * being a start element: partial overlap, reported as -ENOTEMPTY.
389+ * Anonymous sets allow for two consecutive start element since they
390+ * are constant, skip them to avoid bogus overlap reports.
391+ */
392+ if (!nft_set_is_anonymous (set ) && rbe_le &&
393+ nft_rbtree_interval_start (rbe_le ) && nft_rbtree_interval_start (new ))
394+ return - ENOTEMPTY ;
395+
396+ /* - new end element with existing closest, less or equal key value
397+ * being a end element: partial overlap, reported as -ENOTEMPTY.
398+ */
399+ if (rbe_le &&
400+ nft_rbtree_interval_end (rbe_le ) && nft_rbtree_interval_end (new ))
364401 return - ENOTEMPTY ;
365402
403+ /* - new end element with existing closest, greater or equal key value
404+ * being an end element: partial overlap, reported as -ENOTEMPTY
405+ */
406+ if (rbe_ge &&
407+ nft_rbtree_interval_end (rbe_ge ) && nft_rbtree_interval_end (new ))
408+ return - ENOTEMPTY ;
409+
410+ /* Accepted element: pick insertion point depending on key value */
411+ parent = NULL ;
412+ p = & priv -> root .rb_node ;
413+ while (* p != NULL ) {
414+ parent = * p ;
415+ rbe = rb_entry (parent , struct nft_rbtree_elem , node );
416+ d = nft_rbtree_cmp (set , rbe , new );
417+
418+ if (d < 0 )
419+ p = & parent -> rb_left ;
420+ else if (d > 0 )
421+ p = & parent -> rb_right ;
422+ else if (nft_rbtree_interval_end (rbe ))
423+ p = & parent -> rb_left ;
424+ else
425+ p = & parent -> rb_right ;
426+ }
427+
366428 rb_link_node_rcu (& new -> node , parent , p );
367429 rb_insert_color (& new -> node , & priv -> root );
368430 return 0 ;
0 commit comments